193 files changed, 50751 insertions, 1395 deletions
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 45436bfc6dff..a3647352bff6 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -2,12 +2,20 @@
 
 menu "Crypto library routines"
 
+config CRYPTO_HASH_INFO
+	bool
+
 config CRYPTO_LIB_UTILS
 	tristate
 
 config CRYPTO_LIB_AES
 	tristate
 
+config CRYPTO_LIB_AESCFB
+	tristate
+	select CRYPTO_LIB_AES
+	select CRYPTO_LIB_UTILS
+
 config CRYPTO_LIB_AESGCM
 	tristate
 	select CRYPTO_LIB_AES
@@ -20,122 +28,201 @@ config CRYPTO_LIB_ARC4
 config CRYPTO_LIB_GF128MUL
 	tristate
 
-config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	bool
+config CRYPTO_LIB_BLAKE2B
+	tristate
 	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the Blake2s library interface,
-	  either builtin or as a module.
+	  The BLAKE2b library functions.  Select this if your module uses any of
+	  the functions from <crypto/blake2b.h>.
 
-config CRYPTO_LIB_BLAKE2S_GENERIC
-	def_bool !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	help
-	  This symbol can be depended upon by arch implementations of the
-	  Blake2s library interface that require the generic code as a
-	  fallback, e.g., for SIMD implementations. If no arch specific
-	  implementation is enabled, this implementation serves the users
-	  of CRYPTO_LIB_BLAKE2S.
+config CRYPTO_LIB_BLAKE2B_ARCH
+	bool
+	depends on CRYPTO_LIB_BLAKE2B && !UML
+	default y if ARM && KERNEL_MODE_NEON
 
-config CRYPTO_ARCH_HAVE_LIB_CHACHA
-	tristate
-	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the ChaCha library interface,
-	  either builtin or as a module.
+# BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option.
+
+config CRYPTO_LIB_BLAKE2S_ARCH
+	bool
+	depends on !UML
+	default y if ARM
+	default y if X86_64
 
-config CRYPTO_LIB_CHACHA_GENERIC
+config CRYPTO_LIB_CHACHA
 	tristate
 	select CRYPTO_LIB_UTILS
 	help
-	  This symbol can be depended upon by arch implementations of the
-	  ChaCha library interface that require the generic code as a
-	  fallback, e.g., for SIMD implementations. If no arch specific
-	  implementation is enabled, this implementation serves the users
-	  of CRYPTO_LIB_CHACHA.
+	  Enable the ChaCha library interface.  Select this if your module uses
+	  chacha_crypt() or hchacha_block().
 
-config CRYPTO_LIB_CHACHA
-	tristate "ChaCha library interface"
-	depends on CRYPTO_ARCH_HAVE_LIB_CHACHA || !CRYPTO_ARCH_HAVE_LIB_CHACHA
-	select CRYPTO_LIB_CHACHA_GENERIC if CRYPTO_ARCH_HAVE_LIB_CHACHA=n
-	help
-	  Enable the ChaCha library interface. This interface may be fulfilled
-	  by either the generic implementation or an arch-specific one, if one
-	  is available and enabled.
+config CRYPTO_LIB_CHACHA_ARCH
+	bool
+	depends on CRYPTO_LIB_CHACHA && !UML && !KMSAN
+	default y if ARM
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if MIPS && CPU_MIPS32_R2
+	default y if PPC64 && CPU_LITTLE_ENDIAN && VSX
+	default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default y if S390
+	default y if X86_64
 
-config CRYPTO_ARCH_HAVE_LIB_CURVE25519
+config CRYPTO_LIB_CURVE25519
 	tristate
+	select CRYPTO_LIB_UTILS
 	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the Curve25519 library interface,
-	  either builtin or as a module.
+	  The Curve25519 library functions.  Select this if your module uses any
+	  of the functions from <crypto/curve25519.h>.
+
+config CRYPTO_LIB_CURVE25519_ARCH
+	bool
+	depends on CRYPTO_LIB_CURVE25519 && !UML && !KMSAN
+	default y if ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN
+	default y if PPC64 && CPU_LITTLE_ENDIAN
+	default y if X86_64
 
 config CRYPTO_LIB_CURVE25519_GENERIC
+	bool
+	depends on CRYPTO_LIB_CURVE25519
+	default y if !CRYPTO_LIB_CURVE25519_ARCH || ARM || X86_64
+
+config CRYPTO_LIB_DES
 	tristate
-	help
-	  This symbol can be depended upon by arch implementations of the
-	  Curve25519 library interface that require the generic code as a
-	  fallback, e.g., for SIMD implementations. If no arch specific
-	  implementation is enabled, this implementation serves the users
-	  of CRYPTO_LIB_CURVE25519.
 
-config CRYPTO_LIB_CURVE25519
-	tristate "Curve25519 scalar multiplication library"
-	depends on CRYPTO_ARCH_HAVE_LIB_CURVE25519 || !CRYPTO_ARCH_HAVE_LIB_CURVE25519
-	select CRYPTO_LIB_CURVE25519_GENERIC if CRYPTO_ARCH_HAVE_LIB_CURVE25519=n
-	select CRYPTO_LIB_UTILS
+config CRYPTO_LIB_MD5
+	tristate
 	help
-	  Enable the Curve25519 library interface. This interface may be
-	  fulfilled by either the generic implementation or an arch-specific
-	  one, if one is available and enabled.
+	  The MD5 and HMAC-MD5 library functions.  Select this if your module
+	  uses any of the functions from <crypto/md5.h>.
 
-config CRYPTO_LIB_DES
+config CRYPTO_LIB_MD5_ARCH
+	bool
+	depends on CRYPTO_LIB_MD5 && !UML
+	default y if MIPS && CPU_CAVIUM_OCTEON
+	default y if PPC
+	default y if SPARC64
+
+config CRYPTO_LIB_POLY1305
 	tristate
+	help
+	  The Poly1305 library functions.  Select this if your module uses any
+	  of the functions from <crypto/poly1305.h>.
+
+config CRYPTO_LIB_POLY1305_ARCH
+	bool
+	depends on CRYPTO_LIB_POLY1305 && !UML && !KMSAN
+	default y if ARM
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if MIPS
+	# The PPC64 code needs to be fixed to work in softirq context.
+	default y if PPC64 && CPU_LITTLE_ENDIAN && VSX && BROKEN
+	default y if RISCV
+	default y if X86_64
+
+# This symbol controls the inclusion of the Poly1305 generic code.  This differs
+# from most of the other algorithms, which handle the generic code
+# "automatically" via __maybe_unused.  This is needed so that the Adiantum code,
+# which calls the poly1305_core_*() functions directly, can enable them.
+config CRYPTO_LIB_POLY1305_GENERIC
+	bool
+	depends on CRYPTO_LIB_POLY1305
+	# Enable if there's no arch impl or the arch impl requires the generic
+	# impl as a fallback.  (Or if selected explicitly.)
+	default y if !CRYPTO_LIB_POLY1305_ARCH || PPC64
 
 config CRYPTO_LIB_POLY1305_RSIZE
 	int
-	default 2 if MIPS
+	default 2 if MIPS || RISCV
 	default 11 if X86_64
 	default 9 if ARM || ARM64
 	default 1
 
-config CRYPTO_ARCH_HAVE_LIB_POLY1305
-	tristate
-	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the Poly1305 library interface,
-	  either builtin or as a module.
-
-config CRYPTO_LIB_POLY1305_GENERIC
+config CRYPTO_LIB_POLYVAL
 	tristate
 	help
-	  This symbol can be depended upon by arch implementations of the
-	  Poly1305 library interface that require the generic code as a
-	  fallback, e.g., for SIMD implementations. If no arch specific
-	  implementation is enabled, this implementation serves the users
-	  of CRYPTO_LIB_POLY1305.
+	  The POLYVAL library functions.  Select this if your module uses any of
+	  the functions from <crypto/polyval.h>.
 
-config CRYPTO_LIB_POLY1305
-	tristate "Poly1305 library interface"
-	depends on CRYPTO_ARCH_HAVE_LIB_POLY1305 || !CRYPTO_ARCH_HAVE_LIB_POLY1305
-	select CRYPTO_LIB_POLY1305_GENERIC if CRYPTO_ARCH_HAVE_LIB_POLY1305=n
-	help
-	  Enable the Poly1305 library interface. This interface may be fulfilled
-	  by either the generic implementation or an arch-specific one, if one
-	  is available and enabled.
+config CRYPTO_LIB_POLYVAL_ARCH
+	bool
+	depends on CRYPTO_LIB_POLYVAL && !UML
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if X86_64
 
 config CRYPTO_LIB_CHACHA20POLY1305
-	tristate "ChaCha20-Poly1305 AEAD support (8-byte nonce library version)"
-	depends on CRYPTO_ARCH_HAVE_LIB_CHACHA || !CRYPTO_ARCH_HAVE_LIB_CHACHA
-	depends on CRYPTO_ARCH_HAVE_LIB_POLY1305 || !CRYPTO_ARCH_HAVE_LIB_POLY1305
-	depends on CRYPTO
+	tristate
 	select CRYPTO_LIB_CHACHA
 	select CRYPTO_LIB_POLY1305
-	select CRYPTO_ALGAPI
+	select CRYPTO_LIB_UTILS
 
 config CRYPTO_LIB_SHA1
 	tristate
+	help
+	  The SHA-1 and HMAC-SHA1 library functions.  Select this if your module
+	  uses any of the functions from <crypto/sha1.h>.
+
+config CRYPTO_LIB_SHA1_ARCH
+	bool
+	depends on CRYPTO_LIB_SHA1 && !UML
+	default y if ARM
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if MIPS && CPU_CAVIUM_OCTEON
+	default y if PPC
+	default y if S390
+	default y if SPARC64
+	default y if X86_64
 
 config CRYPTO_LIB_SHA256
 	tristate
+	help
+	  The SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions.
+	  Select this if your module uses any of these functions from
+	  <crypto/sha2.h>.
+
+config CRYPTO_LIB_SHA256_ARCH
+	bool
+	depends on CRYPTO_LIB_SHA256 && !UML
+	default y if ARM && !CPU_V7M
+	default y if ARM64
+	default y if MIPS && CPU_CAVIUM_OCTEON
+	default y if PPC && SPE
+	default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default y if S390
+	default y if SPARC64
+	default y if X86_64
+
+config CRYPTO_LIB_SHA512
+	tristate
+	help
+	  The SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions.
+	  Select this if your module uses any of these functions from
+	  <crypto/sha2.h>.
+
+config CRYPTO_LIB_SHA512_ARCH
+	bool
+	depends on CRYPTO_LIB_SHA512 && !UML
+	default y if ARM && !CPU_V7M
+	default y if ARM64
+	default y if MIPS && CPU_CAVIUM_OCTEON
+	default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default y if S390
+	default y if SPARC64
+	default y if X86_64
+
+config CRYPTO_LIB_SHA3
+	tristate
+	select CRYPTO_LIB_UTILS
+	help
+	  The SHA3 library functions.  Select this if your module uses any of
+	  the functions from <crypto/sha3.h>.
+
+config CRYPTO_LIB_SHA3_ARCH
+	bool
+	depends on CRYPTO_LIB_SHA3 && !UML
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if S390
+
+config CRYPTO_LIB_SM3
+	tristate
+
+source "lib/crypto/tests/Kconfig"
 
 endmenu
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 6ec2d4543d9c..b5346cebbb55 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -1,15 +1,26 @@
 # SPDX-License-Identifier: GPL-2.0
 
+aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) > $(@)
+
+quiet_cmd_perlasm_with_args = PERLASM $@
+      cmd_perlasm_with_args = $(PERL) $(<) void $(@)
+
+obj-$(CONFIG_KUNIT)				+= tests/
+
+obj-$(CONFIG_CRYPTO_HASH_INFO)			+= hash_info.o
+
 obj-$(CONFIG_CRYPTO_LIB_UTILS)			+= libcryptoutils.o
 libcryptoutils-y				:= memneq.o utils.o
 
-# chacha is used by the /dev/random driver which is always builtin
-obj-y						+= chacha.o
-obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC)		+= libchacha.o
-
 obj-$(CONFIG_CRYPTO_LIB_AES)			+= libaes.o
 libaes-y					:= aes.o
 
+obj-$(CONFIG_CRYPTO_LIB_AESCFB)			+= libaescfb.o
+libaescfb-y					:= aescfb.o
+
 obj-$(CONFIG_CRYPTO_LIB_AESGCM)			+= libaesgcm.o
 libaesgcm-y					:= aesgcm.o
 
@@ -18,38 +29,282 @@ libarc4-y					:= arc4.o
 
 obj-$(CONFIG_CRYPTO_LIB_GF128MUL)		+= gf128mul.o
 
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2B) += libblake2b.o
+libblake2b-y := blake2b.o
+CFLAGS_blake2b.o := -Wframe-larger-than=4096 #  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
+ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y)
+CFLAGS_blake2b.o += -I$(src)/$(SRCARCH)
+libblake2b-$(CONFIG_ARM) += arm/blake2b-neon-core.o
+endif # CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+
+################################################################################
+
 # blake2s is used by the /dev/random driver which is always builtin
-obj-y						+= libblake2s.o
-libblake2s-y					:= blake2s.o
-libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC)	+= blake2s-generic.o
+obj-y += blake2s.o
+ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2S_ARCH),y)
+CFLAGS_blake2s.o += -I$(src)/$(SRCARCH)
+obj-$(CONFIG_ARM) += arm/blake2s-core.o
+obj-$(CONFIG_X86) += x86/blake2s-core.o
+endif
+
+################################################################################
+
+# chacha20_block() is used by the /dev/random driver which is always builtin
+obj-y += chacha-block-generic.o
+
+obj-$(CONFIG_CRYPTO_LIB_CHACHA) += libchacha.o
+libchacha-y := chacha.o
+
+ifeq ($(CONFIG_CRYPTO_LIB_CHACHA_ARCH),y)
+CFLAGS_chacha.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libchacha-y += arm/chacha-scalar-core.o
+libchacha-$(CONFIG_KERNEL_MODE_NEON) += arm/chacha-neon-core.o
+endif
+
+libchacha-$(CONFIG_ARM64) += arm64/chacha-neon-core.o
+
+ifeq ($(CONFIG_MIPS),y)
+libchacha-y += mips/chacha-core.o
+AFLAGS_mips/chacha-core.o += -O2 # needed to fill branch delay slots
+endif
+
+libchacha-$(CONFIG_PPC) += powerpc/chacha-p10le-8x.o
+libchacha-$(CONFIG_RISCV) += riscv/chacha-riscv64-zvkb.o
+libchacha-$(CONFIG_S390) += s390/chacha-s390.o
+libchacha-$(CONFIG_X86) += x86/chacha-ssse3-x86_64.o \
+			   x86/chacha-avx2-x86_64.o \
+			   x86/chacha-avx512vl-x86_64.o
+endif # CONFIG_CRYPTO_LIB_CHACHA_ARCH
+
+################################################################################
 
 obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305)	+= libchacha20poly1305.o
 libchacha20poly1305-y				+= chacha20poly1305.o
+libchacha20poly1305-$(CONFIG_CRYPTO_SELFTESTS)	+= chacha20poly1305-selftest.o
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o
+libcurve25519-y := curve25519.o
+
+# Disable GCOV in odd or sensitive code
+GCOV_PROFILE_curve25519.o := n
+
+ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y)
+libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-hacl64.o
+else
+libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-fiat32.o
+endif
+# clang versions prior to 18 may blow out the stack with KASAN
+ifeq ($(CONFIG_CC_IS_CLANG)_$(call clang-min-version, 180000),y_)
+KASAN_SANITIZE_curve25519-hacl64.o := n
+endif
 
-obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC)	+= libcurve25519-generic.o
-libcurve25519-generic-y				:= curve25519-fiat32.o
-libcurve25519-generic-$(CONFIG_ARCH_SUPPORTS_INT128)	:= curve25519-hacl64.o
-libcurve25519-generic-y				+= curve25519-generic.o
+ifeq ($(CONFIG_CRYPTO_LIB_CURVE25519_ARCH),y)
+CFLAGS_curve25519.o += -I$(src)/$(SRCARCH)
+libcurve25519-$(CONFIG_ARM) += arm/curve25519-core.o
+libcurve25519-$(CONFIG_PPC) += powerpc/curve25519-ppc64le_asm.o
+endif
 
-obj-$(CONFIG_CRYPTO_LIB_CURVE25519)		+= libcurve25519.o
-libcurve25519-y					+= curve25519.o
+################################################################################
 
 obj-$(CONFIG_CRYPTO_LIB_DES)			+= libdes.o
 libdes-y					:= des.o
 
-obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC)	+= libpoly1305.o
-libpoly1305-y					:= poly1305-donna32.o
-libpoly1305-$(CONFIG_ARCH_SUPPORTS_INT128)	:= poly1305-donna64.o
-libpoly1305-y					+= poly1305.o
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_MD5) += libmd5.o
+libmd5-y := md5.o
+ifeq ($(CONFIG_CRYPTO_LIB_MD5_ARCH),y)
+CFLAGS_md5.o += -I$(src)/$(SRCARCH)
+libmd5-$(CONFIG_PPC) += powerpc/md5-asm.o
+libmd5-$(CONFIG_SPARC) += sparc/md5_asm.o
+endif # CONFIG_CRYPTO_LIB_MD5_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o
+libpoly1305-y := poly1305.o
+ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y)
+libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna64.o
+else
+libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna32.o
+endif
+
+ifeq ($(CONFIG_CRYPTO_LIB_POLY1305_ARCH),y)
+CFLAGS_poly1305.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libpoly1305-y += arm/poly1305-core.o
+$(obj)/arm/poly1305-core.S: $(src)/arm/poly1305-armv4.pl
+	$(call cmd,perlasm)
+# massage the perlasm code a bit so we only get the NEON routine if we need it
+poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
+poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
+AFLAGS_arm/poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
+endif
+
+ifeq ($(CONFIG_ARM64),y)
+libpoly1305-y += arm64/poly1305-core.o
+$(obj)/arm64/poly1305-core.S: $(src)/arm64/poly1305-armv8.pl
+	$(call cmd,perlasm_with_args)
+endif
+
+ifeq ($(CONFIG_MIPS),y)
+libpoly1305-y += mips/poly1305-core.o
+poly1305-perlasm-flavour-$(CONFIG_32BIT) := o32
+poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64
+quiet_cmd_perlasm_poly1305 = PERLASM $@
+      cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@
+# Use if_changed instead of cmd, in case the flavour changed.
+$(obj)/mips/poly1305-core.S: $(src)/mips/poly1305-mips.pl FORCE
+	$(call if_changed,perlasm_poly1305)
+targets += mips/poly1305-core.S
+endif
+
+libpoly1305-$(CONFIG_PPC) += powerpc/poly1305-p10le_64.o
+
+ifeq ($(CONFIG_RISCV),y)
+libpoly1305-y += riscv/poly1305-core.o
+poly1305-perlasm-flavour-$(CONFIG_32BIT) := 32
+poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64
+quiet_cmd_perlasm_poly1305 = PERLASM $@
+      cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@
+# Use if_changed instead of cmd, in case the flavour changed.
+$(obj)/riscv/poly1305-core.S: $(src)/riscv/poly1305-riscv.pl FORCE
+	$(call if_changed,perlasm_poly1305)
+targets += riscv/poly1305-core.S
+AFLAGS_riscv/poly1305-core.o += -Dpoly1305_init=poly1305_block_init
+endif
+
+ifeq ($(CONFIG_X86),y)
+libpoly1305-y += x86/poly1305-x86_64-cryptogams.o
+$(obj)/x86/poly1305-x86_64-cryptogams.S: $(src)/x86/poly1305-x86_64-cryptogams.pl
+	$(call cmd,perlasm)
+endif
+
+endif # CONFIG_CRYPTO_LIB_POLY1305_ARCH
+
+# clean-files must be defined unconditionally
+clean-files += arm/poly1305-core.S \
+	       arm64/poly1305-core.S \
+	       mips/poly1305-core.S \
+	       riscv/poly1305-core.S \
+	       x86/poly1305-x86_64-cryptogams.S
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_POLYVAL) += libpolyval.o
+libpolyval-y := polyval.o
+ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
+CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
+libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
+libpolyval-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
+endif
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
+libsha1-y := sha1.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y)
+CFLAGS_sha1.o += -I$(src)/$(SRCARCH)
+ifeq ($(CONFIG_ARM),y)
+libsha1-y += arm/sha1-armv4-large.o
+libsha1-$(CONFIG_KERNEL_MODE_NEON) += arm/sha1-armv7-neon.o \
+				      arm/sha1-ce-core.o
+endif
+libsha1-$(CONFIG_ARM64) += arm64/sha1-ce-core.o
+ifeq ($(CONFIG_PPC),y)
+libsha1-y += powerpc/sha1-powerpc-asm.o
+libsha1-$(CONFIG_SPE) += powerpc/sha1-spe-asm.o
+endif
+libsha1-$(CONFIG_SPARC) += sparc/sha1_asm.o
+libsha1-$(CONFIG_X86) += x86/sha1-ssse3-and-avx.o \
+			 x86/sha1-avx2-asm.o \
+			 x86/sha1-ni-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA1_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
+libsha256-y := sha256.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA256_ARCH),y)
+CFLAGS_sha256.o += -I$(src)/$(SRCARCH)
 
-obj-$(CONFIG_CRYPTO_LIB_SHA1)			+= libsha1.o
-libsha1-y					:= sha1.o
+ifeq ($(CONFIG_ARM),y)
+libsha256-y += arm/sha256-ce.o arm/sha256-core.o
+$(obj)/arm/sha256-core.S: $(src)/arm/sha256-armv4.pl
+	$(call cmd,perlasm)
+AFLAGS_arm/sha256-core.o += $(aflags-thumb2-y)
+endif
+
+ifeq ($(CONFIG_ARM64),y)
+libsha256-y += arm64/sha256-core.o
+$(obj)/arm64/sha256-core.S: $(src)/arm64/sha2-armv8.pl
+	$(call cmd,perlasm_with_args)
+libsha256-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha256-ce.o
+endif
 
-obj-$(CONFIG_CRYPTO_LIB_SHA256)			+= libsha256.o
-libsha256-y					:= sha256.o
+libsha256-$(CONFIG_PPC) += powerpc/sha256-spe-asm.o
+libsha256-$(CONFIG_RISCV) += riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.o
+libsha256-$(CONFIG_SPARC) += sparc/sha256_asm.o
+libsha256-$(CONFIG_X86) += x86/sha256-ssse3-asm.o \
+			   x86/sha256-avx-asm.o \
+			   x86/sha256-avx2-asm.o \
+			   x86/sha256-ni-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA256_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA512) += libsha512.o
+libsha512-y := sha512.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA512_ARCH),y)
+CFLAGS_sha512.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libsha512-y += arm/sha512-core.o
+$(obj)/arm/sha512-core.S: $(src)/arm/sha512-armv4.pl
+	$(call cmd,perlasm)
+AFLAGS_arm/sha512-core.o += $(aflags-thumb2-y)
+endif
 
-ifneq ($(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS),y)
-libblake2s-y					+= blake2s-selftest.o
-libchacha20poly1305-y				+= chacha20poly1305-selftest.o
-libcurve25519-y					+= curve25519-selftest.o
+ifeq ($(CONFIG_ARM64),y)
+libsha512-y += arm64/sha512-core.o
+$(obj)/arm64/sha512-core.S: $(src)/arm64/sha2-armv8.pl
+	$(call cmd,perlasm_with_args)
+libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o
 endif
+
+libsha512-$(CONFIG_RISCV) += riscv/sha512-riscv64-zvknhb-zvkb.o
+libsha512-$(CONFIG_SPARC) += sparc/sha512_asm.o
+libsha512-$(CONFIG_X86) += x86/sha512-ssse3-asm.o \
+			   x86/sha512-avx-asm.o \
+			   x86/sha512-avx2-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA3) += libsha3.o
+libsha3-y := sha3.o
+
+ifeq ($(CONFIG_CRYPTO_LIB_SHA3_ARCH),y)
+CFLAGS_sha3.o += -I$(src)/$(SRCARCH)
+libsha3-$(CONFIG_ARM64) += arm64/sha3-ce-core.o
+endif # CONFIG_CRYPTO_LIB_SHA3_ARCH
+
+################################################################################
+
+obj-$(CONFIG_MPILIB) += mpi/
+
+obj-$(CONFIG_CRYPTO_SELFTESTS_FULL)		+= simd.o
+
+obj-$(CONFIG_CRYPTO_LIB_SM3)			+= libsm3.o
+libsm3-y					:= sm3.o
+
+# clean-files must be defined unconditionally
+clean-files += arm/sha256-core.S arm/sha512-core.S
+clean-files += arm64/sha256-core.S arm64/sha512-core.S
diff --git a/lib/crypto/aes.c b/lib/crypto/aes.c
index 827fe89922ff..b57fda3460f1 100644
--- a/lib/crypto/aes.c
+++ b/lib/crypto/aes.c
@@ -5,8 +5,9 @@
 
 #include <crypto/aes.h>
 #include <linux/crypto.h>
+#include <linux/export.h>
 #include <linux/module.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 
 /*
  * Emit the sbox as volatile const to prevent the compiler from doing
diff --git a/lib/crypto/aescfb.c b/lib/crypto/aescfb.c
new file mode 100644
index 000000000000..0f294c8cbf3c
--- /dev/null
+++ b/lib/crypto/aescfb.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Minimal library implementation of AES in CFB mode
+ *
+ * Copyright 2023 Google LLC
+ */
+
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <asm/irqflags.h>
+
+static void aescfb_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
+				 const void *src)
+{
+	unsigned long flags;
+
+	/*
+	 * In AES-CFB, the AES encryption operates on known 'plaintext' (the IV
+	 * and ciphertext), making it susceptible to timing attacks on the
+	 * encryption key. The AES library already mitigates this risk to some
+	 * extent by pulling the entire S-box into the caches before doing any
+	 * substitutions, but this strategy is more effective when running with
+	 * interrupts disabled.
+	 */
+	local_irq_save(flags);
+	aes_encrypt(ctx, dst, src);
+	local_irq_restore(flags);
+}
+
+/**
+ * aescfb_encrypt - Perform AES-CFB encryption on a block of data
+ *
+ * @ctx:	The AES-CFB key schedule
+ * @dst:	Pointer to the ciphertext output buffer
+ * @src:	Pointer the plaintext (may equal @dst for encryption in place)
+ * @len:	The size in bytes of the plaintext and ciphertext.
+ * @iv:		The initialization vector (IV) to use for this block of data
+ */
+void aescfb_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
+		    int len, const u8 iv[AES_BLOCK_SIZE])
+{
+	u8 ks[AES_BLOCK_SIZE];
+	const u8 *v = iv;
+
+	while (len > 0) {
+		aescfb_encrypt_block(ctx, ks, v);
+		crypto_xor_cpy(dst, src, ks, min(len, AES_BLOCK_SIZE));
+		v = dst;
+
+		dst += AES_BLOCK_SIZE;
+		src += AES_BLOCK_SIZE;
+		len -= AES_BLOCK_SIZE;
+	}
+
+	memzero_explicit(ks, sizeof(ks));
+}
+EXPORT_SYMBOL(aescfb_encrypt);
+
+/**
+ * aescfb_decrypt - Perform AES-CFB decryption on a block of data
+ *
+ * @ctx:	The AES-CFB key schedule
+ * @dst:	Pointer to the plaintext output buffer
+ * @src:	Pointer the ciphertext (may equal @dst for decryption in place)
+ * @len:	The size in bytes of the plaintext and ciphertext.
+ * @iv:		The initialization vector (IV) to use for this block of data
+ */
+void aescfb_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
+		    int len, const u8 iv[AES_BLOCK_SIZE])
+{
+	u8 ks[2][AES_BLOCK_SIZE];
+
+	aescfb_encrypt_block(ctx, ks[0], iv);
+
+	for (int i = 0; len > 0; i ^= 1) {
+		if (len > AES_BLOCK_SIZE)
+			/*
+			 * Generate the keystream for the next block before
+			 * performing the XOR, as that may update in place and
+			 * overwrite the ciphertext.
+			 */
+			aescfb_encrypt_block(ctx, ks[!i], src);
+
+		crypto_xor_cpy(dst, src, ks[i], min(len, AES_BLOCK_SIZE));
+
+		dst += AES_BLOCK_SIZE;
+		src += AES_BLOCK_SIZE;
+		len -= AES_BLOCK_SIZE;
+	}
+
+	memzero_explicit(ks, sizeof(ks));
+}
+EXPORT_SYMBOL(aescfb_decrypt);
+
+MODULE_DESCRIPTION("Generic AES-CFB library");
+MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>");
+MODULE_LICENSE("GPL");
+
+#ifdef CONFIG_CRYPTO_SELFTESTS
+
+/*
+ * Test code below. Vectors taken from crypto/testmgr.h
+ */
+
+static struct {
+	u8	ptext[64] __nonstring;
+	u8	ctext[64] __nonstring;
+
+	u8	key[AES_MAX_KEY_SIZE] __nonstring;
+	u8	iv[AES_BLOCK_SIZE] __nonstring;
+
+	int	klen;
+	int	len;
+} const aescfb_tv[] __initconst = {
+	{ /* From NIST SP800-38A */
+		.key    = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+			  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.klen	= 16,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ctext	= "\x3b\x3f\xd9\x2e\xb7\x2d\xad\x20"
+			  "\x33\x34\x49\xf8\xe8\x3c\xfb\x4a"
+			  "\xc8\xa6\x45\x37\xa0\xb3\xa9\x3f"
+			  "\xcd\xe3\xcd\xad\x9f\x1c\xe5\x8b"
+			  "\x26\x75\x1f\x67\xa3\xcb\xb1\x40"
+			  "\xb1\x80\x8c\xf1\x87\xa4\xf4\xdf"
+			  "\xc0\x4b\x05\x35\x7c\x5d\x1c\x0e"
+			  "\xea\xc4\xc6\x6f\x9f\xf7\xf2\xe6",
+		.len	= 64,
+	}, {
+		.key	= "\x8e\x73\xb0\xf7\xda\x0e\x64\x52"
+			  "\xc8\x10\xf3\x2b\x80\x90\x79\xe5"
+			  "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b",
+		.klen	= 24,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ctext	= "\xcd\xc8\x0d\x6f\xdd\xf1\x8c\xab"
+			  "\x34\xc2\x59\x09\xc9\x9a\x41\x74"
+			  "\x67\xce\x7f\x7f\x81\x17\x36\x21"
+			  "\x96\x1a\x2b\x70\x17\x1d\x3d\x7a"
+			  "\x2e\x1e\x8a\x1d\xd5\x9b\x88\xb1"
+			  "\xc8\xe6\x0f\xed\x1e\xfa\xc4\xc9"
+			  "\xc0\x5f\x9f\x9c\xa9\x83\x4f\xa0"
+			  "\x42\xae\x8f\xba\x58\x4b\x09\xff",
+		.len	= 64,
+	}, {
+		.key	= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
+			  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+			  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
+			  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+		.klen	= 32,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ctext	= "\xdc\x7e\x84\xbf\xda\x79\x16\x4b"
+			  "\x7e\xcd\x84\x86\x98\x5d\x38\x60"
+			  "\x39\xff\xed\x14\x3b\x28\xb1\xc8"
+			  "\x32\x11\x3c\x63\x31\xe5\x40\x7b"
+			  "\xdf\x10\x13\x24\x15\xe5\x4b\x92"
+			  "\xa1\x3e\xd0\xa8\x26\x7a\xe2\xf9"
+			  "\x75\xa3\x85\x74\x1a\xb9\xce\xf8"
+			  "\x20\x31\x62\x3d\x55\xb1\xe4\x71",
+		.len	= 64,
+	}, { /* > 16 bytes, not a multiple of 16 bytes */
+		.key	= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+			  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.klen	= 16,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae",
+		.ctext	= "\x3b\x3f\xd9\x2e\xb7\x2d\xad\x20"
+			  "\x33\x34\x49\xf8\xe8\x3c\xfb\x4a"
+			  "\xc8",
+		.len	= 17,
+	}, { /* < 16 bytes */
+		.key	= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+			  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.klen	= 16,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f",
+		.ctext	= "\x3b\x3f\xd9\x2e\xb7\x2d\xad",
+		.len	= 7,
+	},
+};
+
+static int __init libaescfb_init(void)
+{
+	for (int i = 0; i < ARRAY_SIZE(aescfb_tv); i++) {
+		struct crypto_aes_ctx ctx;
+		u8 buf[64];
+
+		if (aes_expandkey(&ctx, aescfb_tv[i].key, aescfb_tv[i].klen)) {
+			pr_err("aes_expandkey() failed on vector %d\n", i);
+			return -ENODEV;
+		}
+
+		aescfb_encrypt(&ctx, buf, aescfb_tv[i].ptext, aescfb_tv[i].len,
+			       aescfb_tv[i].iv);
+		if (memcmp(buf, aescfb_tv[i].ctext, aescfb_tv[i].len)) {
+			pr_err("aescfb_encrypt() #1 failed on vector %d\n", i);
+			return -ENODEV;
+		}
+
+		/* decrypt in place */
+		aescfb_decrypt(&ctx, buf, buf, aescfb_tv[i].len, aescfb_tv[i].iv);
+		if (memcmp(buf, aescfb_tv[i].ptext, aescfb_tv[i].len)) {
+			pr_err("aescfb_decrypt() failed on vector %d\n", i);
+			return -ENODEV;
+		}
+
+		/* encrypt in place */
+		aescfb_encrypt(&ctx, buf, buf, aescfb_tv[i].len, aescfb_tv[i].iv);
+		if (memcmp(buf, aescfb_tv[i].ctext, aescfb_tv[i].len)) {
+			pr_err("aescfb_encrypt() #2 failed on vector %d\n", i);
+
+			return -ENODEV;
+		}
+
+	}
+	return 0;
+}
+module_init(libaescfb_init);
+
+static void __exit libaescfb_exit(void)
+{
+}
+module_exit(libaescfb_exit);
+#endif
diff --git a/lib/crypto/aesgcm.c b/lib/crypto/aesgcm.c
index c632d6e17af8..ac0b2fcfd606 100644
--- a/lib/crypto/aesgcm.c
+++ b/lib/crypto/aesgcm.c
@@ -5,12 +5,11 @@
  * Copyright 2022 Google LLC
  */
 
-#include <linux/module.h>
-
 #include <crypto/algapi.h>
 #include <crypto/gcm.h>
 #include <crypto/ghash.h>
-
+#include <linux/export.h>
+#include <linux/module.h>
 #include <asm/irqflags.h>
 
 static void aesgcm_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
@@ -73,6 +72,19 @@ static void aesgcm_ghash(be128 *ghash, const be128 *key, const void *src,
 	}
 }
 
+/**
+ * aesgcm_mac - Generates the authentication tag using AES-GCM algorithm.
+ * @ctx: The data structure that will hold the AES-GCM key schedule
+ * @src: The input source data.
+ * @src_len: Length of the source data.
+ * @assoc: Points to the associated data.
+ * @assoc_len: Length of the associated data values.
+ * @ctr: Points to the counter value.
+ * @authtag: The output buffer for the authentication tag.
+ *
+ * It takes in the AES-GCM context, source data, associated data, counter value,
+ * and an output buffer for the authentication tag.
+ */
 static void aesgcm_mac(const struct aesgcm_ctx *ctx, const u8 *src, int src_len,
 		       const u8 *assoc, int assoc_len, __be32 *ctr, u8 *authtag)
 {
@@ -186,25 +198,25 @@ MODULE_DESCRIPTION("Generic AES-GCM library");
 MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>");
 MODULE_LICENSE("GPL");
 
-#ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
+#ifdef CONFIG_CRYPTO_SELFTESTS
 
 /*
  * Test code below. Vectors taken from crypto/testmgr.h
  */
 
-static const u8 __initconst ctext0[16] =
+static const u8 __initconst ctext0[16] __nonstring =
 	"\x58\xe2\xfc\xce\xfa\x7e\x30\x61"
 	"\x36\x7f\x1d\x57\xa4\xe7\x45\x5a";
 
 static const u8 __initconst ptext1[16];
 
-static const u8 __initconst ctext1[32] =
+static const u8 __initconst ctext1[32] __nonstring =
 	"\x03\x88\xda\xce\x60\xb6\xa3\x92"
 	"\xf3\x28\xc2\xb9\x71\xb2\xfe\x78"
 	"\xab\x6e\x47\xd4\x2c\xec\x13\xbd"
 	"\xf5\x3a\x67\xb2\x12\x57\xbd\xdf";
 
-static const u8 __initconst ptext2[64] =
+static const u8 __initconst ptext2[64] __nonstring =
 	"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
 	"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
 	"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -214,7 +226,7 @@ static const u8 __initconst ptext2[64] =
 	"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
 	"\xba\x63\x7b\x39\x1a\xaf\xd2\x55";
 
-static const u8 __initconst ctext2[80] =
+static const u8 __initconst ctext2[80] __nonstring =
 	"\x42\x83\x1e\xc2\x21\x77\x74\x24"
 	"\x4b\x72\x21\xb7\x84\xd0\xd4\x9c"
 	"\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0"
@@ -226,7 +238,7 @@ static const u8 __initconst ctext2[80] =
 	"\x4d\x5c\x2a\xf3\x27\xcd\x64\xa6"
 	"\x2c\xf3\x5a\xbd\x2b\xa6\xfa\xb4";
 
-static const u8 __initconst ptext3[60] =
+static const u8 __initconst ptext3[60] __nonstring =
 	"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
 	"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
 	"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -236,7 +248,7 @@ static const u8 __initconst ptext3[60] =
 	"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
 	"\xba\x63\x7b\x39";
 
-static const u8 __initconst ctext3[76] =
+static const u8 __initconst ctext3[76] __nonstring =
 	"\x42\x83\x1e\xc2\x21\x77\x74\x24"
 	"\x4b\x72\x21\xb7\x84\xd0\xd4\x9c"
 	"\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0"
@@ -248,17 +260,17 @@ static const u8 __initconst ctext3[76] =
 	"\x5b\xc9\x4f\xbc\x32\x21\xa5\xdb"
 	"\x94\xfa\xe9\x5a\xe7\x12\x1a\x47";
 
-static const u8 __initconst ctext4[16] =
+static const u8 __initconst ctext4[16] __nonstring =
 	"\xcd\x33\xb2\x8a\xc7\x73\xf7\x4b"
 	"\xa0\x0e\xd1\xf3\x12\x57\x24\x35";
 
-static const u8 __initconst ctext5[32] =
+static const u8 __initconst ctext5[32] __nonstring =
 	"\x98\xe7\x24\x7c\x07\xf0\xfe\x41"
 	"\x1c\x26\x7e\x43\x84\xb0\xf6\x00"
 	"\x2f\xf5\x8d\x80\x03\x39\x27\xab"
 	"\x8e\xf4\xd4\x58\x75\x14\xf0\xfb";
 
-static const u8 __initconst ptext6[64] =
+static const u8 __initconst ptext6[64] __nonstring =
 	"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
 	"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
 	"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -268,7 +280,7 @@ static const u8 __initconst ptext6[64] =
 	"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
 	"\xba\x63\x7b\x39\x1a\xaf\xd2\x55";
 
-static const u8 __initconst ctext6[80] =
+static const u8 __initconst ctext6[80] __nonstring =
 	"\x39\x80\xca\x0b\x3c\x00\xe8\x41"
 	"\xeb\x06\xfa\xc4\x87\x2a\x27\x57"
 	"\x85\x9e\x1c\xea\xa6\xef\xd9\x84"
@@ -280,17 +292,17 @@ static const u8 __initconst ctext6[80] =
 	"\x99\x24\xa7\xc8\x58\x73\x36\xbf"
 	"\xb1\x18\x02\x4d\xb8\x67\x4a\x14";
 
-static const u8 __initconst ctext7[16] =
+static const u8 __initconst ctext7[16] __nonstring =
 	"\x53\x0f\x8a\xfb\xc7\x45\x36\xb9"
 	"\xa9\x63\xb4\xf1\xc4\xcb\x73\x8b";
 
-static const u8 __initconst ctext8[32] =
+static const u8 __initconst ctext8[32] __nonstring =
 	"\xce\xa7\x40\x3d\x4d\x60\x6b\x6e"
 	"\x07\x4e\xc5\xd3\xba\xf3\x9d\x18"
 	"\xd0\xd1\xc8\xa7\x99\x99\x6b\xf0"
 	"\x26\x5b\x98\xb5\xd4\x8a\xb9\x19";
 
-static const u8 __initconst ptext9[64] =
+static const u8 __initconst ptext9[64] __nonstring =
 	"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
 	"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
 	"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -300,7 +312,7 @@ static const u8 __initconst ptext9[64] =
 	"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
 	"\xba\x63\x7b\x39\x1a\xaf\xd2\x55";
 
-static const u8 __initconst ctext9[80] =
+static const u8 __initconst ctext9[80] __nonstring =
 	"\x52\x2d\xc1\xf0\x99\x56\x7d\x07"
 	"\xf4\x7f\x37\xa3\x2a\x84\x42\x7d"
 	"\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9"
@@ -312,7 +324,7 @@ static const u8 __initconst ctext9[80] =
 	"\xb0\x94\xda\xc5\xd9\x34\x71\xbd"
 	"\xec\x1a\x50\x22\x70\xe3\xcc\x6c";
 
-static const u8 __initconst ptext10[60] =
+static const u8 __initconst ptext10[60] __nonstring =
 	"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
 	"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
 	"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -322,7 +334,7 @@ static const u8 __initconst ptext10[60] =
 	"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
 	"\xba\x63\x7b\x39";
 
-static const u8 __initconst ctext10[76] =
+static const u8 __initconst ctext10[76] __nonstring =
 	"\x52\x2d\xc1\xf0\x99\x56\x7d\x07"
 	"\xf4\x7f\x37\xa3\x2a\x84\x42\x7d"
 	"\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9"
@@ -334,7 +346,7 @@ static const u8 __initconst ctext10[76] =
 	"\x76\xfc\x6e\xce\x0f\x4e\x17\x68"
 	"\xcd\xdf\x88\x53\xbb\x2d\x55\x1b";
 
-static const u8 __initconst ptext11[60] =
+static const u8 __initconst ptext11[60] __nonstring =
 	"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
 	"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
 	"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -344,7 +356,7 @@ static const u8 __initconst ptext11[60] =
 	"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
 	"\xba\x63\x7b\x39";
 
-static const u8 __initconst ctext11[76] =
+static const u8 __initconst ctext11[76] __nonstring =
 	"\x39\x80\xca\x0b\x3c\x00\xe8\x41"
 	"\xeb\x06\xfa\xc4\x87\x2a\x27\x57"
 	"\x85\x9e\x1c\xea\xa6\xef\xd9\x84"
@@ -356,7 +368,7 @@ static const u8 __initconst ctext11[76] =
 	"\x25\x19\x49\x8e\x80\xf1\x47\x8f"
 	"\x37\xba\x55\xbd\x6d\x27\x61\x8c";
 
-static const u8 __initconst ptext12[719] =
+static const u8 __initconst ptext12[719] __nonstring =
 	"\x42\xc1\xcc\x08\x48\x6f\x41\x3f"
 	"\x2f\x11\x66\x8b\x2a\x16\xf0\xe0"
 	"\x58\x83\xf0\xc3\x70\x14\xc0\x5b"
@@ -448,7 +460,7 @@ static const u8 __initconst ptext12[719] =
 	"\x59\xfa\xfa\xaa\x44\x04\x01\xa7"
 	"\xa4\x78\xdb\x74\x3d\x8b\xb5";
 
-static const u8 __initconst ctext12[735] =
+static const u8 __initconst ctext12[735] __nonstring =
 	"\x84\x0b\xdb\xd5\xb7\xa8\xfe\x20"
 	"\xbb\xb1\x12\x7f\x41\xea\xb3\xc0"
 	"\xa2\xb4\x37\x19\x11\x58\xb6\x0b"
@@ -546,9 +558,9 @@ static struct {
 	const u8	*ptext;
 	const u8	*ctext;
 
-	u8		key[AES_MAX_KEY_SIZE];
-	u8		iv[GCM_AES_IV_SIZE];
-	u8		assoc[20];
+	u8		key[AES_MAX_KEY_SIZE] __nonstring;
+	u8		iv[GCM_AES_IV_SIZE] __nonstring;
+	u8		assoc[20] __nonstring;
 
 	int		klen;
 	int		clen;
@@ -684,7 +696,7 @@ static int __init libaesgcm_init(void)
 		u8 tagbuf[AES_BLOCK_SIZE];
 		int plen = aesgcm_tv[i].plen;
 		struct aesgcm_ctx ctx;
-		u8 buf[sizeof(ptext12)];
+		static u8 buf[sizeof(ptext12)];
 
 		if (aesgcm_expandkey(&ctx, aesgcm_tv[i].key, aesgcm_tv[i].klen,
 				     aesgcm_tv[i].clen - plen)) {
diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c
index c2020f19c652..4e950e1e66d0 100644
--- a/lib/crypto/arc4.c
+++ b/lib/crypto/arc4.c
@@ -8,6 +8,7 @@
  */
 
 #include <crypto/arc4.h>
+#include <linux/export.h>
 #include <linux/module.h>
 
 int arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
@@ -71,4 +72,5 @@ void arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len)
 }
 EXPORT_SYMBOL(arc4_crypt);
 
+MODULE_DESCRIPTION("ARC4 Cipher Algorithm");
 MODULE_LICENSE("GPL");
diff --git a/lib/crypto/arm/.gitignore b/lib/crypto/arm/.gitignore
new file mode 100644
index 000000000000..f6c4e8ef80da
--- /dev/null
+++ b/lib/crypto/arm/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
+sha512-core.S
diff --git a/lib/crypto/arm/blake2b-neon-core.S b/lib/crypto/arm/blake2b-neon-core.S
new file mode 100644
index 000000000000..b55c37f0b88f
--- /dev/null
+++ b/lib/crypto/arm/blake2b-neon-core.S
@@ -0,0 +1,350 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2b digest algorithm optimized with ARM NEON instructions.  On ARM
+ * processors that have NEON support but not the ARMv8 Crypto Extensions,
+ * typically this BLAKE2b implementation is much faster than the SHA-2 family
+ * and slightly faster than SHA-1.
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.fpu		neon
+
+	// The arguments to blake2b_compress_neon()
+	CTX		.req	r0
+	DATA		.req	r1
+	NBLOCKS		.req	r2
+	INC		.req	r3
+
+	// Pointers to the rotation tables
+	ROR24_TABLE	.req	r4
+	ROR16_TABLE	.req	r5
+
+	// The original stack pointer
+	ORIG_SP		.req	r6
+
+	// NEON registers which contain the message words of the current block.
+	// M_0-M_3 are occasionally used for other purposes too.
+	M_0		.req	d16
+	M_1		.req	d17
+	M_2		.req	d18
+	M_3		.req	d19
+	M_4		.req	d20
+	M_5		.req	d21
+	M_6		.req	d22
+	M_7		.req	d23
+	M_8		.req	d24
+	M_9		.req	d25
+	M_10		.req	d26
+	M_11		.req	d27
+	M_12		.req	d28
+	M_13		.req	d29
+	M_14		.req	d30
+	M_15		.req	d31
+
+	.align		4
+	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
+	// instruction.  This is the most efficient way to implement these
+	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
+	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
+.Lror24_table:
+	.byte		3, 4, 5, 6, 7, 0, 1, 2
+.Lror16_table:
+	.byte		2, 3, 4, 5, 6, 7, 0, 1
+	// The BLAKE2b initialization vector
+.Lblake2b_IV:
+	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
+	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
+	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
+	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+
+// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
+// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
+// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
+// (M_0-M_3), so that they can be reloaded if they are used as temporary
+// registers.  The macro arguments s0-s15 give the order in which the message
+// words are used in this round.  'final' is 1 if this is the final round.
+.macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
+			s8, s9, s10, s11, s12, s13, s14, s15, final=0
+
+	// Mix the columns:
+	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
+	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
+
+	// a += b + m[blake2b_sigma[r][2*i + 0]];
+	vadd.u64	q0, q0, q2
+	vadd.u64	q1, q1, q3
+	vadd.u64	d0, d0, M_\s0
+	vadd.u64	d1, d1, M_\s2
+	vadd.u64	d2, d2, M_\s4
+	vadd.u64	d3, d3, M_\s6
+
+	// d = ror64(d ^ a, 32);
+	veor		q6, q6, q0
+	veor		q7, q7, q1
+	vrev64.32	q6, q6
+	vrev64.32	q7, q7
+
+	// c += d;
+	vadd.u64	q4, q4, q6
+	vadd.u64	q5, q5, q7
+
+	// b = ror64(b ^ c, 24);
+	vld1.8		{M_0}, [ROR24_TABLE, :64]
+	veor		q2, q2, q4
+	veor		q3, q3, q5
+	vtbl.8		d4, {d4}, M_0
+	vtbl.8		d5, {d5}, M_0
+	vtbl.8		d6, {d6}, M_0
+	vtbl.8		d7, {d7}, M_0
+
+	// a += b + m[blake2b_sigma[r][2*i + 1]];
+	//
+	// M_0 got clobbered above, so we have to reload it if any of the four
+	// message words this step needs happens to be M_0.  Otherwise we don't
+	// need to reload it here, as it will just get clobbered again below.
+.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
+	vld1.8		{M_0}, [sp, :64]
+.endif
+	vadd.u64	q0, q0, q2
+	vadd.u64	q1, q1, q3
+	vadd.u64	d0, d0, M_\s1
+	vadd.u64	d1, d1, M_\s3
+	vadd.u64	d2, d2, M_\s5
+	vadd.u64	d3, d3, M_\s7
+
+	// d = ror64(d ^ a, 16);
+	vld1.8		{M_0}, [ROR16_TABLE, :64]
+	veor		q6, q6, q0
+	veor		q7, q7, q1
+	vtbl.8		d12, {d12}, M_0
+	vtbl.8		d13, {d13}, M_0
+	vtbl.8		d14, {d14}, M_0
+	vtbl.8		d15, {d15}, M_0
+
+	// c += d;
+	vadd.u64	q4, q4, q6
+	vadd.u64	q5, q5, q7
+
+	// b = ror64(b ^ c, 63);
+	//
+	// This rotation amount isn't a multiple of 8, so it has to be
+	// implemented using a pair of shifts, which requires temporary
+	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
+	veor		q8, q2, q4
+	veor		q9, q3, q5
+	vshr.u64	q2, q8, #63
+	vshr.u64	q3, q9, #63
+	vsli.u64	q2, q8, #1
+	vsli.u64	q3, q9, #1
+	vld1.8		{q8-q9}, [sp, :256]
+
+	// Mix the diagonals:
+	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
+	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
+	//
+	// There are two possible ways to do this: use 'vext' instructions to
+	// shift the rows of the matrix so that the diagonals become columns,
+	// and undo it afterwards; or just use 64-bit operations on 'd'
+	// registers instead of 128-bit operations on 'q' registers.  We use the
+	// latter approach, as it performs much better on Cortex-A7.
+
+	// a += b + m[blake2b_sigma[r][2*i + 0]];
+	vadd.u64	d0, d0, d5
+	vadd.u64	d1, d1, d6
+	vadd.u64	d2, d2, d7
+	vadd.u64	d3, d3, d4
+	vadd.u64	d0, d0, M_\s8
+	vadd.u64	d1, d1, M_\s10
+	vadd.u64	d2, d2, M_\s12
+	vadd.u64	d3, d3, M_\s14
+
+	// d = ror64(d ^ a, 32);
+	veor		d15, d15, d0
+	veor		d12, d12, d1
+	veor		d13, d13, d2
+	veor		d14, d14, d3
+	vrev64.32	d15, d15
+	vrev64.32	d12, d12
+	vrev64.32	d13, d13
+	vrev64.32	d14, d14
+
+	// c += d;
+	vadd.u64	d10, d10, d15
+	vadd.u64	d11, d11, d12
+	vadd.u64	d8, d8, d13
+	vadd.u64	d9, d9, d14
+
+	// b = ror64(b ^ c, 24);
+	vld1.8		{M_0}, [ROR24_TABLE, :64]
+	veor		d5, d5, d10
+	veor		d6, d6, d11
+	veor		d7, d7, d8
+	veor		d4, d4, d9
+	vtbl.8		d5, {d5}, M_0
+	vtbl.8		d6, {d6}, M_0
+	vtbl.8		d7, {d7}, M_0
+	vtbl.8		d4, {d4}, M_0
+
+	// a += b + m[blake2b_sigma[r][2*i + 1]];
+.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
+	vld1.8		{M_0}, [sp, :64]
+.endif
+	vadd.u64	d0, d0, d5
+	vadd.u64	d1, d1, d6
+	vadd.u64	d2, d2, d7
+	vadd.u64	d3, d3, d4
+	vadd.u64	d0, d0, M_\s9
+	vadd.u64	d1, d1, M_\s11
+	vadd.u64	d2, d2, M_\s13
+	vadd.u64	d3, d3, M_\s15
+
+	// d = ror64(d ^ a, 16);
+	vld1.8		{M_0}, [ROR16_TABLE, :64]
+	veor		d15, d15, d0
+	veor		d12, d12, d1
+	veor		d13, d13, d2
+	veor		d14, d14, d3
+	vtbl.8		d12, {d12}, M_0
+	vtbl.8		d13, {d13}, M_0
+	vtbl.8		d14, {d14}, M_0
+	vtbl.8		d15, {d15}, M_0
+
+	// c += d;
+	vadd.u64	d10, d10, d15
+	vadd.u64	d11, d11, d12
+	vadd.u64	d8, d8, d13
+	vadd.u64	d9, d9, d14
+
+	// b = ror64(b ^ c, 63);
+	veor		d16, d4, d9
+	veor		d17, d5, d10
+	veor		d18, d6, d11
+	veor		d19, d7, d8
+	vshr.u64	q2, q8, #63
+	vshr.u64	q3, q9, #63
+	vsli.u64	q2, q8, #1
+	vsli.u64	q3, q9, #1
+	// Reloading q8-q9 can be skipped on the final round.
+.if ! \final
+	vld1.8		{q8-q9}, [sp, :256]
+.endif
+.endm
+
+//
+// void blake2b_compress_neon(struct blake2b_ctx *ctx,
+//			      const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2b_ctx are used:
+//	u64 h[8];	(inout)
+//	u64 t[2];	(inout)
+//	u64 f[2];	(in)
+//
+	.align		5
+ENTRY(blake2b_compress_neon)
+	push		{r4-r10}
+
+	// Allocate a 32-byte stack buffer that is 32-byte aligned.
+	mov		ORIG_SP, sp
+	sub		ip, sp, #32
+	bic		ip, ip, #31
+	mov		sp, ip
+
+	adr		ROR24_TABLE, .Lror24_table
+	adr		ROR16_TABLE, .Lror16_table
+
+	mov		ip, CTX
+	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
+	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
+.Lnext_block:
+	  adr		r10, .Lblake2b_IV
+	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
+	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
+	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
+	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
+	  adds		r7, r7, INC		// Increment counter
+	bcs		.Lslow_inc_ctr
+	vmov.i32	d28[0], r7
+	vst1.64		{d28}, [ip]		// Update t[0]
+.Linc_ctr_done:
+
+	// Load the next message block and finish initializing the state matrix
+	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
+	// entire state matrix in q0-q7 and the entire message block in q8-15.
+	//
+	// However, _blake2b_round also needs some extra registers for rotates,
+	// so we have to spill some registers.  It's better to spill the message
+	// registers than the state registers, as the message doesn't change.
+	// Therefore we store a copy of the first 32 bytes of the message block
+	// (q8-q9) in an aligned buffer on the stack so that they can be
+	// reloaded when needed.  (We could just reload directly from the
+	// message buffer, but it's faster to use aligned loads.)
+	vld1.8		{q8-q9}, [DATA]!
+	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
+	vld1.8		{q10-q11}, [DATA]!
+	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
+	vld1.8		{q12-q13}, [DATA]!
+	vst1.8		{q8-q9}, [sp, :256]
+	  mov		ip, CTX
+	vld1.8		{q14-q15}, [DATA]!
+
+	// Execute the rounds.  Each round is provided the order in which it
+	// needs to use the message words.
+	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
+			final=1
+
+	// Fold the final state matrix into the hash chaining value:
+	//
+	//	for (i = 0; i < 8; i++)
+	//		h[i] ^= v[i] ^ v[i + 8];
+	//
+	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
+	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
+	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
+	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
+	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
+	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
+	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
+	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
+	  mov		ip, CTX
+	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
+	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
+	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
+	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
+	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
+
+	// Advance to the next block, if there is one.
+	bne		.Lnext_block		// nblocks != 0?
+
+	mov		sp, ORIG_SP
+	pop		{r4-r10}
+	mov		pc, lr
+
+.Lslow_inc_ctr:
+	// Handle the case where the counter overflowed its low 32 bits, by
+	// carrying the overflow bit into the full 128-bit counter.
+	vmov		r9, r10, d29
+	adcs		r8, r8, #0
+	adcs		r9, r9, #0
+	adc		r10, r10, #0
+	vmov		d28, r7, r8
+	vmov		d29, r9, r10
+	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
+	b		.Linc_ctr_done
+ENDPROC(blake2b_compress_neon)
diff --git a/lib/crypto/arm/blake2b.h b/lib/crypto/arm/blake2b.h
new file mode 100644
index 000000000000..5c76498521e6
--- /dev/null
+++ b/lib/crypto/arm/blake2b.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2b digest algorithm, NEON accelerated
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void blake2b_compress_neon(struct blake2b_ctx *ctx,
+				      const u8 *data, size_t nblocks, u32 inc);
+
+static void blake2b_compress(struct blake2b_ctx *ctx,
+			     const u8 *data, size_t nblocks, u32 inc)
+{
+	if (!static_branch_likely(&have_neon) || !may_use_simd()) {
+		blake2b_compress_generic(ctx, data, nblocks, inc);
+		return;
+	}
+	do {
+		const size_t blocks = min_t(size_t, nblocks,
+					    SZ_4K / BLAKE2B_BLOCK_SIZE);
+
+		scoped_ksimd()
+			blake2b_compress_neon(ctx, data, blocks, inc);
+
+		data += blocks * BLAKE2B_BLOCK_SIZE;
+		nblocks -= blocks;
+	} while (nblocks);
+}
+
+#define blake2b_mod_init_arch blake2b_mod_init_arch
+static void blake2b_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON)
+		static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
new file mode 100644
index 000000000000..933f0558b7cd
--- /dev/null
+++ b/lib/crypto/arm/blake2s-core.S
@@ -0,0 +1,309 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2s digest algorithm, ARM scalar implementation.  This is faster
+ * than the generic implementations of BLAKE2s and BLAKE2b, but slower
+ * than the NEON implementation of BLAKE2b.  There is no NEON
+ * implementation of BLAKE2s, since NEON doesn't really help with it.
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	// Registers used to hold message words temporarily.  There aren't
+	// enough ARM registers to hold the whole message block, so we have to
+	// load the words on-demand.
+	M_0		.req	r12
+	M_1		.req	r14
+
+// The BLAKE2s initialization vector
+.Lblake2s_IV:
+	.word	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+	.word	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+
+.macro __ldrd		a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	ldrd		\a, \b, [\src, #\offset]
+#else
+	ldr		\a, [\src, #\offset]
+	ldr		\b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd		a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	strd		\a, \b, [\dst, #\offset]
+#else
+	str		\a, [\dst, #\offset]
+	str		\b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _le32_bswap	a, tmp
+#ifdef __ARMEB__
+	rev_l		\a, \tmp
+#endif
+.endm
+
+.macro _le32_bswap_8x	a, b, c, d, e, f, g, h,  tmp
+	_le32_bswap	\a, \tmp
+	_le32_bswap	\b, \tmp
+	_le32_bswap	\c, \tmp
+	_le32_bswap	\d, \tmp
+	_le32_bswap	\e, \tmp
+	_le32_bswap	\f, \tmp
+	_le32_bswap	\g, \tmp
+	_le32_bswap	\h, \tmp
+.endm
+
+// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
+// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
+// columns/diagonals.  s0-s1 are the word offsets to the message words the first
+// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
+// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
+//
+// Note that to save instructions, the rotations don't happen when the
+// pseudocode says they should, but rather they are delayed until the values are
+// used.  See the comment above _blake2s_round().
+.macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3
+
+	ldr		M_0, [sp, #32 + 4 * \s0]
+	ldr		M_1, [sp, #32 + 4 * \s2]
+
+	// a += b + m[blake2s_sigma[r][2*i + 0]];
+	add		\a0, \a0, \b0, ror #brot
+	add		\a1, \a1, \b1, ror #brot
+	add		\a0, \a0, M_0
+	add		\a1, \a1, M_1
+
+	// d = ror32(d ^ a, 16);
+	eor		\d0, \a0, \d0, ror #drot
+	eor		\d1, \a1, \d1, ror #drot
+
+	// c += d;
+	add		\c0, \c0, \d0, ror #16
+	add		\c1, \c1, \d1, ror #16
+
+	// b = ror32(b ^ c, 12);
+	eor		\b0, \c0, \b0, ror #brot
+	eor		\b1, \c1, \b1, ror #brot
+
+	ldr		M_0, [sp, #32 + 4 * \s1]
+	ldr		M_1, [sp, #32 + 4 * \s3]
+
+	// a += b + m[blake2s_sigma[r][2*i + 1]];
+	add		\a0, \a0, \b0, ror #12
+	add		\a1, \a1, \b1, ror #12
+	add		\a0, \a0, M_0
+	add		\a1, \a1, M_1
+
+	// d = ror32(d ^ a, 8);
+	eor		\d0, \a0, \d0, ror#16
+	eor		\d1, \a1, \d1, ror#16
+
+	// c += d;
+	add		\c0, \c0, \d0, ror#8
+	add		\c1, \c1, \d1, ror#8
+
+	// b = ror32(b ^ c, 7);
+	eor		\b0, \c0, \b0, ror#12
+	eor		\b1, \c1, \b1, ror#12
+.endm
+
+// Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
+// are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
+// spilling v[8..9], then to v[10..15], then to the message block.  r10-r12 and
+// r14 are free to use.  The macro arguments s0-s15 give the order in which the
+// message words are used in this round.
+//
+// All rotates are performed using the implicit rotate operand accepted by the
+// 'add' and 'eor' instructions.  This is faster than using explicit rotate
+// instructions.  To make this work, we allow the values in the second and last
+// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
+// wrong rotation amount.  The rotation amount is then fixed up just in time
+// when the values are used.  'brot' is the number of bits the values in row 'b'
+// need to be rotated right to arrive at the correct values, and 'drot'
+// similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
+// that they end up as (7, 8) after every round.
+.macro	_blake2s_round	s0, s1, s2, s3, s4, s5, s6, s7, \
+			s8, s9, s10, s11, s12, s13, s14, s15
+
+	// Mix first two columns:
+	// (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
+	__ldrd		r10, r11, sp, 16	// load v[12] and v[13]
+	_blake2s_quarterround	r0, r4, r8, r10,  r1, r5, r9, r11, \
+				\s0, \s1, \s2, \s3
+	__strd		r8, r9, sp, 0
+	__strd		r10, r11, sp, 16
+
+	// Mix second two columns:
+	// (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
+	__ldrd		r8, r9, sp, 8		// load v[10] and v[11]
+	__ldrd		r10, r11, sp, 24	// load v[14] and v[15]
+	_blake2s_quarterround	r2, r6, r8, r10,  r3, r7, r9, r11, \
+				\s4, \s5, \s6, \s7
+	str		r10, [sp, #24]		// store v[14]
+	// v[10], v[11], and v[15] are used below, so no need to store them yet.
+
+	.set brot, 7
+	.set drot, 8
+
+	// Mix first two diagonals:
+	// (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
+	ldr		r10, [sp, #16]		// load v[12]
+	_blake2s_quarterround	r0, r5, r8, r11,  r1, r6, r9, r10, \
+				\s8, \s9, \s10, \s11
+	__strd		r8, r9, sp, 8
+	str		r11, [sp, #28]
+	str		r10, [sp, #16]
+
+	// Mix second two diagonals:
+	// (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
+	__ldrd		r8, r9, sp, 0		// load v[8] and v[9]
+	__ldrd		r10, r11, sp, 20	// load v[13] and v[14]
+	_blake2s_quarterround	r2, r7, r8, r10,  r3, r4, r9, r11, \
+				\s12, \s13, \s14, \s15
+	__strd		r10, r11, sp, 20
+.endm
+
+//
+// void blake2s_compress(struct blake2s_ctx *ctx,
+//			 const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+//	u32 h[8];	(inout)
+//	u32 t[2];	(inout)
+//	u32 f[2];	(in)
+//
+	.align		5
+ENTRY(blake2s_compress)
+	push		{r0-r2,r4-r11,lr}	// keep this an even number
+
+.Lnext_block:
+	// r0 is 'ctx'
+	// r1 is 'data'
+	// r3 is 'inc'
+
+	// Load and increment the counter t[0..1].
+	__ldrd		r10, r11, r0, 32
+	adds		r10, r10, r3
+	adc		r11, r11, #0
+	__strd		r10, r11, r0, 32
+
+	// _blake2s_round is very short on registers, so copy the message block
+	// to the stack to save a register during the rounds.  This also has the
+	// advantage that misalignment only needs to be dealt with in one place.
+	sub		sp, sp, #64
+	mov		r12, sp
+	tst		r1, #3
+	bne		.Lcopy_block_misaligned
+	ldmia		r1!, {r2-r9}
+	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
+	stmia		r12!, {r2-r9}
+	ldmia		r1!, {r2-r9}
+	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
+	stmia		r12, {r2-r9}
+.Lcopy_block_done:
+	str		r1, [sp, #68]		// Update message pointer
+
+	// Calculate v[8..15].  Push v[10..15] onto the stack, and leave space
+	// for spilling v[8..9].  Leave v[8..9] in r8-r9.
+	mov		r14, r0			// r14 = ctx
+	adr		r12, .Lblake2s_IV
+	ldmia		r12!, {r8-r9}		// load IV[0..1]
+	__ldrd		r0, r1, r14, 40		// load f[0..1]
+	ldm		r12, {r2-r7}		// load IV[2..7]
+	eor		r4, r4, r10		// v[12] = IV[4] ^ t[0]
+	eor		r5, r5, r11		// v[13] = IV[5] ^ t[1]
+	eor		r6, r6, r0		// v[14] = IV[6] ^ f[0]
+	eor		r7, r7, r1		// v[15] = IV[7] ^ f[1]
+	push		{r2-r7}			// push v[10..15]
+	sub		sp, sp, #8		// leave space for v[8..9]
+
+	// Load h[0..7] == v[0..7].
+	ldm		r14, {r0-r7}
+
+	// Execute the rounds.  Each round is provided the order in which it
+	// needs to use the message words.
+	.set brot, 0
+	.set drot, 0
+	_blake2s_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+	_blake2s_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+	_blake2s_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+	_blake2s_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+	_blake2s_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+	_blake2s_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+	_blake2s_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+	_blake2s_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+	_blake2s_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+	_blake2s_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+
+	// Fold the final state matrix into the hash chaining value:
+	//
+	//	for (i = 0; i < 8; i++)
+	//		h[i] ^= v[i] ^ v[i + 8];
+	//
+	ldr		r14, [sp, #96]		// r14 = &h[0]
+	add		sp, sp, #8		// v[8..9] are already loaded.
+	pop		{r10-r11}		// load v[10..11]
+	eor		r0, r0, r8
+	eor		r1, r1, r9
+	eor		r2, r2, r10
+	eor		r3, r3, r11
+	ldm		r14, {r8-r11}		// load h[0..3]
+	eor		r0, r0, r8
+	eor		r1, r1, r9
+	eor		r2, r2, r10
+	eor		r3, r3, r11
+	stmia		r14!, {r0-r3}		// store new h[0..3]
+	ldm		r14, {r0-r3}		// load old h[4..7]
+	pop		{r8-r11}		// load v[12..15]
+	eor		r0, r0, r4, ror #brot
+	eor		r1, r1, r5, ror #brot
+	eor		r2, r2, r6, ror #brot
+	eor		r3, r3, r7, ror #brot
+	eor		r0, r0, r8, ror #drot
+	eor		r1, r1, r9, ror #drot
+	eor		r2, r2, r10, ror #drot
+	eor		r3, r3, r11, ror #drot
+	  add		sp, sp, #64		// skip copy of message block
+	stm		r14, {r0-r3}		// store new h[4..7]
+
+	// Advance to the next block, if there is one.  Note that if there are
+	// multiple blocks, then 'inc' (the counter increment amount) must be
+	// 64.  So we can simply set it to 64 without re-loading it.
+	ldm		sp, {r0, r1, r2}	// load (ctx, data, nblocks)
+	mov		r3, #64			// set 'inc'
+	subs		r2, r2, #1		// nblocks--
+	str		r2, [sp, #8]
+	bne		.Lnext_block		// nblocks != 0?
+
+	pop		{r0-r2,r4-r11,pc}
+
+	// The next message block (pointed to by r1) isn't 4-byte aligned, so it
+	// can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
+	// by r12) using an alternative method.  r2-r9 are free to use.
+.Lcopy_block_misaligned:
+	mov		r2, #64
+1:
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	ldr		r3, [r1], #4
+	_le32_bswap	r3, r4
+#else
+	ldrb		r3, [r1, #0]
+	ldrb		r4, [r1, #1]
+	ldrb		r5, [r1, #2]
+	ldrb		r6, [r1, #3]
+	add		r1, r1, #4
+	orr		r3, r3, r4, lsl #8
+	orr		r3, r3, r5, lsl #16
+	orr		r3, r3, r6, lsl #24
+#endif
+	subs		r2, r2, #4
+	str		r3, [r12], #4
+	bne		1b
+	b		.Lcopy_block_done
+ENDPROC(blake2s_compress)
diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h
new file mode 100644
index 000000000000..42c04440c191
--- /dev/null
+++ b/lib/crypto/arm/blake2s.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+/* defined in blake2s-core.S */
+void blake2s_compress(struct blake2s_ctx *ctx,
+		      const u8 *data, size_t nblocks, u32 inc);
diff --git a/lib/crypto/arm/chacha-neon-core.S b/lib/crypto/arm/chacha-neon-core.S
new file mode 100644
index 000000000000..ddd62b6294a5
--- /dev/null
+++ b/lib/crypto/arm/chacha-neon-core.S
@@ -0,0 +1,643 @@
+/*
+ * ChaCha/HChaCha NEON helper functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+ /*
+  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
+  *
+  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
+  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
+  * (c)  vrev32.16			(16-bit rotations only)
+  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
+  *					 needs index vector)
+  *
+  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
+  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
+  * cycles of (b) on both Cortex-A7 and Cortex-A53.
+  *
+  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+  * and doesn't need a temporary register.
+  *
+  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
+  * is twice as fast as (a), even when doing (a) on multiple registers
+  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
+  * parallelizes better when temporary registers are scarce.
+  *
+  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
+  * (a), so the need to load the rotation table actually makes the vtbl method
+  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
+  * seems to be a good compromise to get a more significant speed boost on some
+  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+  */
+
+#include <linux/linkage.h>
+#include <asm/cache.h>
+
+	.text
+	.fpu		neon
+	.align		5
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in r3.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha_permute:
+
+	adr		ip, .Lrol8_table
+	vld1.8		{d10}, [ip, :64]
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vrev32.16	q3, q3
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vtbl.8		d6, {d6}, d10
+	vtbl.8		d7, {d7}, d10
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vext.8		q1, q1, q1, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vext.8		q3, q3, q3, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vrev32.16	q3, q3
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vtbl.8		d6, {d6}, d10
+	vtbl.8		d7, {d7}, d10
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vext.8		q1, q1, q1, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vext.8		q3, q3, q3, #4
+
+	subs		r3, r3, #2
+	bne		.Ldoubleround
+
+	bx		lr
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_neon)
+	// r0: Input state matrix, s
+	// r1: 1 data block output, o
+	// r2: 1 data block input, i
+	// r3: nrounds
+	push		{lr}
+
+	// x0..3 = s0..3
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	vmov		q8, q0
+	vmov		q9, q1
+	vmov		q10, q2
+	vmov		q11, q3
+
+	bl		chacha_permute
+
+	add		ip, r2, #0x20
+	vld1.8		{q4-q5}, [r2]
+	vld1.8		{q6-q7}, [ip]
+
+	// o0 = i0 ^ (x0 + s0)
+	vadd.i32	q0, q0, q8
+	veor		q0, q0, q4
+
+	// o1 = i1 ^ (x1 + s1)
+	vadd.i32	q1, q1, q9
+	veor		q1, q1, q5
+
+	// o2 = i2 ^ (x2 + s2)
+	vadd.i32	q2, q2, q10
+	veor		q2, q2, q6
+
+	// o3 = i3 ^ (x3 + s3)
+	vadd.i32	q3, q3, q11
+	veor		q3, q3, q7
+
+	add		ip, r1, #0x20
+	vst1.8		{q0-q1}, [r1]
+	vst1.8		{q2-q3}, [ip]
+
+	pop		{pc}
+ENDPROC(chacha_block_xor_neon)
+
+ENTRY(hchacha_block_neon)
+	// r0: Input state matrix, s
+	// r1: output (8 32-bit words)
+	// r2: nrounds
+	push		{lr}
+
+	vld1.32		{q0-q1}, [r0]!
+	vld1.32		{q2-q3}, [r0]
+
+	mov		r3, r2
+	bl		chacha_permute
+
+	vst1.32		{q0}, [r1]!
+	vst1.32		{q3}, [r1]
+
+	pop		{pc}
+ENDPROC(hchacha_block_neon)
+
+	.align		4
+.Lctrinc:	.word	0, 1, 2, 3
+.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
+
+	.align		5
+ENTRY(chacha_4block_xor_neon)
+	push		{r4, lr}
+	mov		r4, sp			// preserve the stack pointer
+	sub		ip, sp, #0x20		// allocate a 32 byte buffer
+	bic		ip, ip, #0x1f		// aligned to 32 bytes
+	mov		sp, ip
+
+	// r0: Input state matrix, s
+	// r1: 4 data blocks output, o
+	// r2: 4 data blocks input, i
+	// r3: nrounds
+
+	//
+	// This function encrypts four consecutive ChaCha blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. The words are re-interleaved before the
+	// final addition of the original state and the XORing step.
+	//
+
+	// x0..15[0-3] = s0..15[0-3]
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	adr		lr, .Lctrinc
+	vdup.32		q15, d7[1]
+	vdup.32		q14, d7[0]
+	vld1.32		{q4}, [lr, :128]
+	vdup.32		q13, d6[1]
+	vdup.32		q12, d6[0]
+	vdup.32		q11, d5[1]
+	vdup.32		q10, d5[0]
+	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
+	vdup.32		q9, d4[1]
+	vdup.32		q8, d4[0]
+	vdup.32		q7, d3[1]
+	vdup.32		q6, d3[0]
+	vdup.32		q5, d2[1]
+	vdup.32		q4, d2[0]
+	vdup.32		q3, d1[1]
+	vdup.32		q2, d1[0]
+	vdup.32		q1, d0[1]
+	vdup.32		q0, d0[0]
+
+	adr		ip, .Lrol8_table
+	b		1f
+
+.Ldoubleround4:
+	vld1.32		{q8-q9}, [sp, :256]
+1:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
+
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+	vrev32.16	q15, q15
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #12
+	vshl.u32	q5, q9, #12
+	vsri.u32	q4, q8, #20
+	vsri.u32	q5, q9, #20
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #12
+	vshl.u32	q7, q9, #12
+	vsri.u32	q6, q8, #20
+	vsri.u32	q7, q9, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vld1.8		{d16}, [ip, :64]
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
+
+	vtbl.8		d24, {d24}, d16
+	vtbl.8		d25, {d25}, d16
+	vtbl.8		d26, {d26}, d16
+	vtbl.8		d27, {d27}, d16
+	vtbl.8		d28, {d28}, d16
+	vtbl.8		d29, {d29}, d16
+	vtbl.8		d30, {d30}, d16
+	vtbl.8		d31, {d31}, d16
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #7
+	vshl.u32	q5, q9, #7
+	vsri.u32	q4, q8, #25
+	vsri.u32	q5, q9, #25
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #7
+	vshl.u32	q7, q9, #7
+	vsri.u32	q6, q8, #25
+	vsri.u32	q7, q9, #25
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
+
+	vrev32.16	q15, q15
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #12
+	vshl.u32	q4, q9, #12
+	vsri.u32	q7, q8, #20
+	vsri.u32	q4, q9, #20
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #12
+	vshl.u32	q6, q9, #12
+	vsri.u32	q5, q8, #20
+	vsri.u32	q6, q9, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vld1.8		{d16}, [ip, :64]
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
+
+	vtbl.8		d30, {d30}, d16
+	vtbl.8		d31, {d31}, d16
+	vtbl.8		d24, {d24}, d16
+	vtbl.8		d25, {d25}, d16
+	vtbl.8		d26, {d26}, d16
+	vtbl.8		d27, {d27}, d16
+	vtbl.8		d28, {d28}, d16
+	vtbl.8		d29, {d29}, d16
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #7
+	vshl.u32	q4, q9, #7
+	vsri.u32	q7, q8, #25
+	vsri.u32	q4, q9, #25
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #7
+	vshl.u32	q6, q9, #7
+	vsri.u32	q5, q8, #25
+	vsri.u32	q6, q9, #25
+
+	subs		r3, r3, #2
+	bne		.Ldoubleround4
+
+	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
+	// x8..9[0-3] are on the stack.
+
+	// Re-interleave the words in the first two rows of each block (x0..7).
+	// Also add the counter values 0-3 to x12[0-3].
+	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
+	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
+	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
+	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
+	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
+	  vadd.u32	q12, q8			// x12 += counter values 0-3
+	vswp		d1, d4
+	vswp		d3, d6
+	  vld1.32	{q8-q9}, [r0]!		// load s0..7
+	vswp		d9, d12
+	vswp		d11, d14
+
+	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
+	// after XORing the first 32 bytes.
+	vswp		q1, q4
+
+	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
+
+	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
+	vadd.u32	q0, q0, q8
+	vadd.u32	q2, q2, q8
+	vadd.u32	q4, q4, q8
+	vadd.u32	q3, q3, q8
+
+	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
+	vadd.u32	q1, q1, q9
+	vadd.u32	q6, q6, q9
+	vadd.u32	q5, q5, q9
+	vadd.u32	q7, q7, q9
+
+	// XOR first 32 bytes using keystream from first two rows of first block
+	vld1.8		{q8-q9}, [r2]!
+	veor		q8, q8, q0
+	veor		q9, q9, q1
+	vst1.8		{q8-q9}, [r1]!
+
+	// Re-interleave the words in the last two rows of each block (x8..15).
+	vld1.32		{q8-q9}, [sp, :256]
+	  mov		sp, r4		// restore original stack pointer
+	  ldr		r4, [r4, #8]	// load number of bytes
+	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
+	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
+	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
+	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
+	  vld1.32	{q0-q1}, [r0]	// load s8..15
+	vswp		d25, d28
+	vswp		d27, d30
+	vswp		d17, d20
+	vswp		d19, d22
+
+	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
+
+	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
+	vadd.u32	q8,  q8,  q0
+	vadd.u32	q10, q10, q0
+	vadd.u32	q9,  q9,  q0
+	vadd.u32	q11, q11, q0
+
+	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
+	vadd.u32	q12, q12, q1
+	vadd.u32	q14, q14, q1
+	vadd.u32	q13, q13, q1
+	vadd.u32	q15, q15, q1
+
+	// XOR the rest of the data with the keystream
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #96
+	veor		q0, q0, q8
+	veor		q1, q1, q12
+	ble		.Lle96
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q2
+	veor		q1, q1, q6
+	ble		.Lle128
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q10
+	veor		q1, q1, q14
+	ble		.Lle160
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q4
+	veor		q1, q1, q5
+	ble		.Lle192
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q9
+	veor		q1, q1, q13
+	ble		.Lle224
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q3
+	veor		q1, q1, q7
+	blt		.Llt256
+.Lout:
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]
+	veor		q0, q0, q11
+	veor		q1, q1, q15
+	vst1.8		{q0-q1}, [r1]
+
+	pop		{r4, pc}
+
+.Lle192:
+	vmov		q4, q9
+	vmov		q5, q13
+
+.Lle160:
+	// nothing to do
+
+.Lfinalblock:
+	// Process the final block if processing less than 4 full blocks.
+	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
+	// previous 32 byte output block that still needs to be written at
+	// [r1] in q0-q1.
+	beq		.Lfullblock
+
+.Lpartialblock:
+	adr		lr, .Lpermute + 32
+	add		r2, r2, r4
+	add		lr, lr, r4
+	add		r4, r4, r1
+
+	vld1.8		{q2-q3}, [lr]
+	vld1.8		{q6-q7}, [r2]
+
+	add		r4, r4, #32
+
+	vtbl.8		d4, {q4-q5}, d4
+	vtbl.8		d5, {q4-q5}, d5
+	vtbl.8		d6, {q4-q5}, d6
+	vtbl.8		d7, {q4-q5}, d7
+
+	veor		q6, q6, q2
+	veor		q7, q7, q3
+
+	vst1.8		{q6-q7}, [r4]	// overlapping stores
+	vst1.8		{q0-q1}, [r1]
+	pop		{r4, pc}
+
+.Lfullblock:
+	vmov		q11, q4
+	vmov		q15, q5
+	b		.Lout
+.Lle96:
+	vmov		q4, q2
+	vmov		q5, q6
+	b		.Lfinalblock
+.Lle128:
+	vmov		q4, q10
+	vmov		q5, q14
+	b		.Lfinalblock
+.Lle224:
+	vmov		q4, q3
+	vmov		q5, q7
+	b		.Lfinalblock
+.Llt256:
+	vmov		q4, q11
+	vmov		q5, q15
+	b		.Lpartialblock
+ENDPROC(chacha_4block_xor_neon)
+
+	.align		L1_CACHE_SHIFT
+.Lpermute:
+	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/lib/crypto/arm/chacha-scalar-core.S b/lib/crypto/arm/chacha-scalar-core.S
new file mode 100644
index 000000000000..4951df05c158
--- /dev/null
+++ b/lib/crypto/arm/chacha-scalar-core.S
@@ -0,0 +1,444 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Google, Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Design notes:
+ *
+ * 16 registers would be needed to hold the state matrix, but only 14 are
+ * available because 'sp' and 'pc' cannot be used.  So we spill the elements
+ * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
+ * 'ldrd' and one 'strd' instruction per round.
+ *
+ * All rotates are performed using the implicit rotate operand accepted by the
+ * 'add' and 'eor' instructions.  This is faster than using explicit rotate
+ * instructions.  To make this work, we allow the values in the second and last
+ * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
+ * wrong rotation amount.  The rotation amount is then fixed up just in time
+ * when the values are used.  'brot' is the number of bits the values in row 'b'
+ * need to be rotated right to arrive at the correct values, and 'drot'
+ * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
+ * that they end up as (25, 24) after every round.
+ */
+
+	// ChaCha state registers
+	X0	.req	r0
+	X1	.req	r1
+	X2	.req	r2
+	X3	.req	r3
+	X4	.req	r4
+	X5	.req	r5
+	X6	.req	r6
+	X7	.req	r7
+	X8_X10	.req	r8	// shared by x8 and x10
+	X9_X11	.req	r9	// shared by x9 and x11
+	X12	.req	r10
+	X13	.req	r11
+	X14	.req	r12
+	X15	.req	r14
+
+.macro _le32_bswap_4x	a, b, c, d,  tmp
+#ifdef __ARMEB__
+	rev_l		\a,  \tmp
+	rev_l		\b,  \tmp
+	rev_l		\c,  \tmp
+	rev_l		\d,  \tmp
+#endif
+.endm
+
+.macro __ldrd		a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	ldrd		\a, \b, [\src, #\offset]
+#else
+	ldr		\a, [\src, #\offset]
+	ldr		\b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd		a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	strd		\a, \b, [\dst, #\offset]
+#else
+	str		\a, [\dst, #\offset]
+	str		\b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2
+
+	// a += b; d ^= a; d = rol(d, 16);
+	add		\a1, \a1, \b1, ror #brot
+	add		\a2, \a2, \b2, ror #brot
+	eor		\d1, \a1, \d1, ror #drot
+	eor		\d2, \a2, \d2, ror #drot
+	// drot == 32 - 16 == 16
+
+	// c += d; b ^= c; b = rol(b, 12);
+	add		\c1, \c1, \d1, ror #16
+	add		\c2, \c2, \d2, ror #16
+	eor		\b1, \c1, \b1, ror #brot
+	eor		\b2, \c2, \b2, ror #brot
+	// brot == 32 - 12 == 20
+
+	// a += b; d ^= a; d = rol(d, 8);
+	add		\a1, \a1, \b1, ror #20
+	add		\a2, \a2, \b2, ror #20
+	eor		\d1, \a1, \d1, ror #16
+	eor		\d2, \a2, \d2, ror #16
+	// drot == 32 - 8 == 24
+
+	// c += d; b ^= c; b = rol(b, 7);
+	add		\c1, \c1, \d1, ror #24
+	add		\c2, \c2, \d2, ror #24
+	eor		\b1, \c1, \b1, ror #20
+	eor		\b2, \c2, \b2, ror #20
+	// brot == 32 - 7 == 25
+.endm
+
+.macro _doubleround
+
+	// column round
+
+	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
+	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
+
+	// save (x8, x9); restore (x10, x11)
+	__strd		X8_X10, X9_X11, sp, 0
+	__ldrd		X8_X10, X9_X11, sp, 8
+
+	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
+	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
+
+	.set brot, 25
+	.set drot, 24
+
+	// diagonal round
+
+	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
+	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
+
+	// save (x10, x11); restore (x8, x9)
+	__strd		X8_X10, X9_X11, sp, 8
+	__ldrd		X8_X10, X9_X11, sp, 0
+
+	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
+	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
+.endm
+
+.macro _chacha_permute	nrounds
+	.set brot, 0
+	.set drot, 0
+	.rept \nrounds / 2
+	 _doubleround
+	.endr
+.endm
+
+.macro _chacha		nrounds
+
+.Lnext_block\@:
+	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
+	// Registers contain x0-x9,x12-x15.
+
+	// Do the core ChaCha permutation to update x0-x15.
+	_chacha_permute	\nrounds
+
+	add		sp, #8
+	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
+	// Registers contain x0-x9,x12-x15.
+	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
+	push		{X8_X10, X9_X11, X12, X13, X14, X15}
+
+	// Load (OUT, IN, LEN).
+	ldr		r14, [sp, #96]
+	ldr		r12, [sp, #100]
+	ldr		r11, [sp, #104]
+
+	orr		r10, r14, r12
+
+	// Use slow path if fewer than 64 bytes remain.
+	cmp		r11, #64
+	blt		.Lxor_slowpath\@
+
+	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
+	// ARMv6+, since ldmia and stmia (used below) still require alignment.
+	tst		r10, #3
+	bne		.Lxor_slowpath\@
+
+	// Fast path: XOR 64 bytes of aligned data.
+
+	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
+	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+	// x0-x3
+	__ldrd		r8, r9, sp, 32
+	__ldrd		r10, r11, sp, 40
+	add		X0, X0, r8
+	add		X1, X1, r9
+	add		X2, X2, r10
+	add		X3, X3, r11
+	_le32_bswap_4x	X0, X1, X2, X3,  r8
+	ldmia		r12!, {r8-r11}
+	eor		X0, X0, r8
+	eor		X1, X1, r9
+	eor		X2, X2, r10
+	eor		X3, X3, r11
+	stmia		r14!, {X0-X3}
+
+	// x4-x7
+	__ldrd		r8, r9, sp, 48
+	__ldrd		r10, r11, sp, 56
+	add		X4, r8, X4, ror #brot
+	add		X5, r9, X5, ror #brot
+	ldmia		r12!, {X0-X3}
+	add		X6, r10, X6, ror #brot
+	add		X7, r11, X7, ror #brot
+	_le32_bswap_4x	X4, X5, X6, X7,  r8
+	eor		X4, X4, X0
+	eor		X5, X5, X1
+	eor		X6, X6, X2
+	eor		X7, X7, X3
+	stmia		r14!, {X4-X7}
+
+	// x8-x15
+	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
+	__ldrd		r8, r9, sp, 32
+	__ldrd		r10, r11, sp, 40
+	add		r0, r0, r8		// x8
+	add		r1, r1, r9		// x9
+	add		r6, r6, r10		// x10
+	add		r7, r7, r11		// x11
+	_le32_bswap_4x	r0, r1, r6, r7,  r8
+	ldmia		r12!, {r8-r11}
+	eor		r0, r0, r8		// x8
+	eor		r1, r1, r9		// x9
+	eor		r6, r6, r10		// x10
+	eor		r7, r7, r11		// x11
+	stmia		r14!, {r0,r1,r6,r7}
+	ldmia		r12!, {r0,r1,r6,r7}
+	__ldrd		r8, r9, sp, 48
+	__ldrd		r10, r11, sp, 56
+	add		r2, r8, r2, ror #drot	// x12
+	add		r3, r9, r3, ror #drot	// x13
+	add		r4, r10, r4, ror #drot	// x14
+	add		r5, r11, r5, ror #drot	// x15
+	_le32_bswap_4x	r2, r3, r4, r5,  r9
+	  ldr		r9, [sp, #72]		// load LEN
+	eor		r2, r2, r0		// x12
+	eor		r3, r3, r1		// x13
+	eor		r4, r4, r6		// x14
+	eor		r5, r5, r7		// x15
+	  subs		r9, #64			// decrement and check LEN
+	stmia		r14!, {r2-r5}
+
+	beq		.Ldone\@
+
+.Lprepare_for_next_block\@:
+
+	// Stack: x0-x15 OUT IN LEN
+
+	// Increment block counter (x12)
+	add		r8, #1
+
+	// Store updated (OUT, IN, LEN)
+	str		r14, [sp, #64]
+	str		r12, [sp, #68]
+	str		r9, [sp, #72]
+
+	  mov		r14, sp
+
+	// Store updated block counter (x12)
+	str		r8, [sp, #48]
+
+	  sub		sp, #16
+
+	// Reload state and do next block
+	ldmia		r14!, {r0-r11}		// load x0-x11
+	__strd		r10, r11, sp, 8		// store x10-x11 before state
+	ldmia		r14, {r10-r12,r14}	// load x12-x15
+	b		.Lnext_block\@
+
+.Lxor_slowpath\@:
+	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
+	// We handle it by storing the 64 bytes of keystream to the stack, then
+	// XOR-ing the needed portion with the data.
+
+	// Allocate keystream buffer
+	sub		sp, #64
+	mov		r14, sp
+
+	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
+	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+	// Save keystream for x0-x3
+	__ldrd		r8, r9, sp, 96
+	__ldrd		r10, r11, sp, 104
+	add		X0, X0, r8
+	add		X1, X1, r9
+	add		X2, X2, r10
+	add		X3, X3, r11
+	_le32_bswap_4x	X0, X1, X2, X3,  r8
+	stmia		r14!, {X0-X3}
+
+	// Save keystream for x4-x7
+	__ldrd		r8, r9, sp, 112
+	__ldrd		r10, r11, sp, 120
+	add		X4, r8, X4, ror #brot
+	add		X5, r9, X5, ror #brot
+	add		X6, r10, X6, ror #brot
+	add		X7, r11, X7, ror #brot
+	_le32_bswap_4x	X4, X5, X6, X7,  r8
+	  add		r8, sp, #64
+	stmia		r14!, {X4-X7}
+
+	// Save keystream for x8-x15
+	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
+	__ldrd		r8, r9, sp, 128
+	__ldrd		r10, r11, sp, 136
+	add		r0, r0, r8		// x8
+	add		r1, r1, r9		// x9
+	add		r6, r6, r10		// x10
+	add		r7, r7, r11		// x11
+	_le32_bswap_4x	r0, r1, r6, r7,  r8
+	stmia		r14!, {r0,r1,r6,r7}
+	__ldrd		r8, r9, sp, 144
+	__ldrd		r10, r11, sp, 152
+	add		r2, r8, r2, ror #drot	// x12
+	add		r3, r9, r3, ror #drot	// x13
+	add		r4, r10, r4, ror #drot	// x14
+	add		r5, r11, r5, ror #drot	// x15
+	_le32_bswap_4x	r2, r3, r4, r5,  r9
+	stmia		r14, {r2-r5}
+
+	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
+	// Registers: r8 is block counter, r12 is IN.
+
+	ldr		r9, [sp, #168]		// LEN
+	ldr		r14, [sp, #160]		// OUT
+	cmp		r9, #64
+	  mov		r0, sp
+	movle		r1, r9
+	movgt		r1, #64
+	// r1 is number of bytes to XOR, in range [1, 64]
+
+.if __LINUX_ARM_ARCH__ < 6
+	orr		r2, r12, r14
+	tst		r2, #3			// IN or OUT misaligned?
+	bne		.Lxor_next_byte\@
+.endif
+
+	// XOR a word at a time
+.rept 16
+	subs		r1, #4
+	blt		.Lxor_words_done\@
+	ldr		r2, [r12], #4
+	ldr		r3, [r0], #4
+	eor		r2, r2, r3
+	str		r2, [r14], #4
+.endr
+	b		.Lxor_slowpath_done\@
+.Lxor_words_done\@:
+	ands		r1, r1, #3
+	beq		.Lxor_slowpath_done\@
+
+	// XOR a byte at a time
+.Lxor_next_byte\@:
+	ldrb		r2, [r12], #1
+	ldrb		r3, [r0], #1
+	eor		r2, r2, r3
+	strb		r2, [r14], #1
+	subs		r1, #1
+	bne		.Lxor_next_byte\@
+
+.Lxor_slowpath_done\@:
+	subs		r9, #64
+	add		sp, #96
+	bgt		.Lprepare_for_next_block\@
+
+.Ldone\@:
+.endm	// _chacha
+
+/*
+ * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+ *		     const struct chacha_state *state, int nrounds);
+ */
+ENTRY(chacha_doarm)
+	cmp		r2, #0			// len == 0?
+	reteq		lr
+
+	ldr		ip, [sp]
+	cmp		ip, #12
+
+	push		{r0-r2,r4-r11,lr}
+
+	// Push state x0-x15 onto stack.
+	// Also store an extra copy of x10-x11 just before the state.
+
+	add		X12, r3, #48
+	ldm		X12, {X12,X13,X14,X15}
+	push		{X12,X13,X14,X15}
+	sub		sp, sp, #64
+
+	__ldrd		X8_X10, X9_X11, r3, 40
+	__strd		X8_X10, X9_X11, sp, 8
+	__strd		X8_X10, X9_X11, sp, 56
+	ldm		r3, {X0-X9_X11}
+	__strd		X0, X1, sp, 16
+	__strd		X2, X3, sp, 24
+	__strd		X4, X5, sp, 32
+	__strd		X6, X7, sp, 40
+	__strd		X8_X10, X9_X11, sp, 48
+
+	beq		1f
+	_chacha		20
+
+0:	add		sp, #76
+	pop		{r4-r11, pc}
+
+1:	_chacha		12
+	b		0b
+ENDPROC(chacha_doarm)
+
+/*
+ * void hchacha_block_arm(const struct chacha_state *state,
+ *			  u32 out[HCHACHA_OUT_WORDS], int nrounds);
+ */
+ENTRY(hchacha_block_arm)
+	push		{r1,r4-r11,lr}
+
+	cmp		r2, #12			// ChaCha12 ?
+
+	mov		r14, r0
+	ldmia		r14!, {r0-r11}		// load x0-x11
+	push		{r10-r11}		// store x10-x11 to stack
+	ldm		r14, {r10-r12,r14}	// load x12-x15
+	sub		sp, #8
+
+	beq		1f
+	_chacha_permute	20
+
+	// Skip over (unused0-unused1, x10-x11)
+0:	add		sp, #16
+
+	// Fix up rotations of x12-x15
+	ror		X12, X12, #drot
+	ror		X13, X13, #drot
+	  pop		{r4}			// load 'out'
+	ror		X14, X14, #drot
+	ror		X15, X15, #drot
+
+	// Store (x0-x3,x12-x15) to 'out'
+	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+
+	pop		{r4-r11,pc}
+
+1:	_chacha_permute	12
+	b		0b
+ENDPROC(hchacha_block_arm)
diff --git a/lib/crypto/arm/chacha.h b/lib/crypto/arm/chacha.h
new file mode 100644
index 000000000000..836e49088e98
--- /dev/null
+++ b/lib/crypto/arm/chacha.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ChaCha and HChaCha functions (ARM optimized)
+ *
+ * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+#include <asm/cputype.h>
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+				      u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       int nrounds, unsigned int nbytes);
+asmlinkage void hchacha_block_arm(const struct chacha_state *state,
+				  u32 out[HCHACHA_OUT_WORDS], int nrounds);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+			     const struct chacha_state *state, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
+
+static inline bool neon_usable(void)
+{
+	return static_branch_likely(&use_neon) && crypto_simd_usable();
+}
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+	u8 buf[CHACHA_BLOCK_SIZE];
+
+	while (bytes > CHACHA_BLOCK_SIZE) {
+		unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+	}
+	if (bytes) {
+		const u8 *s = src;
+		u8 *d = dst;
+
+		if (bytes != CHACHA_BLOCK_SIZE)
+			s = d = memcpy(buf, src, bytes);
+		chacha_block_xor_neon(state, d, s, nrounds);
+		if (d != dst)
+			memcpy(dst, buf, bytes);
+		state->x[12]++;
+	}
+}
+
+static void hchacha_block_arch(const struct chacha_state *state,
+			       u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
+		hchacha_block_arm(state, out, nrounds);
+	} else {
+		scoped_ksimd()
+			hchacha_block_neon(state, out, nrounds);
+	}
+}
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+			      const u8 *src, unsigned int bytes, int nrounds)
+{
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
+	    bytes <= CHACHA_BLOCK_SIZE) {
+		chacha_doarm(dst, src, bytes, state, nrounds);
+		state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
+		return;
+	}
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		scoped_ksimd()
+			chacha_doneon(state, dst, src, todo, nrounds);
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+		switch (read_cpuid_part()) {
+		case ARM_CPU_PART_CORTEX_A7:
+		case ARM_CPU_PART_CORTEX_A5:
+			/*
+			 * The Cortex-A7 and Cortex-A5 do not perform well with
+			 * the NEON implementation but do incredibly with the
+			 * scalar one and use less power.
+			 */
+			break;
+		default:
+			static_branch_enable(&use_neon);
+		}
+	}
+}
diff --git a/lib/crypto/arm/curve25519-core.S b/lib/crypto/arm/curve25519-core.S
new file mode 100644
index 000000000000..b697fa5d059a
--- /dev/null
+++ b/lib/crypto/arm/curve25519-core.S
@@ -0,0 +1,2062 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
+ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
+ * manually reworked for use in kernel space.
+ */
+
+#include <linux/linkage.h>
+
+.text
+.arch armv7-a
+.fpu neon
+.align 4
+
+ENTRY(curve25519_neon)
+	push		{r4-r11, lr}
+	mov		ip, sp
+	sub		r3, sp, #704
+	and		r3, r3, #0xfffffff0
+	mov		sp, r3
+	movw		r4, #0
+	movw		r5, #254
+	vmov.i32	q0, #1
+	vshr.u64	q1, q0, #7
+	vshr.u64	q0, q0, #8
+	vmov.i32	d4, #19
+	vmov.i32	d5, #38
+	add		r6, sp, #480
+	vst1.8		{d2-d3}, [r6, : 128]!
+	vst1.8		{d0-d1}, [r6, : 128]!
+	vst1.8		{d4-d5}, [r6, : 128]
+	add		r6, r3, #0
+	vmov.i32	q2, #0
+	vst1.8		{d4-d5}, [r6, : 128]!
+	vst1.8		{d4-d5}, [r6, : 128]!
+	vst1.8		d4, [r6, : 64]
+	add		r6, r3, #0
+	movw		r7, #960
+	sub		r7, r7, #2
+	neg		r7, r7
+	sub		r7, r7, r7, LSL #7
+	str		r7, [r6]
+	add		r6, sp, #672
+	vld1.8		{d4-d5}, [r1]!
+	vld1.8		{d6-d7}, [r1]
+	vst1.8		{d4-d5}, [r6, : 128]!
+	vst1.8		{d6-d7}, [r6, : 128]
+	sub		r1, r6, #16
+	ldrb		r6, [r1]
+	and		r6, r6, #248
+	strb		r6, [r1]
+	ldrb		r6, [r1, #31]
+	and		r6, r6, #127
+	orr		r6, r6, #64
+	strb		r6, [r1, #31]
+	vmov.i64	q2, #0xffffffff
+	vshr.u64	q3, q2, #7
+	vshr.u64	q2, q2, #6
+	vld1.8		{d8}, [r2]
+	vld1.8		{d10}, [r2]
+	add		r2, r2, #6
+	vld1.8		{d12}, [r2]
+	vld1.8		{d14}, [r2]
+	add		r2, r2, #6
+	vld1.8		{d16}, [r2]
+	add		r2, r2, #4
+	vld1.8		{d18}, [r2]
+	vld1.8		{d20}, [r2]
+	add		r2, r2, #6
+	vld1.8		{d22}, [r2]
+	add		r2, r2, #2
+	vld1.8		{d24}, [r2]
+	vld1.8		{d26}, [r2]
+	vshr.u64	q5, q5, #26
+	vshr.u64	q6, q6, #3
+	vshr.u64	q7, q7, #29
+	vshr.u64	q8, q8, #6
+	vshr.u64	q10, q10, #25
+	vshr.u64	q11, q11, #3
+	vshr.u64	q12, q12, #12
+	vshr.u64	q13, q13, #38
+	vand		q4, q4, q2
+	vand		q6, q6, q2
+	vand		q8, q8, q2
+	vand		q10, q10, q2
+	vand		q2, q12, q2
+	vand		q5, q5, q3
+	vand		q7, q7, q3
+	vand		q9, q9, q3
+	vand		q11, q11, q3
+	vand		q3, q13, q3
+	add		r2, r3, #48
+	vadd.i64	q12, q4, q1
+	vadd.i64	q13, q10, q1
+	vshr.s64	q12, q12, #26
+	vshr.s64	q13, q13, #26
+	vadd.i64	q5, q5, q12
+	vshl.i64	q12, q12, #26
+	vadd.i64	q14, q5, q0
+	vadd.i64	q11, q11, q13
+	vshl.i64	q13, q13, #26
+	vadd.i64	q15, q11, q0
+	vsub.i64	q4, q4, q12
+	vshr.s64	q12, q14, #25
+	vsub.i64	q10, q10, q13
+	vshr.s64	q13, q15, #25
+	vadd.i64	q6, q6, q12
+	vshl.i64	q12, q12, #25
+	vadd.i64	q14, q6, q1
+	vadd.i64	q2, q2, q13
+	vsub.i64	q5, q5, q12
+	vshr.s64	q12, q14, #26
+	vshl.i64	q13, q13, #25
+	vadd.i64	q14, q2, q1
+	vadd.i64	q7, q7, q12
+	vshl.i64	q12, q12, #26
+	vadd.i64	q15, q7, q0
+	vsub.i64	q11, q11, q13
+	vshr.s64	q13, q14, #26
+	vsub.i64	q6, q6, q12
+	vshr.s64	q12, q15, #25
+	vadd.i64	q3, q3, q13
+	vshl.i64	q13, q13, #26
+	vadd.i64	q14, q3, q0
+	vadd.i64	q8, q8, q12
+	vshl.i64	q12, q12, #25
+	vadd.i64	q15, q8, q1
+	add		r2, r2, #8
+	vsub.i64	q2, q2, q13
+	vshr.s64	q13, q14, #25
+	vsub.i64	q7, q7, q12
+	vshr.s64	q12, q15, #26
+	vadd.i64	q14, q13, q13
+	vadd.i64	q9, q9, q12
+	vtrn.32		d12, d14
+	vshl.i64	q12, q12, #26
+	vtrn.32		d13, d15
+	vadd.i64	q0, q9, q0
+	vadd.i64	q4, q4, q14
+	vst1.8		d12, [r2, : 64]!
+	vshl.i64	q6, q13, #4
+	vsub.i64	q7, q8, q12
+	vshr.s64	q0, q0, #25
+	vadd.i64	q4, q4, q6
+	vadd.i64	q6, q10, q0
+	vshl.i64	q0, q0, #25
+	vadd.i64	q8, q6, q1
+	vadd.i64	q4, q4, q13
+	vshl.i64	q10, q13, #25
+	vadd.i64	q1, q4, q1
+	vsub.i64	q0, q9, q0
+	vshr.s64	q8, q8, #26
+	vsub.i64	q3, q3, q10
+	vtrn.32		d14, d0
+	vshr.s64	q1, q1, #26
+	vtrn.32		d15, d1
+	vadd.i64	q0, q11, q8
+	vst1.8		d14, [r2, : 64]
+	vshl.i64	q7, q8, #26
+	vadd.i64	q5, q5, q1
+	vtrn.32		d4, d6
+	vshl.i64	q1, q1, #26
+	vtrn.32		d5, d7
+	vsub.i64	q3, q6, q7
+	add		r2, r2, #16
+	vsub.i64	q1, q4, q1
+	vst1.8		d4, [r2, : 64]
+	vtrn.32		d6, d0
+	vtrn.32		d7, d1
+	sub		r2, r2, #8
+	vtrn.32		d2, d10
+	vtrn.32		d3, d11
+	vst1.8		d6, [r2, : 64]
+	sub		r2, r2, #24
+	vst1.8		d2, [r2, : 64]
+	add		r2, r3, #96
+	vmov.i32	q0, #0
+	vmov.i64	d2, #0xff
+	vmov.i64	d3, #0
+	vshr.u32	q1, q1, #7
+	vst1.8		{d2-d3}, [r2, : 128]!
+	vst1.8		{d0-d1}, [r2, : 128]!
+	vst1.8		d0, [r2, : 64]
+	add		r2, r3, #144
+	vmov.i32	q0, #0
+	vst1.8		{d0-d1}, [r2, : 128]!
+	vst1.8		{d0-d1}, [r2, : 128]!
+	vst1.8		d0, [r2, : 64]
+	add		r2, r3, #240
+	vmov.i32	q0, #0
+	vmov.i64	d2, #0xff
+	vmov.i64	d3, #0
+	vshr.u32	q1, q1, #7
+	vst1.8		{d2-d3}, [r2, : 128]!
+	vst1.8		{d0-d1}, [r2, : 128]!
+	vst1.8		d0, [r2, : 64]
+	add		r2, r3, #48
+	add		r6, r3, #192
+	vld1.8		{d0-d1}, [r2, : 128]!
+	vld1.8		{d2-d3}, [r2, : 128]!
+	vld1.8		{d4}, [r2, : 64]
+	vst1.8		{d0-d1}, [r6, : 128]!
+	vst1.8		{d2-d3}, [r6, : 128]!
+	vst1.8		d4, [r6, : 64]
+.Lmainloop:
+	mov		r2, r5, LSR #3
+	and		r6, r5, #7
+	ldrb		r2, [r1, r2]
+	mov		r2, r2, LSR r6
+	and		r2, r2, #1
+	str		r5, [sp, #456]
+	eor		r4, r4, r2
+	str		r2, [sp, #460]
+	neg		r2, r4
+	add		r4, r3, #96
+	add		r5, r3, #192
+	add		r6, r3, #144
+	vld1.8		{d8-d9}, [r4, : 128]!
+	add		r7, r3, #240
+	vld1.8		{d10-d11}, [r5, : 128]!
+	veor		q6, q4, q5
+	vld1.8		{d14-d15}, [r6, : 128]!
+	vdup.i32	q8, r2
+	vld1.8		{d18-d19}, [r7, : 128]!
+	veor		q10, q7, q9
+	vld1.8		{d22-d23}, [r4, : 128]!
+	vand		q6, q6, q8
+	vld1.8		{d24-d25}, [r5, : 128]!
+	vand		q10, q10, q8
+	vld1.8		{d26-d27}, [r6, : 128]!
+	veor		q4, q4, q6
+	vld1.8		{d28-d29}, [r7, : 128]!
+	veor		q5, q5, q6
+	vld1.8		{d0}, [r4, : 64]
+	veor		q6, q7, q10
+	vld1.8		{d2}, [r5, : 64]
+	veor		q7, q9, q10
+	vld1.8		{d4}, [r6, : 64]
+	veor		q9, q11, q12
+	vld1.8		{d6}, [r7, : 64]
+	veor		q10, q0, q1
+	sub		r2, r4, #32
+	vand		q9, q9, q8
+	sub		r4, r5, #32
+	vand		q10, q10, q8
+	sub		r5, r6, #32
+	veor		q11, q11, q9
+	sub		r6, r7, #32
+	veor		q0, q0, q10
+	veor		q9, q12, q9
+	veor		q1, q1, q10
+	veor		q10, q13, q14
+	veor		q12, q2, q3
+	vand		q10, q10, q8
+	vand		q8, q12, q8
+	veor		q12, q13, q10
+	veor		q2, q2, q8
+	veor		q10, q14, q10
+	veor		q3, q3, q8
+	vadd.i32	q8, q4, q6
+	vsub.i32	q4, q4, q6
+	vst1.8		{d16-d17}, [r2, : 128]!
+	vadd.i32	q6, q11, q12
+	vst1.8		{d8-d9}, [r5, : 128]!
+	vsub.i32	q4, q11, q12
+	vst1.8		{d12-d13}, [r2, : 128]!
+	vadd.i32	q6, q0, q2
+	vst1.8		{d8-d9}, [r5, : 128]!
+	vsub.i32	q0, q0, q2
+	vst1.8		d12, [r2, : 64]
+	vadd.i32	q2, q5, q7
+	vst1.8		d0, [r5, : 64]
+	vsub.i32	q0, q5, q7
+	vst1.8		{d4-d5}, [r4, : 128]!
+	vadd.i32	q2, q9, q10
+	vst1.8		{d0-d1}, [r6, : 128]!
+	vsub.i32	q0, q9, q10
+	vst1.8		{d4-d5}, [r4, : 128]!
+	vadd.i32	q2, q1, q3
+	vst1.8		{d0-d1}, [r6, : 128]!
+	vsub.i32	q0, q1, q3
+	vst1.8		d4, [r4, : 64]
+	vst1.8		d0, [r6, : 64]
+	add		r2, sp, #512
+	add		r4, r3, #96
+	add		r5, r3, #144
+	vld1.8		{d0-d1}, [r2, : 128]
+	vld1.8		{d2-d3}, [r4, : 128]!
+	vld1.8		{d4-d5}, [r5, : 128]!
+	vzip.i32	q1, q2
+	vld1.8		{d6-d7}, [r4, : 128]!
+	vld1.8		{d8-d9}, [r5, : 128]!
+	vshl.i32	q5, q1, #1
+	vzip.i32	q3, q4
+	vshl.i32	q6, q2, #1
+	vld1.8		{d14}, [r4, : 64]
+	vshl.i32	q8, q3, #1
+	vld1.8		{d15}, [r5, : 64]
+	vshl.i32	q9, q4, #1
+	vmul.i32	d21, d7, d1
+	vtrn.32		d14, d15
+	vmul.i32	q11, q4, q0
+	vmul.i32	q0, q7, q0
+	vmull.s32	q12, d2, d2
+	vmlal.s32	q12, d11, d1
+	vmlal.s32	q12, d12, d0
+	vmlal.s32	q12, d13, d23
+	vmlal.s32	q12, d16, d22
+	vmlal.s32	q12, d7, d21
+	vmull.s32	q10, d2, d11
+	vmlal.s32	q10, d4, d1
+	vmlal.s32	q10, d13, d0
+	vmlal.s32	q10, d6, d23
+	vmlal.s32	q10, d17, d22
+	vmull.s32	q13, d10, d4
+	vmlal.s32	q13, d11, d3
+	vmlal.s32	q13, d13, d1
+	vmlal.s32	q13, d16, d0
+	vmlal.s32	q13, d17, d23
+	vmlal.s32	q13, d8, d22
+	vmull.s32	q1, d10, d5
+	vmlal.s32	q1, d11, d4
+	vmlal.s32	q1, d6, d1
+	vmlal.s32	q1, d17, d0
+	vmlal.s32	q1, d8, d23
+	vmull.s32	q14, d10, d6
+	vmlal.s32	q14, d11, d13
+	vmlal.s32	q14, d4, d4
+	vmlal.s32	q14, d17, d1
+	vmlal.s32	q14, d18, d0
+	vmlal.s32	q14, d9, d23
+	vmull.s32	q11, d10, d7
+	vmlal.s32	q11, d11, d6
+	vmlal.s32	q11, d12, d5
+	vmlal.s32	q11, d8, d1
+	vmlal.s32	q11, d19, d0
+	vmull.s32	q15, d10, d8
+	vmlal.s32	q15, d11, d17
+	vmlal.s32	q15, d12, d6
+	vmlal.s32	q15, d13, d5
+	vmlal.s32	q15, d19, d1
+	vmlal.s32	q15, d14, d0
+	vmull.s32	q2, d10, d9
+	vmlal.s32	q2, d11, d8
+	vmlal.s32	q2, d12, d7
+	vmlal.s32	q2, d13, d6
+	vmlal.s32	q2, d14, d1
+	vmull.s32	q0, d15, d1
+	vmlal.s32	q0, d10, d14
+	vmlal.s32	q0, d11, d19
+	vmlal.s32	q0, d12, d8
+	vmlal.s32	q0, d13, d17
+	vmlal.s32	q0, d6, d6
+	add		r2, sp, #480
+	vld1.8		{d18-d19}, [r2, : 128]!
+	vmull.s32	q3, d16, d7
+	vmlal.s32	q3, d10, d15
+	vmlal.s32	q3, d11, d14
+	vmlal.s32	q3, d12, d9
+	vmlal.s32	q3, d13, d8
+	vld1.8		{d8-d9}, [r2, : 128]
+	vadd.i64	q5, q12, q9
+	vadd.i64	q6, q15, q9
+	vshr.s64	q5, q5, #26
+	vshr.s64	q6, q6, #26
+	vadd.i64	q7, q10, q5
+	vshl.i64	q5, q5, #26
+	vadd.i64	q8, q7, q4
+	vadd.i64	q2, q2, q6
+	vshl.i64	q6, q6, #26
+	vadd.i64	q10, q2, q4
+	vsub.i64	q5, q12, q5
+	vshr.s64	q8, q8, #25
+	vsub.i64	q6, q15, q6
+	vshr.s64	q10, q10, #25
+	vadd.i64	q12, q13, q8
+	vshl.i64	q8, q8, #25
+	vadd.i64	q13, q12, q9
+	vadd.i64	q0, q0, q10
+	vsub.i64	q7, q7, q8
+	vshr.s64	q8, q13, #26
+	vshl.i64	q10, q10, #25
+	vadd.i64	q13, q0, q9
+	vadd.i64	q1, q1, q8
+	vshl.i64	q8, q8, #26
+	vadd.i64	q15, q1, q4
+	vsub.i64	q2, q2, q10
+	vshr.s64	q10, q13, #26
+	vsub.i64	q8, q12, q8
+	vshr.s64	q12, q15, #25
+	vadd.i64	q3, q3, q10
+	vshl.i64	q10, q10, #26
+	vadd.i64	q13, q3, q4
+	vadd.i64	q14, q14, q12
+	add		r2, r3, #288
+	vshl.i64	q12, q12, #25
+	add		r4, r3, #336
+	vadd.i64	q15, q14, q9
+	add		r2, r2, #8
+	vsub.i64	q0, q0, q10
+	add		r4, r4, #8
+	vshr.s64	q10, q13, #25
+	vsub.i64	q1, q1, q12
+	vshr.s64	q12, q15, #26
+	vadd.i64	q13, q10, q10
+	vadd.i64	q11, q11, q12
+	vtrn.32		d16, d2
+	vshl.i64	q12, q12, #26
+	vtrn.32		d17, d3
+	vadd.i64	q1, q11, q4
+	vadd.i64	q4, q5, q13
+	vst1.8		d16, [r2, : 64]!
+	vshl.i64	q5, q10, #4
+	vst1.8		d17, [r4, : 64]!
+	vsub.i64	q8, q14, q12
+	vshr.s64	q1, q1, #25
+	vadd.i64	q4, q4, q5
+	vadd.i64	q5, q6, q1
+	vshl.i64	q1, q1, #25
+	vadd.i64	q6, q5, q9
+	vadd.i64	q4, q4, q10
+	vshl.i64	q10, q10, #25
+	vadd.i64	q9, q4, q9
+	vsub.i64	q1, q11, q1
+	vshr.s64	q6, q6, #26
+	vsub.i64	q3, q3, q10
+	vtrn.32		d16, d2
+	vshr.s64	q9, q9, #26
+	vtrn.32		d17, d3
+	vadd.i64	q1, q2, q6
+	vst1.8		d16, [r2, : 64]
+	vshl.i64	q2, q6, #26
+	vst1.8		d17, [r4, : 64]
+	vadd.i64	q6, q7, q9
+	vtrn.32		d0, d6
+	vshl.i64	q7, q9, #26
+	vtrn.32		d1, d7
+	vsub.i64	q2, q5, q2
+	add		r2, r2, #16
+	vsub.i64	q3, q4, q7
+	vst1.8		d0, [r2, : 64]
+	add		r4, r4, #16
+	vst1.8		d1, [r4, : 64]
+	vtrn.32		d4, d2
+	vtrn.32		d5, d3
+	sub		r2, r2, #8
+	sub		r4, r4, #8
+	vtrn.32		d6, d12
+	vtrn.32		d7, d13
+	vst1.8		d4, [r2, : 64]
+	vst1.8		d5, [r4, : 64]
+	sub		r2, r2, #24
+	sub		r4, r4, #24
+	vst1.8		d6, [r2, : 64]
+	vst1.8		d7, [r4, : 64]
+	add		r2, r3, #240
+	add		r4, r3, #96
+	vld1.8		{d0-d1}, [r4, : 128]!
+	vld1.8		{d2-d3}, [r4, : 128]!
+	vld1.8		{d4}, [r4, : 64]
+	add		r4, r3, #144
+	vld1.8		{d6-d7}, [r4, : 128]!
+	vtrn.32		q0, q3
+	vld1.8		{d8-d9}, [r4, : 128]!
+	vshl.i32	q5, q0, #4
+	vtrn.32		q1, q4
+	vshl.i32	q6, q3, #4
+	vadd.i32	q5, q5, q0
+	vadd.i32	q6, q6, q3
+	vshl.i32	q7, q1, #4
+	vld1.8		{d5}, [r4, : 64]
+	vshl.i32	q8, q4, #4
+	vtrn.32		d4, d5
+	vadd.i32	q7, q7, q1
+	vadd.i32	q8, q8, q4
+	vld1.8		{d18-d19}, [r2, : 128]!
+	vshl.i32	q10, q2, #4
+	vld1.8		{d22-d23}, [r2, : 128]!
+	vadd.i32	q10, q10, q2
+	vld1.8		{d24}, [r2, : 64]
+	vadd.i32	q5, q5, q0
+	add		r2, r3, #192
+	vld1.8		{d26-d27}, [r2, : 128]!
+	vadd.i32	q6, q6, q3
+	vld1.8		{d28-d29}, [r2, : 128]!
+	vadd.i32	q8, q8, q4
+	vld1.8		{d25}, [r2, : 64]
+	vadd.i32	q10, q10, q2
+	vtrn.32		q9, q13
+	vadd.i32	q7, q7, q1
+	vadd.i32	q5, q5, q0
+	vtrn.32		q11, q14
+	vadd.i32	q6, q6, q3
+	add		r2, sp, #528
+	vadd.i32	q10, q10, q2
+	vtrn.32		d24, d25
+	vst1.8		{d12-d13}, [r2, : 128]!
+	vshl.i32	q6, q13, #1
+	vst1.8		{d20-d21}, [r2, : 128]!
+	vshl.i32	q10, q14, #1
+	vst1.8		{d12-d13}, [r2, : 128]!
+	vshl.i32	q15, q12, #1
+	vadd.i32	q8, q8, q4
+	vext.32		d10, d31, d30, #0
+	vadd.i32	q7, q7, q1
+	vst1.8		{d16-d17}, [r2, : 128]!
+	vmull.s32	q8, d18, d5
+	vmlal.s32	q8, d26, d4
+	vmlal.s32	q8, d19, d9
+	vmlal.s32	q8, d27, d3
+	vmlal.s32	q8, d22, d8
+	vmlal.s32	q8, d28, d2
+	vmlal.s32	q8, d23, d7
+	vmlal.s32	q8, d29, d1
+	vmlal.s32	q8, d24, d6
+	vmlal.s32	q8, d25, d0
+	vst1.8		{d14-d15}, [r2, : 128]!
+	vmull.s32	q2, d18, d4
+	vmlal.s32	q2, d12, d9
+	vmlal.s32	q2, d13, d8
+	vmlal.s32	q2, d19, d3
+	vmlal.s32	q2, d22, d2
+	vmlal.s32	q2, d23, d1
+	vmlal.s32	q2, d24, d0
+	vst1.8		{d20-d21}, [r2, : 128]!
+	vmull.s32	q7, d18, d9
+	vmlal.s32	q7, d26, d3
+	vmlal.s32	q7, d19, d8
+	vmlal.s32	q7, d27, d2
+	vmlal.s32	q7, d22, d7
+	vmlal.s32	q7, d28, d1
+	vmlal.s32	q7, d23, d6
+	vmlal.s32	q7, d29, d0
+	vst1.8		{d10-d11}, [r2, : 128]!
+	vmull.s32	q5, d18, d3
+	vmlal.s32	q5, d19, d2
+	vmlal.s32	q5, d22, d1
+	vmlal.s32	q5, d23, d0
+	vmlal.s32	q5, d12, d8
+	vst1.8		{d16-d17}, [r2, : 128]
+	vmull.s32	q4, d18, d8
+	vmlal.s32	q4, d26, d2
+	vmlal.s32	q4, d19, d7
+	vmlal.s32	q4, d27, d1
+	vmlal.s32	q4, d22, d6
+	vmlal.s32	q4, d28, d0
+	vmull.s32	q8, d18, d7
+	vmlal.s32	q8, d26, d1
+	vmlal.s32	q8, d19, d6
+	vmlal.s32	q8, d27, d0
+	add		r2, sp, #544
+	vld1.8		{d20-d21}, [r2, : 128]
+	vmlal.s32	q7, d24, d21
+	vmlal.s32	q7, d25, d20
+	vmlal.s32	q4, d23, d21
+	vmlal.s32	q4, d29, d20
+	vmlal.s32	q8, d22, d21
+	vmlal.s32	q8, d28, d20
+	vmlal.s32	q5, d24, d20
+	vst1.8		{d14-d15}, [r2, : 128]
+	vmull.s32	q7, d18, d6
+	vmlal.s32	q7, d26, d0
+	add		r2, sp, #624
+	vld1.8		{d30-d31}, [r2, : 128]
+	vmlal.s32	q2, d30, d21
+	vmlal.s32	q7, d19, d21
+	vmlal.s32	q7, d27, d20
+	add		r2, sp, #592
+	vld1.8		{d26-d27}, [r2, : 128]
+	vmlal.s32	q4, d25, d27
+	vmlal.s32	q8, d29, d27
+	vmlal.s32	q8, d25, d26
+	vmlal.s32	q7, d28, d27
+	vmlal.s32	q7, d29, d26
+	add		r2, sp, #576
+	vld1.8		{d28-d29}, [r2, : 128]
+	vmlal.s32	q4, d24, d29
+	vmlal.s32	q8, d23, d29
+	vmlal.s32	q8, d24, d28
+	vmlal.s32	q7, d22, d29
+	vmlal.s32	q7, d23, d28
+	vst1.8		{d8-d9}, [r2, : 128]
+	add		r2, sp, #528
+	vld1.8		{d8-d9}, [r2, : 128]
+	vmlal.s32	q7, d24, d9
+	vmlal.s32	q7, d25, d31
+	vmull.s32	q1, d18, d2
+	vmlal.s32	q1, d19, d1
+	vmlal.s32	q1, d22, d0
+	vmlal.s32	q1, d24, d27
+	vmlal.s32	q1, d23, d20
+	vmlal.s32	q1, d12, d7
+	vmlal.s32	q1, d13, d6
+	vmull.s32	q6, d18, d1
+	vmlal.s32	q6, d19, d0
+	vmlal.s32	q6, d23, d27
+	vmlal.s32	q6, d22, d20
+	vmlal.s32	q6, d24, d26
+	vmull.s32	q0, d18, d0
+	vmlal.s32	q0, d22, d27
+	vmlal.s32	q0, d23, d26
+	vmlal.s32	q0, d24, d31
+	vmlal.s32	q0, d19, d20
+	add		r2, sp, #608
+	vld1.8		{d18-d19}, [r2, : 128]
+	vmlal.s32	q2, d18, d7
+	vmlal.s32	q5, d18, d6
+	vmlal.s32	q1, d18, d21
+	vmlal.s32	q0, d18, d28
+	vmlal.s32	q6, d18, d29
+	vmlal.s32	q2, d19, d6
+	vmlal.s32	q5, d19, d21
+	vmlal.s32	q1, d19, d29
+	vmlal.s32	q0, d19, d9
+	vmlal.s32	q6, d19, d28
+	add		r2, sp, #560
+	vld1.8		{d18-d19}, [r2, : 128]
+	add		r2, sp, #480
+	vld1.8		{d22-d23}, [r2, : 128]
+	vmlal.s32	q5, d19, d7
+	vmlal.s32	q0, d18, d21
+	vmlal.s32	q0, d19, d29
+	vmlal.s32	q6, d18, d6
+	add		r2, sp, #496
+	vld1.8		{d6-d7}, [r2, : 128]
+	vmlal.s32	q6, d19, d21
+	add		r2, sp, #544
+	vld1.8		{d18-d19}, [r2, : 128]
+	vmlal.s32	q0, d30, d8
+	add		r2, sp, #640
+	vld1.8		{d20-d21}, [r2, : 128]
+	vmlal.s32	q5, d30, d29
+	add		r2, sp, #576
+	vld1.8		{d24-d25}, [r2, : 128]
+	vmlal.s32	q1, d30, d28
+	vadd.i64	q13, q0, q11
+	vadd.i64	q14, q5, q11
+	vmlal.s32	q6, d30, d9
+	vshr.s64	q4, q13, #26
+	vshr.s64	q13, q14, #26
+	vadd.i64	q7, q7, q4
+	vshl.i64	q4, q4, #26
+	vadd.i64	q14, q7, q3
+	vadd.i64	q9, q9, q13
+	vshl.i64	q13, q13, #26
+	vadd.i64	q15, q9, q3
+	vsub.i64	q0, q0, q4
+	vshr.s64	q4, q14, #25
+	vsub.i64	q5, q5, q13
+	vshr.s64	q13, q15, #25
+	vadd.i64	q6, q6, q4
+	vshl.i64	q4, q4, #25
+	vadd.i64	q14, q6, q11
+	vadd.i64	q2, q2, q13
+	vsub.i64	q4, q7, q4
+	vshr.s64	q7, q14, #26
+	vshl.i64	q13, q13, #25
+	vadd.i64	q14, q2, q11
+	vadd.i64	q8, q8, q7
+	vshl.i64	q7, q7, #26
+	vadd.i64	q15, q8, q3
+	vsub.i64	q9, q9, q13
+	vshr.s64	q13, q14, #26
+	vsub.i64	q6, q6, q7
+	vshr.s64	q7, q15, #25
+	vadd.i64	q10, q10, q13
+	vshl.i64	q13, q13, #26
+	vadd.i64	q14, q10, q3
+	vadd.i64	q1, q1, q7
+	add		r2, r3, #144
+	vshl.i64	q7, q7, #25
+	add		r4, r3, #96
+	vadd.i64	q15, q1, q11
+	add		r2, r2, #8
+	vsub.i64	q2, q2, q13
+	add		r4, r4, #8
+	vshr.s64	q13, q14, #25
+	vsub.i64	q7, q8, q7
+	vshr.s64	q8, q15, #26
+	vadd.i64	q14, q13, q13
+	vadd.i64	q12, q12, q8
+	vtrn.32		d12, d14
+	vshl.i64	q8, q8, #26
+	vtrn.32		d13, d15
+	vadd.i64	q3, q12, q3
+	vadd.i64	q0, q0, q14
+	vst1.8		d12, [r2, : 64]!
+	vshl.i64	q7, q13, #4
+	vst1.8		d13, [r4, : 64]!
+	vsub.i64	q1, q1, q8
+	vshr.s64	q3, q3, #25
+	vadd.i64	q0, q0, q7
+	vadd.i64	q5, q5, q3
+	vshl.i64	q3, q3, #25
+	vadd.i64	q6, q5, q11
+	vadd.i64	q0, q0, q13
+	vshl.i64	q7, q13, #25
+	vadd.i64	q8, q0, q11
+	vsub.i64	q3, q12, q3
+	vshr.s64	q6, q6, #26
+	vsub.i64	q7, q10, q7
+	vtrn.32		d2, d6
+	vshr.s64	q8, q8, #26
+	vtrn.32		d3, d7
+	vadd.i64	q3, q9, q6
+	vst1.8		d2, [r2, : 64]
+	vshl.i64	q6, q6, #26
+	vst1.8		d3, [r4, : 64]
+	vadd.i64	q1, q4, q8
+	vtrn.32		d4, d14
+	vshl.i64	q4, q8, #26
+	vtrn.32		d5, d15
+	vsub.i64	q5, q5, q6
+	add		r2, r2, #16
+	vsub.i64	q0, q0, q4
+	vst1.8		d4, [r2, : 64]
+	add		r4, r4, #16
+	vst1.8		d5, [r4, : 64]
+	vtrn.32		d10, d6
+	vtrn.32		d11, d7
+	sub		r2, r2, #8
+	sub		r4, r4, #8
+	vtrn.32		d0, d2
+	vtrn.32		d1, d3
+	vst1.8		d10, [r2, : 64]
+	vst1.8		d11, [r4, : 64]
+	sub		r2, r2, #24
+	sub		r4, r4, #24
+	vst1.8		d0, [r2, : 64]
+	vst1.8		d1, [r4, : 64]
+	add		r2, r3, #288
+	add		r4, r3, #336
+	vld1.8		{d0-d1}, [r2, : 128]!
+	vld1.8		{d2-d3}, [r4, : 128]!
+	vsub.i32	q0, q0, q1
+	vld1.8		{d2-d3}, [r2, : 128]!
+	vld1.8		{d4-d5}, [r4, : 128]!
+	vsub.i32	q1, q1, q2
+	add		r5, r3, #240
+	vld1.8		{d4}, [r2, : 64]
+	vld1.8		{d6}, [r4, : 64]
+	vsub.i32	q2, q2, q3
+	vst1.8		{d0-d1}, [r5, : 128]!
+	vst1.8		{d2-d3}, [r5, : 128]!
+	vst1.8		d4, [r5, : 64]
+	add		r2, r3, #144
+	add		r4, r3, #96
+	add		r5, r3, #144
+	add		r6, r3, #192
+	vld1.8		{d0-d1}, [r2, : 128]!
+	vld1.8		{d2-d3}, [r4, : 128]!
+	vsub.i32	q2, q0, q1
+	vadd.i32	q0, q0, q1
+	vld1.8		{d2-d3}, [r2, : 128]!
+	vld1.8		{d6-d7}, [r4, : 128]!
+	vsub.i32	q4, q1, q3
+	vadd.i32	q1, q1, q3
+	vld1.8		{d6}, [r2, : 64]
+	vld1.8		{d10}, [r4, : 64]
+	vsub.i32	q6, q3, q5
+	vadd.i32	q3, q3, q5
+	vst1.8		{d4-d5}, [r5, : 128]!
+	vst1.8		{d0-d1}, [r6, : 128]!
+	vst1.8		{d8-d9}, [r5, : 128]!
+	vst1.8		{d2-d3}, [r6, : 128]!
+	vst1.8		d12, [r5, : 64]
+	vst1.8		d6, [r6, : 64]
+	add		r2, r3, #0
+	add		r4, r3, #240
+	vld1.8		{d0-d1}, [r4, : 128]!
+	vld1.8		{d2-d3}, [r4, : 128]!
+	vld1.8		{d4}, [r4, : 64]
+	add		r4, r3, #336
+	vld1.8		{d6-d7}, [r4, : 128]!
+	vtrn.32		q0, q3
+	vld1.8		{d8-d9}, [r4, : 128]!
+	vshl.i32	q5, q0, #4
+	vtrn.32		q1, q4
+	vshl.i32	q6, q3, #4
+	vadd.i32	q5, q5, q0
+	vadd.i32	q6, q6, q3
+	vshl.i32	q7, q1, #4
+	vld1.8		{d5}, [r4, : 64]
+	vshl.i32	q8, q4, #4
+	vtrn.32		d4, d5
+	vadd.i32	q7, q7, q1
+	vadd.i32	q8, q8, q4
+	vld1.8		{d18-d19}, [r2, : 128]!
+	vshl.i32	q10, q2, #4
+	vld1.8		{d22-d23}, [r2, : 128]!
+	vadd.i32	q10, q10, q2
+	vld1.8		{d24}, [r2, : 64]
+	vadd.i32	q5, q5, q0
+	add		r2, r3, #288
+	vld1.8		{d26-d27}, [r2, : 128]!
+	vadd.i32	q6, q6, q3
+	vld1.8		{d28-d29}, [r2, : 128]!
+	vadd.i32	q8, q8, q4
+	vld1.8		{d25}, [r2, : 64]
+	vadd.i32	q10, q10, q2
+	vtrn.32		q9, q13
+	vadd.i32	q7, q7, q1
+	vadd.i32	q5, q5, q0
+	vtrn.32		q11, q14
+	vadd.i32	q6, q6, q3
+	add		r2, sp, #528
+	vadd.i32	q10, q10, q2
+	vtrn.32		d24, d25
+	vst1.8		{d12-d13}, [r2, : 128]!
+	vshl.i32	q6, q13, #1
+	vst1.8		{d20-d21}, [r2, : 128]!
+	vshl.i32	q10, q14, #1
+	vst1.8		{d12-d13}, [r2, : 128]!
+	vshl.i32	q15, q12, #1
+	vadd.i32	q8, q8, q4
+	vext.32		d10, d31, d30, #0
+	vadd.i32	q7, q7, q1
+	vst1.8		{d16-d17}, [r2, : 128]!
+	vmull.s32	q8, d18, d5
+	vmlal.s32	q8, d26, d4
+	vmlal.s32	q8, d19, d9
+	vmlal.s32	q8, d27, d3
+	vmlal.s32	q8, d22, d8
+	vmlal.s32	q8, d28, d2
+	vmlal.s32	q8, d23, d7
+	vmlal.s32	q8, d29, d1
+	vmlal.s32	q8, d24, d6
+	vmlal.s32	q8, d25, d0
+	vst1.8		{d14-d15}, [r2, : 128]!
+	vmull.s32	q2, d18, d4
+	vmlal.s32	q2, d12, d9
+	vmlal.s32	q2, d13, d8
+	vmlal.s32	q2, d19, d3
+	vmlal.s32	q2, d22, d2
+	vmlal.s32	q2, d23, d1
+	vmlal.s32	q2, d24, d0
+	vst1.8		{d20-d21}, [r2, : 128]!
+	vmull.s32	q7, d18, d9
+	vmlal.s32	q7, d26, d3
+	vmlal.s32	q7, d19, d8
+	vmlal.s32	q7, d27, d2
+	vmlal.s32	q7, d22, d7
+	vmlal.s32	q7, d28, d1
+	vmlal.s32	q7, d23, d6
+	vmlal.s32	q7, d29, d0
+	vst1.8		{d10-d11}, [r2, : 128]!
+	vmull.s32	q5, d18, d3
+	vmlal.s32	q5, d19, d2
+	vmlal.s32	q5, d22, d1
+	vmlal.s32	q5, d23, d0
+	vmlal.s32	q5, d12, d8
+	vst1.8		{d16-d17}, [r2, : 128]!
+	vmull.s32	q4, d18, d8
+	vmlal.s32	q4, d26, d2
+	vmlal.s32	q4, d19, d7
+	vmlal.s32	q4, d27, d1
+	vmlal.s32	q4, d22, d6
+	vmlal.s32	q4, d28, d0
+	vmull.s32	q8, d18, d7
+	vmlal.s32	q8, d26, d1
+	vmlal.s32	q8, d19, d6
+	vmlal.s32	q8, d27, d0
+	add		r2, sp, #544
+	vld1.8		{d20-d21}, [r2, : 128]
+	vmlal.s32	q7, d24, d21
+	vmlal.s32	q7, d25, d20
+	vmlal.s32	q4, d23, d21
+	vmlal.s32	q4, d29, d20
+	vmlal.s32	q8, d22, d21
+	vmlal.s32	q8, d28, d20
+	vmlal.s32	q5, d24, d20
+	vst1.8		{d14-d15}, [r2, : 128]
+	vmull.s32	q7, d18, d6
+	vmlal.s32	q7, d26, d0
+	add		r2, sp, #624
+	vld1.8		{d30-d31}, [r2, : 128]
+	vmlal.s32	q2, d30, d21
+	vmlal.s32	q7, d19, d21
+	vmlal.s32	q7, d27, d20
+	add		r2, sp, #592
+	vld1.8		{d26-d27}, [r2, : 128]
+	vmlal.s32	q4, d25, d27
+	vmlal.s32	q8, d29, d27
+	vmlal.s32	q8, d25, d26
+	vmlal.s32	q7, d28, d27
+	vmlal.s32	q7, d29, d26
+	add		r2, sp, #576
+	vld1.8		{d28-d29}, [r2, : 128]
+	vmlal.s32	q4, d24, d29
+	vmlal.s32	q8, d23, d29
+	vmlal.s32	q8, d24, d28
+	vmlal.s32	q7, d22, d29
+	vmlal.s32	q7, d23, d28
+	vst1.8		{d8-d9}, [r2, : 128]
+	add		r2, sp, #528
+	vld1.8		{d8-d9}, [r2, : 128]
+	vmlal.s32	q7, d24, d9
+	vmlal.s32	q7, d25, d31
+	vmull.s32	q1, d18, d2
+	vmlal.s32	q1, d19, d1
+	vmlal.s32	q1, d22, d0
+	vmlal.s32	q1, d24, d27
+	vmlal.s32	q1, d23, d20
+	vmlal.s32	q1, d12, d7
+	vmlal.s32	q1, d13, d6
+	vmull.s32	q6, d18, d1
+	vmlal.s32	q6, d19, d0
+	vmlal.s32	q6, d23, d27
+	vmlal.s32	q6, d22, d20
+	vmlal.s32	q6, d24, d26
+	vmull.s32	q0, d18, d0
+	vmlal.s32	q0, d22, d27
+	vmlal.s32	q0, d23, d26
+	vmlal.s32	q0, d24, d31
+	vmlal.s32	q0, d19, d20
+	add		r2, sp, #608
+	vld1.8		{d18-d19}, [r2, : 128]
+	vmlal.s32	q2, d18, d7
+	vmlal.s32	q5, d18, d6
+	vmlal.s32	q1, d18, d21
+	vmlal.s32	q0, d18, d28
+	vmlal.s32	q6, d18, d29
+	vmlal.s32	q2, d19, d6
+	vmlal.s32	q5, d19, d21
+	vmlal.s32	q1, d19, d29
+	vmlal.s32	q0, d19, d9
+	vmlal.s32	q6, d19, d28
+	add		r2, sp, #560
+	vld1.8		{d18-d19}, [r2, : 128]
+	add		r2, sp, #480
+	vld1.8		{d22-d23}, [r2, : 128]
+	vmlal.s32	q5, d19, d7
+	vmlal.s32	q0, d18, d21
+	vmlal.s32	q0, d19, d29
+	vmlal.s32	q6, d18, d6
+	add		r2, sp, #496
+	vld1.8		{d6-d7}, [r2, : 128]
+	vmlal.s32	q6, d19, d21
+	add		r2, sp, #544
+	vld1.8		{d18-d19}, [r2, : 128]
+	vmlal.s32	q0, d30, d8
+	add		r2, sp, #640
+	vld1.8		{d20-d21}, [r2, : 128]
+	vmlal.s32	q5, d30, d29
+	add		r2, sp, #576
+	vld1.8		{d24-d25}, [r2, : 128]
+	vmlal.s32	q1, d30, d28
+	vadd.i64	q13, q0, q11
+	vadd.i64	q14, q5, q11
+	vmlal.s32	q6, d30, d9
+	vshr.s64	q4, q13, #26
+	vshr.s64	q13, q14, #26
+	vadd.i64	q7, q7, q4
+	vshl.i64	q4, q4, #26
+	vadd.i64	q14, q7, q3
+	vadd.i64	q9, q9, q13
+	vshl.i64	q13, q13, #26
+	vadd.i64	q15, q9, q3
+	vsub.i64	q0, q0, q4
+	vshr.s64	q4, q14, #25
+	vsub.i64	q5, q5, q13
+	vshr.s64	q13, q15, #25
+	vadd.i64	q6, q6, q4
+	vshl.i64	q4, q4, #25
+	vadd.i64	q14, q6, q11
+	vadd.i64	q2, q2, q13
+	vsub.i64	q4, q7, q4
+	vshr.s64	q7, q14, #26
+	vshl.i64	q13, q13, #25
+	vadd.i64	q14, q2, q11
+	vadd.i64	q8, q8, q7
+	vshl.i64	q7, q7, #26
+	vadd.i64	q15, q8, q3
+	vsub.i64	q9, q9, q13
+	vshr.s64	q13, q14, #26
+	vsub.i64	q6, q6, q7
+	vshr.s64	q7, q15, #25
+	vadd.i64	q10, q10, q13
+	vshl.i64	q13, q13, #26
+	vadd.i64	q14, q10, q3
+	vadd.i64	q1, q1, q7
+	add		r2, r3, #288
+	vshl.i64	q7, q7, #25
+	add		r4, r3, #96
+	vadd.i64	q15, q1, q11
+	add		r2, r2, #8
+	vsub.i64	q2, q2, q13
+	add		r4, r4, #8
+	vshr.s64	q13, q14, #25
+	vsub.i64	q7, q8, q7
+	vshr.s64	q8, q15, #26
+	vadd.i64	q14, q13, q13
+	vadd.i64	q12, q12, q8
+	vtrn.32		d12, d14
+	vshl.i64	q8, q8, #26
+	vtrn.32		d13, d15
+	vadd.i64	q3, q12, q3
+	vadd.i64	q0, q0, q14
+	vst1.8		d12, [r2, : 64]!
+	vshl.i64	q7, q13, #4
+	vst1.8		d13, [r4, : 64]!
+	vsub.i64	q1, q1, q8
+	vshr.s64	q3, q3, #25
+	vadd.i64	q0, q0, q7
+	vadd.i64	q5, q5, q3
+	vshl.i64	q3, q3, #25
+	vadd.i64	q6, q5, q11
+	vadd.i64	q0, q0, q13
+	vshl.i64	q7, q13, #25
+	vadd.i64	q8, q0, q11
+	vsub.i64	q3, q12, q3
+	vshr.s64	q6, q6, #26
+	vsub.i64	q7, q10, q7
+	vtrn.32		d2, d6
+	vshr.s64	q8, q8, #26
+	vtrn.32		d3, d7
+	vadd.i64	q3, q9, q6
+	vst1.8		d2, [r2, : 64]
+	vshl.i64	q6, q6, #26
+	vst1.8		d3, [r4, : 64]
+	vadd.i64	q1, q4, q8
+	vtrn.32		d4, d14
+	vshl.i64	q4, q8, #26
+	vtrn.32		d5, d15
+	vsub.i64	q5, q5, q6
+	add		r2, r2, #16
+	vsub.i64	q0, q0, q4
+	vst1.8		d4, [r2, : 64]
+	add		r4, r4, #16
+	vst1.8		d5, [r4, : 64]
+	vtrn.32		d10, d6
+	vtrn.32		d11, d7
+	sub		r2, r2, #8
+	sub		r4, r4, #8
+	vtrn.32		d0, d2
+	vtrn.32		d1, d3
+	vst1.8		d10, [r2, : 64]
+	vst1.8		d11, [r4, : 64]
+	sub		r2, r2, #24
+	sub		r4, r4, #24
+	vst1.8		d0, [r2, : 64]
+	vst1.8		d1, [r4, : 64]
+	add		r2, sp, #512
+	add		r4, r3, #144
+	add		r5, r3, #192
+	vld1.8		{d0-d1}, [r2, : 128]
+	vld1.8		{d2-d3}, [r4, : 128]!
+	vld1.8		{d4-d5}, [r5, : 128]!
+	vzip.i32	q1, q2
+	vld1.8		{d6-d7}, [r4, : 128]!
+	vld1.8		{d8-d9}, [r5, : 128]!
+	vshl.i32	q5, q1, #1
+	vzip.i32	q3, q4
+	vshl.i32	q6, q2, #1
+	vld1.8		{d14}, [r4, : 64]
+	vshl.i32	q8, q3, #1
+	vld1.8		{d15}, [r5, : 64]
+	vshl.i32	q9, q4, #1
+	vmul.i32	d21, d7, d1
+	vtrn.32		d14, d15
+	vmul.i32	q11, q4, q0
+	vmul.i32	q0, q7, q0
+	vmull.s32	q12, d2, d2
+	vmlal.s32	q12, d11, d1
+	vmlal.s32	q12, d12, d0
+	vmlal.s32	q12, d13, d23
+	vmlal.s32	q12, d16, d22
+	vmlal.s32	q12, d7, d21
+	vmull.s32	q10, d2, d11
+	vmlal.s32	q10, d4, d1
+	vmlal.s32	q10, d13, d0
+	vmlal.s32	q10, d6, d23
+	vmlal.s32	q10, d17, d22
+	vmull.s32	q13, d10, d4
+	vmlal.s32	q13, d11, d3
+	vmlal.s32	q13, d13, d1
+	vmlal.s32	q13, d16, d0
+	vmlal.s32	q13, d17, d23
+	vmlal.s32	q13, d8, d22
+	vmull.s32	q1, d10, d5
+	vmlal.s32	q1, d11, d4
+	vmlal.s32	q1, d6, d1
+	vmlal.s32	q1, d17, d0
+	vmlal.s32	q1, d8, d23
+	vmull.s32	q14, d10, d6
+	vmlal.s32	q14, d11, d13
+	vmlal.s32	q14, d4, d4
+	vmlal.s32	q14, d17, d1
+	vmlal.s32	q14, d18, d0
+	vmlal.s32	q14, d9, d23
+	vmull.s32	q11, d10, d7
+	vmlal.s32	q11, d11, d6
+	vmlal.s32	q11, d12, d5
+	vmlal.s32	q11, d8, d1
+	vmlal.s32	q11, d19, d0
+	vmull.s32	q15, d10, d8
+	vmlal.s32	q15, d11, d17
+	vmlal.s32	q15, d12, d6
+	vmlal.s32	q15, d13, d5
+	vmlal.s32	q15, d19, d1
+	vmlal.s32	q15, d14, d0
+	vmull.s32	q2, d10, d9
+	vmlal.s32	q2, d11, d8
+	vmlal.s32	q2, d12, d7
+	vmlal.s32	q2, d13, d6
+	vmlal.s32	q2, d14, d1
+	vmull.s32	q0, d15, d1
+	vmlal.s32	q0, d10, d14
+	vmlal.s32	q0, d11, d19
+	vmlal.s32	q0, d12, d8
+	vmlal.s32	q0, d13, d17
+	vmlal.s32	q0, d6, d6
+	add		r2, sp, #480
+	vld1.8		{d18-d19}, [r2, : 128]!
+	vmull.s32	q3, d16, d7
+	vmlal.s32	q3, d10, d15
+	vmlal.s32	q3, d11, d14
+	vmlal.s32	q3, d12, d9
+	vmlal.s32	q3, d13, d8
+	vld1.8		{d8-d9}, [r2, : 128]
+	vadd.i64	q5, q12, q9
+	vadd.i64	q6, q15, q9
+	vshr.s64	q5, q5, #26
+	vshr.s64	q6, q6, #26
+	vadd.i64	q7, q10, q5
+	vshl.i64	q5, q5, #26
+	vadd.i64	q8, q7, q4
+	vadd.i64	q2, q2, q6
+	vshl.i64	q6, q6, #26
+	vadd.i64	q10, q2, q4
+	vsub.i64	q5, q12, q5
+	vshr.s64	q8, q8, #25
+	vsub.i64	q6, q15, q6
+	vshr.s64	q10, q10, #25
+	vadd.i64	q12, q13, q8
+	vshl.i64	q8, q8, #25
+	vadd.i64	q13, q12, q9
+	vadd.i64	q0, q0, q10
+	vsub.i64	q7, q7, q8
+	vshr.s64	q8, q13, #26
+	vshl.i64	q10, q10, #25
+	vadd.i64	q13, q0, q9
+	vadd.i64	q1, q1, q8
+	vshl.i64	q8, q8, #26
+	vadd.i64	q15, q1, q4
+	vsub.i64	q2, q2, q10
+	vshr.s64	q10, q13, #26
+	vsub.i64	q8, q12, q8
+	vshr.s64	q12, q15, #25
+	vadd.i64	q3, q3, q10
+	vshl.i64	q10, q10, #26
+	vadd.i64	q13, q3, q4
+	vadd.i64	q14, q14, q12
+	add		r2, r3, #144
+	vshl.i64	q12, q12, #25
+	add		r4, r3, #192
+	vadd.i64	q15, q14, q9
+	add		r2, r2, #8
+	vsub.i64	q0, q0, q10
+	add		r4, r4, #8
+	vshr.s64	q10, q13, #25
+	vsub.i64	q1, q1, q12
+	vshr.s64	q12, q15, #26
+	vadd.i64	q13, q10, q10
+	vadd.i64	q11, q11, q12
+	vtrn.32		d16, d2
+	vshl.i64	q12, q12, #26
+	vtrn.32		d17, d3
+	vadd.i64	q1, q11, q4
+	vadd.i64	q4, q5, q13
+	vst1.8		d16, [r2, : 64]!
+	vshl.i64	q5, q10, #4
+	vst1.8		d17, [r4, : 64]!
+	vsub.i64	q8, q14, q12
+	vshr.s64	q1, q1, #25
+	vadd.i64	q4, q4, q5
+	vadd.i64	q5, q6, q1
+	vshl.i64	q1, q1, #25
+	vadd.i64	q6, q5, q9
+	vadd.i64	q4, q4, q10
+	vshl.i64	q10, q10, #25
+	vadd.i64	q9, q4, q9
+	vsub.i64	q1, q11, q1
+	vshr.s64	q6, q6, #26
+	vsub.i64	q3, q3, q10
+	vtrn.32		d16, d2
+	vshr.s64	q9, q9, #26
+	vtrn.32		d17, d3
+	vadd.i64	q1, q2, q6
+	vst1.8		d16, [r2, : 64]
+	vshl.i64	q2, q6, #26
+	vst1.8		d17, [r4, : 64]
+	vadd.i64	q6, q7, q9
+	vtrn.32		d0, d6
+	vshl.i64	q7, q9, #26
+	vtrn.32		d1, d7
+	vsub.i64	q2, q5, q2
+	add		r2, r2, #16
+	vsub.i64	q3, q4, q7
+	vst1.8		d0, [r2, : 64]
+	add		r4, r4, #16
+	vst1.8		d1, [r4, : 64]
+	vtrn.32		d4, d2
+	vtrn.32		d5, d3
+	sub		r2, r2, #8
+	sub		r4, r4, #8
+	vtrn.32		d6, d12
+	vtrn.32		d7, d13
+	vst1.8		d4, [r2, : 64]
+	vst1.8		d5, [r4, : 64]
+	sub		r2, r2, #24
+	sub		r4, r4, #24
+	vst1.8		d6, [r2, : 64]
+	vst1.8		d7, [r4, : 64]
+	add		r2, r3, #336
+	add		r4, r3, #288
+	vld1.8		{d0-d1}, [r2, : 128]!
+	vld1.8		{d2-d3}, [r4, : 128]!
+	vadd.i32	q0, q0, q1
+	vld1.8		{d2-d3}, [r2, : 128]!
+	vld1.8		{d4-d5}, [r4, : 128]!
+	vadd.i32	q1, q1, q2
+	add		r5, r3, #288
+	vld1.8		{d4}, [r2, : 64]
+	vld1.8		{d6}, [r4, : 64]
+	vadd.i32	q2, q2, q3
+	vst1.8		{d0-d1}, [r5, : 128]!
+	vst1.8		{d2-d3}, [r5, : 128]!
+	vst1.8		d4, [r5, : 64]
+	add		r2, r3, #48
+	add		r4, r3, #144
+	vld1.8		{d0-d1}, [r4, : 128]!
+	vld1.8		{d2-d3}, [r4, : 128]!
+	vld1.8		{d4}, [r4, : 64]
+	add		r4, r3, #288
+	vld1.8		{d6-d7}, [r4, : 128]!
+	vtrn.32		q0, q3
+	vld1.8		{d8-d9}, [r4, : 128]!
+	vshl.i32	q5, q0, #4
+	vtrn.32		q1, q4
+	vshl.i32	q6, q3, #4
+	vadd.i32	q5, q5, q0
+	vadd.i32	q6, q6, q3
+	vshl.i32	q7, q1, #4
+	vld1.8		{d5}, [r4, : 64]
+	vshl.i32	q8, q4, #4
+	vtrn.32		d4, d5
+	vadd.i32	q7, q7, q1
+	vadd.i32	q8, q8, q4
+	vld1.8		{d18-d19}, [r2, : 128]!
+	vshl.i32	q10, q2, #4
+	vld1.8		{d22-d23}, [r2, : 128]!
+	vadd.i32	q10, q10, q2
+	vld1.8		{d24}, [r2, : 64]
+	vadd.i32	q5, q5, q0
+	add		r2, r3, #240
+	vld1.8		{d26-d27}, [r2, : 128]!
+	vadd.i32	q6, q6, q3
+	vld1.8		{d28-d29}, [r2, : 128]!
+	vadd.i32	q8, q8, q4
+	vld1.8		{d25}, [r2, : 64]
+	vadd.i32	q10, q10, q2
+	vtrn.32		q9, q13
+	vadd.i32	q7, q7, q1
+	vadd.i32	q5, q5, q0
+	vtrn.32		q11, q14
+	vadd.i32	q6, q6, q3
+	add		r2, sp, #528
+	vadd.i32	q10, q10, q2
+	vtrn.32		d24, d25
+	vst1.8		{d12-d13}, [r2, : 128]!
+	vshl.i32	q6, q13, #1
+	vst1.8		{d20-d21}, [r2, : 128]!
+	vshl.i32	q10, q14, #1
+	vst1.8		{d12-d13}, [r2, : 128]!
+	vshl.i32	q15, q12, #1
+	vadd.i32	q8, q8, q4
+	vext.32		d10, d31, d30, #0
+	vadd.i32	q7, q7, q1
+	vst1.8		{d16-d17}, [r2, : 128]!
+	vmull.s32	q8, d18, d5
+	vmlal.s32	q8, d26, d4
+	vmlal.s32	q8, d19, d9
+	vmlal.s32	q8, d27, d3
+	vmlal.s32	q8, d22, d8
+	vmlal.s32	q8, d28, d2
+	vmlal.s32	q8, d23, d7
+	vmlal.s32	q8, d29, d1
+	vmlal.s32	q8, d24, d6
+	vmlal.s32	q8, d25, d0
+	vst1.8		{d14-d15}, [r2, : 128]!
+	vmull.s32	q2, d18, d4
+	vmlal.s32	q2, d12, d9
+	vmlal.s32	q2, d13, d8
+	vmlal.s32	q2, d19, d3
+	vmlal.s32	q2, d22, d2
+	vmlal.s32	q2, d23, d1
+	vmlal.s32	q2, d24, d0
+	vst1.8		{d20-d21}, [r2, : 128]!
+	vmull.s32	q7, d18, d9
+	vmlal.s32	q7, d26, d3
+	vmlal.s32	q7, d19, d8
+	vmlal.s32	q7, d27, d2
+	vmlal.s32	q7, d22, d7
+	vmlal.s32	q7, d28, d1
+	vmlal.s32	q7, d23, d6
+	vmlal.s32	q7, d29, d0
+	vst1.8		{d10-d11}, [r2, : 128]!
+	vmull.s32	q5, d18, d3
+	vmlal.s32	q5, d19, d2
+	vmlal.s32	q5, d22, d1
+	vmlal.s32	q5, d23, d0
+	vmlal.s32	q5, d12, d8
+	vst1.8		{d16-d17}, [r2, : 128]!
+	vmull.s32	q4, d18, d8
+	vmlal.s32	q4, d26, d2
+	vmlal.s32	q4, d19, d7
+	vmlal.s32	q4, d27, d1
+	vmlal.s32	q4, d22, d6
+	vmlal.s32	q4, d28, d0
+	vmull.s32	q8, d18, d7
+	vmlal.s32	q8, d26, d1
+	vmlal.s32	q8, d19, d6
+	vmlal.s32	q8, d27, d0
+	add		r2, sp, #544
+	vld1.8		{d20-d21}, [r2, : 128]
+	vmlal.s32	q7, d24, d21
+	vmlal.s32	q7, d25, d20
+	vmlal.s32	q4, d23, d21
+	vmlal.s32	q4, d29, d20
+	vmlal.s32	q8, d22, d21
+	vmlal.s32	q8, d28, d20
+	vmlal.s32	q5, d24, d20
+	vst1.8		{d14-d15}, [r2, : 128]
+	vmull.s32	q7, d18, d6
+	vmlal.s32	q7, d26, d0
+	add		r2, sp, #624
+	vld1.8		{d30-d31}, [r2, : 128]
+	vmlal.s32	q2, d30, d21
+	vmlal.s32	q7, d19, d21
+	vmlal.s32	q7, d27, d20
+	add		r2, sp, #592
+	vld1.8		{d26-d27}, [r2, : 128]
+	vmlal.s32	q4, d25, d27
+	vmlal.s32	q8, d29, d27
+	vmlal.s32	q8, d25, d26
+	vmlal.s32	q7, d28, d27
+	vmlal.s32	q7, d29, d26
+	add		r2, sp, #576
+	vld1.8		{d28-d29}, [r2, : 128]
+	vmlal.s32	q4, d24, d29
+	vmlal.s32	q8, d23, d29
+	vmlal.s32	q8, d24, d28
+	vmlal.s32	q7, d22, d29
+	vmlal.s32	q7, d23, d28
+	vst1.8		{d8-d9}, [r2, : 128]
+	add		r2, sp, #528
+	vld1.8		{d8-d9}, [r2, : 128]
+	vmlal.s32	q7, d24, d9
+	vmlal.s32	q7, d25, d31
+	vmull.s32	q1, d18, d2
+	vmlal.s32	q1, d19, d1
+	vmlal.s32	q1, d22, d0
+	vmlal.s32	q1, d24, d27
+	vmlal.s32	q1, d23, d20
+	vmlal.s32	q1, d12, d7
+	vmlal.s32	q1, d13, d6
+	vmull.s32	q6, d18, d1
+	vmlal.s32	q6, d19, d0
+	vmlal.s32	q6, d23, d27
+	vmlal.s32	q6, d22, d20
+	vmlal.s32	q6, d24, d26
+	vmull.s32	q0, d18, d0
+	vmlal.s32	q0, d22, d27
+	vmlal.s32	q0, d23, d26
+	vmlal.s32	q0, d24, d31
+	vmlal.s32	q0, d19, d20
+	add		r2, sp, #608
+	vld1.8		{d18-d19}, [r2, : 128]
+	vmlal.s32	q2, d18, d7
+	vmlal.s32	q5, d18, d6
+	vmlal.s32	q1, d18, d21
+	vmlal.s32	q0, d18, d28
+	vmlal.s32	q6, d18, d29
+	vmlal.s32	q2, d19, d6
+	vmlal.s32	q5, d19, d21
+	vmlal.s32	q1, d19, d29
+	vmlal.s32	q0, d19, d9
+	vmlal.s32	q6, d19, d28
+	add		r2, sp, #560
+	vld1.8		{d18-d19}, [r2, : 128]
+	add		r2, sp, #480
+	vld1.8		{d22-d23}, [r2, : 128]
+	vmlal.s32	q5, d19, d7
+	vmlal.s32	q0, d18, d21
+	vmlal.s32	q0, d19, d29
+	vmlal.s32	q6, d18, d6
+	add		r2, sp, #496
+	vld1.8		{d6-d7}, [r2, : 128]
+	vmlal.s32	q6, d19, d21
+	add		r2, sp, #544
+	vld1.8		{d18-d19}, [r2, : 128]
+	vmlal.s32	q0, d30, d8
+	add		r2, sp, #640
+	vld1.8		{d20-d21}, [r2, : 128]
+	vmlal.s32	q5, d30, d29
+	add		r2, sp, #576
+	vld1.8		{d24-d25}, [r2, : 128]
+	vmlal.s32	q1, d30, d28
+	vadd.i64	q13, q0, q11
+	vadd.i64	q14, q5, q11
+	vmlal.s32	q6, d30, d9
+	vshr.s64	q4, q13, #26
+	vshr.s64	q13, q14, #26
+	vadd.i64	q7, q7, q4
+	vshl.i64	q4, q4, #26
+	vadd.i64	q14, q7, q3
+	vadd.i64	q9, q9, q13
+	vshl.i64	q13, q13, #26
+	vadd.i64	q15, q9, q3
+	vsub.i64	q0, q0, q4
+	vshr.s64	q4, q14, #25
+	vsub.i64	q5, q5, q13
+	vshr.s64	q13, q15, #25
+	vadd.i64	q6, q6, q4
+	vshl.i64	q4, q4, #25
+	vadd.i64	q14, q6, q11
+	vadd.i64	q2, q2, q13
+	vsub.i64	q4, q7, q4
+	vshr.s64	q7, q14, #26
+	vshl.i64	q13, q13, #25
+	vadd.i64	q14, q2, q11
+	vadd.i64	q8, q8, q7
+	vshl.i64	q7, q7, #26
+	vadd.i64	q15, q8, q3
+	vsub.i64	q9, q9, q13
+	vshr.s64	q13, q14, #26
+	vsub.i64	q6, q6, q7
+	vshr.s64	q7, q15, #25
+	vadd.i64	q10, q10, q13
+	vshl.i64	q13, q13, #26
+	vadd.i64	q14, q10, q3
+	vadd.i64	q1, q1, q7
+	add		r2, r3, #240
+	vshl.i64	q7, q7, #25
+	add		r4, r3, #144
+	vadd.i64	q15, q1, q11
+	add		r2, r2, #8
+	vsub.i64	q2, q2, q13
+	add		r4, r4, #8
+	vshr.s64	q13, q14, #25
+	vsub.i64	q7, q8, q7
+	vshr.s64	q8, q15, #26
+	vadd.i64	q14, q13, q13
+	vadd.i64	q12, q12, q8
+	vtrn.32		d12, d14
+	vshl.i64	q8, q8, #26
+	vtrn.32		d13, d15
+	vadd.i64	q3, q12, q3
+	vadd.i64	q0, q0, q14
+	vst1.8		d12, [r2, : 64]!
+	vshl.i64	q7, q13, #4
+	vst1.8		d13, [r4, : 64]!
+	vsub.i64	q1, q1, q8
+	vshr.s64	q3, q3, #25
+	vadd.i64	q0, q0, q7
+	vadd.i64	q5, q5, q3
+	vshl.i64	q3, q3, #25
+	vadd.i64	q6, q5, q11
+	vadd.i64	q0, q0, q13
+	vshl.i64	q7, q13, #25
+	vadd.i64	q8, q0, q11
+	vsub.i64	q3, q12, q3
+	vshr.s64	q6, q6, #26
+	vsub.i64	q7, q10, q7
+	vtrn.32		d2, d6
+	vshr.s64	q8, q8, #26
+	vtrn.32		d3, d7
+	vadd.i64	q3, q9, q6
+	vst1.8		d2, [r2, : 64]
+	vshl.i64	q6, q6, #26
+	vst1.8		d3, [r4, : 64]
+	vadd.i64	q1, q4, q8
+	vtrn.32		d4, d14
+	vshl.i64	q4, q8, #26
+	vtrn.32		d5, d15
+	vsub.i64	q5, q5, q6
+	add		r2, r2, #16
+	vsub.i64	q0, q0, q4
+	vst1.8		d4, [r2, : 64]
+	add		r4, r4, #16
+	vst1.8		d5, [r4, : 64]
+	vtrn.32		d10, d6
+	vtrn.32		d11, d7
+	sub		r2, r2, #8
+	sub		r4, r4, #8
+	vtrn.32		d0, d2
+	vtrn.32		d1, d3
+	vst1.8		d10, [r2, : 64]
+	vst1.8		d11, [r4, : 64]
+	sub		r2, r2, #24
+	sub		r4, r4, #24
+	vst1.8		d0, [r2, : 64]
+	vst1.8		d1, [r4, : 64]
+	ldr		r2, [sp, #456]
+	ldr		r4, [sp, #460]
+	subs		r5, r2, #1
+	bge		.Lmainloop
+	add		r1, r3, #144
+	add		r2, r3, #336
+	vld1.8		{d0-d1}, [r1, : 128]!
+	vld1.8		{d2-d3}, [r1, : 128]!
+	vld1.8		{d4}, [r1, : 64]
+	vst1.8		{d0-d1}, [r2, : 128]!
+	vst1.8		{d2-d3}, [r2, : 128]!
+	vst1.8		d4, [r2, : 64]
+	movw		r1, #0
+.Linvertloop:
+	add		r2, r3, #144
+	movw		r4, #0
+	movw		r5, #2
+	cmp		r1, #1
+	moveq		r5, #1
+	addeq		r2, r3, #336
+	addeq		r4, r3, #48
+	cmp		r1, #2
+	moveq		r5, #1
+	addeq		r2, r3, #48
+	cmp		r1, #3
+	moveq		r5, #5
+	addeq		r4, r3, #336
+	cmp		r1, #4
+	moveq		r5, #10
+	cmp		r1, #5
+	moveq		r5, #20
+	cmp		r1, #6
+	moveq		r5, #10
+	addeq		r2, r3, #336
+	addeq		r4, r3, #336
+	cmp		r1, #7
+	moveq		r5, #50
+	cmp		r1, #8
+	moveq		r5, #100
+	cmp		r1, #9
+	moveq		r5, #50
+	addeq		r2, r3, #336
+	cmp		r1, #10
+	moveq		r5, #5
+	addeq		r2, r3, #48
+	cmp		r1, #11
+	moveq		r5, #0
+	addeq		r2, r3, #96
+	add		r6, r3, #144
+	add		r7, r3, #288
+	vld1.8		{d0-d1}, [r6, : 128]!
+	vld1.8		{d2-d3}, [r6, : 128]!
+	vld1.8		{d4}, [r6, : 64]
+	vst1.8		{d0-d1}, [r7, : 128]!
+	vst1.8		{d2-d3}, [r7, : 128]!
+	vst1.8		d4, [r7, : 64]
+	cmp		r5, #0
+	beq		.Lskipsquaringloop
+.Lsquaringloop:
+	add		r6, r3, #288
+	add		r7, r3, #288
+	add		r8, r3, #288
+	vmov.i32	q0, #19
+	vmov.i32	q1, #0
+	vmov.i32	q2, #1
+	vzip.i32	q1, q2
+	vld1.8		{d4-d5}, [r7, : 128]!
+	vld1.8		{d6-d7}, [r7, : 128]!
+	vld1.8		{d9}, [r7, : 64]
+	vld1.8		{d10-d11}, [r6, : 128]!
+	add		r7, sp, #384
+	vld1.8		{d12-d13}, [r6, : 128]!
+	vmul.i32	q7, q2, q0
+	vld1.8		{d8}, [r6, : 64]
+	vext.32		d17, d11, d10, #1
+	vmul.i32	q9, q3, q0
+	vext.32		d16, d10, d8, #1
+	vshl.u32	q10, q5, q1
+	vext.32		d22, d14, d4, #1
+	vext.32		d24, d18, d6, #1
+	vshl.u32	q13, q6, q1
+	vshl.u32	d28, d8, d2
+	vrev64.i32	d22, d22
+	vmul.i32	d1, d9, d1
+	vrev64.i32	d24, d24
+	vext.32		d29, d8, d13, #1
+	vext.32		d0, d1, d9, #1
+	vrev64.i32	d0, d0
+	vext.32		d2, d9, d1, #1
+	vext.32		d23, d15, d5, #1
+	vmull.s32	q4, d20, d4
+	vrev64.i32	d23, d23
+	vmlal.s32	q4, d21, d1
+	vrev64.i32	d2, d2
+	vmlal.s32	q4, d26, d19
+	vext.32		d3, d5, d15, #1
+	vmlal.s32	q4, d27, d18
+	vrev64.i32	d3, d3
+	vmlal.s32	q4, d28, d15
+	vext.32		d14, d12, d11, #1
+	vmull.s32	q5, d16, d23
+	vext.32		d15, d13, d12, #1
+	vmlal.s32	q5, d17, d4
+	vst1.8		d8, [r7, : 64]!
+	vmlal.s32	q5, d14, d1
+	vext.32		d12, d9, d8, #0
+	vmlal.s32	q5, d15, d19
+	vmov.i64	d13, #0
+	vmlal.s32	q5, d29, d18
+	vext.32		d25, d19, d7, #1
+	vmlal.s32	q6, d20, d5
+	vrev64.i32	d25, d25
+	vmlal.s32	q6, d21, d4
+	vst1.8		d11, [r7, : 64]!
+	vmlal.s32	q6, d26, d1
+	vext.32		d9, d10, d10, #0
+	vmlal.s32	q6, d27, d19
+	vmov.i64	d8, #0
+	vmlal.s32	q6, d28, d18
+	vmlal.s32	q4, d16, d24
+	vmlal.s32	q4, d17, d5
+	vmlal.s32	q4, d14, d4
+	vst1.8		d12, [r7, : 64]!
+	vmlal.s32	q4, d15, d1
+	vext.32		d10, d13, d12, #0
+	vmlal.s32	q4, d29, d19
+	vmov.i64	d11, #0
+	vmlal.s32	q5, d20, d6
+	vmlal.s32	q5, d21, d5
+	vmlal.s32	q5, d26, d4
+	vext.32		d13, d8, d8, #0
+	vmlal.s32	q5, d27, d1
+	vmov.i64	d12, #0
+	vmlal.s32	q5, d28, d19
+	vst1.8		d9, [r7, : 64]!
+	vmlal.s32	q6, d16, d25
+	vmlal.s32	q6, d17, d6
+	vst1.8		d10, [r7, : 64]
+	vmlal.s32	q6, d14, d5
+	vext.32		d8, d11, d10, #0
+	vmlal.s32	q6, d15, d4
+	vmov.i64	d9, #0
+	vmlal.s32	q6, d29, d1
+	vmlal.s32	q4, d20, d7
+	vmlal.s32	q4, d21, d6
+	vmlal.s32	q4, d26, d5
+	vext.32		d11, d12, d12, #0
+	vmlal.s32	q4, d27, d4
+	vmov.i64	d10, #0
+	vmlal.s32	q4, d28, d1
+	vmlal.s32	q5, d16, d0
+	sub		r6, r7, #32
+	vmlal.s32	q5, d17, d7
+	vmlal.s32	q5, d14, d6
+	vext.32		d30, d9, d8, #0
+	vmlal.s32	q5, d15, d5
+	vld1.8		{d31}, [r6, : 64]!
+	vmlal.s32	q5, d29, d4
+	vmlal.s32	q15, d20, d0
+	vext.32		d0, d6, d18, #1
+	vmlal.s32	q15, d21, d25
+	vrev64.i32	d0, d0
+	vmlal.s32	q15, d26, d24
+	vext.32		d1, d7, d19, #1
+	vext.32		d7, d10, d10, #0
+	vmlal.s32	q15, d27, d23
+	vrev64.i32	d1, d1
+	vld1.8		{d6}, [r6, : 64]
+	vmlal.s32	q15, d28, d22
+	vmlal.s32	q3, d16, d4
+	add		r6, r6, #24
+	vmlal.s32	q3, d17, d2
+	vext.32		d4, d31, d30, #0
+	vmov		d17, d11
+	vmlal.s32	q3, d14, d1
+	vext.32		d11, d13, d13, #0
+	vext.32		d13, d30, d30, #0
+	vmlal.s32	q3, d15, d0
+	vext.32		d1, d8, d8, #0
+	vmlal.s32	q3, d29, d3
+	vld1.8		{d5}, [r6, : 64]
+	sub		r6, r6, #16
+	vext.32		d10, d6, d6, #0
+	vmov.i32	q1, #0xffffffff
+	vshl.i64	q4, q1, #25
+	add		r7, sp, #480
+	vld1.8		{d14-d15}, [r7, : 128]
+	vadd.i64	q9, q2, q7
+	vshl.i64	q1, q1, #26
+	vshr.s64	q10, q9, #26
+	vld1.8		{d0}, [r6, : 64]!
+	vadd.i64	q5, q5, q10
+	vand		q9, q9, q1
+	vld1.8		{d16}, [r6, : 64]!
+	add		r6, sp, #496
+	vld1.8		{d20-d21}, [r6, : 128]
+	vadd.i64	q11, q5, q10
+	vsub.i64	q2, q2, q9
+	vshr.s64	q9, q11, #25
+	vext.32		d12, d5, d4, #0
+	vand		q11, q11, q4
+	vadd.i64	q0, q0, q9
+	vmov		d19, d7
+	vadd.i64	q3, q0, q7
+	vsub.i64	q5, q5, q11
+	vshr.s64	q11, q3, #26
+	vext.32		d18, d11, d10, #0
+	vand		q3, q3, q1
+	vadd.i64	q8, q8, q11
+	vadd.i64	q11, q8, q10
+	vsub.i64	q0, q0, q3
+	vshr.s64	q3, q11, #25
+	vand		q11, q11, q4
+	vadd.i64	q3, q6, q3
+	vadd.i64	q6, q3, q7
+	vsub.i64	q8, q8, q11
+	vshr.s64	q11, q6, #26
+	vand		q6, q6, q1
+	vadd.i64	q9, q9, q11
+	vadd.i64	d25, d19, d21
+	vsub.i64	q3, q3, q6
+	vshr.s64	d23, d25, #25
+	vand		q4, q12, q4
+	vadd.i64	d21, d23, d23
+	vshl.i64	d25, d23, #4
+	vadd.i64	d21, d21, d23
+	vadd.i64	d25, d25, d21
+	vadd.i64	d4, d4, d25
+	vzip.i32	q0, q8
+	vadd.i64	d12, d4, d14
+	add		r6, r8, #8
+	vst1.8		d0, [r6, : 64]
+	vsub.i64	d19, d19, d9
+	add		r6, r6, #16
+	vst1.8		d16, [r6, : 64]
+	vshr.s64	d22, d12, #26
+	vand		q0, q6, q1
+	vadd.i64	d10, d10, d22
+	vzip.i32	q3, q9
+	vsub.i64	d4, d4, d0
+	sub		r6, r6, #8
+	vst1.8		d6, [r6, : 64]
+	add		r6, r6, #16
+	vst1.8		d18, [r6, : 64]
+	vzip.i32	q2, q5
+	sub		r6, r6, #32
+	vst1.8		d4, [r6, : 64]
+	subs		r5, r5, #1
+	bhi		.Lsquaringloop
+.Lskipsquaringloop:
+	mov		r2, r2
+	add		r5, r3, #288
+	add		r6, r3, #144
+	vmov.i32	q0, #19
+	vmov.i32	q1, #0
+	vmov.i32	q2, #1
+	vzip.i32	q1, q2
+	vld1.8		{d4-d5}, [r5, : 128]!
+	vld1.8		{d6-d7}, [r5, : 128]!
+	vld1.8		{d9}, [r5, : 64]
+	vld1.8		{d10-d11}, [r2, : 128]!
+	add		r5, sp, #384
+	vld1.8		{d12-d13}, [r2, : 128]!
+	vmul.i32	q7, q2, q0
+	vld1.8		{d8}, [r2, : 64]
+	vext.32		d17, d11, d10, #1
+	vmul.i32	q9, q3, q0
+	vext.32		d16, d10, d8, #1
+	vshl.u32	q10, q5, q1
+	vext.32		d22, d14, d4, #1
+	vext.32		d24, d18, d6, #1
+	vshl.u32	q13, q6, q1
+	vshl.u32	d28, d8, d2
+	vrev64.i32	d22, d22
+	vmul.i32	d1, d9, d1
+	vrev64.i32	d24, d24
+	vext.32		d29, d8, d13, #1
+	vext.32		d0, d1, d9, #1
+	vrev64.i32	d0, d0
+	vext.32		d2, d9, d1, #1
+	vext.32		d23, d15, d5, #1
+	vmull.s32	q4, d20, d4
+	vrev64.i32	d23, d23
+	vmlal.s32	q4, d21, d1
+	vrev64.i32	d2, d2
+	vmlal.s32	q4, d26, d19
+	vext.32		d3, d5, d15, #1
+	vmlal.s32	q4, d27, d18
+	vrev64.i32	d3, d3
+	vmlal.s32	q4, d28, d15
+	vext.32		d14, d12, d11, #1
+	vmull.s32	q5, d16, d23
+	vext.32		d15, d13, d12, #1
+	vmlal.s32	q5, d17, d4
+	vst1.8		d8, [r5, : 64]!
+	vmlal.s32	q5, d14, d1
+	vext.32		d12, d9, d8, #0
+	vmlal.s32	q5, d15, d19
+	vmov.i64	d13, #0
+	vmlal.s32	q5, d29, d18
+	vext.32		d25, d19, d7, #1
+	vmlal.s32	q6, d20, d5
+	vrev64.i32	d25, d25
+	vmlal.s32	q6, d21, d4
+	vst1.8		d11, [r5, : 64]!
+	vmlal.s32	q6, d26, d1
+	vext.32		d9, d10, d10, #0
+	vmlal.s32	q6, d27, d19
+	vmov.i64	d8, #0
+	vmlal.s32	q6, d28, d18
+	vmlal.s32	q4, d16, d24
+	vmlal.s32	q4, d17, d5
+	vmlal.s32	q4, d14, d4
+	vst1.8		d12, [r5, : 64]!
+	vmlal.s32	q4, d15, d1
+	vext.32		d10, d13, d12, #0
+	vmlal.s32	q4, d29, d19
+	vmov.i64	d11, #0
+	vmlal.s32	q5, d20, d6
+	vmlal.s32	q5, d21, d5
+	vmlal.s32	q5, d26, d4
+	vext.32		d13, d8, d8, #0
+	vmlal.s32	q5, d27, d1
+	vmov.i64	d12, #0
+	vmlal.s32	q5, d28, d19
+	vst1.8		d9, [r5, : 64]!
+	vmlal.s32	q6, d16, d25
+	vmlal.s32	q6, d17, d6
+	vst1.8		d10, [r5, : 64]
+	vmlal.s32	q6, d14, d5
+	vext.32		d8, d11, d10, #0
+	vmlal.s32	q6, d15, d4
+	vmov.i64	d9, #0
+	vmlal.s32	q6, d29, d1
+	vmlal.s32	q4, d20, d7
+	vmlal.s32	q4, d21, d6
+	vmlal.s32	q4, d26, d5
+	vext.32		d11, d12, d12, #0
+	vmlal.s32	q4, d27, d4
+	vmov.i64	d10, #0
+	vmlal.s32	q4, d28, d1
+	vmlal.s32	q5, d16, d0
+	sub		r2, r5, #32
+	vmlal.s32	q5, d17, d7
+	vmlal.s32	q5, d14, d6
+	vext.32		d30, d9, d8, #0
+	vmlal.s32	q5, d15, d5
+	vld1.8		{d31}, [r2, : 64]!
+	vmlal.s32	q5, d29, d4
+	vmlal.s32	q15, d20, d0
+	vext.32		d0, d6, d18, #1
+	vmlal.s32	q15, d21, d25
+	vrev64.i32	d0, d0
+	vmlal.s32	q15, d26, d24
+	vext.32		d1, d7, d19, #1
+	vext.32		d7, d10, d10, #0
+	vmlal.s32	q15, d27, d23
+	vrev64.i32	d1, d1
+	vld1.8		{d6}, [r2, : 64]
+	vmlal.s32	q15, d28, d22
+	vmlal.s32	q3, d16, d4
+	add		r2, r2, #24
+	vmlal.s32	q3, d17, d2
+	vext.32		d4, d31, d30, #0
+	vmov		d17, d11
+	vmlal.s32	q3, d14, d1
+	vext.32		d11, d13, d13, #0
+	vext.32		d13, d30, d30, #0
+	vmlal.s32	q3, d15, d0
+	vext.32		d1, d8, d8, #0
+	vmlal.s32	q3, d29, d3
+	vld1.8		{d5}, [r2, : 64]
+	sub		r2, r2, #16
+	vext.32		d10, d6, d6, #0
+	vmov.i32	q1, #0xffffffff
+	vshl.i64	q4, q1, #25
+	add		r5, sp, #480
+	vld1.8		{d14-d15}, [r5, : 128]
+	vadd.i64	q9, q2, q7
+	vshl.i64	q1, q1, #26
+	vshr.s64	q10, q9, #26
+	vld1.8		{d0}, [r2, : 64]!
+	vadd.i64	q5, q5, q10
+	vand		q9, q9, q1
+	vld1.8		{d16}, [r2, : 64]!
+	add		r2, sp, #496
+	vld1.8		{d20-d21}, [r2, : 128]
+	vadd.i64	q11, q5, q10
+	vsub.i64	q2, q2, q9
+	vshr.s64	q9, q11, #25
+	vext.32		d12, d5, d4, #0
+	vand		q11, q11, q4
+	vadd.i64	q0, q0, q9
+	vmov		d19, d7
+	vadd.i64	q3, q0, q7
+	vsub.i64	q5, q5, q11
+	vshr.s64	q11, q3, #26
+	vext.32		d18, d11, d10, #0
+	vand		q3, q3, q1
+	vadd.i64	q8, q8, q11
+	vadd.i64	q11, q8, q10
+	vsub.i64	q0, q0, q3
+	vshr.s64	q3, q11, #25
+	vand		q11, q11, q4
+	vadd.i64	q3, q6, q3
+	vadd.i64	q6, q3, q7
+	vsub.i64	q8, q8, q11
+	vshr.s64	q11, q6, #26
+	vand		q6, q6, q1
+	vadd.i64	q9, q9, q11
+	vadd.i64	d25, d19, d21
+	vsub.i64	q3, q3, q6
+	vshr.s64	d23, d25, #25
+	vand		q4, q12, q4
+	vadd.i64	d21, d23, d23
+	vshl.i64	d25, d23, #4
+	vadd.i64	d21, d21, d23
+	vadd.i64	d25, d25, d21
+	vadd.i64	d4, d4, d25
+	vzip.i32	q0, q8
+	vadd.i64	d12, d4, d14
+	add		r2, r6, #8
+	vst1.8		d0, [r2, : 64]
+	vsub.i64	d19, d19, d9
+	add		r2, r2, #16
+	vst1.8		d16, [r2, : 64]
+	vshr.s64	d22, d12, #26
+	vand		q0, q6, q1
+	vadd.i64	d10, d10, d22
+	vzip.i32	q3, q9
+	vsub.i64	d4, d4, d0
+	sub		r2, r2, #8
+	vst1.8		d6, [r2, : 64]
+	add		r2, r2, #16
+	vst1.8		d18, [r2, : 64]
+	vzip.i32	q2, q5
+	sub		r2, r2, #32
+	vst1.8		d4, [r2, : 64]
+	cmp		r4, #0
+	beq		.Lskippostcopy
+	add		r2, r3, #144
+	mov		r4, r4
+	vld1.8		{d0-d1}, [r2, : 128]!
+	vld1.8		{d2-d3}, [r2, : 128]!
+	vld1.8		{d4}, [r2, : 64]
+	vst1.8		{d0-d1}, [r4, : 128]!
+	vst1.8		{d2-d3}, [r4, : 128]!
+	vst1.8		d4, [r4, : 64]
+.Lskippostcopy:
+	cmp		r1, #1
+	bne		.Lskipfinalcopy
+	add		r2, r3, #288
+	add		r4, r3, #144
+	vld1.8		{d0-d1}, [r2, : 128]!
+	vld1.8		{d2-d3}, [r2, : 128]!
+	vld1.8		{d4}, [r2, : 64]
+	vst1.8		{d0-d1}, [r4, : 128]!
+	vst1.8		{d2-d3}, [r4, : 128]!
+	vst1.8		d4, [r4, : 64]
+.Lskipfinalcopy:
+	add		r1, r1, #1
+	cmp		r1, #12
+	blo		.Linvertloop
+	add		r1, r3, #144
+	ldr		r2, [r1], #4
+	ldr		r3, [r1], #4
+	ldr		r4, [r1], #4
+	ldr		r5, [r1], #4
+	ldr		r6, [r1], #4
+	ldr		r7, [r1], #4
+	ldr		r8, [r1], #4
+	ldr		r9, [r1], #4
+	ldr		r10, [r1], #4
+	ldr		r1, [r1]
+	add		r11, r1, r1, LSL #4
+	add		r11, r11, r1, LSL #1
+	add		r11, r11, #16777216
+	mov		r11, r11, ASR #25
+	add		r11, r11, r2
+	mov		r11, r11, ASR #26
+	add		r11, r11, r3
+	mov		r11, r11, ASR #25
+	add		r11, r11, r4
+	mov		r11, r11, ASR #26
+	add		r11, r11, r5
+	mov		r11, r11, ASR #25
+	add		r11, r11, r6
+	mov		r11, r11, ASR #26
+	add		r11, r11, r7
+	mov		r11, r11, ASR #25
+	add		r11, r11, r8
+	mov		r11, r11, ASR #26
+	add		r11, r11, r9
+	mov		r11, r11, ASR #25
+	add		r11, r11, r10
+	mov		r11, r11, ASR #26
+	add		r11, r11, r1
+	mov		r11, r11, ASR #25
+	add		r2, r2, r11
+	add		r2, r2, r11, LSL #1
+	add		r2, r2, r11, LSL #4
+	mov		r11, r2, ASR #26
+	add		r3, r3, r11
+	sub		r2, r2, r11, LSL #26
+	mov		r11, r3, ASR #25
+	add		r4, r4, r11
+	sub		r3, r3, r11, LSL #25
+	mov		r11, r4, ASR #26
+	add		r5, r5, r11
+	sub		r4, r4, r11, LSL #26
+	mov		r11, r5, ASR #25
+	add		r6, r6, r11
+	sub		r5, r5, r11, LSL #25
+	mov		r11, r6, ASR #26
+	add		r7, r7, r11
+	sub		r6, r6, r11, LSL #26
+	mov		r11, r7, ASR #25
+	add		r8, r8, r11
+	sub		r7, r7, r11, LSL #25
+	mov		r11, r8, ASR #26
+	add		r9, r9, r11
+	sub		r8, r8, r11, LSL #26
+	mov		r11, r9, ASR #25
+	add		r10, r10, r11
+	sub		r9, r9, r11, LSL #25
+	mov		r11, r10, ASR #26
+	add		r1, r1, r11
+	sub		r10, r10, r11, LSL #26
+	mov		r11, r1, ASR #25
+	sub		r1, r1, r11, LSL #25
+	add		r2, r2, r3, LSL #26
+	mov		r3, r3, LSR #6
+	add		r3, r3, r4, LSL #19
+	mov		r4, r4, LSR #13
+	add		r4, r4, r5, LSL #13
+	mov		r5, r5, LSR #19
+	add		r5, r5, r6, LSL #6
+	add		r6, r7, r8, LSL #25
+	mov		r7, r8, LSR #7
+	add		r7, r7, r9, LSL #19
+	mov		r8, r9, LSR #13
+	add		r8, r8, r10, LSL #12
+	mov		r9, r10, LSR #20
+	add		r1, r9, r1, LSL #6
+	str		r2, [r0]
+	str		r3, [r0, #4]
+	str		r4, [r0, #8]
+	str		r5, [r0, #12]
+	str		r6, [r0, #16]
+	str		r7, [r0, #20]
+	str		r8, [r0, #24]
+	str		r1, [r0, #28]
+	movw		r0, #0
+	mov		sp, ip
+	pop		{r4-r11, pc}
+ENDPROC(curve25519_neon)
diff --git a/lib/crypto/arm/curve25519.h b/lib/crypto/arm/curve25519.h
new file mode 100644
index 000000000000..b1a566885e95
--- /dev/null
+++ b/lib/crypto/arm/curve25519.h
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
+ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
+ * manually reworked for use in kernel space.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <linux/types.h>
+#include <linux/jump_label.h>
+
+asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
+				const u8 secret[CURVE25519_KEY_SIZE],
+				const u8 basepoint[CURVE25519_KEY_SIZE]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
+			    const u8 scalar[CURVE25519_KEY_SIZE],
+			    const u8 point[CURVE25519_KEY_SIZE])
+{
+	if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
+		scoped_ksimd()
+			curve25519_neon(out, scalar, point);
+	} else {
+		curve25519_generic(out, scalar, point);
+	}
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+				 const u8 secret[CURVE25519_KEY_SIZE])
+{
+	curve25519_arch(pub, secret, curve25519_base_point);
+}
+
+#define curve25519_mod_init_arch curve25519_mod_init_arch
+static void curve25519_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON)
+		static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl
new file mode 100644
index 000000000000..34c11b7b44bd
--- /dev/null
+++ b/lib/crypto/arm/poly1305-armv4.pl
@@ -0,0 +1,1235 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+#			IALU(*)/gcc-4.4		NEON
+#
+# ARM11xx(ARMv6)	7.78/+100%		-
+# Cortex-A5		6.35/+130%		3.00
+# Cortex-A8		6.25/+115%		2.36
+# Cortex-A9		5.10/+95%		2.55
+# Cortex-A15		3.85/+85%		1.25(**)
+# Snapdragon S4		5.70/+100%		1.48(**)
+#
+# (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
+# (**)	these are trade-off results, they can be improved by ~8% but at
+#	the cost of 15/12% regression on Cortex-A5/A7, it's even possible
+#	to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
+
+$code.=<<___;
+#ifndef	__KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
+# define poly1305_init   poly1305_block_init
+# define poly1305_blocks poly1305_blocks_arm
+#endif
+
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.text
+
+.globl	poly1305_emit
+.globl	poly1305_blocks
+.globl	poly1305_init
+.type	poly1305_init,%function
+.align	5
+poly1305_init:
+.Lpoly1305_init:
+	stmdb	sp!,{r4-r11}
+
+	eor	r3,r3,r3
+	cmp	$inp,#0
+	str	r3,[$ctx,#0]		@ zero hash value
+	str	r3,[$ctx,#4]
+	str	r3,[$ctx,#8]
+	str	r3,[$ctx,#12]
+	str	r3,[$ctx,#16]
+	str	r3,[$ctx,#36]		@ clear is_base2_26
+	add	$ctx,$ctx,#20
+
+#ifdef	__thumb2__
+	it	eq
+#endif
+	moveq	r0,#0
+	beq	.Lno_key
+
+#if	__ARM_MAX_ARCH__>=7
+	mov	r3,#-1
+	str	r3,[$ctx,#28]		@ impossible key power value
+# ifndef __KERNEL__
+	adr	r11,.Lpoly1305_init
+	ldr	r12,.LOPENSSL_armcap
+# endif
+#endif
+	ldrb	r4,[$inp,#0]
+	mov	r10,#0x0fffffff
+	ldrb	r5,[$inp,#1]
+	and	r3,r10,#-4		@ 0x0ffffffc
+	ldrb	r6,[$inp,#2]
+	ldrb	r7,[$inp,#3]
+	orr	r4,r4,r5,lsl#8
+	ldrb	r5,[$inp,#4]
+	orr	r4,r4,r6,lsl#16
+	ldrb	r6,[$inp,#5]
+	orr	r4,r4,r7,lsl#24
+	ldrb	r7,[$inp,#6]
+	and	r4,r4,r10
+
+#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+# if !defined(_WIN32)
+	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
+	ldr	r12,[r12]
+# endif
+#endif
+	ldrb	r8,[$inp,#7]
+	orr	r5,r5,r6,lsl#8
+	ldrb	r6,[$inp,#8]
+	orr	r5,r5,r7,lsl#16
+	ldrb	r7,[$inp,#9]
+	orr	r5,r5,r8,lsl#24
+	ldrb	r8,[$inp,#10]
+	and	r5,r5,r3
+
+#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	tst	r12,#ARMV7_NEON		@ check for NEON
+# ifdef	__thumb2__
+	adr	r9,.Lpoly1305_blocks_neon
+	adr	r11,.Lpoly1305_blocks
+	it	ne
+	movne	r11,r9
+	adr	r12,.Lpoly1305_emit
+	orr	r11,r11,#1		@ thumb-ify addresses
+	orr	r12,r12,#1
+# else
+	add	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
+	ite	eq
+	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
+	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
+# endif
+#endif
+	ldrb	r9,[$inp,#11]
+	orr	r6,r6,r7,lsl#8
+	ldrb	r7,[$inp,#12]
+	orr	r6,r6,r8,lsl#16
+	ldrb	r8,[$inp,#13]
+	orr	r6,r6,r9,lsl#24
+	ldrb	r9,[$inp,#14]
+	and	r6,r6,r3
+
+	ldrb	r10,[$inp,#15]
+	orr	r7,r7,r8,lsl#8
+	str	r4,[$ctx,#0]
+	orr	r7,r7,r9,lsl#16
+	str	r5,[$ctx,#4]
+	orr	r7,r7,r10,lsl#24
+	str	r6,[$ctx,#8]
+	and	r7,r7,r3
+	str	r7,[$ctx,#12]
+#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	stmia	r2,{r11,r12}		@ fill functions table
+	mov	r0,#1
+#else
+	mov	r0,#0
+#endif
+.Lno_key:
+	ldmia	sp!,{r4-r11}
+#if	__ARM_ARCH__>=5
+	ret				@ bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
+my ($s1,$s2,$s3)=($r1,$r2,$r3);
+
+$code.=<<___;
+.type	poly1305_blocks,%function
+.align	5
+poly1305_blocks:
+.Lpoly1305_blocks:
+	stmdb	sp!,{r3-r11,lr}
+
+	ands	$len,$len,#-16
+	beq	.Lno_data
+
+	add	$len,$len,$inp		@ end pointer
+	sub	sp,sp,#32
+
+#if __ARM_ARCH__<7
+	ldmia	$ctx,{$h0-$r3}		@ load context
+	add	$ctx,$ctx,#20
+	str	$len,[sp,#16]		@ offload stuff
+	str	$ctx,[sp,#12]
+#else
+	ldr	lr,[$ctx,#36]		@ is_base2_26
+	ldmia	$ctx!,{$h0-$h4}		@ load hash value
+	str	$len,[sp,#16]		@ offload stuff
+	str	$ctx,[sp,#12]
+
+	adds	$r0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
+	mov	$r1,$h1,lsr#6
+	adcs	$r1,$r1,$h2,lsl#20
+	mov	$r2,$h2,lsr#12
+	adcs	$r2,$r2,$h3,lsl#14
+	mov	$r3,$h3,lsr#18
+	adcs	$r3,$r3,$h4,lsl#8
+	mov	$len,#0
+	teq	lr,#0
+	str	$len,[$ctx,#16]		@ clear is_base2_26
+	adc	$len,$len,$h4,lsr#24
+
+	itttt	ne
+	movne	$h0,$r0			@ choose between radixes
+	movne	$h1,$r1
+	movne	$h2,$r2
+	movne	$h3,$r3
+	ldmia	$ctx,{$r0-$r3}		@ load key
+	it	ne
+	movne	$h4,$len
+#endif
+
+	mov	lr,$inp
+	cmp	$padbit,#0
+	str	$r1,[sp,#20]
+	str	$r2,[sp,#24]
+	str	$r3,[sp,#28]
+	b	.Loop
+
+.align	4
+.Loop:
+#if __ARM_ARCH__<7
+	ldrb	r0,[lr],#16		@ load input
+# ifdef	__thumb2__
+	it	hi
+# endif
+	addhi	$h4,$h4,#1		@ 1<<128
+	ldrb	r1,[lr,#-15]
+	ldrb	r2,[lr,#-14]
+	ldrb	r3,[lr,#-13]
+	orr	r1,r0,r1,lsl#8
+	ldrb	r0,[lr,#-12]
+	orr	r2,r1,r2,lsl#16
+	ldrb	r1,[lr,#-11]
+	orr	r3,r2,r3,lsl#24
+	ldrb	r2,[lr,#-10]
+	adds	$h0,$h0,r3		@ accumulate input
+
+	ldrb	r3,[lr,#-9]
+	orr	r1,r0,r1,lsl#8
+	ldrb	r0,[lr,#-8]
+	orr	r2,r1,r2,lsl#16
+	ldrb	r1,[lr,#-7]
+	orr	r3,r2,r3,lsl#24
+	ldrb	r2,[lr,#-6]
+	adcs	$h1,$h1,r3
+
+	ldrb	r3,[lr,#-5]
+	orr	r1,r0,r1,lsl#8
+	ldrb	r0,[lr,#-4]
+	orr	r2,r1,r2,lsl#16
+	ldrb	r1,[lr,#-3]
+	orr	r3,r2,r3,lsl#24
+	ldrb	r2,[lr,#-2]
+	adcs	$h2,$h2,r3
+
+	ldrb	r3,[lr,#-1]
+	orr	r1,r0,r1,lsl#8
+	str	lr,[sp,#8]		@ offload input pointer
+	orr	r2,r1,r2,lsl#16
+	add	$s1,$r1,$r1,lsr#2
+	orr	r3,r2,r3,lsl#24
+#else
+	ldr	r0,[lr],#16		@ load input
+	it	hi
+	addhi	$h4,$h4,#1		@ padbit
+	ldr	r1,[lr,#-12]
+	ldr	r2,[lr,#-8]
+	ldr	r3,[lr,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	adds	$h0,$h0,r0		@ accumulate input
+	str	lr,[sp,#8]		@ offload input pointer
+	adcs	$h1,$h1,r1
+	add	$s1,$r1,$r1,lsr#2
+	adcs	$h2,$h2,r2
+#endif
+	add	$s2,$r2,$r2,lsr#2
+	adcs	$h3,$h3,r3
+	add	$s3,$r3,$r3,lsr#2
+
+	umull	r2,r3,$h1,$r0
+	 adc	$h4,$h4,#0
+	umull	r0,r1,$h0,$r0
+	umlal	r2,r3,$h4,$s1
+	umlal	r0,r1,$h3,$s1
+	ldr	$r1,[sp,#20]		@ reload $r1
+	umlal	r2,r3,$h2,$s3
+	umlal	r0,r1,$h1,$s3
+	umlal	r2,r3,$h3,$s2
+	umlal	r0,r1,$h2,$s2
+	umlal	r2,r3,$h0,$r1
+	str	r0,[sp,#0]		@ future $h0
+	 mul	r0,$s2,$h4
+	ldr	$r2,[sp,#24]		@ reload $r2
+	adds	r2,r2,r1		@ d1+=d0>>32
+	 eor	r1,r1,r1
+	adc	lr,r3,#0		@ future $h2
+	str	r2,[sp,#4]		@ future $h1
+
+	mul	r2,$s3,$h4
+	eor	r3,r3,r3
+	umlal	r0,r1,$h3,$s3
+	ldr	$r3,[sp,#28]		@ reload $r3
+	umlal	r2,r3,$h3,$r0
+	umlal	r0,r1,$h2,$r0
+	umlal	r2,r3,$h2,$r1
+	umlal	r0,r1,$h1,$r1
+	umlal	r2,r3,$h1,$r2
+	umlal	r0,r1,$h0,$r2
+	umlal	r2,r3,$h0,$r3
+	ldr	$h0,[sp,#0]
+	mul	$h4,$r0,$h4
+	ldr	$h1,[sp,#4]
+
+	adds	$h2,lr,r0		@ d2+=d1>>32
+	ldr	lr,[sp,#8]		@ reload input pointer
+	adc	r1,r1,#0
+	adds	$h3,r2,r1		@ d3+=d2>>32
+	ldr	r0,[sp,#16]		@ reload end pointer
+	adc	r3,r3,#0
+	add	$h4,$h4,r3		@ h4+=d3>>32
+
+	and	r1,$h4,#-4
+	and	$h4,$h4,#3
+	add	r1,r1,r1,lsr#2		@ *=5
+	adds	$h0,$h0,r1
+	adcs	$h1,$h1,#0
+	adcs	$h2,$h2,#0
+	adcs	$h3,$h3,#0
+	adc	$h4,$h4,#0
+
+	cmp	r0,lr			@ done yet?
+	bhi	.Loop
+
+	ldr	$ctx,[sp,#12]
+	add	sp,sp,#32
+	stmdb	$ctx,{$h0-$h4}		@ store the result
+
+.Lno_data:
+#if	__ARM_ARCH__>=5
+	ldmia	sp!,{r3-r11,pc}
+#else
+	ldmia	sp!,{r3-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce)=map("r$_",(0..2));
+my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
+my $g4=$ctx;
+
+$code.=<<___;
+.type	poly1305_emit,%function
+.align	5
+poly1305_emit:
+.Lpoly1305_emit:
+	stmdb	sp!,{r4-r11}
+
+	ldmia	$ctx,{$h0-$h4}
+
+#if __ARM_ARCH__>=7
+	ldr	ip,[$ctx,#36]		@ is_base2_26
+
+	adds	$g0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
+	mov	$g1,$h1,lsr#6
+	adcs	$g1,$g1,$h2,lsl#20
+	mov	$g2,$h2,lsr#12
+	adcs	$g2,$g2,$h3,lsl#14
+	mov	$g3,$h3,lsr#18
+	adcs	$g3,$g3,$h4,lsl#8
+	mov	$g4,#0
+	adc	$g4,$g4,$h4,lsr#24
+
+	tst	ip,ip
+	itttt	ne
+	movne	$h0,$g0
+	movne	$h1,$g1
+	movne	$h2,$g2
+	movne	$h3,$g3
+	it	ne
+	movne	$h4,$g4
+#endif
+
+	adds	$g0,$h0,#5		@ compare to modulus
+	adcs	$g1,$h1,#0
+	adcs	$g2,$h2,#0
+	adcs	$g3,$h3,#0
+	adc	$g4,$h4,#0
+	tst	$g4,#4			@ did it carry/borrow?
+
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	$h0,$g0
+	ldr	$g0,[$nonce,#0]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	$h1,$g1
+	ldr	$g1,[$nonce,#4]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	$h2,$g2
+	ldr	$g2,[$nonce,#8]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	$h3,$g3
+	ldr	$g3,[$nonce,#12]
+
+	adds	$h0,$h0,$g0
+	adcs	$h1,$h1,$g1
+	adcs	$h2,$h2,$g2
+	adc	$h3,$h3,$g3
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+	rev	$h0,$h0
+	rev	$h1,$h1
+	rev	$h2,$h2
+	rev	$h3,$h3
+# endif
+	str	$h0,[$mac,#0]
+	str	$h1,[$mac,#4]
+	str	$h2,[$mac,#8]
+	str	$h3,[$mac,#12]
+#else
+	strb	$h0,[$mac,#0]
+	mov	$h0,$h0,lsr#8
+	strb	$h1,[$mac,#4]
+	mov	$h1,$h1,lsr#8
+	strb	$h2,[$mac,#8]
+	mov	$h2,$h2,lsr#8
+	strb	$h3,[$mac,#12]
+	mov	$h3,$h3,lsr#8
+
+	strb	$h0,[$mac,#1]
+	mov	$h0,$h0,lsr#8
+	strb	$h1,[$mac,#5]
+	mov	$h1,$h1,lsr#8
+	strb	$h2,[$mac,#9]
+	mov	$h2,$h2,lsr#8
+	strb	$h3,[$mac,#13]
+	mov	$h3,$h3,lsr#8
+
+	strb	$h0,[$mac,#2]
+	mov	$h0,$h0,lsr#8
+	strb	$h1,[$mac,#6]
+	mov	$h1,$h1,lsr#8
+	strb	$h2,[$mac,#10]
+	mov	$h2,$h2,lsr#8
+	strb	$h3,[$mac,#14]
+	mov	$h3,$h3,lsr#8
+
+	strb	$h0,[$mac,#3]
+	strb	$h1,[$mac,#7]
+	strb	$h2,[$mac,#11]
+	strb	$h3,[$mac,#15]
+#endif
+	ldmia	sp!,{r4-r11}
+#if	__ARM_ARCH__>=5
+	ret				@ bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	poly1305_emit,.-poly1305_emit
+___
+{
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
+my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
+my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
+
+my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
+
+$code.=<<___;
+#if	__ARM_MAX_ARCH__>=7
+.fpu	neon
+
+.type	poly1305_init_neon,%function
+.align	5
+poly1305_init_neon:
+.Lpoly1305_init_neon:
+	ldr	r3,[$ctx,#48]		@ first table element
+	cmp	r3,#-1			@ is value impossible?
+	bne	.Lno_init_neon
+
+	ldr	r4,[$ctx,#20]		@ load key base 2^32
+	ldr	r5,[$ctx,#24]
+	ldr	r6,[$ctx,#28]
+	ldr	r7,[$ctx,#32]
+
+	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
+	mov	r3,r4,lsr#26
+	mov	r4,r5,lsr#20
+	orr	r3,r3,r5,lsl#6
+	mov	r5,r6,lsr#14
+	orr	r4,r4,r6,lsl#12
+	mov	r6,r7,lsr#8
+	orr	r5,r5,r7,lsl#18
+	and	r3,r3,#0x03ffffff
+	and	r4,r4,#0x03ffffff
+	and	r5,r5,#0x03ffffff
+
+	vdup.32	$R0,r2			@ r^1 in both lanes
+	add	r2,r3,r3,lsl#2		@ *5
+	vdup.32	$R1,r3
+	add	r3,r4,r4,lsl#2
+	vdup.32	$S1,r2
+	vdup.32	$R2,r4
+	add	r4,r5,r5,lsl#2
+	vdup.32	$S2,r3
+	vdup.32	$R3,r5
+	add	r5,r6,r6,lsl#2
+	vdup.32	$S3,r4
+	vdup.32	$R4,r6
+	vdup.32	$S4,r5
+
+	mov	$zeros,#2		@ counter
+
+.Lsquare_neon:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+
+	vmull.u32	$D0,$R0,${R0}[1]
+	vmull.u32	$D1,$R1,${R0}[1]
+	vmull.u32	$D2,$R2,${R0}[1]
+	vmull.u32	$D3,$R3,${R0}[1]
+	vmull.u32	$D4,$R4,${R0}[1]
+
+	vmlal.u32	$D0,$R4,${S1}[1]
+	vmlal.u32	$D1,$R0,${R1}[1]
+	vmlal.u32	$D2,$R1,${R1}[1]
+	vmlal.u32	$D3,$R2,${R1}[1]
+	vmlal.u32	$D4,$R3,${R1}[1]
+
+	vmlal.u32	$D0,$R3,${S2}[1]
+	vmlal.u32	$D1,$R4,${S2}[1]
+	vmlal.u32	$D3,$R1,${R2}[1]
+	vmlal.u32	$D2,$R0,${R2}[1]
+	vmlal.u32	$D4,$R2,${R2}[1]
+
+	vmlal.u32	$D0,$R2,${S3}[1]
+	vmlal.u32	$D3,$R0,${R3}[1]
+	vmlal.u32	$D1,$R3,${S3}[1]
+	vmlal.u32	$D2,$R4,${S3}[1]
+	vmlal.u32	$D4,$R1,${R3}[1]
+
+	vmlal.u32	$D3,$R4,${S4}[1]
+	vmlal.u32	$D0,$R1,${S4}[1]
+	vmlal.u32	$D1,$R2,${S4}[1]
+	vmlal.u32	$D2,$R3,${S4}[1]
+	vmlal.u32	$D4,$R0,${R4}[1]
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+	@ and P. Schwabe
+	@
+	@ H0>>+H1>>+H2>>+H3>>+H4
+	@ H3>>+H4>>*5+H0>>+H1
+	@
+	@ Trivia.
+	@
+	@ Result of multiplication of n-bit number by m-bit number is
+	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+	@ m-bit number multiplied by 2^n is still n+m bits wide.
+	@
+	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+	@ one is n+1 bits wide.
+	@
+	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+	@ can be 27. However! In cases when their width exceeds 26 bits
+	@ they are limited by 2^26+2^6. This in turn means that *sum*
+	@ of the products with these values can still be viewed as sum
+	@ of 52-bit numbers as long as the amount of addends is not a
+	@ power of 2. For example,
+	@
+	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+	@
+	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
+	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+	@ which is less than 32 * (2^52) or 2^57. And when processing
+	@ data we are looking at triple as many addends...
+	@
+	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
+	@ This means that result of reduction have to be compressed upon
+	@ loop wrap-around. This can be done in the process of reduction
+	@ to minimize amount of instructions [as well as amount of
+	@ 128-bit instructions, which benefits low-end processors], but
+	@ one has to watch for H2 (which is narrower than H0) and 5*H4
+	@ not being wider than 58 bits, so that result of right shift
+	@ by 26 bits fits in 32 bits. This is also useful on x86,
+	@ because it allows to use paddd in place for paddq, which
+	@ benefits Atom, where paddq is ridiculously slow.
+
+	vshr.u64	$T0,$D3,#26
+	vmovn.i64	$D3#lo,$D3
+	 vshr.u64	$T1,$D0,#26
+	 vmovn.i64	$D0#lo,$D0
+	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
+	vbic.i32	$D3#lo,#0xfc000000	@ &=0x03ffffff
+	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
+	 vbic.i32	$D0#lo,#0xfc000000
+
+	vshrn.u64	$T0#lo,$D4,#26
+	vmovn.i64	$D4#lo,$D4
+	 vshr.u64	$T1,$D1,#26
+	 vmovn.i64	$D1#lo,$D1
+	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
+	vbic.i32	$D4#lo,#0xfc000000
+	 vbic.i32	$D1#lo,#0xfc000000
+
+	vadd.i32	$D0#lo,$D0#lo,$T0#lo
+	vshl.u32	$T0#lo,$T0#lo,#2
+	 vshrn.u64	$T1#lo,$D2,#26
+	 vmovn.i64	$D2#lo,$D2
+	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
+	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
+	 vbic.i32	$D2#lo,#0xfc000000
+
+	vshr.u32	$T0#lo,$D0#lo,#26
+	vbic.i32	$D0#lo,#0xfc000000
+	 vshr.u32	$T1#lo,$D3#lo,#26
+	 vbic.i32	$D3#lo,#0xfc000000
+	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
+	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
+
+	subs		$zeros,$zeros,#1
+	beq		.Lsquare_break_neon
+
+	add		$tbl0,$ctx,#(48+0*9*4)
+	add		$tbl1,$ctx,#(48+1*9*4)
+
+	vtrn.32		$R0,$D0#lo		@ r^2:r^1
+	vtrn.32		$R2,$D2#lo
+	vtrn.32		$R3,$D3#lo
+	vtrn.32		$R1,$D1#lo
+	vtrn.32		$R4,$D4#lo
+
+	vshl.u32	$S2,$R2,#2		@ *5
+	vshl.u32	$S3,$R3,#2
+	vshl.u32	$S1,$R1,#2
+	vshl.u32	$S4,$R4,#2
+	vadd.i32	$S2,$S2,$R2
+	vadd.i32	$S1,$S1,$R1
+	vadd.i32	$S3,$S3,$R3
+	vadd.i32	$S4,$S4,$R4
+
+	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vst1.32		{${S4}[0]},[$tbl0,:32]
+	vst1.32		{${S4}[1]},[$tbl1,:32]
+
+	b		.Lsquare_neon
+
+.align	4
+.Lsquare_break_neon:
+	add		$tbl0,$ctx,#(48+2*4*9)
+	add		$tbl1,$ctx,#(48+3*4*9)
+
+	vmov		$R0,$D0#lo		@ r^4:r^3
+	vshl.u32	$S1,$D1#lo,#2		@ *5
+	vmov		$R1,$D1#lo
+	vshl.u32	$S2,$D2#lo,#2
+	vmov		$R2,$D2#lo
+	vshl.u32	$S3,$D3#lo,#2
+	vmov		$R3,$D3#lo
+	vshl.u32	$S4,$D4#lo,#2
+	vmov		$R4,$D4#lo
+	vadd.i32	$S1,$S1,$D1#lo
+	vadd.i32	$S2,$S2,$D2#lo
+	vadd.i32	$S3,$S3,$D3#lo
+	vadd.i32	$S4,$S4,$D4#lo
+
+	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vst1.32		{${S4}[0]},[$tbl0]
+	vst1.32		{${S4}[1]},[$tbl1]
+
+.Lno_init_neon:
+	ret				@ bx	lr
+.size	poly1305_init_neon,.-poly1305_init_neon
+
+.globl	poly1305_blocks_neon
+.type	poly1305_blocks_neon,%function
+.align	5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+	ldr	ip,[$ctx,#36]		@ is_base2_26
+
+	cmp	$len,#64
+	blo	.Lpoly1305_blocks
+
+	stmdb	sp!,{r4-r7}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+
+	tst	ip,ip			@ is_base2_26?
+	bne	.Lbase2_26_neon
+
+	stmdb	sp!,{r1-r3,lr}
+	bl	.Lpoly1305_init_neon
+
+	ldr	r4,[$ctx,#0]		@ load hash value base 2^32
+	ldr	r5,[$ctx,#4]
+	ldr	r6,[$ctx,#8]
+	ldr	r7,[$ctx,#12]
+	ldr	ip,[$ctx,#16]
+
+	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
+	mov	r3,r4,lsr#26
+	 veor	$D0#lo,$D0#lo,$D0#lo
+	mov	r4,r5,lsr#20
+	orr	r3,r3,r5,lsl#6
+	 veor	$D1#lo,$D1#lo,$D1#lo
+	mov	r5,r6,lsr#14
+	orr	r4,r4,r6,lsl#12
+	 veor	$D2#lo,$D2#lo,$D2#lo
+	mov	r6,r7,lsr#8
+	orr	r5,r5,r7,lsl#18
+	 veor	$D3#lo,$D3#lo,$D3#lo
+	and	r3,r3,#0x03ffffff
+	orr	r6,r6,ip,lsl#24
+	 veor	$D4#lo,$D4#lo,$D4#lo
+	and	r4,r4,#0x03ffffff
+	mov	r1,#1
+	and	r5,r5,#0x03ffffff
+	str	r1,[$ctx,#36]		@ set is_base2_26
+
+	vmov.32	$D0#lo[0],r2
+	vmov.32	$D1#lo[0],r3
+	vmov.32	$D2#lo[0],r4
+	vmov.32	$D3#lo[0],r5
+	vmov.32	$D4#lo[0],r6
+	adr	$zeros,.Lzeros
+
+	ldmia	sp!,{r1-r3,lr}
+	b	.Lhash_loaded
+
+.align	4
+.Lbase2_26_neon:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ load hash value
+
+	veor		$D0#lo,$D0#lo,$D0#lo
+	veor		$D1#lo,$D1#lo,$D1#lo
+	veor		$D2#lo,$D2#lo,$D2#lo
+	veor		$D3#lo,$D3#lo,$D3#lo
+	veor		$D4#lo,$D4#lo,$D4#lo
+	vld4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+	adr		$zeros,.Lzeros
+	vld1.32		{$D4#lo[0]},[$ctx]
+	sub		$ctx,$ctx,#16		@ rewind
+
+.Lhash_loaded:
+	add		$in2,$inp,#32
+	mov		$padbit,$padbit,lsl#24
+	tst		$len,#31
+	beq		.Leven
+
+	vld4.32		{$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
+	vmov.32		$H4#lo[0],$padbit
+	sub		$len,$len,#16
+	add		$in2,$inp,#32
+
+# ifdef	__ARMEB__
+	vrev32.8	$H0,$H0
+	vrev32.8	$H3,$H3
+	vrev32.8	$H1,$H1
+	vrev32.8	$H2,$H2
+# endif
+	vsri.u32	$H4#lo,$H3#lo,#8	@ base 2^32 -> base 2^26
+	vshl.u32	$H3#lo,$H3#lo,#18
+
+	vsri.u32	$H3#lo,$H2#lo,#14
+	vshl.u32	$H2#lo,$H2#lo,#12
+	vadd.i32	$H4#hi,$H4#lo,$D4#lo	@ add hash value and move to #hi
+
+	vbic.i32	$H3#lo,#0xfc000000
+	vsri.u32	$H2#lo,$H1#lo,#20
+	vshl.u32	$H1#lo,$H1#lo,#6
+
+	vbic.i32	$H2#lo,#0xfc000000
+	vsri.u32	$H1#lo,$H0#lo,#26
+	vadd.i32	$H3#hi,$H3#lo,$D3#lo
+
+	vbic.i32	$H0#lo,#0xfc000000
+	vbic.i32	$H1#lo,#0xfc000000
+	vadd.i32	$H2#hi,$H2#lo,$D2#lo
+
+	vadd.i32	$H0#hi,$H0#lo,$D0#lo
+	vadd.i32	$H1#hi,$H1#lo,$D1#lo
+
+	mov		$tbl1,$zeros
+	add		$tbl0,$ctx,#48
+
+	cmp		$len,$len
+	b		.Long_tail
+
+.align	4
+.Leven:
+	subs		$len,$len,#64
+	it		lo
+	movlo		$in2,$zeros
+
+	vmov.i32	$H4,#1<<24		@ padbit, yes, always
+	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
+	add		$inp,$inp,#64
+	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
+	add		$in2,$in2,#64
+	itt		hi
+	addhi		$tbl1,$ctx,#(48+1*9*4)
+	addhi		$tbl0,$ctx,#(48+3*9*4)
+
+# ifdef	__ARMEB__
+	vrev32.8	$H0,$H0
+	vrev32.8	$H3,$H3
+	vrev32.8	$H1,$H1
+	vrev32.8	$H2,$H2
+# endif
+	vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
+	vshl.u32	$H3,$H3,#18
+
+	vsri.u32	$H3,$H2,#14
+	vshl.u32	$H2,$H2,#12
+
+	vbic.i32	$H3,#0xfc000000
+	vsri.u32	$H2,$H1,#20
+	vshl.u32	$H1,$H1,#6
+
+	vbic.i32	$H2,#0xfc000000
+	vsri.u32	$H1,$H0,#26
+
+	vbic.i32	$H0,#0xfc000000
+	vbic.i32	$H1,#0xfc000000
+
+	bls		.Lskip_loop
+
+	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^2
+	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
+	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	b		.Loop_neon
+
+.align	5
+.Loop_neon:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+	@   \___________________/
+	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+	@   \___________________/ \____________________/
+	@
+	@ Note that we start with inp[2:3]*r^2. This is because it
+	@ doesn't depend on reduction in previous iteration.
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ inp[2:3]*r^2
+
+	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ accumulate inp[0:1]
+	vmull.u32	$D2,$H2#hi,${R0}[1]
+	vadd.i32	$H0#lo,$H0#lo,$D0#lo
+	vmull.u32	$D0,$H0#hi,${R0}[1]
+	vadd.i32	$H3#lo,$H3#lo,$D3#lo
+	vmull.u32	$D3,$H3#hi,${R0}[1]
+	vmlal.u32	$D2,$H1#hi,${R1}[1]
+	vadd.i32	$H1#lo,$H1#lo,$D1#lo
+	vmull.u32	$D1,$H1#hi,${R0}[1]
+
+	vadd.i32	$H4#lo,$H4#lo,$D4#lo
+	vmull.u32	$D4,$H4#hi,${R0}[1]
+	subs		$len,$len,#64
+	vmlal.u32	$D0,$H4#hi,${S1}[1]
+	it		lo
+	movlo		$in2,$zeros
+	vmlal.u32	$D3,$H2#hi,${R1}[1]
+	vld1.32		${S4}[1],[$tbl1,:32]
+	vmlal.u32	$D1,$H0#hi,${R1}[1]
+	vmlal.u32	$D4,$H3#hi,${R1}[1]
+
+	vmlal.u32	$D0,$H3#hi,${S2}[1]
+	vmlal.u32	$D3,$H1#hi,${R2}[1]
+	vmlal.u32	$D4,$H2#hi,${R2}[1]
+	vmlal.u32	$D1,$H4#hi,${S2}[1]
+	vmlal.u32	$D2,$H0#hi,${R2}[1]
+
+	vmlal.u32	$D3,$H0#hi,${R3}[1]
+	vmlal.u32	$D0,$H2#hi,${S3}[1]
+	vmlal.u32	$D4,$H1#hi,${R3}[1]
+	vmlal.u32	$D1,$H3#hi,${S3}[1]
+	vmlal.u32	$D2,$H4#hi,${S3}[1]
+
+	vmlal.u32	$D3,$H4#hi,${S4}[1]
+	vmlal.u32	$D0,$H1#hi,${S4}[1]
+	vmlal.u32	$D4,$H0#hi,${R4}[1]
+	vmlal.u32	$D1,$H2#hi,${S4}[1]
+	vmlal.u32	$D2,$H3#hi,${S4}[1]
+
+	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
+	add		$in2,$in2,#64
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ (hash+inp[0:1])*r^4 and accumulate
+
+	vmlal.u32	$D3,$H3#lo,${R0}[0]
+	vmlal.u32	$D0,$H0#lo,${R0}[0]
+	vmlal.u32	$D4,$H4#lo,${R0}[0]
+	vmlal.u32	$D1,$H1#lo,${R0}[0]
+	vmlal.u32	$D2,$H2#lo,${R0}[0]
+	vld1.32		${S4}[0],[$tbl0,:32]
+
+	vmlal.u32	$D3,$H2#lo,${R1}[0]
+	vmlal.u32	$D0,$H4#lo,${S1}[0]
+	vmlal.u32	$D4,$H3#lo,${R1}[0]
+	vmlal.u32	$D1,$H0#lo,${R1}[0]
+	vmlal.u32	$D2,$H1#lo,${R1}[0]
+
+	vmlal.u32	$D3,$H1#lo,${R2}[0]
+	vmlal.u32	$D0,$H3#lo,${S2}[0]
+	vmlal.u32	$D4,$H2#lo,${R2}[0]
+	vmlal.u32	$D1,$H4#lo,${S2}[0]
+	vmlal.u32	$D2,$H0#lo,${R2}[0]
+
+	vmlal.u32	$D3,$H0#lo,${R3}[0]
+	vmlal.u32	$D0,$H2#lo,${S3}[0]
+	vmlal.u32	$D4,$H1#lo,${R3}[0]
+	vmlal.u32	$D1,$H3#lo,${S3}[0]
+	vmlal.u32	$D3,$H4#lo,${S4}[0]
+
+	vmlal.u32	$D2,$H4#lo,${S3}[0]
+	vmlal.u32	$D0,$H1#lo,${S4}[0]
+	vmlal.u32	$D4,$H0#lo,${R4}[0]
+	vmov.i32	$H4,#1<<24		@ padbit, yes, always
+	vmlal.u32	$D1,$H2#lo,${S4}[0]
+	vmlal.u32	$D2,$H3#lo,${S4}[0]
+
+	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
+	add		$inp,$inp,#64
+# ifdef	__ARMEB__
+	vrev32.8	$H0,$H0
+	vrev32.8	$H1,$H1
+	vrev32.8	$H2,$H2
+	vrev32.8	$H3,$H3
+# endif
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
+	@ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
+
+	vshr.u64	$T0,$D3,#26
+	vmovn.i64	$D3#lo,$D3
+	 vshr.u64	$T1,$D0,#26
+	 vmovn.i64	$D0#lo,$D0
+	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
+	vbic.i32	$D3#lo,#0xfc000000
+	  vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
+	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
+	  vshl.u32	$H3,$H3,#18
+	 vbic.i32	$D0#lo,#0xfc000000
+
+	vshrn.u64	$T0#lo,$D4,#26
+	vmovn.i64	$D4#lo,$D4
+	 vshr.u64	$T1,$D1,#26
+	 vmovn.i64	$D1#lo,$D1
+	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
+	  vsri.u32	$H3,$H2,#14
+	vbic.i32	$D4#lo,#0xfc000000
+	  vshl.u32	$H2,$H2,#12
+	 vbic.i32	$D1#lo,#0xfc000000
+
+	vadd.i32	$D0#lo,$D0#lo,$T0#lo
+	vshl.u32	$T0#lo,$T0#lo,#2
+	  vbic.i32	$H3,#0xfc000000
+	 vshrn.u64	$T1#lo,$D2,#26
+	 vmovn.i64	$D2#lo,$D2
+	vaddl.u32	$D0,$D0#lo,$T0#lo	@ h4 -> h0 [widen for a sec]
+	  vsri.u32	$H2,$H1,#20
+	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
+	  vshl.u32	$H1,$H1,#6
+	 vbic.i32	$D2#lo,#0xfc000000
+	  vbic.i32	$H2,#0xfc000000
+
+	vshrn.u64	$T0#lo,$D0,#26		@ re-narrow
+	vmovn.i64	$D0#lo,$D0
+	  vsri.u32	$H1,$H0,#26
+	  vbic.i32	$H0,#0xfc000000
+	 vshr.u32	$T1#lo,$D3#lo,#26
+	 vbic.i32	$D3#lo,#0xfc000000
+	vbic.i32	$D0#lo,#0xfc000000
+	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
+	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
+	  vbic.i32	$H1,#0xfc000000
+
+	bhi		.Loop_neon
+
+.Lskip_loop:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+	add		$tbl1,$ctx,#(48+0*9*4)
+	add		$tbl0,$ctx,#(48+1*9*4)
+	adds		$len,$len,#32
+	it		ne
+	movne		$len,#0
+	bne		.Long_tail
+
+	vadd.i32	$H2#hi,$H2#lo,$D2#lo	@ add hash value and move to #hi
+	vadd.i32	$H0#hi,$H0#lo,$D0#lo
+	vadd.i32	$H3#hi,$H3#lo,$D3#lo
+	vadd.i32	$H1#hi,$H1#lo,$D1#lo
+	vadd.i32	$H4#hi,$H4#lo,$D4#lo
+
+.Long_tail:
+	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^1
+	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^2
+
+	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ can be redundant
+	vmull.u32	$D2,$H2#hi,$R0
+	vadd.i32	$H0#lo,$H0#lo,$D0#lo
+	vmull.u32	$D0,$H0#hi,$R0
+	vadd.i32	$H3#lo,$H3#lo,$D3#lo
+	vmull.u32	$D3,$H3#hi,$R0
+	vadd.i32	$H1#lo,$H1#lo,$D1#lo
+	vmull.u32	$D1,$H1#hi,$R0
+	vadd.i32	$H4#lo,$H4#lo,$D4#lo
+	vmull.u32	$D4,$H4#hi,$R0
+
+	vmlal.u32	$D0,$H4#hi,$S1
+	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vmlal.u32	$D3,$H2#hi,$R1
+	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	vmlal.u32	$D1,$H0#hi,$R1
+	vmlal.u32	$D4,$H3#hi,$R1
+	vmlal.u32	$D2,$H1#hi,$R1
+
+	vmlal.u32	$D3,$H1#hi,$R2
+	vld1.32		${S4}[1],[$tbl1,:32]
+	vmlal.u32	$D0,$H3#hi,$S2
+	vld1.32		${S4}[0],[$tbl0,:32]
+	vmlal.u32	$D4,$H2#hi,$R2
+	vmlal.u32	$D1,$H4#hi,$S2
+	vmlal.u32	$D2,$H0#hi,$R2
+
+	vmlal.u32	$D3,$H0#hi,$R3
+	 it		ne
+	 addne		$tbl1,$ctx,#(48+2*9*4)
+	vmlal.u32	$D0,$H2#hi,$S3
+	 it		ne
+	 addne		$tbl0,$ctx,#(48+3*9*4)
+	vmlal.u32	$D4,$H1#hi,$R3
+	vmlal.u32	$D1,$H3#hi,$S3
+	vmlal.u32	$D2,$H4#hi,$S3
+
+	vmlal.u32	$D3,$H4#hi,$S4
+	 vorn		$MASK,$MASK,$MASK	@ all-ones, can be redundant
+	vmlal.u32	$D0,$H1#hi,$S4
+	 vshr.u64	$MASK,$MASK,#38
+	vmlal.u32	$D4,$H0#hi,$R4
+	vmlal.u32	$D1,$H2#hi,$S4
+	vmlal.u32	$D2,$H3#hi,$S4
+
+	beq		.Lshort_tail
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^3
+	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
+
+	vmlal.u32	$D2,$H2#lo,$R0
+	vmlal.u32	$D0,$H0#lo,$R0
+	vmlal.u32	$D3,$H3#lo,$R0
+	vmlal.u32	$D1,$H1#lo,$R0
+	vmlal.u32	$D4,$H4#lo,$R0
+
+	vmlal.u32	$D0,$H4#lo,$S1
+	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vmlal.u32	$D3,$H2#lo,$R1
+	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	vmlal.u32	$D1,$H0#lo,$R1
+	vmlal.u32	$D4,$H3#lo,$R1
+	vmlal.u32	$D2,$H1#lo,$R1
+
+	vmlal.u32	$D3,$H1#lo,$R2
+	vld1.32		${S4}[1],[$tbl1,:32]
+	vmlal.u32	$D0,$H3#lo,$S2
+	vld1.32		${S4}[0],[$tbl0,:32]
+	vmlal.u32	$D4,$H2#lo,$R2
+	vmlal.u32	$D1,$H4#lo,$S2
+	vmlal.u32	$D2,$H0#lo,$R2
+
+	vmlal.u32	$D3,$H0#lo,$R3
+	vmlal.u32	$D0,$H2#lo,$S3
+	vmlal.u32	$D4,$H1#lo,$R3
+	vmlal.u32	$D1,$H3#lo,$S3
+	vmlal.u32	$D2,$H4#lo,$S3
+
+	vmlal.u32	$D3,$H4#lo,$S4
+	 vorn		$MASK,$MASK,$MASK	@ all-ones
+	vmlal.u32	$D0,$H1#lo,$S4
+	 vshr.u64	$MASK,$MASK,#38
+	vmlal.u32	$D4,$H0#lo,$R4
+	vmlal.u32	$D1,$H2#lo,$S4
+	vmlal.u32	$D2,$H3#lo,$S4
+
+.Lshort_tail:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ horizontal addition
+
+	vadd.i64	$D3#lo,$D3#lo,$D3#hi
+	vadd.i64	$D0#lo,$D0#lo,$D0#hi
+	vadd.i64	$D4#lo,$D4#lo,$D4#hi
+	vadd.i64	$D1#lo,$D1#lo,$D1#hi
+	vadd.i64	$D2#lo,$D2#lo,$D2#hi
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ lazy reduction, but without narrowing
+
+	vshr.u64	$T0,$D3,#26
+	vand.i64	$D3,$D3,$MASK
+	 vshr.u64	$T1,$D0,#26
+	 vand.i64	$D0,$D0,$MASK
+	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
+	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
+
+	vshr.u64	$T0,$D4,#26
+	vand.i64	$D4,$D4,$MASK
+	 vshr.u64	$T1,$D1,#26
+	 vand.i64	$D1,$D1,$MASK
+	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
+
+	vadd.i64	$D0,$D0,$T0
+	vshl.u64	$T0,$T0,#2
+	 vshr.u64	$T1,$D2,#26
+	 vand.i64	$D2,$D2,$MASK
+	vadd.i64	$D0,$D0,$T0		@ h4 -> h0
+	 vadd.i64	$D3,$D3,$T1		@ h2 -> h3
+
+	vshr.u64	$T0,$D0,#26
+	vand.i64	$D0,$D0,$MASK
+	 vshr.u64	$T1,$D3,#26
+	 vand.i64	$D3,$D3,$MASK
+	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
+	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
+
+	cmp		$len,#0
+	bne		.Leven
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ store hash value
+
+	vst4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+	vst1.32		{$D4#lo[0]},[$ctx]
+
+	vldmia	sp!,{d8-d15}			@ epilogue
+	ldmia	sp!,{r4-r7}
+	ret					@ bx	lr
+.size	poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align	5
+.Lzeros:
+.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+#ifndef	__KERNEL__
+.LOPENSSL_armcap:
+# ifdef	_WIN32
+.word	OPENSSL_armcap_P
+# else
+.word	OPENSSL_armcap_P-.Lpoly1305_init
+# endif
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
+#endif
+___
+}	}
+$code.=<<___;
+.asciz	"Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bret\b/bx	lr/go						or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/poly1305.h b/lib/crypto/arm/poly1305.h
new file mode 100644
index 000000000000..0fe903d8de55
--- /dev/null
+++ b/lib/crypto/arm/poly1305.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+				    const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
+				    const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+				     const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+			      u8 digest[POLY1305_DIGEST_SIZE],
+			      const u32 nonce[4]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
+			    unsigned int len, u32 padbit)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
+		do {
+			unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+			scoped_ksimd()
+				poly1305_blocks_neon(state, src, todo, padbit);
+
+			len -= todo;
+			src += todo;
+		} while (len);
+	} else
+		poly1305_blocks_arm(state, src, len, padbit);
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON)
+		static_branch_enable(&have_neon);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm/sha1-armv4-large.S b/lib/crypto/arm/sha1-armv4-large.S
new file mode 100644
index 000000000000..1c8b685149f2
--- /dev/null
+++ b/lib/crypto/arm/sha1-armv4-large.S
@@ -0,0 +1,507 @@
+#define __ARM_ARCH__ __LINUX_ARM_ARCH__
+@ SPDX-License-Identifier: GPL-2.0
+
+@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
+@ has relicensed it under the GPLv2. Therefore this program is free software;
+@ you can redistribute it and/or modify it under the terms of the GNU General
+@ Public License version 2 as published by the Free Software Foundation.
+@
+@ The original headers, including the original license headers, are
+@ included below for completeness.
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see https://www.openssl.org/~appro/cryptogams/.
+@ ====================================================================
+
+@ sha1_block procedure for ARMv4.
+@
+@ January 2007.
+
+@ Size/performance trade-off
+@ ====================================================================
+@ impl		size in bytes	comp cycles[*]	measured performance
+@ ====================================================================
+@ thumb		304		3212		4420
+@ armv4-small	392/+29%	1958/+64%	2250/+96%
+@ armv4-compact	740/+89%	1552/+26%	1840/+22%
+@ armv4-large	1420/+92%	1307/+19%	1370/+34%[***]
+@ full unroll	~5100/+260%	~1260/+4%	~1300/+5%
+@ ====================================================================
+@ thumb		= same as 'small' but in Thumb instructions[**] and
+@		  with recurring code in two private functions;
+@ small		= detached Xload/update, loops are folded;
+@ compact	= detached Xload/update, 5x unroll;
+@ large		= interleaved Xload/update, 5x unroll;
+@ full unroll	= interleaved Xload/update, full unroll, estimated[!];
+@
+@ [*]	Manually counted instructions in "grand" loop body. Measured
+@	performance is affected by prologue and epilogue overhead,
+@	i-cache availability, branch penalties, etc.
+@ [**]	While each Thumb instruction is twice smaller, they are not as
+@	diverse as ARM ones: e.g., there are only two arithmetic
+@	instructions with 3 arguments, no [fixed] rotate, addressing
+@	modes are limited. As result it takes more instructions to do
+@	the same job in Thumb, therefore the code is never twice as
+@	small and always slower.
+@ [***]	which is also ~35% better than compiler generated code. Dual-
+@	issue Cortex A8 core was measured to process input block in
+@	~990 cycles.
+
+@ August 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
+@ Cortex A8 core and in absolute terms ~870 cycles per input block
+@ [or 13.6 cycles per byte].
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 10%
+@ improvement on Cortex A8 core and 12.2 cycles per byte.
+
+#include <linux/linkage.h>
+
+.text
+
+.align	2
+ENTRY(sha1_block_data_order)
+	stmdb	sp!,{r4-r12,lr}
+	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
+	ldmia	r0,{r3,r4,r5,r6,r7}
+.Lloop:
+	ldr	r8,.LK_00_19
+	mov	r14,sp
+	sub	sp,sp,#15*4
+	mov	r5,r5,ror#30
+	mov	r6,r6,ror#30
+	mov	r7,r7,ror#30		@ [6]
+.L_00_15:
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	eor	r10,r5,r6			@ F_xx_xx
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r4,r10,ror#2
+	add	r7,r7,r9			@ E+=X[i]
+	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r4,r5			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
+	eor	r10,r4,r5			@ F_xx_xx
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r3,r10,ror#2
+	add	r6,r6,r9			@ E+=X[i]
+	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r3,r4			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
+	eor	r10,r3,r4			@ F_xx_xx
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r7,r10,ror#2
+	add	r5,r5,r9			@ E+=X[i]
+	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r7,r3			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
+	eor	r10,r7,r3			@ F_xx_xx
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r6,r10,ror#2
+	add	r4,r4,r9			@ E+=X[i]
+	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r6,r7			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
+	eor	r10,r6,r7			@ F_xx_xx
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r5,r10,ror#2
+	add	r3,r3,r9			@ E+=X[i]
+	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
+	cmp	r14,sp
+	bne	.L_00_15		@ [((11+4)*5+2)*3]
+	sub	sp,sp,#25*4
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	eor	r10,r5,r6			@ F_xx_xx
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r4,r10,ror#2
+	add	r7,r7,r9			@ E+=X[i]
+	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r3,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
+	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r7,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
+	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r6,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
+	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r5,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
+	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
+
+	ldr	r8,.LK_20_39		@ [+15+16*4]
+	cmn	sp,#0			@ [+3], clear carry to denote 20_39
+.L_20_39_or_60_79:
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r5,r6			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor r10,r4,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r7,r7,r9			@ E+=X[i]
+	add	r7,r7,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor r10,r3,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	add	r6,r6,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor r10,r7,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	add	r5,r5,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor r10,r6,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	add	r4,r4,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor r10,r5,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	add	r3,r3,r10			@ E+=F_20_39(B,C,D)
+ ARM(	teq	r14,sp		)	@ preserve carry
+ THUMB(	mov	r11,sp		)
+ THUMB(	teq	r14,r11		)	@ preserve carry
+	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
+	bcs	.L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
+
+	ldr	r8,.LK_40_59
+	sub	sp,sp,#20*4		@ [+2]
+.L_40_59:
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r5,r6			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r4,r10,ror#2					@ F_xx_xx
+	and r11,r5,r6					@ F_xx_xx
+	add	r7,r7,r9			@ E+=X[i]
+	add	r7,r7,r10			@ E+=F_40_59(B,C,D)
+	add	r7,r7,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r3,r10,ror#2					@ F_xx_xx
+	and r11,r4,r5					@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	add	r6,r6,r10			@ E+=F_40_59(B,C,D)
+	add	r6,r6,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r7,r10,ror#2					@ F_xx_xx
+	and r11,r3,r4					@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	add	r5,r5,r10			@ E+=F_40_59(B,C,D)
+	add	r5,r5,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r6,r10,ror#2					@ F_xx_xx
+	and r11,r7,r3					@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	add	r4,r4,r10			@ E+=F_40_59(B,C,D)
+	add	r4,r4,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and r10,r5,r10,ror#2					@ F_xx_xx
+	and r11,r6,r7					@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	add	r3,r3,r10			@ E+=F_40_59(B,C,D)
+	add	r3,r3,r11,ror#2
+	cmp	r14,sp
+	bne	.L_40_59		@ [+((12+5)*5+2)*4]
+
+	ldr	r8,.LK_60_79
+	sub	sp,sp,#20*4
+	cmp	sp,#0			@ set carry to denote 60_79
+	b	.L_20_39_or_60_79	@ [+4], spare 300 bytes
+.L_done:
+	add	sp,sp,#80*4		@ "deallocate" stack frame
+	ldmia	r0,{r8,r9,r10,r11,r12}
+	add	r3,r8,r3
+	add	r4,r9,r4
+	add	r5,r10,r5,ror#2
+	add	r6,r11,r6,ror#2
+	add	r7,r12,r7,ror#2
+	stmia	r0,{r3,r4,r5,r6,r7}
+	teq	r1,r2
+	bne	.Lloop			@ [+18], total 1307
+
+	ldmia	sp!,{r4-r12,pc}
+.align	2
+.LK_00_19:	.word	0x5a827999
+.LK_20_39:	.word	0x6ed9eba1
+.LK_40_59:	.word	0x8f1bbcdc
+.LK_60_79:	.word	0xca62c1d6
+ENDPROC(sha1_block_data_order)
+.asciz	"SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
+.align	2
diff --git a/lib/crypto/arm/sha1-armv7-neon.S b/lib/crypto/arm/sha1-armv7-neon.S
new file mode 100644
index 000000000000..a0323fa5c58a
--- /dev/null
+++ b/lib/crypto/arm/sha1-armv7-neon.S
@@ -0,0 +1,633 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* ARM/NEON accelerated SHA-1 transform function
+ *
+ * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.syntax unified
+.fpu neon
+
+.text
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+#define K1  0x5A827999
+#define K2  0x6ED9EBA1
+#define K3  0x8F1BBCDC
+#define K4  0xCA62C1D6
+.align 4
+.LK_VEC:
+.LK1:	.long K1, K1, K1, K1
+.LK2:	.long K2, K2, K2, K2
+.LK3:	.long K3, K3, K3, K3
+.LK4:	.long K4, K4, K4, K4
+
+
+/* Register macros */
+
+#define RSTATE r0
+#define RDATA r1
+#define RNBLKS r2
+#define ROLDSTACK r3
+#define RWK lr
+
+#define _a r4
+#define _b r5
+#define _c r6
+#define _d r7
+#define _e r8
+
+#define RT0 r9
+#define RT1 r10
+#define RT2 r11
+#define RT3 r12
+
+#define W0 q0
+#define W1 q7
+#define W2 q2
+#define W3 q3
+#define W4 q4
+#define W5 q6
+#define W6 q5
+#define W7 q1
+
+#define tmp0 q8
+#define tmp1 q9
+#define tmp2 q10
+#define tmp3 q11
+
+#define qK1 q12
+#define qK2 q13
+#define qK3 q14
+#define qK4 q15
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define ARM_LE(code...)
+#else
+#define ARM_LE(code...)		code
+#endif
+
+/* Round function macros. */
+
+#define WK_offs(i) (((i) & 15) * 4)
+
+#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	ldr RT3, [sp, WK_offs(i)]; \
+		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	bic RT0, d, b; \
+	add e, e, a, ror #(32 - 5); \
+	and RT1, c, b; \
+		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add RT0, RT0, RT3; \
+	add e, e, RT1; \
+	ror b, #(32 - 30); \
+		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT0;
+
+#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	ldr RT3, [sp, WK_offs(i)]; \
+		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	eor RT0, d, b; \
+	add e, e, a, ror #(32 - 5); \
+	eor RT0, RT0, c; \
+		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT3; \
+	ror b, #(32 - 30); \
+		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT0; \
+
+#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	ldr RT3, [sp, WK_offs(i)]; \
+		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	eor RT0, b, c; \
+	and RT1, b, c; \
+	add e, e, a, ror #(32 - 5); \
+		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	and RT0, RT0, d; \
+	add RT1, RT1, RT3; \
+	add e, e, RT0; \
+	ror b, #(32 - 30); \
+		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT1;
+
+#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	_R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
+           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	_R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define R(a,b,c,d,e,f,i) \
+	_R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
+	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define dummy(...)
+
+
+/* Input expansion macros. */
+
+/********* Precalc macros for rounds 0-15 *************************************/
+
+#define W_PRECALC_00_15() \
+	add       RWK, sp, #(WK_offs(0));			\
+	\
+	vld1.32   {W0, W7}, [RDATA]!;				\
+ ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
+	vld1.32   {W6, W5}, [RDATA]!;				\
+	vadd.u32  tmp0, W0, curK;				\
+ ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
+ ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
+	vadd.u32  tmp1, W7, curK;				\
+ ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
+	vadd.u32  tmp2, W6, curK;				\
+	vst1.32   {tmp0, tmp1}, [RWK]!;				\
+	vadd.u32  tmp3, W5, curK;				\
+	vst1.32   {tmp2, tmp3}, [RWK];				\
+
+#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vld1.32   {W0, W7}, [RDATA]!;				\
+
+#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	add       RWK, sp, #(WK_offs(0));			\
+
+#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
+
+#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vld1.32   {W6, W5}, [RDATA]!;				\
+
+#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp0, W0, curK;				\
+
+#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
+
+#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
+
+#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp1, W7, curK;				\
+
+#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
+
+#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp2, W6, curK;				\
+
+#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vst1.32   {tmp0, tmp1}, [RWK]!;				\
+
+#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp3, W5, curK;				\
+
+#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vst1.32   {tmp2, tmp3}, [RWK];				\
+
+
+/********* Precalc macros for rounds 16-31 ************************************/
+
+#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      tmp0, tmp0;			\
+	vext.8    W, W_m16, W_m12, #8;		\
+
+#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	add       RWK, sp, #(WK_offs(i));	\
+	vext.8    tmp0, W_m04, tmp0, #4;	\
+
+#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      tmp0, tmp0, W_m16;		\
+	veor.32   W, W, W_m08;			\
+
+#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      tmp1, tmp1;			\
+	veor      W, W, tmp0;			\
+
+#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vshl.u32  tmp0, W, #1;			\
+
+#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vext.8    tmp1, tmp1, W, #(16-12);	\
+	vshr.u32  W, W, #31;			\
+
+#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vorr      tmp0, tmp0, W;		\
+	vshr.u32  W, tmp1, #30;			\
+
+#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vshl.u32  tmp1, tmp1, #2;		\
+
+#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      tmp0, tmp0, W;		\
+
+#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      W, tmp0, tmp1;		\
+
+#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp0, W, curK;		\
+
+#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vst1.32   {tmp0}, [RWK];
+
+
+/********* Precalc macros for rounds 32-79 ************************************/
+
+#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor W, W_m28; \
+
+#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vext.8 tmp0, W_m08, W_m04, #8; \
+
+#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor W, W_m16; \
+
+#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor W, tmp0; \
+
+#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	add RWK, sp, #(WK_offs(i&~3)); \
+
+#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vshl.u32 tmp1, W, #2; \
+
+#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vshr.u32 tmp0, W, #30; \
+
+#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vorr W, tmp0, tmp1; \
+
+#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vst1.32 {tmp0}, [RWK];
+
+
+/*
+ * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA.
+ *
+ * void sha1_transform_neon(struct sha1_block_state *state,
+ *			    const u8 *data, size_t nblocks);
+ */
+.align 3
+ENTRY(sha1_transform_neon)
+  /* input:
+   *	r0: state
+   *	r1: data (64*nblocks bytes)
+   *	r2: nblocks
+   */
+
+  cmp RNBLKS, #0;
+  beq .Ldo_nothing;
+
+  push {r4-r12, lr};
+  /*vpush {q4-q7};*/
+
+  adr RT3, .LK_VEC;
+
+  mov ROLDSTACK, sp;
+
+  /* Align stack. */
+  sub RT0, sp, #(16*4);
+  and RT0, #(~(16-1));
+  mov sp, RT0;
+
+  vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
+
+  /* Get the values of the chaining variables. */
+  ldm RSTATE, {_a-_e};
+
+  vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
+
+#undef curK
+#define curK qK1
+  /* Precalc 0-15. */
+  W_PRECALC_00_15();
+
+.Loop:
+  /* Transform 0-15 + Precalc 16-31. */
+  _R( _a, _b, _c, _d, _e, F1,  0,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
+      W4, W5, W6, W7, W0, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1,  1,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
+      W4, W5, W6, W7, W0, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F1,  2,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
+      W4, W5, W6, W7, W0, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F1,  3,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
+      W4, W5, W6, W7, W0, _, _, _ );
+
+#undef curK
+#define curK qK2
+  _R( _b, _c, _d, _e, _a, F1,  4,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
+      W3, W4, W5, W6, W7, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1,  5,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
+      W3, W4, W5, W6, W7, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1,  6,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
+      W3, W4, W5, W6, W7, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F1,  7,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
+      W3, W4, W5, W6, W7, _, _, _ );
+
+  _R( _c, _d, _e, _a, _b, F1,  8,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
+      W2, W3, W4, W5, W6, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F1,  9,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
+      W2, W3, W4, W5, W6, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1, 10,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
+      W2, W3, W4, W5, W6, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1, 11,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
+      W2, W3, W4, W5, W6, _, _, _ );
+
+  _R( _d, _e, _a, _b, _c, F1, 12,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
+      W1, W2, W3, W4, W5, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F1, 13,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
+      W1, W2, W3, W4, W5, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F1, 14,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
+      W1, W2, W3, W4, W5, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1, 15,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
+      W1, W2, W3, W4, W5, _, _, _ );
+
+  /* Transform 16-63 + Precalc 32-79. */
+  _R( _e, _a, _b, _c, _d, F1, 16,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _d, _e, _a, _b, _c, F1, 17,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _c, _d, _e, _a, _b, F1, 18,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _b, _c, _d, _e, _a, F1, 19,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+
+  _R( _a, _b, _c, _d, _e, F2, 20,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _e, _a, _b, _c, _d, F2, 21,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _d, _e, _a, _b, _c, F2, 22,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _c, _d, _e, _a, _b, F2, 23,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+
+#undef curK
+#define curK qK3
+  _R( _b, _c, _d, _e, _a, F2, 24,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _a, _b, _c, _d, _e, F2, 25,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _e, _a, _b, _c, _d, F2, 26,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _d, _e, _a, _b, _c, F2, 27,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+
+  _R( _c, _d, _e, _a, _b, F2, 28,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _b, _c, _d, _e, _a, F2, 29,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _a, _b, _c, _d, _e, F2, 30,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _e, _a, _b, _c, _d, F2, 31,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+
+  _R( _d, _e, _a, _b, _c, F2, 32,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _c, _d, _e, _a, _b, F2, 33,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _b, _c, _d, _e, _a, F2, 34,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _a, _b, _c, _d, _e, F2, 35,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+
+  _R( _e, _a, _b, _c, _d, F2, 36,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _d, _e, _a, _b, _c, F2, 37,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _c, _d, _e, _a, _b, F2, 38,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _b, _c, _d, _e, _a, F2, 39,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+
+  _R( _a, _b, _c, _d, _e, F3, 40,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _e, _a, _b, _c, _d, F3, 41,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _d, _e, _a, _b, _c, F3, 42,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _c, _d, _e, _a, _b, F3, 43,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+
+#undef curK
+#define curK qK4
+  _R( _b, _c, _d, _e, _a, F3, 44,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _a, _b, _c, _d, _e, F3, 45,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _e, _a, _b, _c, _d, F3, 46,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _d, _e, _a, _b, _c, F3, 47,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+
+  _R( _c, _d, _e, _a, _b, F3, 48,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _b, _c, _d, _e, _a, F3, 49,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _a, _b, _c, _d, _e, F3, 50,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _e, _a, _b, _c, _d, F3, 51,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+
+  _R( _d, _e, _a, _b, _c, F3, 52,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _c, _d, _e, _a, _b, F3, 53,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _b, _c, _d, _e, _a, F3, 54,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _a, _b, _c, _d, _e, F3, 55,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+
+  _R( _e, _a, _b, _c, _d, F3, 56,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _d, _e, _a, _b, _c, F3, 57,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _c, _d, _e, _a, _b, F3, 58,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _b, _c, _d, _e, _a, F3, 59,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+
+  subs RNBLKS, #1;
+
+  _R( _a, _b, _c, _d, _e, F4, 60,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _e, _a, _b, _c, _d, F4, 61,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _d, _e, _a, _b, _c, F4, 62,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _c, _d, _e, _a, _b, F4, 63,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+
+  beq .Lend;
+
+  /* Transform 64-79 + Precalc 0-15 of next block. */
+#undef curK
+#define curK qK1
+  _R( _b, _c, _d, _e, _a, F4, 64,
+      WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 65,
+      WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F4, 66,
+      WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F4, 67,
+      WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _c, _d, _e, _a, _b, F4, 68,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 69,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 70,
+      WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F4, 71,
+      WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _d, _e, _a, _b, _c, F4, 72,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F4, 73,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 74,
+      WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 75,
+      WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _e, _a, _b, _c, _d, F4, 76,
+      WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F4, 77,
+      WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F4, 78,
+      WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 79,
+      WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
+
+  /* Update the chaining variables. */
+  ldm RSTATE, {RT0-RT3};
+  add _a, RT0;
+  ldr RT0, [RSTATE, #state_h4];
+  add _b, RT1;
+  add _c, RT2;
+  add _d, RT3;
+  add _e, RT0;
+  stm RSTATE, {_a-_e};
+
+  b .Loop;
+
+.Lend:
+  /* Transform 64-79 */
+  R( _b, _c, _d, _e, _a, F4, 64 );
+  R( _a, _b, _c, _d, _e, F4, 65 );
+  R( _e, _a, _b, _c, _d, F4, 66 );
+  R( _d, _e, _a, _b, _c, F4, 67 );
+  R( _c, _d, _e, _a, _b, F4, 68 );
+  R( _b, _c, _d, _e, _a, F4, 69 );
+  R( _a, _b, _c, _d, _e, F4, 70 );
+  R( _e, _a, _b, _c, _d, F4, 71 );
+  R( _d, _e, _a, _b, _c, F4, 72 );
+  R( _c, _d, _e, _a, _b, F4, 73 );
+  R( _b, _c, _d, _e, _a, F4, 74 );
+  R( _a, _b, _c, _d, _e, F4, 75 );
+  R( _e, _a, _b, _c, _d, F4, 76 );
+  R( _d, _e, _a, _b, _c, F4, 77 );
+  R( _c, _d, _e, _a, _b, F4, 78 );
+  R( _b, _c, _d, _e, _a, F4, 79 );
+
+  mov sp, ROLDSTACK;
+
+  /* Update the chaining variables. */
+  ldm RSTATE, {RT0-RT3};
+  add _a, RT0;
+  ldr RT0, [RSTATE, #state_h4];
+  add _b, RT1;
+  add _c, RT2;
+  add _d, RT3;
+  /*vpop {q4-q7};*/
+  add _e, RT0;
+  stm RSTATE, {_a-_e};
+
+  pop {r4-r12, pc};
+
+.Ldo_nothing:
+  bx lr
+ENDPROC(sha1_transform_neon)
diff --git a/lib/crypto/arm/sha1-ce-core.S b/lib/crypto/arm/sha1-ce-core.S
new file mode 100644
index 000000000000..7d6b2631ca8d
--- /dev/null
+++ b/lib/crypto/arm/sha1-ce-core.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.arch		armv8-a
+	.fpu		crypto-neon-fp-armv8
+
+	k0		.req	q0
+	k1		.req	q1
+	k2		.req	q2
+	k3		.req	q3
+
+	ta0		.req	q4
+	ta1		.req	q5
+	tb0		.req	q5
+	tb1		.req	q4
+
+	dga		.req	q6
+	dgb		.req	q7
+	dgbs		.req	s28
+
+	dg0		.req	q12
+	dg1a0		.req	q13
+	dg1a1		.req	q14
+	dg1b0		.req	q14
+	dg1b1		.req	q13
+
+	.macro		add_only, op, ev, rc, s0, dg1
+	.ifnb		\s0
+	vadd.u32	tb\ev, q\s0, \rc
+	.endif
+	sha1h.32	dg1b\ev, dg0
+	.ifb		\dg1
+	sha1\op\().32	dg0, dg1a\ev, ta\ev
+	.else
+	sha1\op\().32	dg0, \dg1, ta\ev
+	.endif
+	.endm
+
+	.macro		add_update, op, ev, rc, s0, s1, s2, s3, dg1
+	sha1su0.32	q\s0, q\s1, q\s2
+	add_only	\op, \ev, \rc, \s1, \dg1
+	sha1su1.32	q\s0, q\s3
+	.endm
+
+	.align		6
+.Lsha1_rcon:
+	.word		0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+	.word		0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+	.word		0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+	.word		0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+	/*
+	 * void sha1_ce_transform(struct sha1_block_state *state,
+	 *			  const u8 *data, size_t nblocks);
+	 */
+ENTRY(sha1_ce_transform)
+	/* load round constants */
+	adr		ip, .Lsha1_rcon
+	vld1.32		{k0-k1}, [ip, :128]!
+	vld1.32		{k2-k3}, [ip, :128]
+
+	/* load state */
+	vld1.32		{dga}, [r0]
+	vldr		dgbs, [r0, #16]
+
+	/* load input */
+0:	vld1.32		{q8-q9}, [r1]!
+	vld1.32		{q10-q11}, [r1]!
+	subs		r2, r2, #1
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+	vrev32.8	q8, q8
+	vrev32.8	q9, q9
+	vrev32.8	q10, q10
+	vrev32.8	q11, q11
+#endif
+
+	vadd.u32	ta0, q8, k0
+	vmov		dg0, dga
+
+	add_update	c, 0, k0,  8,  9, 10, 11, dgb
+	add_update	c, 1, k0,  9, 10, 11,  8
+	add_update	c, 0, k0, 10, 11,  8,  9
+	add_update	c, 1, k0, 11,  8,  9, 10
+	add_update	c, 0, k1,  8,  9, 10, 11
+
+	add_update	p, 1, k1,  9, 10, 11,  8
+	add_update	p, 0, k1, 10, 11,  8,  9
+	add_update	p, 1, k1, 11,  8,  9, 10
+	add_update	p, 0, k1,  8,  9, 10, 11
+	add_update	p, 1, k2,  9, 10, 11,  8
+
+	add_update	m, 0, k2, 10, 11,  8,  9
+	add_update	m, 1, k2, 11,  8,  9, 10
+	add_update	m, 0, k2,  8,  9, 10, 11
+	add_update	m, 1, k2,  9, 10, 11,  8
+	add_update	m, 0, k3, 10, 11,  8,  9
+
+	add_update	p, 1, k3, 11,  8,  9, 10
+	add_only	p, 0, k3,  9
+	add_only	p, 1, k3, 10
+	add_only	p, 0, k3, 11
+	add_only	p, 1
+
+	/* update state */
+	vadd.u32	dga, dga, dg0
+	vadd.u32	dgb, dgb, dg1a0
+	bne		0b
+
+	/* store new state */
+	vst1.32		{dga}, [r0]
+	vstr		dgbs, [r0, #16]
+	bx		lr
+ENDPROC(sha1_ce_transform)
diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h
new file mode 100644
index 000000000000..3e2d8c7cab9f
--- /dev/null
+++ b/lib/crypto/arm/sha1.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage void sha1_block_data_order(struct sha1_block_state *state,
+				      const u8 *data, size_t nblocks);
+asmlinkage void sha1_transform_neon(struct sha1_block_state *state,
+				    const u8 *data, size_t nblocks);
+asmlinkage void sha1_ce_transform(struct sha1_block_state *state,
+				  const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
+		scoped_ksimd() {
+			if (static_branch_likely(&have_ce))
+				sha1_ce_transform(state, data, nblocks);
+			else
+				sha1_transform_neon(state, data, nblocks);
+		}
+	} else {
+		sha1_block_data_order(state, data, nblocks);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON) {
+		static_branch_enable(&have_neon);
+		if (elf_hwcap2 & HWCAP2_SHA1)
+			static_branch_enable(&have_ce);
+	}
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm/sha256-armv4.pl b/lib/crypto/arm/sha256-armv4.pl
new file mode 100644
index 000000000000..f3a2b54efd4e
--- /dev/null
+++ b/lib/crypto/arm/sha256-armv4.pl
@@ -0,0 +1,724 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";	$t0="r0";
+$inp="r1";	$t4="r1";
+$len="r2";	$t1="r2";
+$T1="r3";	$t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+	@ ldr	$t1,[$inp],#4			@ $i
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	$t1,$t1
+# endif
+#else
+	@ ldrb	$t1,[$inp,#3]			@ $i
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	ldrb	$t2,[$inp,#2]
+	ldrb	$t0,[$inp,#1]
+	orr	$t1,$t1,$t2,lsl#8
+	ldrb	$t2,[$inp],#4
+	orr	$t1,$t1,$t0,lsl#16
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	orr	$t1,$t1,$t2,lsl#24
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+#endif
+___
+$code.=<<___;
+	ldr	$t2,[$Ktbl],#4			@ *K256++
+	add	$h,$h,$t1			@ h+=X[i]
+	str	$t1,[sp,#`$i%16`*4]
+	eor	$t1,$f,$g
+	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
+	and	$t1,$t1,$e
+	add	$h,$h,$t2			@ h+=K256[i]
+	eor	$t1,$t1,$g			@ Ch(e,f,g)
+	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+	add	$h,$h,$t1			@ h+=Ch(e,f,g)
+#if $i==31
+	and	$t2,$t2,#0xff
+	cmp	$t2,#0xf2			@ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4			@ prefetch
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+#else
+	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
+#endif
+	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
+	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
+	add	$d,$d,$h			@ d+=h
+	eor	$t3,$t3,$b			@ Maj(a,b,c)
+	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
+	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t4,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t1,ror#$sigma0[0]
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	mov	$t2,$t4,ror#$sigma1[0]
+	eor	$t0,$t0,$t1,ror#$sigma0[1]
+	eor	$t2,$t2,$t4,ror#$sigma1[1]
+	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	ldr	$t1,[sp,#`($i+0)%16`*4]
+	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	ldr	$t4,[sp,#`($i+9)%16`*4]
+
+	add	$t2,$t2,$t0
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
+	add	$t1,$t1,$t2
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+	add	$t1,$t1,$t4			@ X[i]
+___
+	&BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align	5
+
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+.Lsha256_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha256_block_data_order
+#else
+	adr	r3,.Lsha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
+	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+	sub	$Ktbl,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t3,$B,$C		@ magic
+	eor	$t2,$t2,$t2
+___
+for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
+	ldr	$t0,[$t3,#0]
+	ldr	$t1,[$t3,#4]
+	ldr	$t2,[$t3,#8]
+	add	$A,$A,$t0
+	ldr	$t0,[$t3,#12]
+	add	$B,$B,$t1
+	ldr	$t1,[$t3,#16]
+	add	$C,$C,$t2
+	ldr	$t2,[$t3,#20]
+	add	$D,$D,$t0
+	ldr	$t0,[$t3,#24]
+	add	$E,$E,$t1
+	ldr	$t1,[$t3,#28]
+	add	$F,$F,$t2
+	ldr	$inp,[sp,#17*4]		@ pull inp
+	ldr	$t2,[sp,#18*4]		@ pull inp+len
+	add	$G,$G,$t0
+	add	$H,$H,$t1
+	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+	cmp	$inp,$t2
+	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#`16+3`*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 while($#insns>=2) { eval(shift(@insns)); }
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vrev32_8	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&eor	($t1,$f,$g)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&and	($t1,$t1,$e)',
+	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
+	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&add	($d,$d,$h)',			# d+=h
+	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	4
+sha256_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	$H,sp,#16*4+16
+	adr	$Ktbl,.Lsha256_block_data_order
+	sub	$Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
+	bic	$H,$H,#15		@ align for 128-bit stores
+	mov	$t2,sp
+	mov	sp,$H			@ alloca
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{@X[0]},[$inp]!
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	vld1.32		{$T0},[$Ktbl,:128]!
+	vld1.32		{$T1},[$Ktbl,:128]!
+	vld1.32		{$T2},[$Ktbl,:128]!
+	vld1.32		{$T3},[$Ktbl,:128]!
+	vrev32.8	@X[0],@X[0]		@ yes, even on
+	str		$ctx,[sp,#64]
+	vrev32.8	@X[1],@X[1]		@ big-endian
+	str		$inp,[sp,#68]
+	mov		$Xfer,sp
+	vrev32.8	@X[2],@X[2]
+	str		$len,[sp,#72]
+	vrev32.8	@X[3],@X[3]
+	str		$t2,[sp,#76]		@ save original sp
+	vadd.i32	$T0,$T0,@X[0]
+	vadd.i32	$T1,$T1,@X[1]
+	vst1.32		{$T0},[$Xfer,:128]!
+	vadd.i32	$T2,$T2,@X[2]
+	vst1.32		{$T1},[$Xfer,:128]!
+	vadd.i32	$T3,$T3,@X[3]
+	vst1.32		{$T2},[$Xfer,:128]!
+	vst1.32		{$T3},[$Xfer,:128]!
+
+	ldmia		$ctx,{$A-$H}
+	sub		$Xfer,$Xfer,#64
+	ldr		$t1,[sp,#0]
+	eor		$t2,$t2,$t2
+	eor		$t3,$B,$C
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	teq	$t1,#0				@ check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	ldr		$inp,[sp,#68]
+	ldr		$t0,[sp,#72]
+	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
+	teq		$inp,$t0
+	it		eq
+	subeq		$inp,$inp,#64		@ avoid SEGV
+	vld1.8		{@X[0]},[$inp]!		@ load next input block
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	it		ne
+	strne		$inp,[sp,#68]
+	mov		$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	ldr	$t0,[$t1,#0]
+	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
+	ldr	$t2,[$t1,#4]
+	ldr	$t3,[$t1,#8]
+	ldr	$t4,[$t1,#12]
+	add	$A,$A,$t0			@ accumulate
+	ldr	$t0,[$t1,#16]
+	add	$B,$B,$t2
+	ldr	$t2,[$t1,#20]
+	add	$C,$C,$t3
+	ldr	$t3,[$t1,#24]
+	add	$D,$D,$t4
+	ldr	$t4,[$t1,#28]
+	add	$E,$E,$t0
+	str	$A,[$t1],#4
+	add	$F,$F,$t2
+	str	$B,[$t1],#4
+	add	$G,$G,$t3
+	str	$C,[$t1],#4
+	add	$H,$H,$t4
+	str	$D,[$t1],#4
+	stmia	$t1,{$E-$H}
+
+	ittte	ne
+	movne	$Xfer,sp
+	ldrne	$t1,[sp,#0]
+	eorne	$t2,$t2,$t2
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	$t3,$B,$C
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.type	sha256_block_data_order_armv8,%function
+.align	5
+sha256_block_data_order_armv8:
+.LARMv8:
+	vld1.32	{$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+	adr	$Ktbl,.LARMv8
+	sub	$Ktbl,$Ktbl,#.LARMv8-K256
+# else
+	adrl	$Ktbl,K256
+# endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+.Loop_v8:
+	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
+	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
+	vld1.32		{$W0},[$Ktbl]!
+	vrev32.8	@MSG[0],@MSG[0]
+	vrev32.8	@MSG[1],@MSG[1]
+	vrev32.8	@MSG[2],@MSG[2]
+	vrev32.8	@MSG[3],@MSG[3]
+	vmov		$ABCD_SAVE,$ABCD	@ offload
+	vmov		$EFGH_SAVE,$EFGH
+	teq		$inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0,@MSG[0]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vld1.32		{$W0},[$Ktbl]!
+	vadd.i32	$W1,$W1,@MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vld1.32		{$W1},[$Ktbl]
+	vadd.i32	$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#256-16	@ rewind
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vadd.i32	$W1,$W1,@MSG[3]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
+	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
+	it		ne
+	bne		.Loop_v8
+
+	vst1.32		{$ABCD,$EFGH},[$ctx]
+
+	ret		@ bx lr
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+{   my  %opcode = (
+	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
+	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<17)|(($2&8)<<4)
+					 |(($3&7)<<1) |(($3&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+}
+
+foreach (split($/,$code)) {
+
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S
new file mode 100644
index 000000000000..144ee805f64a
--- /dev/null
+++ b/lib/crypto/arm/sha256-ce.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-224/256 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.arch		armv8-a
+	.fpu		crypto-neon-fp-armv8
+
+	k0		.req	q7
+	k1		.req	q8
+	rk		.req	r3
+
+	ta0		.req	q9
+	ta1		.req	q10
+	tb0		.req	q10
+	tb1		.req	q9
+
+	dga		.req	q11
+	dgb		.req	q12
+
+	dg0		.req	q13
+	dg1		.req	q14
+	dg2		.req	q15
+
+	.macro		add_only, ev, s0
+	vmov		dg2, dg0
+	.ifnb		\s0
+	vld1.32		{k\ev}, [rk, :128]!
+	.endif
+	sha256h.32	dg0, dg1, tb\ev
+	sha256h2.32	dg1, dg2, tb\ev
+	.ifnb		\s0
+	vadd.u32	ta\ev, q\s0, k\ev
+	.endif
+	.endm
+
+	.macro		add_update, ev, s0, s1, s2, s3
+	sha256su0.32	q\s0, q\s1
+	add_only	\ev, \s1
+	sha256su1.32	q\s0, q\s2, q\s3
+	.endm
+
+	.align		6
+.Lsha256_rcon:
+	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+	/*
+	 * void sha256_ce_transform(struct sha256_block_state *state,
+	 *			    const u8 *data, size_t nblocks);
+	 */
+ENTRY(sha256_ce_transform)
+	/* load state */
+	vld1.32		{dga-dgb}, [r0]
+
+	/* load input */
+0:	vld1.32		{q0-q1}, [r1]!
+	vld1.32		{q2-q3}, [r1]!
+	subs		r2, r2, #1
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+	vrev32.8	q0, q0
+	vrev32.8	q1, q1
+	vrev32.8	q2, q2
+	vrev32.8	q3, q3
+#endif
+
+	/* load first round constant */
+	adr		rk, .Lsha256_rcon
+	vld1.32		{k0}, [rk, :128]!
+
+	vadd.u32	ta0, q0, k0
+	vmov		dg0, dga
+	vmov		dg1, dgb
+
+	add_update	1, 0, 1, 2, 3
+	add_update	0, 1, 2, 3, 0
+	add_update	1, 2, 3, 0, 1
+	add_update	0, 3, 0, 1, 2
+	add_update	1, 0, 1, 2, 3
+	add_update	0, 1, 2, 3, 0
+	add_update	1, 2, 3, 0, 1
+	add_update	0, 3, 0, 1, 2
+	add_update	1, 0, 1, 2, 3
+	add_update	0, 1, 2, 3, 0
+	add_update	1, 2, 3, 0, 1
+	add_update	0, 3, 0, 1, 2
+
+	add_only	1, 1
+	add_only	0, 2
+	add_only	1, 3
+	add_only	0
+
+	/* update state */
+	vadd.u32	dga, dga, dg0
+	vadd.u32	dgb, dgb, dg1
+	bne		0b
+
+	/* store new state */
+	vst1.32		{dga-dgb}, [r0]
+	bx		lr
+ENDPROC(sha256_ce_transform)
diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h
new file mode 100644
index 000000000000..ae7e52dd6e3b
--- /dev/null
+++ b/lib/crypto/arm/sha256.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
+					const u8 *data, size_t nblocks);
+asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state,
+					     const u8 *data, size_t nblocks);
+asmlinkage void sha256_ce_transform(struct sha256_block_state *state,
+				    const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
+		scoped_ksimd() {
+			if (static_branch_likely(&have_ce))
+				sha256_ce_transform(state, data, nblocks);
+			else
+				sha256_block_data_order_neon(state, data, nblocks);
+		}
+	} else {
+		sha256_block_data_order(state, data, nblocks);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON) {
+		static_branch_enable(&have_neon);
+		if (elf_hwcap2 & HWCAP2_SHA2)
+			static_branch_enable(&have_ce);
+	}
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm/sha512-armv4.pl b/lib/crypto/arm/sha512-armv4.pl
new file mode 100644
index 000000000000..2fc3516912fa
--- /dev/null
+++ b/lib/crypto/arm/sha512-armv4.pl
@@ -0,0 +1,657 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA512 block procedure for ARMv4. September 2007.
+
+# This code is ~4.5 (four and a half) times faster than code generated
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+# August 2012.
+#
+# Improve NEON performance by 12% on Snapdragon S4. In absolute
+# terms it's 22.6 cycles per byte, which is disappointing result.
+# Technical writers asserted that 3-way S4 pipeline can sustain
+# multiple NEON instructions per cycle, but dual NEON issue could
+# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
+
+# Byte order [in]dependence. =========================================
+#
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
+# ====================================================================
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";	# parameter block
+$inp="r1";
+$len="r2";
+
+$Tlo="r3";
+$Thi="r4";
+$Alo="r5";
+$Ahi="r6";
+$Elo="r7";
+$Ehi="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+############	r13 is stack pointer
+$Ktbl="r14";
+############	r15 is program counter
+
+$Aoff=8*0;
+$Boff=8*1;
+$Coff=8*2;
+$Doff=8*3;
+$Eoff=8*4;
+$Foff=8*5;
+$Goff=8*6;
+$Hoff=8*7;
+$Xoff=8*8;
+
+sub BODY_00_15() {
+my $magic = shift;
+$code.=<<___;
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	$t0,$Elo,lsr#14
+	str	$Tlo,[sp,#$Xoff+0]
+	mov	$t1,$Ehi,lsr#14
+	str	$Thi,[sp,#$Xoff+4]
+	eor	$t0,$t0,$Ehi,lsl#18
+	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
+	eor	$t1,$t1,$Elo,lsl#18
+	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
+	eor	$t0,$t0,$Elo,lsr#18
+	eor	$t1,$t1,$Ehi,lsr#18
+	eor	$t0,$t0,$Ehi,lsl#14
+	eor	$t1,$t1,$Elo,lsl#14
+	eor	$t0,$t0,$Ehi,lsr#9
+	eor	$t1,$t1,$Elo,lsr#9
+	eor	$t0,$t0,$Elo,lsl#23
+	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#$Foff+0]	@ f.lo
+	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
+	ldr	$t1,[sp,#$Foff+4]	@ f.hi
+	adds	$Tlo,$Tlo,$t2
+	ldr	$t2,[sp,#$Goff+0]	@ g.lo
+	adc	$Thi,$Thi,$t3		@ T += h
+	ldr	$t3,[sp,#$Goff+4]	@ g.hi
+
+	eor	$t0,$t0,$t2
+	str	$Elo,[sp,#$Eoff+0]
+	eor	$t1,$t1,$t3
+	str	$Ehi,[sp,#$Eoff+4]
+	and	$t0,$t0,$Elo
+	str	$Alo,[sp,#$Aoff+0]
+	and	$t1,$t1,$Ehi
+	str	$Ahi,[sp,#$Aoff+4]
+	eor	$t0,$t0,$t2
+	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
+	eor	$t1,$t1,$t3		@ Ch(e,f,g)
+	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
+
+	adds	$Tlo,$Tlo,$t0
+	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
+	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
+	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
+	adds	$Tlo,$Tlo,$t2
+	and	$t0,$t2,#0xff
+	adc	$Thi,$Thi,$t3		@ T += K[i]
+	adds	$Elo,$Elo,$Tlo
+	ldr	$t2,[sp,#$Boff+0]	@ b.lo
+	adc	$Ehi,$Ehi,$Thi		@ d += T
+	teq	$t0,#$magic
+
+	ldr	$t3,[sp,#$Coff+0]	@ c.lo
+#if __ARM_ARCH__>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	$Ktbl,$Ktbl,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	$t0,$Alo,lsr#28
+	mov	$t1,$Ahi,lsr#28
+	eor	$t0,$t0,$Ahi,lsl#4
+	eor	$t1,$t1,$Alo,lsl#4
+	eor	$t0,$t0,$Ahi,lsr#2
+	eor	$t1,$t1,$Alo,lsr#2
+	eor	$t0,$t0,$Alo,lsl#30
+	eor	$t1,$t1,$Ahi,lsl#30
+	eor	$t0,$t0,$Ahi,lsr#7
+	eor	$t1,$t1,$Alo,lsr#7
+	eor	$t0,$t0,$Alo,lsl#25
+	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
+	adds	$Tlo,$Tlo,$t0
+	and	$t0,$Alo,$t2
+	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
+
+	ldr	$t1,[sp,#$Boff+4]	@ b.hi
+	orr	$Alo,$Alo,$t2
+	ldr	$t2,[sp,#$Coff+4]	@ c.hi
+	and	$Alo,$Alo,$t3
+	and	$t3,$Ahi,$t1
+	orr	$Ahi,$Ahi,$t1
+	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
+	and	$Ahi,$Ahi,$t2
+	adds	$Alo,$Alo,$Tlo
+	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	$Ahi,$Ahi,$Thi		@ h += T
+	tst	$Ktbl,#1
+	add	$Ktbl,$Ktbl,#8
+___
+}
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K512,%object
+.align	5
+K512:
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size	K512,.-K512
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha512_block_data_order
+.skip	32-4
+#else
+.skip	32
+#endif
+
+.global	sha512_block_data_order
+.type	sha512_block_data_order,%function
+sha512_block_data_order:
+.Lsha512_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha512_block_data_order
+#else
+	adr	r3,.Lsha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#1
+	bne	.LNEON
+#endif
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	stmdb	sp!,{r4-r12,lr}
+	sub	$Ktbl,r3,#672		@ K512
+	sub	sp,sp,#9*8
+
+	ldr	$Elo,[$ctx,#$Eoff+$lo]
+	ldr	$Ehi,[$ctx,#$Eoff+$hi]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+.Loop:
+	str	$t0, [sp,#$Goff+0]
+	str	$t1, [sp,#$Goff+4]
+	str	$t2, [sp,#$Hoff+0]
+	str	$t3, [sp,#$Hoff+4]
+	ldr	$Alo,[$ctx,#$Aoff+$lo]
+	ldr	$Ahi,[$ctx,#$Aoff+$hi]
+	ldr	$Tlo,[$ctx,#$Boff+$lo]
+	ldr	$Thi,[$ctx,#$Boff+$hi]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	str	$Tlo,[sp,#$Boff+0]
+	str	$Thi,[sp,#$Boff+4]
+	str	$t0, [sp,#$Coff+0]
+	str	$t1, [sp,#$Coff+4]
+	str	$t2, [sp,#$Doff+0]
+	str	$t3, [sp,#$Doff+4]
+	ldr	$Tlo,[$ctx,#$Foff+$lo]
+	ldr	$Thi,[$ctx,#$Foff+$hi]
+	str	$Tlo,[sp,#$Foff+0]
+	str	$Thi,[sp,#$Foff+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+	ldrb	$Tlo,[$inp,#7]
+	ldrb	$t0, [$inp,#6]
+	ldrb	$t1, [$inp,#5]
+	ldrb	$t2, [$inp,#4]
+	ldrb	$Thi,[$inp,#3]
+	ldrb	$t3, [$inp,#2]
+	orr	$Tlo,$Tlo,$t0,lsl#8
+	ldrb	$t0, [$inp,#1]
+	orr	$Tlo,$Tlo,$t1,lsl#16
+	ldrb	$t1, [$inp],#8
+	orr	$Tlo,$Tlo,$t2,lsl#24
+	orr	$Thi,$Thi,$t3,lsl#8
+	orr	$Thi,$Thi,$t0,lsl#16
+	orr	$Thi,$Thi,$t1,lsl#24
+#else
+	ldr	$Tlo,[$inp,#4]
+	ldr	$Thi,[$inp],#8
+#ifdef __ARMEL__
+	rev	$Tlo,$Tlo
+	rev	$Thi,$Thi
+#endif
+#endif
+___
+	&BODY_00_15(0x94);
+$code.=<<___;
+	tst	$Ktbl,#1
+	beq	.L00_15
+	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	bic	$Ktbl,$Ktbl,#1
+.L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	$Tlo,$t0,lsr#1
+	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
+	mov	$Thi,$t1,lsr#1
+	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
+	eor	$Tlo,$Tlo,$t1,lsl#31
+	eor	$Thi,$Thi,$t0,lsl#31
+	eor	$Tlo,$Tlo,$t0,lsr#8
+	eor	$Thi,$Thi,$t1,lsr#8
+	eor	$Tlo,$Tlo,$t1,lsl#24
+	eor	$Thi,$Thi,$t0,lsl#24
+	eor	$Tlo,$Tlo,$t0,lsr#7
+	eor	$Thi,$Thi,$t1,lsr#7
+	eor	$Tlo,$Tlo,$t1,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	$t0,$t2,lsr#19
+	mov	$t1,$t3,lsr#19
+	eor	$t0,$t0,$t3,lsl#13
+	eor	$t1,$t1,$t2,lsl#13
+	eor	$t0,$t0,$t3,lsr#29
+	eor	$t1,$t1,$t2,lsr#29
+	eor	$t0,$t0,$t2,lsl#3
+	eor	$t1,$t1,$t3,lsl#3
+	eor	$t0,$t0,$t2,lsr#6
+	eor	$t1,$t1,$t3,lsr#6
+	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
+	eor	$t0,$t0,$t3,lsl#26
+
+	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#`$Xoff+8*16`+0]
+	adc	$Thi,$Thi,$t1
+
+	ldr	$t1,[sp,#`$Xoff+8*16`+4]
+	adds	$Tlo,$Tlo,$t2
+	adc	$Thi,$Thi,$t3
+	adds	$Tlo,$Tlo,$t0
+	adc	$Thi,$Thi,$t1
+___
+	&BODY_00_15(0x17);
+$code.=<<___;
+#if __ARM_ARCH__>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	beq	.L16_79
+	bic	$Ktbl,$Ktbl,#1
+
+	ldr	$Tlo,[sp,#$Boff+0]
+	ldr	$Thi,[sp,#$Boff+4]
+	ldr	$t0, [$ctx,#$Aoff+$lo]
+	ldr	$t1, [$ctx,#$Aoff+$hi]
+	ldr	$t2, [$ctx,#$Boff+$lo]
+	ldr	$t3, [$ctx,#$Boff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Aoff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Aoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Boff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Boff+$hi]
+
+	ldr	$Alo,[sp,#$Coff+0]
+	ldr	$Ahi,[sp,#$Coff+4]
+	ldr	$Tlo,[sp,#$Doff+0]
+	ldr	$Thi,[sp,#$Doff+4]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Coff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Coff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Doff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Doff+$hi]
+
+	ldr	$Tlo,[sp,#$Foff+0]
+	ldr	$Thi,[sp,#$Foff+4]
+	ldr	$t0, [$ctx,#$Eoff+$lo]
+	ldr	$t1, [$ctx,#$Eoff+$hi]
+	ldr	$t2, [$ctx,#$Foff+$lo]
+	ldr	$t3, [$ctx,#$Foff+$hi]
+	adds	$Elo,$Elo,$t0
+	str	$Elo,[$ctx,#$Eoff+$lo]
+	adc	$Ehi,$Ehi,$t1
+	str	$Ehi,[$ctx,#$Eoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Foff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Foff+$hi]
+
+	ldr	$Alo,[sp,#$Goff+0]
+	ldr	$Ahi,[sp,#$Goff+4]
+	ldr	$Tlo,[sp,#$Hoff+0]
+	ldr	$Thi,[sp,#$Hoff+4]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Goff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Goff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Hoff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Hoff+$hi]
+
+	add	sp,sp,#640
+	sub	$Ktbl,$Ktbl,#640
+
+	teq	$inp,$len
+	bne	.Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
+	ldmia	sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha512_block_data_order,.-sha512_block_data_order
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
+
+$code.=<<___ if ($i<16 || $i&1);
+	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
+#if $i<16
+	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
+#endif
+	vshr.u64	$t1,$e,#@Sigma1[1]
+#if $i>0
+	 vadd.i64	$a,$Maj			@ h+=Maj from the past
+#endif
+	vshr.u64	$t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
+	vsli.64		$t0,$e,#`64-@Sigma1[0]`
+	vsli.64		$t1,$e,#`64-@Sigma1[1]`
+	vmov		$Ch,$e
+	vsli.64		$t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+	vrev64.8	@X[$i],@X[$i]
+#endif
+	veor		$t1,$t0
+	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
+	vshr.u64	$t0,$a,#@Sigma0[0]
+	veor		$t2,$t1			@ Sigma1(e)
+	vadd.i64	$T1,$Ch,$h
+	vshr.u64	$t1,$a,#@Sigma0[1]
+	vsli.64		$t0,$a,#`64-@Sigma0[0]`
+	vadd.i64	$T1,$t2
+	vshr.u64	$t2,$a,#@Sigma0[2]
+	vadd.i64	$K,@X[$i%16]
+	vsli.64		$t1,$a,#`64-@Sigma0[1]`
+	veor		$Maj,$a,$b
+	vsli.64		$t2,$a,#`64-@Sigma0[2]`
+	veor		$h,$t0,$t1
+	vadd.i64	$T1,$K
+	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
+	veor		$h,$t2			@ Sigma0(a)
+	vadd.i64	$d,$T1
+	vadd.i64	$Maj,$T1
+	@ vadd.i64	$h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1)	{ &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));			# view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
+my $e=@_[4];					# $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
+	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
+	 vadd.i64	@_[0],d30			@ h+=Maj from the past
+	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
+	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
+	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
+	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
+	veor		$s1,$t0
+	vshr.u64	$t0,$s0,#@sigma0[0]
+	veor		$s1,$t1				@ sigma1(X[i+14])
+	vshr.u64	$t1,$s0,#@sigma0[1]
+	vadd.i64	@X[$i%8],$s1
+	vshr.u64	$s1,$s0,#@sigma0[2]
+	vsli.64		$t0,$s0,#`64-@sigma0[0]`
+	vsli.64		$t1,$s0,#`64-@sigma0[1]`
+	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
+	veor		$s1,$t0
+	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s0
+	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
+	veor		$s1,$t1				@ sigma0(X[i+1])
+	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s1
+___
+	&NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha512_block_data_order_neon
+.type	sha512_block_data_order_neon,%function
+.align	4
+sha512_block_data_order_neon:
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	VFP_ABI_PUSH
+	adr	$Ktbl,.Lsha512_block_data_order
+	sub	$Ktbl,$Ktbl,.Lsha512_block_data_order-K512
+	vldmia	$ctx,{$A-$H}		@ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mov		$cnt,#4
+.L16_79_neon:
+	subs		$cnt,#1
+___
+for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bne		.L16_79_neon
+
+	 vadd.i64	$A,d30		@ h+=Maj from the past
+	vldmia		$ctx,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		$ctx,{$A-$H}	@ save context
+	teq		$inp,$len
+	sub		$Ktbl,#640	@ rewind K512
+	bne		.Loop_neon
+
+	VFP_ABI_POP
+	ret				@ bx lr
+.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm	OPENSSL_armcap_P,4,4
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx	lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+print $code;
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h
new file mode 100644
index 000000000000..ed9bd81d6d78
--- /dev/null
+++ b/lib/crypto/arm/sha512.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * arm32-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void sha512_block_data_order(struct sha512_block_state *state,
+					const u8 *data, size_t nblocks);
+asmlinkage void sha512_block_data_order_neon(struct sha512_block_state *state,
+					     const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
+		scoped_ksimd()
+			sha512_block_data_order_neon(state, data, nblocks);
+	} else {
+		sha512_block_data_order(state, data, nblocks);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+	if (cpu_has_neon())
+		static_branch_enable(&have_neon);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm64/.gitignore b/lib/crypto/arm64/.gitignore
new file mode 100644
index 000000000000..f6c4e8ef80da
--- /dev/null
+++ b/lib/crypto/arm64/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
+sha512-core.S
diff --git a/lib/crypto/arm64/chacha-neon-core.S b/lib/crypto/arm64/chacha-neon-core.S
new file mode 100644
index 000000000000..80079586ecc7
--- /dev/null
+++ b/lib/crypto/arm64/chacha-neon-core.S
@@ -0,0 +1,805 @@
+/*
+ * ChaCha/HChaCha NEON helper functions
+ *
+ * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Originally based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
+
+	.text
+	.align		6
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers v0-v3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in w3.
+ *
+ * Clobbers: w3, x10, v4, v12
+ */
+SYM_FUNC_START_LOCAL(chacha_permute)
+
+	adr_l		x10, ROT8
+	ld1		{v12.4s}, [x10]
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	tbl		v3.16b, {v3.16b}, v12.16b
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	ext		v1.16b, v1.16b, v1.16b, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	ext		v3.16b, v3.16b, v3.16b, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	tbl		v3.16b, {v3.16b}, v12.16b
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	ext		v1.16b, v1.16b, v1.16b, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	ext		v3.16b, v3.16b, v3.16b, #4
+
+	subs		w3, w3, #2
+	b.ne		.Ldoubleround
+
+	ret
+SYM_FUNC_END(chacha_permute)
+
+SYM_FUNC_START(chacha_block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 1 data block output, o
+	// x2: 1 data block input, i
+	// w3: nrounds
+
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	// x0..3 = s0..3
+	ld1		{v0.4s-v3.4s}, [x0]
+	ld1		{v8.4s-v11.4s}, [x0]
+
+	bl		chacha_permute
+
+	ld1		{v4.16b-v7.16b}, [x2]
+
+	// o0 = i0 ^ (x0 + s0)
+	add		v0.4s, v0.4s, v8.4s
+	eor		v0.16b, v0.16b, v4.16b
+
+	// o1 = i1 ^ (x1 + s1)
+	add		v1.4s, v1.4s, v9.4s
+	eor		v1.16b, v1.16b, v5.16b
+
+	// o2 = i2 ^ (x2 + s2)
+	add		v2.4s, v2.4s, v10.4s
+	eor		v2.16b, v2.16b, v6.16b
+
+	// o3 = i3 ^ (x3 + s3)
+	add		v3.4s, v3.4s, v11.4s
+	eor		v3.16b, v3.16b, v7.16b
+
+	st1		{v0.16b-v3.16b}, [x1]
+
+	ldp		x29, x30, [sp], #16
+	ret
+SYM_FUNC_END(chacha_block_xor_neon)
+
+SYM_FUNC_START(hchacha_block_neon)
+	// x0: Input state matrix, s
+	// x1: output (8 32-bit words)
+	// w2: nrounds
+
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	ld1		{v0.4s-v3.4s}, [x0]
+
+	mov		w3, w2
+	bl		chacha_permute
+
+	st1		{v0.4s}, [x1], #16
+	st1		{v3.4s}, [x1]
+
+	ldp		x29, x30, [sp], #16
+	ret
+SYM_FUNC_END(hchacha_block_neon)
+
+	a0		.req	w12
+	a1		.req	w13
+	a2		.req	w14
+	a3		.req	w15
+	a4		.req	w16
+	a5		.req	w17
+	a6		.req	w19
+	a7		.req	w20
+	a8		.req	w21
+	a9		.req	w22
+	a10		.req	w23
+	a11		.req	w24
+	a12		.req	w25
+	a13		.req	w26
+	a14		.req	w27
+	a15		.req	w28
+
+	.align		6
+SYM_FUNC_START(chacha_4block_xor_neon)
+	frame_push	10
+
+	// x0: Input state matrix, s
+	// x1: 4 data blocks output, o
+	// x2: 4 data blocks input, i
+	// w3: nrounds
+	// x4: byte count
+
+	adr_l		x10, .Lpermute
+	and		x5, x4, #63
+	add		x10, x10, x5
+
+	//
+	// This function encrypts four consecutive ChaCha blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. For final XORing step we transpose the
+	// matrix by interleaving 32- and then 64-bit words, which allows us to
+	// do XOR in NEON registers.
+	//
+	// At the same time, a fifth block is encrypted in parallel using
+	// scalar registers
+	//
+	adr_l		x9, CTRINC		// ... and ROT8
+	ld1		{v30.4s-v31.4s}, [x9]
+
+	// x0..15[0-3] = s0..3[0..3]
+	add		x8, x0, #16
+	ld4r		{ v0.4s- v3.4s}, [x0]
+	ld4r		{ v4.4s- v7.4s}, [x8], #16
+	ld4r		{ v8.4s-v11.4s}, [x8], #16
+	ld4r		{v12.4s-v15.4s}, [x8]
+
+	mov		a0, v0.s[0]
+	mov		a1, v1.s[0]
+	mov		a2, v2.s[0]
+	mov		a3, v3.s[0]
+	mov		a4, v4.s[0]
+	mov		a5, v5.s[0]
+	mov		a6, v6.s[0]
+	mov		a7, v7.s[0]
+	mov		a8, v8.s[0]
+	mov		a9, v9.s[0]
+	mov		a10, v10.s[0]
+	mov		a11, v11.s[0]
+	mov		a12, v12.s[0]
+	mov		a13, v13.s[0]
+	mov		a14, v14.s[0]
+	mov		a15, v15.s[0]
+
+	// x12 += counter values 1-4
+	add		v12.4s, v12.4s, v30.4s
+
+.Ldoubleround4:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	add		v0.4s, v0.4s, v4.4s
+	  add		a0, a0, a4
+	add		v1.4s, v1.4s, v5.4s
+	  add		a1, a1, a5
+	add		v2.4s, v2.4s, v6.4s
+	  add		a2, a2, a6
+	add		v3.4s, v3.4s, v7.4s
+	  add		a3, a3, a7
+
+	eor		v12.16b, v12.16b, v0.16b
+	  eor		a12, a12, a0
+	eor		v13.16b, v13.16b, v1.16b
+	  eor		a13, a13, a1
+	eor		v14.16b, v14.16b, v2.16b
+	  eor		a14, a14, a2
+	eor		v15.16b, v15.16b, v3.16b
+	  eor		a15, a15, a3
+
+	rev32		v12.8h, v12.8h
+	  ror		a12, a12, #16
+	rev32		v13.8h, v13.8h
+	  ror		a13, a13, #16
+	rev32		v14.8h, v14.8h
+	  ror		a14, a14, #16
+	rev32		v15.8h, v15.8h
+	  ror		a15, a15, #16
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	add		v8.4s, v8.4s, v12.4s
+	  add		a8, a8, a12
+	add		v9.4s, v9.4s, v13.4s
+	  add		a9, a9, a13
+	add		v10.4s, v10.4s, v14.4s
+	  add		a10, a10, a14
+	add		v11.4s, v11.4s, v15.4s
+	  add		a11, a11, a15
+
+	eor		v16.16b, v4.16b, v8.16b
+	  eor		a4, a4, a8
+	eor		v17.16b, v5.16b, v9.16b
+	  eor		a5, a5, a9
+	eor		v18.16b, v6.16b, v10.16b
+	  eor		a6, a6, a10
+	eor		v19.16b, v7.16b, v11.16b
+	  eor		a7, a7, a11
+
+	shl		v4.4s, v16.4s, #12
+	shl		v5.4s, v17.4s, #12
+	shl		v6.4s, v18.4s, #12
+	shl		v7.4s, v19.4s, #12
+
+	sri		v4.4s, v16.4s, #20
+	  ror		a4, a4, #20
+	sri		v5.4s, v17.4s, #20
+	  ror		a5, a5, #20
+	sri		v6.4s, v18.4s, #20
+	  ror		a6, a6, #20
+	sri		v7.4s, v19.4s, #20
+	  ror		a7, a7, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	add		v0.4s, v0.4s, v4.4s
+	  add		a0, a0, a4
+	add		v1.4s, v1.4s, v5.4s
+	  add		a1, a1, a5
+	add		v2.4s, v2.4s, v6.4s
+	  add		a2, a2, a6
+	add		v3.4s, v3.4s, v7.4s
+	  add		a3, a3, a7
+
+	eor		v12.16b, v12.16b, v0.16b
+	  eor		a12, a12, a0
+	eor		v13.16b, v13.16b, v1.16b
+	  eor		a13, a13, a1
+	eor		v14.16b, v14.16b, v2.16b
+	  eor		a14, a14, a2
+	eor		v15.16b, v15.16b, v3.16b
+	  eor		a15, a15, a3
+
+	tbl		v12.16b, {v12.16b}, v31.16b
+	  ror		a12, a12, #24
+	tbl		v13.16b, {v13.16b}, v31.16b
+	  ror		a13, a13, #24
+	tbl		v14.16b, {v14.16b}, v31.16b
+	  ror		a14, a14, #24
+	tbl		v15.16b, {v15.16b}, v31.16b
+	  ror		a15, a15, #24
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	add		v8.4s, v8.4s, v12.4s
+	  add		a8, a8, a12
+	add		v9.4s, v9.4s, v13.4s
+	  add		a9, a9, a13
+	add		v10.4s, v10.4s, v14.4s
+	  add		a10, a10, a14
+	add		v11.4s, v11.4s, v15.4s
+	  add		a11, a11, a15
+
+	eor		v16.16b, v4.16b, v8.16b
+	  eor		a4, a4, a8
+	eor		v17.16b, v5.16b, v9.16b
+	  eor		a5, a5, a9
+	eor		v18.16b, v6.16b, v10.16b
+	  eor		a6, a6, a10
+	eor		v19.16b, v7.16b, v11.16b
+	  eor		a7, a7, a11
+
+	shl		v4.4s, v16.4s, #7
+	shl		v5.4s, v17.4s, #7
+	shl		v6.4s, v18.4s, #7
+	shl		v7.4s, v19.4s, #7
+
+	sri		v4.4s, v16.4s, #25
+	  ror		a4, a4, #25
+	sri		v5.4s, v17.4s, #25
+	  ror		a5, a5, #25
+	sri		v6.4s, v18.4s, #25
+	 ror		a6, a6, #25
+	sri		v7.4s, v19.4s, #25
+	  ror		a7, a7, #25
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	add		v0.4s, v0.4s, v5.4s
+	  add		a0, a0, a5
+	add		v1.4s, v1.4s, v6.4s
+	  add		a1, a1, a6
+	add		v2.4s, v2.4s, v7.4s
+	  add		a2, a2, a7
+	add		v3.4s, v3.4s, v4.4s
+	  add		a3, a3, a4
+
+	eor		v15.16b, v15.16b, v0.16b
+	  eor		a15, a15, a0
+	eor		v12.16b, v12.16b, v1.16b
+	  eor		a12, a12, a1
+	eor		v13.16b, v13.16b, v2.16b
+	  eor		a13, a13, a2
+	eor		v14.16b, v14.16b, v3.16b
+	  eor		a14, a14, a3
+
+	rev32		v15.8h, v15.8h
+	  ror		a15, a15, #16
+	rev32		v12.8h, v12.8h
+	  ror		a12, a12, #16
+	rev32		v13.8h, v13.8h
+	  ror		a13, a13, #16
+	rev32		v14.8h, v14.8h
+	  ror		a14, a14, #16
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	add		v10.4s, v10.4s, v15.4s
+	  add		a10, a10, a15
+	add		v11.4s, v11.4s, v12.4s
+	  add		a11, a11, a12
+	add		v8.4s, v8.4s, v13.4s
+	  add		a8, a8, a13
+	add		v9.4s, v9.4s, v14.4s
+	  add		a9, a9, a14
+
+	eor		v16.16b, v5.16b, v10.16b
+	  eor		a5, a5, a10
+	eor		v17.16b, v6.16b, v11.16b
+	  eor		a6, a6, a11
+	eor		v18.16b, v7.16b, v8.16b
+	  eor		a7, a7, a8
+	eor		v19.16b, v4.16b, v9.16b
+	  eor		a4, a4, a9
+
+	shl		v5.4s, v16.4s, #12
+	shl		v6.4s, v17.4s, #12
+	shl		v7.4s, v18.4s, #12
+	shl		v4.4s, v19.4s, #12
+
+	sri		v5.4s, v16.4s, #20
+	  ror		a5, a5, #20
+	sri		v6.4s, v17.4s, #20
+	  ror		a6, a6, #20
+	sri		v7.4s, v18.4s, #20
+	  ror		a7, a7, #20
+	sri		v4.4s, v19.4s, #20
+	  ror		a4, a4, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	add		v0.4s, v0.4s, v5.4s
+	  add		a0, a0, a5
+	add		v1.4s, v1.4s, v6.4s
+	  add		a1, a1, a6
+	add		v2.4s, v2.4s, v7.4s
+	  add		a2, a2, a7
+	add		v3.4s, v3.4s, v4.4s
+	  add		a3, a3, a4
+
+	eor		v15.16b, v15.16b, v0.16b
+	  eor		a15, a15, a0
+	eor		v12.16b, v12.16b, v1.16b
+	  eor		a12, a12, a1
+	eor		v13.16b, v13.16b, v2.16b
+	  eor		a13, a13, a2
+	eor		v14.16b, v14.16b, v3.16b
+	  eor		a14, a14, a3
+
+	tbl		v15.16b, {v15.16b}, v31.16b
+	  ror		a15, a15, #24
+	tbl		v12.16b, {v12.16b}, v31.16b
+	  ror		a12, a12, #24
+	tbl		v13.16b, {v13.16b}, v31.16b
+	  ror		a13, a13, #24
+	tbl		v14.16b, {v14.16b}, v31.16b
+	  ror		a14, a14, #24
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	add		v10.4s, v10.4s, v15.4s
+	  add		a10, a10, a15
+	add		v11.4s, v11.4s, v12.4s
+	  add		a11, a11, a12
+	add		v8.4s, v8.4s, v13.4s
+	  add		a8, a8, a13
+	add		v9.4s, v9.4s, v14.4s
+	  add		a9, a9, a14
+
+	eor		v16.16b, v5.16b, v10.16b
+	  eor		a5, a5, a10
+	eor		v17.16b, v6.16b, v11.16b
+	  eor		a6, a6, a11
+	eor		v18.16b, v7.16b, v8.16b
+	  eor		a7, a7, a8
+	eor		v19.16b, v4.16b, v9.16b
+	  eor		a4, a4, a9
+
+	shl		v5.4s, v16.4s, #7
+	shl		v6.4s, v17.4s, #7
+	shl		v7.4s, v18.4s, #7
+	shl		v4.4s, v19.4s, #7
+
+	sri		v5.4s, v16.4s, #25
+	  ror		a5, a5, #25
+	sri		v6.4s, v17.4s, #25
+	  ror		a6, a6, #25
+	sri		v7.4s, v18.4s, #25
+	  ror		a7, a7, #25
+	sri		v4.4s, v19.4s, #25
+	  ror		a4, a4, #25
+
+	subs		w3, w3, #2
+	b.ne		.Ldoubleround4
+
+	ld4r		{v16.4s-v19.4s}, [x0], #16
+	ld4r		{v20.4s-v23.4s}, [x0], #16
+
+	// x12 += counter values 0-3
+	add		v12.4s, v12.4s, v30.4s
+
+	// x0[0-3] += s0[0]
+	// x1[0-3] += s0[1]
+	// x2[0-3] += s0[2]
+	// x3[0-3] += s0[3]
+	add		v0.4s, v0.4s, v16.4s
+	  mov		w6, v16.s[0]
+	  mov		w7, v17.s[0]
+	add		v1.4s, v1.4s, v17.4s
+	  mov		w8, v18.s[0]
+	  mov		w9, v19.s[0]
+	add		v2.4s, v2.4s, v18.4s
+	  add		a0, a0, w6
+	  add		a1, a1, w7
+	add		v3.4s, v3.4s, v19.4s
+	  add		a2, a2, w8
+	  add		a3, a3, w9
+CPU_BE(	  rev		a0, a0		)
+CPU_BE(	  rev		a1, a1		)
+CPU_BE(	  rev		a2, a2		)
+CPU_BE(	  rev		a3, a3		)
+
+	ld4r		{v24.4s-v27.4s}, [x0], #16
+	ld4r		{v28.4s-v31.4s}, [x0]
+
+	// x4[0-3] += s1[0]
+	// x5[0-3] += s1[1]
+	// x6[0-3] += s1[2]
+	// x7[0-3] += s1[3]
+	add		v4.4s, v4.4s, v20.4s
+	  mov		w6, v20.s[0]
+	  mov		w7, v21.s[0]
+	add		v5.4s, v5.4s, v21.4s
+	  mov		w8, v22.s[0]
+	  mov		w9, v23.s[0]
+	add		v6.4s, v6.4s, v22.4s
+	  add		a4, a4, w6
+	  add		a5, a5, w7
+	add		v7.4s, v7.4s, v23.4s
+	  add		a6, a6, w8
+	  add		a7, a7, w9
+CPU_BE(	  rev		a4, a4		)
+CPU_BE(	  rev		a5, a5		)
+CPU_BE(	  rev		a6, a6		)
+CPU_BE(	  rev		a7, a7		)
+
+	// x8[0-3] += s2[0]
+	// x9[0-3] += s2[1]
+	// x10[0-3] += s2[2]
+	// x11[0-3] += s2[3]
+	add		v8.4s, v8.4s, v24.4s
+	  mov		w6, v24.s[0]
+	  mov		w7, v25.s[0]
+	add		v9.4s, v9.4s, v25.4s
+	  mov		w8, v26.s[0]
+	  mov		w9, v27.s[0]
+	add		v10.4s, v10.4s, v26.4s
+	  add		a8, a8, w6
+	  add		a9, a9, w7
+	add		v11.4s, v11.4s, v27.4s
+	  add		a10, a10, w8
+	  add		a11, a11, w9
+CPU_BE(	  rev		a8, a8		)
+CPU_BE(	  rev		a9, a9		)
+CPU_BE(	  rev		a10, a10	)
+CPU_BE(	  rev		a11, a11	)
+
+	// x12[0-3] += s3[0]
+	// x13[0-3] += s3[1]
+	// x14[0-3] += s3[2]
+	// x15[0-3] += s3[3]
+	add		v12.4s, v12.4s, v28.4s
+	  mov		w6, v28.s[0]
+	  mov		w7, v29.s[0]
+	add		v13.4s, v13.4s, v29.4s
+	  mov		w8, v30.s[0]
+	  mov		w9, v31.s[0]
+	add		v14.4s, v14.4s, v30.4s
+	  add		a12, a12, w6
+	  add		a13, a13, w7
+	add		v15.4s, v15.4s, v31.4s
+	  add		a14, a14, w8
+	  add		a15, a15, w9
+CPU_BE(	  rev		a12, a12	)
+CPU_BE(	  rev		a13, a13	)
+CPU_BE(	  rev		a14, a14	)
+CPU_BE(	  rev		a15, a15	)
+
+	// interleave 32-bit words in state n, n+1
+	  ldp		w6, w7, [x2], #64
+	zip1		v16.4s, v0.4s, v1.4s
+	  ldp		w8, w9, [x2, #-56]
+	  eor		a0, a0, w6
+	zip2		v17.4s, v0.4s, v1.4s
+	  eor		a1, a1, w7
+	zip1		v18.4s, v2.4s, v3.4s
+	  eor		a2, a2, w8
+	zip2		v19.4s, v2.4s, v3.4s
+	  eor		a3, a3, w9
+	  ldp		w6, w7, [x2, #-48]
+	zip1		v20.4s, v4.4s, v5.4s
+	  ldp		w8, w9, [x2, #-40]
+	  eor		a4, a4, w6
+	zip2		v21.4s, v4.4s, v5.4s
+	  eor		a5, a5, w7
+	zip1		v22.4s, v6.4s, v7.4s
+	  eor		a6, a6, w8
+	zip2		v23.4s, v6.4s, v7.4s
+	  eor		a7, a7, w9
+	  ldp		w6, w7, [x2, #-32]
+	zip1		v24.4s, v8.4s, v9.4s
+	  ldp		w8, w9, [x2, #-24]
+	  eor		a8, a8, w6
+	zip2		v25.4s, v8.4s, v9.4s
+	  eor		a9, a9, w7
+	zip1		v26.4s, v10.4s, v11.4s
+	  eor		a10, a10, w8
+	zip2		v27.4s, v10.4s, v11.4s
+	  eor		a11, a11, w9
+	  ldp		w6, w7, [x2, #-16]
+	zip1		v28.4s, v12.4s, v13.4s
+	  ldp		w8, w9, [x2, #-8]
+	  eor		a12, a12, w6
+	zip2		v29.4s, v12.4s, v13.4s
+	  eor		a13, a13, w7
+	zip1		v30.4s, v14.4s, v15.4s
+	  eor		a14, a14, w8
+	zip2		v31.4s, v14.4s, v15.4s
+	  eor		a15, a15, w9
+
+	add		x3, x2, x4
+	sub		x3, x3, #128		// start of last block
+
+	subs		x5, x4, #128
+	csel		x2, x2, x3, ge
+
+	// interleave 64-bit words in state n, n+2
+	zip1		v0.2d, v16.2d, v18.2d
+	zip2		v4.2d, v16.2d, v18.2d
+	  stp		a0, a1, [x1], #64
+	zip1		v8.2d, v17.2d, v19.2d
+	zip2		v12.2d, v17.2d, v19.2d
+	  stp		a2, a3, [x1, #-56]
+
+	subs		x6, x4, #192
+	ld1		{v16.16b-v19.16b}, [x2], #64
+	csel		x2, x2, x3, ge
+
+	zip1		v1.2d, v20.2d, v22.2d
+	zip2		v5.2d, v20.2d, v22.2d
+	  stp		a4, a5, [x1, #-48]
+	zip1		v9.2d, v21.2d, v23.2d
+	zip2		v13.2d, v21.2d, v23.2d
+	  stp		a6, a7, [x1, #-40]
+
+	subs		x7, x4, #256
+	ld1		{v20.16b-v23.16b}, [x2], #64
+	csel		x2, x2, x3, ge
+
+	zip1		v2.2d, v24.2d, v26.2d
+	zip2		v6.2d, v24.2d, v26.2d
+	  stp		a8, a9, [x1, #-32]
+	zip1		v10.2d, v25.2d, v27.2d
+	zip2		v14.2d, v25.2d, v27.2d
+	  stp		a10, a11, [x1, #-24]
+
+	subs		x8, x4, #320
+	ld1		{v24.16b-v27.16b}, [x2], #64
+	csel		x2, x2, x3, ge
+
+	zip1		v3.2d, v28.2d, v30.2d
+	zip2		v7.2d, v28.2d, v30.2d
+	  stp		a12, a13, [x1, #-16]
+	zip1		v11.2d, v29.2d, v31.2d
+	zip2		v15.2d, v29.2d, v31.2d
+	  stp		a14, a15, [x1, #-8]
+
+	tbnz		x5, #63, .Lt128
+	ld1		{v28.16b-v31.16b}, [x2]
+
+	// xor with corresponding input, write to output
+	eor		v16.16b, v16.16b, v0.16b
+	eor		v17.16b, v17.16b, v1.16b
+	eor		v18.16b, v18.16b, v2.16b
+	eor		v19.16b, v19.16b, v3.16b
+
+	tbnz		x6, #63, .Lt192
+
+	eor		v20.16b, v20.16b, v4.16b
+	eor		v21.16b, v21.16b, v5.16b
+	eor		v22.16b, v22.16b, v6.16b
+	eor		v23.16b, v23.16b, v7.16b
+
+	st1		{v16.16b-v19.16b}, [x1], #64
+	tbnz		x7, #63, .Lt256
+
+	eor		v24.16b, v24.16b, v8.16b
+	eor		v25.16b, v25.16b, v9.16b
+	eor		v26.16b, v26.16b, v10.16b
+	eor		v27.16b, v27.16b, v11.16b
+
+	st1		{v20.16b-v23.16b}, [x1], #64
+	tbnz		x8, #63, .Lt320
+
+	eor		v28.16b, v28.16b, v12.16b
+	eor		v29.16b, v29.16b, v13.16b
+	eor		v30.16b, v30.16b, v14.16b
+	eor		v31.16b, v31.16b, v15.16b
+
+	st1		{v24.16b-v27.16b}, [x1], #64
+	st1		{v28.16b-v31.16b}, [x1]
+
+.Lout:	frame_pop
+	ret
+
+	// fewer than 192 bytes of in/output
+.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
+	ld1		{v28.16b-v31.16b}, [x10]
+	add		x5, x5, x1
+	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
+	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
+	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
+	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
+
+0:	eor		v20.16b, v20.16b, v28.16b
+	eor		v21.16b, v21.16b, v29.16b
+	eor		v22.16b, v22.16b, v30.16b
+	eor		v23.16b, v23.16b, v31.16b
+	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
+1:	st1		{v16.16b-v19.16b}, [x1]
+	b		.Lout
+
+	// fewer than 128 bytes of in/output
+.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
+	add		x5, x5, x1
+	sub		x1, x1, #64
+	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
+	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
+	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
+	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
+	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
+	b		0b
+
+	// fewer than 256 bytes of in/output
+.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
+	ld1		{v4.16b-v7.16b}, [x10]
+	add		x6, x6, x1
+	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
+	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
+	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
+	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
+
+	eor		v28.16b, v28.16b, v0.16b
+	eor		v29.16b, v29.16b, v1.16b
+	eor		v30.16b, v30.16b, v2.16b
+	eor		v31.16b, v31.16b, v3.16b
+	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
+2:	st1		{v20.16b-v23.16b}, [x1]
+	b		.Lout
+
+	// fewer than 320 bytes of in/output
+.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
+	ld1		{v4.16b-v7.16b}, [x10]
+	add		x7, x7, x1
+	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
+	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
+	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
+	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
+
+	eor		v28.16b, v28.16b, v0.16b
+	eor		v29.16b, v29.16b, v1.16b
+	eor		v30.16b, v30.16b, v2.16b
+	eor		v31.16b, v31.16b, v3.16b
+	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
+3:	st1		{v24.16b-v27.16b}, [x1]
+	b		.Lout
+SYM_FUNC_END(chacha_4block_xor_neon)
+
+	.section	".rodata", "a", %progbits
+	.align		L1_CACHE_SHIFT
+.Lpermute:
+	.set		.Li, 0
+	.rept		128
+	.byte		(.Li - 64)
+	.set		.Li, .Li + 1
+	.endr
+
+CTRINC:	.word		1, 2, 3, 4
+ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/lib/crypto/arm64/chacha.h b/lib/crypto/arm64/chacha.h
new file mode 100644
index 000000000000..ca8c6a8b0578
--- /dev/null
+++ b/lib/crypto/arm64/chacha.h
@@ -0,0 +1,96 @@
+/*
+ * ChaCha and HChaCha functions (ARM64 optimized)
+ *
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+				      u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       int nrounds, int bytes);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+			  int bytes, int nrounds)
+{
+	while (bytes > 0) {
+		int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
+
+		if (l <= CHACHA_BLOCK_SIZE) {
+			u8 buf[CHACHA_BLOCK_SIZE];
+
+			memcpy(buf, src, l);
+			chacha_block_xor_neon(state, buf, buf, nrounds);
+			memcpy(dst, buf, l);
+			state->x[12] += 1;
+			break;
+		}
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+	}
+}
+
+static void hchacha_block_arch(const struct chacha_state *state,
+			       u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
+		hchacha_block_generic(state, out, nrounds);
+	} else {
+		scoped_ksimd()
+			hchacha_block_neon(state, out, nrounds);
+	}
+}
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+			      const u8 *src, unsigned int bytes, int nrounds)
+{
+	if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
+	    !crypto_simd_usable())
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		scoped_ksimd()
+			chacha_doneon(state, dst, src, todo, nrounds);
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(ASIMD))
+		static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl
new file mode 100644
index 000000000000..f1930c6b55ce
--- /dev/null
+++ b/lib/crypto/arm64/poly1305-armv8.pl
@@ -0,0 +1,920 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# This module implements Poly1305 hash for ARMv8.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+#		IALU/gcc-4.9	NEON
+#
+# Apple A7	1.86/+5%	0.72
+# Cortex-A53	2.69/+58%	1.47
+# Cortex-A57	2.70/+7%	1.14
+# Denver	1.64/+50%	1.18(*)
+# X-Gene	2.13/+68%	2.27
+# Mongoose	1.77/+75%	1.12
+# Kryo		2.70/+55%	1.13
+# ThunderX2	1.17/+95%	1.36
+#
+# (*)	estimate based on resources availability is less than 1.0,
+#	i.e. measured result is worse than expected, presumably binary
+#	translator is not almighty;
+
+$flavour=shift;
+$output=shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern	OPENSSL_armcap_P
+#else
+# define poly1305_init   poly1305_block_init
+# define poly1305_blocks poly1305_blocks_arm64
+#endif
+
+.text
+
+// forward "declarations" are required for Apple
+.globl	poly1305_blocks
+.globl	poly1305_emit
+
+.globl	poly1305_init
+.type	poly1305_init,%function
+.align	5
+poly1305_init:
+	cmp	$inp,xzr
+	stp	xzr,xzr,[$ctx]		// zero hash value
+	stp	xzr,xzr,[$ctx,#16]	// [along with is_base2_26]
+
+	csel	x0,xzr,x0,eq
+	b.eq	.Lno_key
+
+#ifndef	__KERNEL__
+	adrp	x17,OPENSSL_armcap_P
+	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
+#endif
+
+	ldp	$r0,$r1,[$inp]		// load key
+	mov	$s1,#0xfffffffc0fffffff
+	movk	$s1,#0x0fff,lsl#48
+#ifdef	__AARCH64EB__
+	rev	$r0,$r0			// flip bytes
+	rev	$r1,$r1
+#endif
+	and	$r0,$r0,$s1		// &=0ffffffc0fffffff
+	and	$s1,$s1,#-4
+	and	$r1,$r1,$s1		// &=0ffffffc0ffffffc
+	mov	w#$s1,#-1
+	stp	$r0,$r1,[$ctx,#32]	// save key value
+	str	w#$s1,[$ctx,#48]	// impossible key power value
+
+#ifndef	__KERNEL__
+	tst	w17,#ARMV7_NEON
+
+	adr	$d0,.Lpoly1305_blocks
+	adr	$r0,.Lpoly1305_blocks_neon
+	adr	$d1,.Lpoly1305_emit
+
+	csel	$d0,$d0,$r0,eq
+
+# ifdef	__ILP32__
+	stp	w#$d0,w#$d1,[$len]
+# else
+	stp	$d0,$d1,[$len]
+# endif
+#endif
+	mov	x0,#1
+.Lno_key:
+	ret
+.size	poly1305_init,.-poly1305_init
+
+.type	poly1305_blocks,%function
+.align	5
+poly1305_blocks:
+.Lpoly1305_blocks:
+	ands	$len,$len,#-16
+	b.eq	.Lno_data
+
+	ldp	$h0,$h1,[$ctx]		// load hash value
+	ldp	$h2,x17,[$ctx,#16]	// [along with is_base2_26]
+	ldp	$r0,$r1,[$ctx,#32]	// load key value
+
+#ifdef	__AARCH64EB__
+	lsr	$d0,$h0,#32
+	mov	w#$d1,w#$h0
+	lsr	$d2,$h1,#32
+	mov	w15,w#$h1
+	lsr	x16,$h2,#32
+#else
+	mov	w#$d0,w#$h0
+	lsr	$d1,$h0,#32
+	mov	w#$d2,w#$h1
+	lsr	x15,$h1,#32
+	mov	w16,w#$h2
+#endif
+
+	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
+	lsr	$d1,$d2,#12
+	adds	$d0,$d0,$d2,lsl#52
+	add	$d1,$d1,x15,lsl#14
+	adc	$d1,$d1,xzr
+	lsr	$d2,x16,#24
+	adds	$d1,$d1,x16,lsl#40
+	adc	$d2,$d2,xzr
+
+	cmp	x17,#0			// is_base2_26?
+	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
+	csel	$h0,$h0,$d0,eq		// choose between radixes
+	csel	$h1,$h1,$d1,eq
+	csel	$h2,$h2,$d2,eq
+
+.Loop:
+	ldp	$t0,$t1,[$inp],#16	// load input
+	sub	$len,$len,#16
+#ifdef	__AARCH64EB__
+	rev	$t0,$t0
+	rev	$t1,$t1
+#endif
+	adds	$h0,$h0,$t0		// accumulate input
+	adcs	$h1,$h1,$t1
+
+	mul	$d0,$h0,$r0		// h0*r0
+	adc	$h2,$h2,$padbit
+	umulh	$d1,$h0,$r0
+
+	mul	$t0,$h1,$s1		// h1*5*r1
+	umulh	$t1,$h1,$s1
+
+	adds	$d0,$d0,$t0
+	mul	$t0,$h0,$r1		// h0*r1
+	adc	$d1,$d1,$t1
+	umulh	$d2,$h0,$r1
+
+	adds	$d1,$d1,$t0
+	mul	$t0,$h1,$r0		// h1*r0
+	adc	$d2,$d2,xzr
+	umulh	$t1,$h1,$r0
+
+	adds	$d1,$d1,$t0
+	mul	$t0,$h2,$s1		// h2*5*r1
+	adc	$d2,$d2,$t1
+	mul	$t1,$h2,$r0		// h2*r0
+
+	adds	$d1,$d1,$t0
+	adc	$d2,$d2,$t1
+
+	and	$t0,$d2,#-4		// final reduction
+	and	$h2,$d2,#3
+	add	$t0,$t0,$d2,lsr#2
+	adds	$h0,$d0,$t0
+	adcs	$h1,$d1,xzr
+	adc	$h2,$h2,xzr
+
+	cbnz	$len,.Loop
+
+	stp	$h0,$h1,[$ctx]		// store hash value
+	stp	$h2,xzr,[$ctx,#16]	// [and clear is_base2_26]
+
+.Lno_data:
+	ret
+.size	poly1305_blocks,.-poly1305_blocks
+
+.type	poly1305_emit,%function
+.align	5
+poly1305_emit:
+.Lpoly1305_emit:
+	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
+	ldp	$h2,$r0,[$ctx,#16]	// [along with is_base2_26]
+	ldp	$t0,$t1,[$nonce]	// load nonce
+
+#ifdef	__AARCH64EB__
+	lsr	$d0,$h0,#32
+	mov	w#$d1,w#$h0
+	lsr	$d2,$h1,#32
+	mov	w15,w#$h1
+	lsr	x16,$h2,#32
+#else
+	mov	w#$d0,w#$h0
+	lsr	$d1,$h0,#32
+	mov	w#$d2,w#$h1
+	lsr	x15,$h1,#32
+	mov	w16,w#$h2
+#endif
+
+	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
+	lsr	$d1,$d2,#12
+	adds	$d0,$d0,$d2,lsl#52
+	add	$d1,$d1,x15,lsl#14
+	adc	$d1,$d1,xzr
+	lsr	$d2,x16,#24
+	adds	$d1,$d1,x16,lsl#40
+	adc	$d2,$d2,xzr
+
+	cmp	$r0,#0			// is_base2_26?
+	csel	$h0,$h0,$d0,eq		// choose between radixes
+	csel	$h1,$h1,$d1,eq
+	csel	$h2,$h2,$d2,eq
+
+	adds	$d0,$h0,#5		// compare to modulus
+	adcs	$d1,$h1,xzr
+	adc	$d2,$h2,xzr
+
+	tst	$d2,#-4			// see if it's carried/borrowed
+
+	csel	$h0,$h0,$d0,eq
+	csel	$h1,$h1,$d1,eq
+
+#ifdef	__AARCH64EB__
+	ror	$t0,$t0,#32		// flip nonce words
+	ror	$t1,$t1,#32
+#endif
+	adds	$h0,$h0,$t0		// accumulate nonce
+	adc	$h1,$h1,$t1
+#ifdef	__AARCH64EB__
+	rev	$h0,$h0			// flip output bytes
+	rev	$h1,$h1
+#endif
+	stp	$h0,$h1,[$mac]		// write result
+
+	ret
+.size	poly1305_emit,.-poly1305_emit
+___
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
+my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
+my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
+my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
+my ($T0,$T1,$MASK) = map("v$_",(29..31));
+
+my ($in2,$zeros)=("x16","x17");
+my $is_base2_26 = $zeros;		# borrow
+
+$code.=<<___;
+.type	poly1305_mult,%function
+.align	5
+poly1305_mult:
+	mul	$d0,$h0,$r0		// h0*r0
+	umulh	$d1,$h0,$r0
+
+	mul	$t0,$h1,$s1		// h1*5*r1
+	umulh	$t1,$h1,$s1
+
+	adds	$d0,$d0,$t0
+	mul	$t0,$h0,$r1		// h0*r1
+	adc	$d1,$d1,$t1
+	umulh	$d2,$h0,$r1
+
+	adds	$d1,$d1,$t0
+	mul	$t0,$h1,$r0		// h1*r0
+	adc	$d2,$d2,xzr
+	umulh	$t1,$h1,$r0
+
+	adds	$d1,$d1,$t0
+	mul	$t0,$h2,$s1		// h2*5*r1
+	adc	$d2,$d2,$t1
+	mul	$t1,$h2,$r0		// h2*r0
+
+	adds	$d1,$d1,$t0
+	adc	$d2,$d2,$t1
+
+	and	$t0,$d2,#-4		// final reduction
+	and	$h2,$d2,#3
+	add	$t0,$t0,$d2,lsr#2
+	adds	$h0,$d0,$t0
+	adcs	$h1,$d1,xzr
+	adc	$h2,$h2,xzr
+
+	ret
+.size	poly1305_mult,.-poly1305_mult
+
+.type	poly1305_splat,%function
+.align	4
+poly1305_splat:
+	and	x12,$h0,#0x03ffffff	// base 2^64 -> base 2^26
+	ubfx	x13,$h0,#26,#26
+	extr	x14,$h1,$h0,#52
+	and	x14,x14,#0x03ffffff
+	ubfx	x15,$h1,#14,#26
+	extr	x16,$h2,$h1,#40
+
+	str	w12,[$ctx,#16*0]	// r0
+	add	w12,w13,w13,lsl#2	// r1*5
+	str	w13,[$ctx,#16*1]	// r1
+	add	w13,w14,w14,lsl#2	// r2*5
+	str	w12,[$ctx,#16*2]	// s1
+	str	w14,[$ctx,#16*3]	// r2
+	add	w14,w15,w15,lsl#2	// r3*5
+	str	w13,[$ctx,#16*4]	// s2
+	str	w15,[$ctx,#16*5]	// r3
+	add	w15,w16,w16,lsl#2	// r4*5
+	str	w14,[$ctx,#16*6]	// s3
+	str	w16,[$ctx,#16*7]	// r4
+	str	w15,[$ctx,#16*8]	// s4
+
+	ret
+.size	poly1305_splat,.-poly1305_splat
+
+#ifdef	__KERNEL__
+.globl	poly1305_blocks_neon
+#endif
+.type	poly1305_blocks_neon,%function
+.align	5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+	ldr	$is_base2_26,[$ctx,#24]
+	cmp	$len,#128
+	b.lo	.Lpoly1305_blocks
+
+	.inst	0xd503233f		// paciasp
+	stp	x29,x30,[sp,#-80]!
+	add	x29,sp,#0
+
+	stp	d8,d9,[sp,#16]		// meet ABI requirements
+	stp	d10,d11,[sp,#32]
+	stp	d12,d13,[sp,#48]
+	stp	d14,d15,[sp,#64]
+
+	cbz	$is_base2_26,.Lbase2_64_neon
+
+	ldp	w10,w11,[$ctx]		// load hash value base 2^26
+	ldp	w12,w13,[$ctx,#8]
+	ldr	w14,[$ctx,#16]
+
+	tst	$len,#31
+	b.eq	.Leven_neon
+
+	ldp	$r0,$r1,[$ctx,#32]	// load key value
+
+	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64
+	lsr	$h1,x12,#12
+	adds	$h0,$h0,x12,lsl#52
+	add	$h1,$h1,x13,lsl#14
+	adc	$h1,$h1,xzr
+	lsr	$h2,x14,#24
+	adds	$h1,$h1,x14,lsl#40
+	adc	$d2,$h2,xzr		// can be partially reduced...
+
+	ldp	$d0,$d1,[$inp],#16	// load input
+	sub	$len,$len,#16
+	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
+
+#ifdef	__AARCH64EB__
+	rev	$d0,$d0
+	rev	$d1,$d1
+#endif
+	adds	$h0,$h0,$d0		// accumulate input
+	adcs	$h1,$h1,$d1
+	adc	$h2,$h2,$padbit
+
+	bl	poly1305_mult
+
+	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
+	ubfx	x11,$h0,#26,#26
+	extr	x12,$h1,$h0,#52
+	and	x12,x12,#0x03ffffff
+	ubfx	x13,$h1,#14,#26
+	extr	x14,$h2,$h1,#40
+
+	b	.Leven_neon
+
+.align	4
+.Lbase2_64_neon:
+	ldp	$r0,$r1,[$ctx,#32]	// load key value
+
+	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64
+	ldr	$h2,[$ctx,#16]
+
+	tst	$len,#31
+	b.eq	.Linit_neon
+
+	ldp	$d0,$d1,[$inp],#16	// load input
+	sub	$len,$len,#16
+	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
+#ifdef	__AARCH64EB__
+	rev	$d0,$d0
+	rev	$d1,$d1
+#endif
+	adds	$h0,$h0,$d0		// accumulate input
+	adcs	$h1,$h1,$d1
+	adc	$h2,$h2,$padbit
+
+	bl	poly1305_mult
+
+.Linit_neon:
+	ldr	w17,[$ctx,#48]		// first table element
+	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
+	ubfx	x11,$h0,#26,#26
+	extr	x12,$h1,$h0,#52
+	and	x12,x12,#0x03ffffff
+	ubfx	x13,$h1,#14,#26
+	extr	x14,$h2,$h1,#40
+
+	cmp	w17,#-1			// is value impossible?
+	b.ne	.Leven_neon
+
+	fmov	${H0},x10
+	fmov	${H1},x11
+	fmov	${H2},x12
+	fmov	${H3},x13
+	fmov	${H4},x14
+
+	////////////////////////////////// initialize r^n table
+	mov	$h0,$r0			// r^1
+	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
+	mov	$h1,$r1
+	mov	$h2,xzr
+	add	$ctx,$ctx,#48+12
+	bl	poly1305_splat
+
+	bl	poly1305_mult		// r^2
+	sub	$ctx,$ctx,#4
+	bl	poly1305_splat
+
+	bl	poly1305_mult		// r^3
+	sub	$ctx,$ctx,#4
+	bl	poly1305_splat
+
+	bl	poly1305_mult		// r^4
+	sub	$ctx,$ctx,#4
+	bl	poly1305_splat
+	sub	$ctx,$ctx,#48		// restore original $ctx
+	b	.Ldo_neon
+
+.align	4
+.Leven_neon:
+	fmov	${H0},x10
+	fmov	${H1},x11
+	fmov	${H2},x12
+	fmov	${H3},x13
+	fmov	${H4},x14
+
+.Ldo_neon:
+	ldp	x8,x12,[$inp,#32]	// inp[2:3]
+	subs	$len,$len,#64
+	ldp	x9,x13,[$inp,#48]
+	add	$in2,$inp,#96
+	adrp	$zeros,.Lzeros
+	add	$zeros,$zeros,#:lo12:.Lzeros
+
+	lsl	$padbit,$padbit,#24
+	add	x15,$ctx,#48
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+#endif
+	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
+	and	x5,x9,#0x03ffffff
+	ubfx	x6,x8,#26,#26
+	ubfx	x7,x9,#26,#26
+	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
+	extr	x8,x12,x8,#52
+	extr	x9,x13,x9,#52
+	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
+	fmov	$IN23_0,x4
+	and	x8,x8,#0x03ffffff
+	and	x9,x9,#0x03ffffff
+	ubfx	x10,x12,#14,#26
+	ubfx	x11,x13,#14,#26
+	add	x12,$padbit,x12,lsr#40
+	add	x13,$padbit,x13,lsr#40
+	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
+	fmov	$IN23_1,x6
+	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
+	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	fmov	$IN23_2,x8
+	fmov	$IN23_3,x10
+	fmov	$IN23_4,x12
+
+	ldp	x8,x12,[$inp],#16	// inp[0:1]
+	ldp	x9,x13,[$inp],#48
+
+	ld1	{$R0,$R1,$S1,$R2},[x15],#64
+	ld1	{$S2,$R3,$S3,$R4},[x15],#64
+	ld1	{$S4},[x15]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+#endif
+	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
+	and	x5,x9,#0x03ffffff
+	ubfx	x6,x8,#26,#26
+	ubfx	x7,x9,#26,#26
+	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
+	extr	x8,x12,x8,#52
+	extr	x9,x13,x9,#52
+	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
+	fmov	$IN01_0,x4
+	and	x8,x8,#0x03ffffff
+	and	x9,x9,#0x03ffffff
+	ubfx	x10,x12,#14,#26
+	ubfx	x11,x13,#14,#26
+	add	x12,$padbit,x12,lsr#40
+	add	x13,$padbit,x13,lsr#40
+	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
+	fmov	$IN01_1,x6
+	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
+	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	movi	$MASK.2d,#-1
+	fmov	$IN01_2,x8
+	fmov	$IN01_3,x10
+	fmov	$IN01_4,x12
+	ushr	$MASK.2d,$MASK.2d,#38
+
+	b.ls	.Lskip_loop
+
+.align	4
+.Loop_neon:
+	////////////////////////////////////////////////////////////////
+	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+	//   \___________________/
+	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+	//   \___________________/ \____________________/
+	//
+	// Note that we start with inp[2:3]*r^2. This is because it
+	// doesn't depend on reduction in previous iteration.
+	////////////////////////////////////////////////////////////////
+	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
+	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
+	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
+	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
+	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+	subs	$len,$len,#64
+	umull	$ACC4,$IN23_0,${R4}[2]
+	csel	$in2,$zeros,$in2,lo
+	umull	$ACC3,$IN23_0,${R3}[2]
+	umull	$ACC2,$IN23_0,${R2}[2]
+	 ldp	x8,x12,[$in2],#16	// inp[2:3] (or zero)
+	umull	$ACC1,$IN23_0,${R1}[2]
+	 ldp	x9,x13,[$in2],#48
+	umull	$ACC0,$IN23_0,${R0}[2]
+#ifdef	__AARCH64EB__
+	 rev	x8,x8
+	 rev	x12,x12
+	 rev	x9,x9
+	 rev	x13,x13
+#endif
+
+	umlal	$ACC4,$IN23_1,${R3}[2]
+	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
+	umlal	$ACC3,$IN23_1,${R2}[2]
+	 and	x5,x9,#0x03ffffff
+	umlal	$ACC2,$IN23_1,${R1}[2]
+	 ubfx	x6,x8,#26,#26
+	umlal	$ACC1,$IN23_1,${R0}[2]
+	 ubfx	x7,x9,#26,#26
+	umlal	$ACC0,$IN23_1,${S4}[2]
+	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
+
+	umlal	$ACC4,$IN23_2,${R2}[2]
+	 extr	x8,x12,x8,#52
+	umlal	$ACC3,$IN23_2,${R1}[2]
+	 extr	x9,x13,x9,#52
+	umlal	$ACC2,$IN23_2,${R0}[2]
+	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
+	umlal	$ACC1,$IN23_2,${S4}[2]
+	 fmov	$IN23_0,x4
+	umlal	$ACC0,$IN23_2,${S3}[2]
+	 and	x8,x8,#0x03ffffff
+
+	umlal	$ACC4,$IN23_3,${R1}[2]
+	 and	x9,x9,#0x03ffffff
+	umlal	$ACC3,$IN23_3,${R0}[2]
+	 ubfx	x10,x12,#14,#26
+	umlal	$ACC2,$IN23_3,${S4}[2]
+	 ubfx	x11,x13,#14,#26
+	umlal	$ACC1,$IN23_3,${S3}[2]
+	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
+	umlal	$ACC0,$IN23_3,${S2}[2]
+	 fmov	$IN23_1,x6
+
+	add	$IN01_2,$IN01_2,$H2
+	 add	x12,$padbit,x12,lsr#40
+	umlal	$ACC4,$IN23_4,${R0}[2]
+	 add	x13,$padbit,x13,lsr#40
+	umlal	$ACC3,$IN23_4,${S4}[2]
+	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
+	umlal	$ACC2,$IN23_4,${S3}[2]
+	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	umlal	$ACC1,$IN23_4,${S2}[2]
+	 fmov	$IN23_2,x8
+	umlal	$ACC0,$IN23_4,${S1}[2]
+	 fmov	$IN23_3,x10
+
+	////////////////////////////////////////////////////////////////
+	// (hash+inp[0:1])*r^4 and accumulate
+
+	add	$IN01_0,$IN01_0,$H0
+	 fmov	$IN23_4,x12
+	umlal	$ACC3,$IN01_2,${R1}[0]
+	 ldp	x8,x12,[$inp],#16	// inp[0:1]
+	umlal	$ACC0,$IN01_2,${S3}[0]
+	 ldp	x9,x13,[$inp],#48
+	umlal	$ACC4,$IN01_2,${R2}[0]
+	umlal	$ACC1,$IN01_2,${S4}[0]
+	umlal	$ACC2,$IN01_2,${R0}[0]
+#ifdef	__AARCH64EB__
+	 rev	x8,x8
+	 rev	x12,x12
+	 rev	x9,x9
+	 rev	x13,x13
+#endif
+
+	add	$IN01_1,$IN01_1,$H1
+	umlal	$ACC3,$IN01_0,${R3}[0]
+	umlal	$ACC4,$IN01_0,${R4}[0]
+	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
+	umlal	$ACC2,$IN01_0,${R2}[0]
+	 and	x5,x9,#0x03ffffff
+	umlal	$ACC0,$IN01_0,${R0}[0]
+	 ubfx	x6,x8,#26,#26
+	umlal	$ACC1,$IN01_0,${R1}[0]
+	 ubfx	x7,x9,#26,#26
+
+	add	$IN01_3,$IN01_3,$H3
+	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
+	umlal	$ACC3,$IN01_1,${R2}[0]
+	 extr	x8,x12,x8,#52
+	umlal	$ACC4,$IN01_1,${R3}[0]
+	 extr	x9,x13,x9,#52
+	umlal	$ACC0,$IN01_1,${S4}[0]
+	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
+	umlal	$ACC2,$IN01_1,${R1}[0]
+	 fmov	$IN01_0,x4
+	umlal	$ACC1,$IN01_1,${R0}[0]
+	 and	x8,x8,#0x03ffffff
+
+	add	$IN01_4,$IN01_4,$H4
+	 and	x9,x9,#0x03ffffff
+	umlal	$ACC3,$IN01_3,${R0}[0]
+	 ubfx	x10,x12,#14,#26
+	umlal	$ACC0,$IN01_3,${S2}[0]
+	 ubfx	x11,x13,#14,#26
+	umlal	$ACC4,$IN01_3,${R1}[0]
+	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
+	umlal	$ACC1,$IN01_3,${S3}[0]
+	 fmov	$IN01_1,x6
+	umlal	$ACC2,$IN01_3,${S4}[0]
+	 add	x12,$padbit,x12,lsr#40
+
+	umlal	$ACC3,$IN01_4,${S4}[0]
+	 add	x13,$padbit,x13,lsr#40
+	umlal	$ACC0,$IN01_4,${S1}[0]
+	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
+	umlal	$ACC4,$IN01_4,${R0}[0]
+	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	umlal	$ACC1,$IN01_4,${S2}[0]
+	 fmov	$IN01_2,x8
+	umlal	$ACC2,$IN01_4,${S3}[0]
+	 fmov	$IN01_3,x10
+	 fmov	$IN01_4,x12
+
+	/////////////////////////////////////////////////////////////////
+	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+	// and P. Schwabe
+	//
+	// [see discussion in poly1305-armv4 module]
+
+	ushr	$T0.2d,$ACC3,#26
+	xtn	$H3,$ACC3
+	 ushr	$T1.2d,$ACC0,#26
+	 and	$ACC0,$ACC0,$MASK.2d
+	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
+	bic	$H3,#0xfc,lsl#24	// &=0x03ffffff
+	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
+
+	ushr	$T0.2d,$ACC4,#26
+	xtn	$H4,$ACC4
+	 ushr	$T1.2d,$ACC1,#26
+	 xtn	$H1,$ACC1
+	bic	$H4,#0xfc,lsl#24
+	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
+
+	add	$ACC0,$ACC0,$T0.2d
+	shl	$T0.2d,$T0.2d,#2
+	 shrn	$T1.2s,$ACC2,#26
+	 xtn	$H2,$ACC2
+	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
+	 bic	$H1,#0xfc,lsl#24
+	 add	$H3,$H3,$T1.2s		// h2 -> h3
+	 bic	$H2,#0xfc,lsl#24
+
+	shrn	$T0.2s,$ACC0,#26
+	xtn	$H0,$ACC0
+	 ushr	$T1.2s,$H3,#26
+	 bic	$H3,#0xfc,lsl#24
+	 bic	$H0,#0xfc,lsl#24
+	add	$H1,$H1,$T0.2s		// h0 -> h1
+	 add	$H4,$H4,$T1.2s		// h3 -> h4
+
+	b.hi	.Loop_neon
+
+.Lskip_loop:
+	dup	$IN23_2,${IN23_2}[0]
+	add	$IN01_2,$IN01_2,$H2
+
+	////////////////////////////////////////////////////////////////
+	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+	adds	$len,$len,#32
+	b.ne	.Long_tail
+
+	dup	$IN23_2,${IN01_2}[0]
+	add	$IN23_0,$IN01_0,$H0
+	add	$IN23_3,$IN01_3,$H3
+	add	$IN23_1,$IN01_1,$H1
+	add	$IN23_4,$IN01_4,$H4
+
+.Long_tail:
+	dup	$IN23_0,${IN23_0}[0]
+	umull2	$ACC0,$IN23_2,${S3}
+	umull2	$ACC3,$IN23_2,${R1}
+	umull2	$ACC4,$IN23_2,${R2}
+	umull2	$ACC2,$IN23_2,${R0}
+	umull2	$ACC1,$IN23_2,${S4}
+
+	dup	$IN23_1,${IN23_1}[0]
+	umlal2	$ACC0,$IN23_0,${R0}
+	umlal2	$ACC2,$IN23_0,${R2}
+	umlal2	$ACC3,$IN23_0,${R3}
+	umlal2	$ACC4,$IN23_0,${R4}
+	umlal2	$ACC1,$IN23_0,${R1}
+
+	dup	$IN23_3,${IN23_3}[0]
+	umlal2	$ACC0,$IN23_1,${S4}
+	umlal2	$ACC3,$IN23_1,${R2}
+	umlal2	$ACC2,$IN23_1,${R1}
+	umlal2	$ACC4,$IN23_1,${R3}
+	umlal2	$ACC1,$IN23_1,${R0}
+
+	dup	$IN23_4,${IN23_4}[0]
+	umlal2	$ACC3,$IN23_3,${R0}
+	umlal2	$ACC4,$IN23_3,${R1}
+	umlal2	$ACC0,$IN23_3,${S2}
+	umlal2	$ACC1,$IN23_3,${S3}
+	umlal2	$ACC2,$IN23_3,${S4}
+
+	umlal2	$ACC3,$IN23_4,${S4}
+	umlal2	$ACC0,$IN23_4,${S1}
+	umlal2	$ACC4,$IN23_4,${R0}
+	umlal2	$ACC1,$IN23_4,${S2}
+	umlal2	$ACC2,$IN23_4,${S3}
+
+	b.eq	.Lshort_tail
+
+	////////////////////////////////////////////////////////////////
+	// (hash+inp[0:1])*r^4:r^3 and accumulate
+
+	add	$IN01_0,$IN01_0,$H0
+	umlal	$ACC3,$IN01_2,${R1}
+	umlal	$ACC0,$IN01_2,${S3}
+	umlal	$ACC4,$IN01_2,${R2}
+	umlal	$ACC1,$IN01_2,${S4}
+	umlal	$ACC2,$IN01_2,${R0}
+
+	add	$IN01_1,$IN01_1,$H1
+	umlal	$ACC3,$IN01_0,${R3}
+	umlal	$ACC0,$IN01_0,${R0}
+	umlal	$ACC4,$IN01_0,${R4}
+	umlal	$ACC1,$IN01_0,${R1}
+	umlal	$ACC2,$IN01_0,${R2}
+
+	add	$IN01_3,$IN01_3,$H3
+	umlal	$ACC3,$IN01_1,${R2}
+	umlal	$ACC0,$IN01_1,${S4}
+	umlal	$ACC4,$IN01_1,${R3}
+	umlal	$ACC1,$IN01_1,${R0}
+	umlal	$ACC2,$IN01_1,${R1}
+
+	add	$IN01_4,$IN01_4,$H4
+	umlal	$ACC3,$IN01_3,${R0}
+	umlal	$ACC0,$IN01_3,${S2}
+	umlal	$ACC4,$IN01_3,${R1}
+	umlal	$ACC1,$IN01_3,${S3}
+	umlal	$ACC2,$IN01_3,${S4}
+
+	umlal	$ACC3,$IN01_4,${S4}
+	umlal	$ACC0,$IN01_4,${S1}
+	umlal	$ACC4,$IN01_4,${R0}
+	umlal	$ACC1,$IN01_4,${S2}
+	umlal	$ACC2,$IN01_4,${S3}
+
+.Lshort_tail:
+	////////////////////////////////////////////////////////////////
+	// horizontal add
+
+	addp	$ACC3,$ACC3,$ACC3
+	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
+	addp	$ACC0,$ACC0,$ACC0
+	 ldp	d10,d11,[sp,#32]
+	addp	$ACC4,$ACC4,$ACC4
+	 ldp	d12,d13,[sp,#48]
+	addp	$ACC1,$ACC1,$ACC1
+	 ldp	d14,d15,[sp,#64]
+	addp	$ACC2,$ACC2,$ACC2
+	 ldr	x30,[sp,#8]
+
+	////////////////////////////////////////////////////////////////
+	// lazy reduction, but without narrowing
+
+	ushr	$T0.2d,$ACC3,#26
+	and	$ACC3,$ACC3,$MASK.2d
+	 ushr	$T1.2d,$ACC0,#26
+	 and	$ACC0,$ACC0,$MASK.2d
+
+	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
+	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
+
+	ushr	$T0.2d,$ACC4,#26
+	and	$ACC4,$ACC4,$MASK.2d
+	 ushr	$T1.2d,$ACC1,#26
+	 and	$ACC1,$ACC1,$MASK.2d
+	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
+
+	add	$ACC0,$ACC0,$T0.2d
+	shl	$T0.2d,$T0.2d,#2
+	 ushr	$T1.2d,$ACC2,#26
+	 and	$ACC2,$ACC2,$MASK.2d
+	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
+	 add	$ACC3,$ACC3,$T1.2d	// h2 -> h3
+
+	ushr	$T0.2d,$ACC0,#26
+	and	$ACC0,$ACC0,$MASK.2d
+	 ushr	$T1.2d,$ACC3,#26
+	 and	$ACC3,$ACC3,$MASK.2d
+	add	$ACC1,$ACC1,$T0.2d	// h0 -> h1
+	 add	$ACC4,$ACC4,$T1.2d	// h3 -> h4
+
+	////////////////////////////////////////////////////////////////
+	// write the result, can be partially reduced
+
+	st4	{$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
+	mov	x4,#1
+	st1	{$ACC4}[0],[$ctx]
+	str	x4,[$ctx,#8]		// set is_base2_26
+
+	ldr	x29,[sp],#80
+	 .inst	0xd50323bf		// autiasp
+	ret
+.size	poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.pushsection .rodata
+.align	5
+.Lzeros:
+.long	0,0,0,0,0,0,0,0
+.asciz	"Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
+.popsection
+
+.align	2
+#if !defined(__KERNEL__) && !defined(_WIN64)
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
+___
+
+foreach (split("\n",$code)) {
+	s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/			or
+	s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/	or
+	(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))			or
+	(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))	or
+	(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))		or
+	(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))		or
+	(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
+
+	s/\.[124]([sd])\[/.$1\[/;
+	s/w#x([0-9]+)/w$1/g;
+
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/lib/crypto/arm64/poly1305.h b/lib/crypto/arm64/poly1305.h
new file mode 100644
index 000000000000..b77669767cd6
--- /dev/null
+++ b/lib/crypto/arm64/poly1305.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+				    const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_arm64(struct poly1305_block_state *state,
+				      const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+				     const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+			      u8 digest[POLY1305_DIGEST_SIZE],
+			      const u32 nonce[4]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
+			    unsigned int len, u32 padbit)
+{
+	if (static_branch_likely(&have_neon) && likely(may_use_simd())) {
+		do {
+			unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+			scoped_ksimd()
+				poly1305_blocks_neon(state, src, todo, padbit);
+
+			len -= todo;
+			src += todo;
+		} while (len);
+	} else
+		poly1305_blocks_arm64(state, src, len, padbit);
+}
+
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(ASIMD))
+		static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm64/polyval-ce-core.S b/lib/crypto/arm64/polyval-ce-core.S
new file mode 100644
index 000000000000..7c731a044d02
--- /dev/null
+++ b/lib/crypto/arm64/polyval-ce-core.S
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Implementation of POLYVAL using ARMv8 Crypto Extensions.
+ *
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
+ * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
+ * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
+ * finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process  only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#define STRIDE_BLOCKS 8
+
+ACCUMULATOR	.req	x0
+KEY_POWERS	.req	x1
+MSG		.req	x2
+BLOCKS_LEFT	.req	x3
+KEY_START	.req	x10
+EXTRA_BYTES	.req	x11
+TMP	.req	x13
+
+M0	.req	v0
+M1	.req	v1
+M2	.req	v2
+M3	.req	v3
+M4	.req	v4
+M5	.req	v5
+M6	.req	v6
+M7	.req	v7
+KEY8	.req	v8
+KEY7	.req	v9
+KEY6	.req	v10
+KEY5	.req	v11
+KEY4	.req	v12
+KEY3	.req	v13
+KEY2	.req	v14
+KEY1	.req	v15
+PL	.req	v16
+PH	.req	v17
+TMP_V	.req	v18
+LO	.req	v20
+MI	.req	v21
+HI	.req	v22
+SUM	.req	v23
+GSTAR	.req	v24
+
+	.text
+
+	.arch	armv8-a+crypto
+	.align	4
+
+.Lgstar:
+	.quad	0xc200000000000000, 0xc200000000000000
+
+/*
+ * Computes the product of two 128-bit polynomials in X and Y and XORs the
+ * components of the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ *  X = [X_1 : X_0]
+ *  Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ *  LO += X_0 * Y_0
+ *  MI += (X_0 + X_1) * (Y_0 + Y_1)
+ *  HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ *   [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * Karatsuba multiplication is used instead of Schoolbook multiplication because
+ * it was found to be slightly faster on ARM64 CPUs.
+ *
+ */
+.macro karatsuba1 X Y
+	X .req \X
+	Y .req \Y
+	ext	v25.16b, X.16b, X.16b, #8
+	ext	v26.16b, Y.16b, Y.16b, #8
+	eor	v25.16b, v25.16b, X.16b
+	eor	v26.16b, v26.16b, Y.16b
+	pmull2	v28.1q, X.2d, Y.2d
+	pmull	v29.1q, X.1d, Y.1d
+	pmull	v27.1q, v25.1d, v26.1d
+	eor	HI.16b, HI.16b, v28.16b
+	eor	LO.16b, LO.16b, v29.16b
+	eor	MI.16b, MI.16b, v27.16b
+	.unreq X
+	.unreq Y
+.endm
+
+/*
+ * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
+ * them.
+ */
+.macro karatsuba1_store X Y
+	X .req \X
+	Y .req \Y
+	ext	v25.16b, X.16b, X.16b, #8
+	ext	v26.16b, Y.16b, Y.16b, #8
+	eor	v25.16b, v25.16b, X.16b
+	eor	v26.16b, v26.16b, Y.16b
+	pmull2	HI.1q, X.2d, Y.2d
+	pmull	LO.1q, X.1d, Y.1d
+	pmull	MI.1q, v25.1d, v26.1d
+	.unreq X
+	.unreq Y
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ * [PH : PL] =
+ *   [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+ */
+.macro karatsuba2
+	// v4 = [HI_1 + MI_1 : HI_0 + MI_0]
+	eor	v4.16b, HI.16b, MI.16b
+	// v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
+	eor	v4.16b, v4.16b, LO.16b
+	// v5 = [HI_0 : LO_1]
+	ext	v5.16b, LO.16b, HI.16b, #8
+	// v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
+	eor	v4.16b, v4.16b, v5.16b
+	// HI = [HI_0 : HI_1]
+	ext	HI.16b, HI.16b, HI.16b, #8
+	// LO = [LO_0 : LO_1]
+	ext	LO.16b, LO.16b, LO.16b, #8
+	// PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
+	ext	PH.16b, v4.16b, HI.16b, #8
+	// PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+	ext	PL.16b, LO.16b, v4.16b, #8
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
+ * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128.  To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right
+ * shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ *   T = T_1 : T_0 = g*(x) * P_0
+ *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+	DEST .req \dest
+	// TMP_V = T_1 : T_0 = P_0 * g*(x)
+	pmull	TMP_V.1q, PL.1d, GSTAR.1d
+	// TMP_V = T_0 : T_1
+	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+	// TMP_V = P_1 + T_0 : P_0 + T_1
+	eor	TMP_V.16b, PL.16b, TMP_V.16b
+	// PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+	eor	PH.16b, PH.16b, TMP_V.16b
+	// TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
+	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
+	eor	DEST.16b, PH.16b, TMP_V.16b
+	.unreq DEST
+.endm
+
+/*
+ * Compute Polyval on 8 blocks.
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ *
+ * Sets PL, PH.
+ */
+.macro full_stride reduce
+	eor		LO.16b, LO.16b, LO.16b
+	eor		MI.16b, MI.16b, MI.16b
+	eor		HI.16b, HI.16b, HI.16b
+
+	ld1		{M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
+	ld1		{M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
+
+	karatsuba1 M7 KEY1
+	.if \reduce
+	pmull	TMP_V.1q, PL.1d, GSTAR.1d
+	.endif
+
+	karatsuba1 M6 KEY2
+	.if \reduce
+	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+	.endif
+
+	karatsuba1 M5 KEY3
+	.if \reduce
+	eor	TMP_V.16b, PL.16b, TMP_V.16b
+	.endif
+
+	karatsuba1 M4 KEY4
+	.if \reduce
+	eor	PH.16b, PH.16b, TMP_V.16b
+	.endif
+
+	karatsuba1 M3 KEY5
+	.if \reduce
+	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
+	.endif
+
+	karatsuba1 M2 KEY6
+	.if \reduce
+	eor	SUM.16b, PH.16b, TMP_V.16b
+	.endif
+
+	karatsuba1 M1 KEY7
+	eor	M0.16b, M0.16b, SUM.16b
+
+	karatsuba1 M0 KEY8
+	karatsuba2
+.endm
+
+/*
+ * Handle any extra blocks after full_stride loop.
+ */
+.macro partial_stride
+	add	KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
+	sub	KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
+	ld1	{KEY1.16b}, [KEY_POWERS], #16
+
+	ld1	{TMP_V.16b}, [MSG], #16
+	eor	SUM.16b, SUM.16b, TMP_V.16b
+	karatsuba1_store KEY1 SUM
+	sub	BLOCKS_LEFT, BLOCKS_LEFT, #1
+
+	tst	BLOCKS_LEFT, #4
+	beq	.Lpartial4BlocksDone
+	ld1	{M0.16b, M1.16b,  M2.16b, M3.16b}, [MSG], #64
+	ld1	{KEY8.16b, KEY7.16b, KEY6.16b,	KEY5.16b}, [KEY_POWERS], #64
+	karatsuba1 M0 KEY8
+	karatsuba1 M1 KEY7
+	karatsuba1 M2 KEY6
+	karatsuba1 M3 KEY5
+.Lpartial4BlocksDone:
+	tst	BLOCKS_LEFT, #2
+	beq	.Lpartial2BlocksDone
+	ld1	{M0.16b, M1.16b}, [MSG], #32
+	ld1	{KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
+	karatsuba1 M0 KEY8
+	karatsuba1 M1 KEY7
+.Lpartial2BlocksDone:
+	tst	BLOCKS_LEFT, #1
+	beq	.LpartialDone
+	ld1	{M0.16b}, [MSG], #16
+	ld1	{KEY8.16b}, [KEY_POWERS], #16
+	karatsuba1 M0 KEY8
+.LpartialDone:
+	karatsuba2
+	montgomery_reduction SUM
+.endm
+
+/*
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * void polyval_mul_pmull(struct polyval_elem *a,
+ *			  const struct polyval_elem *b);
+ */
+SYM_FUNC_START(polyval_mul_pmull)
+	adr	TMP, .Lgstar
+	ld1	{GSTAR.2d}, [TMP]
+	ld1	{v0.16b}, [x0]
+	ld1	{v1.16b}, [x1]
+	karatsuba1_store v0 v1
+	karatsuba2
+	montgomery_reduction SUM
+	st1	{SUM.16b}, [x0]
+	ret
+SYM_FUNC_END(polyval_mul_pmull)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL.  This computes:
+ *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * x0 - pointer to accumulator
+ * x1 - pointer to precomputed key powers h^8 ... h^1
+ * x2 - pointer to message blocks
+ * x3 - number of blocks to hash
+ *
+ * void polyval_blocks_pmull(struct polyval_elem *acc,
+ *			     const struct polyval_key *key,
+ *			     const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(polyval_blocks_pmull)
+	adr	TMP, .Lgstar
+	mov	KEY_START, KEY_POWERS
+	ld1	{GSTAR.2d}, [TMP]
+	ld1	{SUM.16b}, [ACCUMULATOR]
+	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+	blt .LstrideLoopExit
+	ld1	{KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
+	ld1	{KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
+	full_stride 0
+	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+	blt .LstrideLoopExitReduce
+.LstrideLoop:
+	full_stride 1
+	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+	bge	.LstrideLoop
+.LstrideLoopExitReduce:
+	montgomery_reduction SUM
+.LstrideLoopExit:
+	adds	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+	beq	.LskipPartial
+	partial_stride
+.LskipPartial:
+	st1	{SUM.16b}, [ACCUMULATOR]
+	ret
+SYM_FUNC_END(polyval_blocks_pmull)
diff --git a/lib/crypto/arm64/polyval.h b/lib/crypto/arm64/polyval.h
new file mode 100644
index 000000000000..a39763395e9b
--- /dev/null
+++ b/lib/crypto/arm64/polyval.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, arm64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+asmlinkage void polyval_mul_pmull(struct polyval_elem *a,
+				  const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pmull(struct polyval_elem *acc,
+				     const struct polyval_key *key,
+				     const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+				    const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+	memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		scoped_ksimd() {
+			for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+				key->h_powers[i] = key->h_powers[i + 1];
+				polyval_mul_pmull(
+					&key->h_powers[i],
+					&key->h_powers[NUM_H_POWERS - 1]);
+			}
+		}
+	} else {
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_generic(&key->h_powers[i],
+					    &key->h_powers[NUM_H_POWERS - 1]);
+		}
+	}
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+			     const struct polyval_key *key)
+{
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		scoped_ksimd()
+			polyval_mul_pmull(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	} else {
+		polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	}
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+				const struct polyval_key *key,
+				const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		do {
+			/* Allow rescheduling every 4 KiB. */
+			size_t n = min_t(size_t, nblocks,
+					 4096 / POLYVAL_BLOCK_SIZE);
+
+			scoped_ksimd()
+				polyval_blocks_pmull(acc, key, data, n);
+			data += n * POLYVAL_BLOCK_SIZE;
+			nblocks -= n;
+		} while (nblocks);
+	} else {
+		polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+				       data, nblocks);
+	}
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(PMULL))
+		static_branch_enable(&have_pmull);
+}
diff --git a/lib/crypto/arm64/sha1-ce-core.S b/lib/crypto/arm64/sha1-ce-core.S
new file mode 100644
index 000000000000..8fbd4767f0f0
--- /dev/null
+++ b/lib/crypto/arm64/sha1-ce-core.S
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.arch		armv8-a+crypto
+
+	k0		.req	v0
+	k1		.req	v1
+	k2		.req	v2
+	k3		.req	v3
+
+	t0		.req	v4
+	t1		.req	v5
+
+	dga		.req	q6
+	dgav		.req	v6
+	dgb		.req	s7
+	dgbv		.req	v7
+
+	dg0q		.req	q12
+	dg0s		.req	s12
+	dg0v		.req	v12
+	dg1s		.req	s13
+	dg1v		.req	v13
+	dg2s		.req	s14
+
+	.macro		add_only, op, ev, rc, s0, dg1
+	.ifc		\ev, ev
+	add		t1.4s, v\s0\().4s, \rc\().4s
+	sha1h		dg2s, dg0s
+	.ifnb		\dg1
+	sha1\op		dg0q, \dg1, t0.4s
+	.else
+	sha1\op		dg0q, dg1s, t0.4s
+	.endif
+	.else
+	.ifnb		\s0
+	add		t0.4s, v\s0\().4s, \rc\().4s
+	.endif
+	sha1h		dg1s, dg0s
+	sha1\op		dg0q, dg2s, t1.4s
+	.endif
+	.endm
+
+	.macro		add_update, op, ev, rc, s0, s1, s2, s3, dg1
+	sha1su0		v\s0\().4s, v\s1\().4s, v\s2\().4s
+	add_only	\op, \ev, \rc, \s1, \dg1
+	sha1su1		v\s0\().4s, v\s3\().4s
+	.endm
+
+	.macro		loadrc, k, val, tmp
+	movz		\tmp, :abs_g0_nc:\val
+	movk		\tmp, :abs_g1:\val
+	dup		\k, \tmp
+	.endm
+
+	/*
+	 * size_t __sha1_ce_transform(struct sha1_block_state *state,
+	 *			      const u8 *data, size_t nblocks);
+	 */
+SYM_FUNC_START(__sha1_ce_transform)
+	/* load round constants */
+	loadrc		k0.4s, 0x5a827999, w6
+	loadrc		k1.4s, 0x6ed9eba1, w6
+	loadrc		k2.4s, 0x8f1bbcdc, w6
+	loadrc		k3.4s, 0xca62c1d6, w6
+
+	/* load state */
+	ld1		{dgav.4s}, [x0]
+	ldr		dgb, [x0, #16]
+
+	/* load input */
+0:	ld1		{v8.4s-v11.4s}, [x1], #64
+	sub		x2, x2, #1
+
+CPU_LE(	rev32		v8.16b, v8.16b		)
+CPU_LE(	rev32		v9.16b, v9.16b		)
+CPU_LE(	rev32		v10.16b, v10.16b	)
+CPU_LE(	rev32		v11.16b, v11.16b	)
+
+	add		t0.4s, v8.4s, k0.4s
+	mov		dg0v.16b, dgav.16b
+
+	add_update	c, ev, k0,  8,  9, 10, 11, dgb
+	add_update	c, od, k0,  9, 10, 11,  8
+	add_update	c, ev, k0, 10, 11,  8,  9
+	add_update	c, od, k0, 11,  8,  9, 10
+	add_update	c, ev, k1,  8,  9, 10, 11
+
+	add_update	p, od, k1,  9, 10, 11,  8
+	add_update	p, ev, k1, 10, 11,  8,  9
+	add_update	p, od, k1, 11,  8,  9, 10
+	add_update	p, ev, k1,  8,  9, 10, 11
+	add_update	p, od, k2,  9, 10, 11,  8
+
+	add_update	m, ev, k2, 10, 11,  8,  9
+	add_update	m, od, k2, 11,  8,  9, 10
+	add_update	m, ev, k2,  8,  9, 10, 11
+	add_update	m, od, k2,  9, 10, 11,  8
+	add_update	m, ev, k3, 10, 11,  8,  9
+
+	add_update	p, od, k3, 11,  8,  9, 10
+	add_only	p, ev, k3,  9
+	add_only	p, od, k3, 10
+	add_only	p, ev, k3, 11
+	add_only	p, od
+
+	/* update state */
+	add		dgbv.2s, dgbv.2s, dg1v.2s
+	add		dgav.4s, dgav.4s, dg0v.4s
+
+	/* return early if voluntary preemption is needed */
+	cond_yield	1f, x5, x6
+
+	/* handled all input blocks? */
+	cbnz		x2, 0b
+
+	/* store new state */
+1:	st1		{dgav.4s}, [x0]
+	str		dgb, [x0, #16]
+	mov		x0, x2
+	ret
+SYM_FUNC_END(__sha1_ce_transform)
diff --git a/lib/crypto/arm64/sha1.h b/lib/crypto/arm64/sha1.h
new file mode 100644
index 000000000000..bc7071f1be09
--- /dev/null
+++ b/lib/crypto/arm64/sha1.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for ARM64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage size_t __sha1_ce_transform(struct sha1_block_state *state,
+				      const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_ce) && likely(may_use_simd())) {
+		do {
+			size_t rem;
+
+			scoped_ksimd()
+				rem = __sha1_ce_transform(state, data, nblocks);
+
+			data += (nblocks - rem) * SHA1_BLOCK_SIZE;
+			nblocks = rem;
+		} while (nblocks);
+	} else {
+		sha1_blocks_generic(state, data, nblocks);
+	}
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(SHA1))
+		static_branch_enable(&have_ce);
+}
diff --git a/lib/crypto/arm64/sha2-armv8.pl b/lib/crypto/arm64/sha2-armv8.pl
new file mode 100644
index 000000000000..35ec9ae99fe1
--- /dev/null
+++ b/lib/crypto/arm64/sha2-armv8.pl
@@ -0,0 +1,786 @@
+#! /usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#		SHA256-hw	SHA256(*)	SHA512
+# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+# Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+# Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+# Denver	2.01		10.5 (+26%)	6.70 (+8%)
+# X-Gene			20.0 (+100%)	12.8 (+300%(***))
+# Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+#
+# (*)	Software SHA256 results are of lesser relevance, presented
+#	mostly for informational purposes.
+# (**)	The result is a trade-off: it's possible to improve it by
+#	10% (or by 1 cycle per round), but at the cost of 20% loss
+#	on Cortex-A53 (or by 4 cycles per round).
+# (***)	Super-impressive coefficients over gcc-generated code are
+#	indication of some compiler "pathology", most notably code
+#	generated with -mgeneral-regs-only is significantly faster
+#	and the gap is only 40-90%.
+#
+# October 2016.
+#
+# Originally it was reckoned that it makes no sense to implement NEON
+# version of SHA256 for 64-bit processors. This is because performance
+# improvement on most wide-spread Cortex-A5x processors was observed
+# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
+# observed that 32-bit NEON SHA256 performs significantly better than
+# 64-bit scalar version on *some* of the more recent processors. As
+# result 64-bit NEON version of SHA256 was added to provide best
+# all-round performance. For example it executes ~30% faster on X-Gene
+# and Mongoose. [For reference, NEON version of SHA512 is bound to
+# deliver much less improvement, likely *negative* on Cortex-A5x.
+# Which is why NEON support is limited to SHA256.]
+
+$output=pop;
+$flavour=pop;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" $xlate $flavour $output";
+    *STDOUT=*OUT;
+} else {
+    open STDOUT,">$output";
+}
+
+if ($output =~ /512/) {
+	$BITS=512;
+	$SZ=8;
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+	$reg_t="x";
+} else {
+	$BITS=256;
+	$SZ=4;
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+	$reg_t="w";
+}
+
+$func="sha${BITS}_block_data_order";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+   $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___	if ($i<16);
+#ifndef	__AARCH64EB__
+	rev	@X[$i],@X[$i]			// $i
+#endif
+___
+$code.=<<___	if ($i<13 && ($i&1));
+	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___	if ($i==13);
+	ldp	@X[14],@X[15],[$inp]
+___
+$code.=<<___	if ($i>=14);
+	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___	if ($i>0 && $i<16);
+	add	$a,$a,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=11);
+	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___	if ($i<15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+	and	$t1,$f,$e
+	bic	$t2,$g,$e
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	add	$d,$d,$h			// d+=h
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	//add	$h,$h,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	ror	$T1,@X[($j+1)&15],#$sigma0[0]
+	and	$t1,$f,$e
+	ror	$T2,@X[($j+14)&15],#$sigma1[0]
+	bic	$t2,$g,$e
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	eor	$t0,$t0,$e,ror#$Sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
+	eor	$T0,$T0,$a,ror#$Sigma0[1]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
+	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
+	add	@X[$j],@X[$j],@X[($j+9)&15]
+	add	$d,$d,$h			// d+=h
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	add	@X[$j],@X[$j],$T1
+	add	$h,$h,$t1			// h+=Sigma0(a)
+	add	@X[$j],@X[$j],$T2
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#ifndef	__KERNEL__
+# include "arm_arch.h"
+#endif
+
+.text
+
+.extern	OPENSSL_armcap_P
+.globl	$func
+.type	$func,%function
+.align	6
+$func:
+___
+$code.=<<___	if ($SZ==4);
+#ifndef	__KERNEL__
+# ifdef	__ILP32__
+	ldrsw	x16,.LOPENSSL_armcap_P
+# else
+	ldr	x16,.LOPENSSL_armcap_P
+# endif
+	adr	x17,.LOPENSSL_armcap_P
+	add	x16,x16,x17
+	ldr	w16,[x16]
+	tst	w16,#ARMV8_SHA256
+	b.ne	.Lv8_entry
+	tst	w16,#ARMV7_NEON
+	b.ne	.Lneon_entry
+#endif
+___
+$code.=<<___;
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*$SZ
+
+	ldp	$A,$B,[$ctx]				// load context
+	ldp	$C,$D,[$ctx,#2*$SZ]
+	ldp	$E,$F,[$ctx,#4*$SZ]
+	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
+	ldp	$G,$H,[$ctx,#6*$SZ]
+	adr	$Ktbl,.LK$BITS
+	stp	$ctx,$num,[x29,#96]
+
+.Loop:
+	ldp	@X[0],@X[1],[$inp],#2*$SZ
+	ldr	$t2,[$Ktbl],#$SZ			// *K++
+	eor	$t3,$B,$C				// magic seed
+	str	$inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	cbnz	$t2,.Loop_16_xx
+
+	ldp	$ctx,$num,[x29,#96]
+	ldr	$inp,[x29,#112]
+	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
+
+	ldp	@X[0],@X[1],[$ctx]
+	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
+	add	$inp,$inp,#14*$SZ			// advance input pointer
+	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
+	add	$A,$A,@X[0]
+	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
+	add	$B,$B,@X[1]
+	add	$C,$C,@X[2]
+	add	$D,$D,@X[3]
+	stp	$A,$B,[$ctx]
+	add	$E,$E,@X[4]
+	add	$F,$F,@X[5]
+	stp	$C,$D,[$ctx,#2*$SZ]
+	add	$G,$G,@X[6]
+	add	$H,$H,@X[7]
+	cmp	$inp,$num
+	stp	$E,$F,[$ctx,#4*$SZ]
+	stp	$G,$H,[$ctx,#6*$SZ]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*$SZ
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	ret
+.size	$func,.-$func
+
+.align	6
+.type	.LK$BITS,%object
+.LK$BITS:
+___
+$code.=<<___ if ($SZ==8);
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+	.quad	0	// terminator
+___
+$code.=<<___ if ($SZ==4);
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0	//terminator
+___
+$code.=<<___;
+.size	.LK$BITS,.-.LK$BITS
+#ifndef	__KERNEL__
+.align	3
+.LOPENSSL_armcap_P:
+# ifdef	__ILP32__
+	.long	OPENSSL_armcap_P-.
+# else
+	.quad	OPENSSL_armcap_P-.
+# endif
+#endif
+.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+#ifndef	__KERNEL__
+.type	sha256_block_armv8,%function
+.align	6
+sha256_block_armv8:
+.Lv8_entry:
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+
+	ld1.32		{$ABCD,$EFGH},[$ctx]
+	adr		$Ktbl,.LK256
+
+.Loop_hw:
+	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
+	sub		$num,$num,#1
+	ld1.32		{$W0},[$Ktbl],#16
+	rev32		@MSG[0],@MSG[0]
+	rev32		@MSG[1],@MSG[1]
+	rev32		@MSG[2],@MSG[2]
+	rev32		@MSG[3],@MSG[3]
+	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
+	orr		$EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	ld1.32		{$W0},[$Ktbl],#16
+	add.i32		$W1,$W1,@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	ld1.32		{$W1},[$Ktbl]
+	add.i32		$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	add.i32		$W1,$W1,@MSG[3]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	add.i32		$ABCD,$ABCD,$ABCD_SAVE
+	add.i32		$EFGH,$EFGH,$EFGH_SAVE
+
+	cbnz		$num,.Loop_hw
+
+	st1.32		{$ABCD,$EFGH},[$ctx]
+
+	ldr		x29,[sp],#16
+	ret
+.size	sha256_block_armv8,.-sha256_block_armv8
+#endif
+___
+}
+
+if ($SZ==4) {	######################################### NEON stuff #
+# You'll surely note a lot of similarities with sha256-armv4 module,
+# and of course it's not a coincidence. sha256-armv4 was used as
+# initial template, but was adapted for ARMv8 instruction set and
+# extensively re-tuned for all-round performance.
+
+my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
+my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
+my $Ktbl="x16";
+my $Xfer="x17";
+my @X = map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
+my $j=0;
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
+sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
+sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&mov		(&Dscalar($T7),&Dhi(@X[3]));	# X[14..15]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ushr_32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	&ushr_32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	&sli_32		($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ushr_32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&sli_32		($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T4,$T7,$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T4,$T7,32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T5,$T7,$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T3,$T7,$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_u32	($T3,$T7,32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T5,$T5,$T3);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T6,@X[0],$sigma1[0]);
+	 eval(shift(@insns));
+	  &ushr_32	($T7,@X[0],$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T6,@X[0],32-$sigma1[0]);
+	 eval(shift(@insns));
+	  &ushr_32	($T5,@X[0],$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T7,$T7,$T6);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T5,@X[0],32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_32		("{$T0}","[$Ktbl], #16");
+	 eval(shift(@insns));
+	  &eor_8	($T7,$T7,$T5);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T5,$T5,$T5);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&mov		(&Dhi($T5), &Dlo($T7));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T5);	# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		($T0,$T0,@X[0]);
+	 while($#insns>=1) { eval(shift(@insns)); }
+	&st1_32		("{$T0}","[$Xfer], #16");
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_8		("{@X[0]}","[$inp],#16");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_32		("{$T0}","[$Ktbl],#16");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&rev32		(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		($T0,$T0,@X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&st1_32		("{$T0}","[$Xfer], #16");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past
+	'&and	($t1,$f,$e)',
+	'&bic	($t4,$g,$e)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g)
+	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ror	($t0,$t0,"#$Sigma1[0]")',
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t0)',			# h+=Sigma1(e)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&ror	($t4,$t4,"#$Sigma0[0]")',
+	'&add	($d,$d,$h)',			# d+=h
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+#ifdef	__KERNEL__
+.globl	sha256_block_neon
+#endif
+.type	sha256_block_neon,%function
+.align	4
+sha256_block_neon:
+.Lneon_entry:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	sub	sp,sp,#16*4
+
+	adr	$Ktbl,.LK256
+	add	$num,$inp,$num,lsl#6	// len to point at the end of inp
+
+	ld1.8	{@X[0]},[$inp], #16
+	ld1.8	{@X[1]},[$inp], #16
+	ld1.8	{@X[2]},[$inp], #16
+	ld1.8	{@X[3]},[$inp], #16
+	ld1.32	{$T0},[$Ktbl], #16
+	ld1.32	{$T1},[$Ktbl], #16
+	ld1.32	{$T2},[$Ktbl], #16
+	ld1.32	{$T3},[$Ktbl], #16
+	rev32	@X[0],@X[0]		// yes, even on
+	rev32	@X[1],@X[1]		// big-endian
+	rev32	@X[2],@X[2]
+	rev32	@X[3],@X[3]
+	mov	$Xfer,sp
+	add.32	$T0,$T0,@X[0]
+	add.32	$T1,$T1,@X[1]
+	add.32	$T2,$T2,@X[2]
+	st1.32	{$T0-$T1},[$Xfer], #32
+	add.32	$T3,$T3,@X[3]
+	st1.32	{$T2-$T3},[$Xfer]
+	sub	$Xfer,$Xfer,#32
+
+	ldp	$A,$B,[$ctx]
+	ldp	$C,$D,[$ctx,#8]
+	ldp	$E,$F,[$ctx,#16]
+	ldp	$G,$H,[$ctx,#24]
+	ldr	$t1,[sp,#0]
+	mov	$t2,wzr
+	eor	$t3,$B,$C
+	mov	$t4,wzr
+	b	.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	cmp	$t1,#0				// check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	sub	$Ktbl,$Ktbl,#256		// rewind $Ktbl
+	cmp	$inp,$num
+	mov	$Xfer, #64
+	csel	$Xfer, $Xfer, xzr, eq
+	sub	$inp,$inp,$Xfer			// avoid SEGV
+	mov	$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	add	$A,$A,$t4			// h+=Sigma0(a) from the past
+	ldp	$t0,$t1,[$ctx,#0]
+	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past
+	ldp	$t2,$t3,[$ctx,#8]
+	add	$A,$A,$t0			// accumulate
+	add	$B,$B,$t1
+	ldp	$t0,$t1,[$ctx,#16]
+	add	$C,$C,$t2
+	add	$D,$D,$t3
+	ldp	$t2,$t3,[$ctx,#24]
+	add	$E,$E,$t0
+	add	$F,$F,$t1
+	 ldr	$t1,[sp,#0]
+	stp	$A,$B,[$ctx,#0]
+	add	$G,$G,$t2
+	 mov	$t2,wzr
+	stp	$C,$D,[$ctx,#8]
+	add	$H,$H,$t3
+	stp	$E,$F,[$ctx,#16]
+	 eor	$t3,$B,$C
+	stp	$G,$H,[$ctx,#24]
+	 mov	$t4,wzr
+	 mov	$Xfer,sp
+	b.ne	.L_00_48
+
+	ldr	x29,[x29]
+	add	sp,sp,#16*4+16
+	ret
+.size	sha256_block_neon,.-sha256_block_neon
+___
+}
+
+$code.=<<___;
+#ifndef	__KERNEL__
+.comm	OPENSSL_armcap_P,4,4
+#endif
+___
+
+{   my  %opcode = (
+	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
+	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/ge;
+
+	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
+
+	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
+
+	s/\.[ui]?8(\s)/$1/;
+	s/\.\w?32\b//		and s/\.16b/\.4s/g;
+	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;
+
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S
new file mode 100644
index 000000000000..e4bfe42a61a9
--- /dev/null
+++ b/lib/crypto/arm64/sha256-ce.S
@@ -0,0 +1,408 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Core SHA-224/SHA-256 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.arch		armv8-a+crypto
+
+	dga		.req	q20
+	dgav		.req	v20
+	dgb		.req	q21
+	dgbv		.req	v21
+
+	t0		.req	v22
+	t1		.req	v23
+
+	dg0q		.req	q24
+	dg0v		.req	v24
+	dg1q		.req	q25
+	dg1v		.req	v25
+	dg2q		.req	q26
+	dg2v		.req	v26
+
+	.macro		add_only, ev, rc, s0
+	mov		dg2v.16b, dg0v.16b
+	.ifeq		\ev
+	add		t1.4s, v\s0\().4s, \rc\().4s
+	sha256h		dg0q, dg1q, t0.4s
+	sha256h2	dg1q, dg2q, t0.4s
+	.else
+	.ifnb		\s0
+	add		t0.4s, v\s0\().4s, \rc\().4s
+	.endif
+	sha256h		dg0q, dg1q, t1.4s
+	sha256h2	dg1q, dg2q, t1.4s
+	.endif
+	.endm
+
+	.macro		add_update, ev, rc, s0, s1, s2, s3
+	sha256su0	v\s0\().4s, v\s1\().4s
+	add_only	\ev, \rc, \s1
+	sha256su1	v\s0\().4s, v\s2\().4s, v\s3\().4s
+	.endm
+
+	/*
+	 * The SHA-256 round constants
+	 */
+	.section	".rodata", "a"
+	.align		4
+.Lsha2_rcon:
+	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+	.macro load_round_constants	tmp
+	adr_l		\tmp, .Lsha2_rcon
+	ld1		{ v0.4s- v3.4s}, [\tmp], #64
+	ld1		{ v4.4s- v7.4s}, [\tmp], #64
+	ld1		{ v8.4s-v11.4s}, [\tmp], #64
+	ld1		{v12.4s-v15.4s}, [\tmp]
+	.endm
+
+	/*
+	 * size_t __sha256_ce_transform(struct sha256_block_state *state,
+	 *				const u8 *data, size_t nblocks);
+	 */
+	.text
+SYM_FUNC_START(__sha256_ce_transform)
+
+	load_round_constants	x8
+
+	/* load state */
+	ld1		{dgav.4s, dgbv.4s}, [x0]
+
+	/* load input */
+0:	ld1		{v16.4s-v19.4s}, [x1], #64
+	sub		x2, x2, #1
+
+CPU_LE(	rev32		v16.16b, v16.16b	)
+CPU_LE(	rev32		v17.16b, v17.16b	)
+CPU_LE(	rev32		v18.16b, v18.16b	)
+CPU_LE(	rev32		v19.16b, v19.16b	)
+
+	add		t0.4s, v16.4s, v0.4s
+	mov		dg0v.16b, dgav.16b
+	mov		dg1v.16b, dgbv.16b
+
+	add_update	0,  v1, 16, 17, 18, 19
+	add_update	1,  v2, 17, 18, 19, 16
+	add_update	0,  v3, 18, 19, 16, 17
+	add_update	1,  v4, 19, 16, 17, 18
+
+	add_update	0,  v5, 16, 17, 18, 19
+	add_update	1,  v6, 17, 18, 19, 16
+	add_update	0,  v7, 18, 19, 16, 17
+	add_update	1,  v8, 19, 16, 17, 18
+
+	add_update	0,  v9, 16, 17, 18, 19
+	add_update	1, v10, 17, 18, 19, 16
+	add_update	0, v11, 18, 19, 16, 17
+	add_update	1, v12, 19, 16, 17, 18
+
+	add_only	0, v13, 17
+	add_only	1, v14, 18
+	add_only	0, v15, 19
+	add_only	1
+
+	/* update state */
+	add		dgav.4s, dgav.4s, dg0v.4s
+	add		dgbv.4s, dgbv.4s, dg1v.4s
+
+	/* return early if voluntary preemption is needed */
+	cond_yield	1f, x5, x6
+
+	/* handled all input blocks? */
+	cbnz		x2, 0b
+
+	/* store new state */
+1:	st1		{dgav.4s, dgbv.4s}, [x0]
+	mov		x0, x2
+	ret
+SYM_FUNC_END(__sha256_ce_transform)
+
+	.unreq dga
+	.unreq dgav
+	.unreq dgb
+	.unreq dgbv
+	.unreq t0
+	.unreq t1
+	.unreq dg0q
+	.unreq dg0v
+	.unreq dg1q
+	.unreq dg1v
+	.unreq dg2q
+	.unreq dg2v
+
+	// parameters for sha256_ce_finup2x()
+	ctx		.req	x0
+	data1		.req	x1
+	data2		.req	x2
+	len		.req	w3
+	out1		.req	x4
+	out2		.req	x5
+
+	// other scalar variables
+	count		.req	x6
+	final_step	.req	w7
+
+	// x8-x9 are used as temporaries.
+
+	// v0-v15 are used to cache the SHA-256 round constants.
+	// v16-v19 are used for the message schedule for the first message.
+	// v20-v23 are used for the message schedule for the second message.
+	// v24-v31 are used for the state and temporaries as given below.
+	// *_a are for the first message and *_b for the second.
+	state0_a_q	.req	q24
+	state0_a	.req	v24
+	state1_a_q	.req	q25
+	state1_a	.req	v25
+	state0_b_q	.req	q26
+	state0_b	.req	v26
+	state1_b_q	.req	q27
+	state1_b	.req	v27
+	t0_a		.req	v28
+	t0_b		.req	v29
+	t1_a_q		.req	q30
+	t1_a		.req	v30
+	t1_b_q		.req	q31
+	t1_b		.req	v31
+
+#define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
+// offsetof(struct __sha256_ctx, state) is assumed to be 0.
+
+	// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a
+	// and m0_b contain the current 4 message schedule words for the first
+	// and second message respectively.
+	//
+	// If not all the message schedule words have been computed yet, then
+	// this also computes 4 more message schedule words for each message.
+	// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
+	// the first message, and likewise m1_b-m3_b for the second.  After
+	// consuming the current value of m0_a, this macro computes the group
+	// after m3_a and writes it to m0_a, and likewise for *_b.  This means
+	// that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
+	// m3_a, m0_a), and likewise for *_b, so the caller must cycle through
+	// the registers accordingly.
+	.macro	do_4rounds_2x	i, k,  m0_a, m1_a, m2_a, m3_a,  \
+				       m0_b, m1_b, m2_b, m3_b
+	add		t0_a\().4s, \m0_a\().4s, \k\().4s
+	add		t0_b\().4s, \m0_b\().4s, \k\().4s
+	.if \i < 48
+	sha256su0	\m0_a\().4s, \m1_a\().4s
+	sha256su0	\m0_b\().4s, \m1_b\().4s
+	sha256su1	\m0_a\().4s, \m2_a\().4s, \m3_a\().4s
+	sha256su1	\m0_b\().4s, \m2_b\().4s, \m3_b\().4s
+	.endif
+	mov		t1_a.16b, state0_a.16b
+	mov		t1_b.16b, state0_b.16b
+	sha256h		state0_a_q, state1_a_q, t0_a\().4s
+	sha256h		state0_b_q, state1_b_q, t0_b\().4s
+	sha256h2	state1_a_q, t1_a_q, t0_a\().4s
+	sha256h2	state1_b_q, t1_b_q, t0_b\().4s
+	.endm
+
+	.macro	do_16rounds_2x	i, k0, k1, k2, k3
+	do_4rounds_2x	\i + 0,  \k0,  v16, v17, v18, v19,  v20, v21, v22, v23
+	do_4rounds_2x	\i + 4,  \k1,  v17, v18, v19, v16,  v21, v22, v23, v20
+	do_4rounds_2x	\i + 8,  \k2,  v18, v19, v16, v17,  v22, v23, v20, v21
+	do_4rounds_2x	\i + 12, \k3,  v19, v16, v17, v18,  v23, v20, v21, v22
+	.endm
+
+//
+// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+//			  const u8 *data1, const u8 *data2, int len,
+//			  u8 out1[SHA256_DIGEST_SIZE],
+//			  u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved.  On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ce_finup2x)
+	sub		sp, sp, #128
+	mov		final_step, #0
+	load_round_constants	x8
+
+	// Load the initial state from ctx->state.
+	ld1		{state0_a.4s-state1_a.4s}, [ctx]
+
+	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
+	// bytes that are buffered in ctx->buf.  Also save it in a register with
+	// len added to it.
+	ldr		x8, [ctx, #OFFSETOF_BYTECOUNT]
+	add		count, x8, len, sxtw
+	and		x8, x8, #63
+	cbz		x8, .Lfinup2x_enter_loop	// No bytes buffered?
+
+	// x8 bytes (1 to 63) are currently buffered in ctx->buf.  Load them
+	// followed by the first 64 - x8 bytes of data.  Since len >= 64, we
+	// just load 64 bytes from each of ctx->buf, data1, and data2
+	// unconditionally and rearrange the data as needed.
+	add		x9, ctx, #OFFSETOF_BUF
+	ld1		{v16.16b-v19.16b}, [x9]
+	st1		{v16.16b-v19.16b}, [sp]
+
+	ld1		{v16.16b-v19.16b}, [data1], #64
+	add		x9, sp, x8
+	st1		{v16.16b-v19.16b}, [x9]
+	ld1		{v16.4s-v19.4s}, [sp]
+
+	ld1		{v20.16b-v23.16b}, [data2], #64
+	st1		{v20.16b-v23.16b}, [x9]
+	ld1		{v20.4s-v23.4s}, [sp]
+
+	sub		len, len, #64
+	sub		data1, data1, x8
+	sub		data2, data2, x8
+	add		len, len, w8
+	mov		state0_b.16b, state0_a.16b
+	mov		state1_b.16b, state1_a.16b
+	b		.Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+	sub		len, len, #64
+	mov		state0_b.16b, state0_a.16b
+	mov		state1_b.16b, state1_a.16b
+.Lfinup2x_loop:
+	// Load the next two data blocks.
+	ld1		{v16.4s-v19.4s}, [data1], #64
+	ld1		{v20.4s-v23.4s}, [data2], #64
+.Lfinup2x_loop_have_data:
+	// Convert the words of the data blocks from big endian.
+CPU_LE(	rev32		v16.16b, v16.16b	)
+CPU_LE(	rev32		v17.16b, v17.16b	)
+CPU_LE(	rev32		v18.16b, v18.16b	)
+CPU_LE(	rev32		v19.16b, v19.16b	)
+CPU_LE(	rev32		v20.16b, v20.16b	)
+CPU_LE(	rev32		v21.16b, v21.16b	)
+CPU_LE(	rev32		v22.16b, v22.16b	)
+CPU_LE(	rev32		v23.16b, v23.16b	)
+.Lfinup2x_loop_have_bswapped_data:
+
+	// Save the original state for each block.
+	st1		{state0_a.4s-state1_b.4s}, [sp]
+
+	// Do the SHA-256 rounds on each block.
+	do_16rounds_2x	0,  v0, v1, v2, v3
+	do_16rounds_2x	16, v4, v5, v6, v7
+	do_16rounds_2x	32, v8, v9, v10, v11
+	do_16rounds_2x	48, v12, v13, v14, v15
+
+	// Add the original state for each block.
+	ld1		{v16.4s-v19.4s}, [sp]
+	add		state0_a.4s, state0_a.4s, v16.4s
+	add		state1_a.4s, state1_a.4s, v17.4s
+	add		state0_b.4s, state0_b.4s, v18.4s
+	add		state1_b.4s, state1_b.4s, v19.4s
+
+	// Update len and loop back if more blocks remain.
+	sub		len, len, #64
+	tbz		len, #31, .Lfinup2x_loop	// len >= 0?
+
+	// Check if any final blocks need to be handled.
+	// final_step = 2: all done
+	// final_step = 1: need to do count-only padding block
+	// final_step = 0: need to do the block with 0x80 padding byte
+	tbnz		final_step, #1, .Lfinup2x_done
+	tbnz		final_step, #0, .Lfinup2x_finalize_countonly
+	add		len, len, #64
+	cbz		len, .Lfinup2x_finalize_blockaligned
+
+	// Not block-aligned; 1 <= len <= 63 data bytes remain.  Pad the block.
+	// To do this, write the padding starting with the 0x80 byte to
+	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
+	// and load from &sp[64 - len] to get the needed padding block.  This
+	// code relies on the data buffers being >= 64 bytes in length.
+	sub		w8, len, #64		// w8 = len - 64
+	add		data1, data1, w8, sxtw	// data1 += len - 64
+	add		data2, data2, w8, sxtw	// data2 += len - 64
+CPU_LE(	mov		x9, #0x80		)
+CPU_LE(	fmov		d16, x9			)
+CPU_BE(	movi		v16.16b, #0		)
+CPU_BE(	mov		x9, #0x8000000000000000	)
+CPU_BE(	mov		v16.d[1], x9		)
+	movi		v17.16b, #0
+	stp		q16, q17, [sp, #64]
+	stp		q17, q17, [sp, #96]
+	sub		x9, sp, w8, sxtw	// x9 = &sp[64 - len]
+	cmp		len, #56
+	b.ge		1f		// will count spill into its own block?
+	lsl		count, count, #3
+CPU_LE(	rev		count, count		)
+	str		count, [x9, #56]
+	mov		final_step, #2	// won't need count-only block
+	b		2f
+1:
+	mov		final_step, #1	// will need count-only block
+2:
+	ld1		{v16.16b-v19.16b}, [data1]
+	st1		{v16.16b-v19.16b}, [sp]
+	ld1		{v16.4s-v19.4s}, [x9]
+	ld1		{v20.16b-v23.16b}, [data2]
+	st1		{v20.16b-v23.16b}, [sp]
+	ld1		{v20.4s-v23.4s}, [x9]
+	b		.Lfinup2x_loop_have_data
+
+	// Prepare a padding block, either:
+	//
+	//	{0x80, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a block aligned message.
+	//
+	//	{   0, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a message whose length mod 64 is >= 56.
+	//
+	// Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+	movi		v16.2d, #0
+	b		1f
+.Lfinup2x_finalize_blockaligned:
+	mov		x8, #0x80000000
+	fmov		d16, x8
+1:
+	movi		v17.2d, #0
+	movi		v18.2d, #0
+	ror		count, count, #29	// ror(lsl(count, 3), 32)
+	mov		v19.d[0], xzr
+	mov		v19.d[1], count
+	mov		v20.16b, v16.16b
+	movi		v21.2d, #0
+	movi		v22.2d, #0
+	mov		v23.16b, v19.16b
+	mov		final_step, #2
+	b		.Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+	// Write the two digests with all bytes in the correct order.
+CPU_LE(	rev32		state0_a.16b, state0_a.16b	)
+CPU_LE(	rev32		state1_a.16b, state1_a.16b	)
+CPU_LE(	rev32		state0_b.16b, state0_b.16b	)
+CPU_LE(	rev32		state1_b.16b, state1_b.16b	)
+	st1		{state0_a.4s-state1_a.4s}, [out1]
+	st1		{state0_b.4s-state1_b.4s}, [out2]
+	add		sp, sp, #128
+	ret
+SYM_FUNC_END(sha256_ce_finup2x)
diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h
new file mode 100644
index 000000000000..568dff0f276a
--- /dev/null
+++ b/lib/crypto/arm64/sha256.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for ARM64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
+					const u8 *data, size_t nblocks);
+asmlinkage void sha256_block_neon(struct sha256_block_state *state,
+				  const u8 *data, size_t nblocks);
+asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state,
+					const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
+		if (static_branch_likely(&have_ce)) {
+			do {
+				size_t rem;
+
+				scoped_ksimd()
+					rem = __sha256_ce_transform(state, data,
+								    nblocks);
+
+				data += (nblocks - rem) * SHA256_BLOCK_SIZE;
+				nblocks = rem;
+			} while (nblocks);
+		} else {
+			scoped_ksimd()
+				sha256_block_neon(state, data, nblocks);
+		}
+	} else {
+		sha256_block_data_order(state, data, nblocks);
+	}
+}
+
+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+				  const u8 *data1, const u8 *data2, int len,
+				  u8 out1[SHA256_DIGEST_SIZE],
+				  u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	/*
+	 * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+	 * Further limit len to 65536 to avoid spending too long with preemption
+	 * disabled.  (Of course, in practice len is nearly always 4096 anyway.)
+	 */
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
+	    len <= 65536 && likely(may_use_simd())) {
+		scoped_ksimd()
+			sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
+		kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+		kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+		return true;
+	}
+	return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return static_key_enabled(&have_ce);
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(ASIMD)) {
+		static_branch_enable(&have_neon);
+		if (cpu_have_named_feature(SHA2))
+			static_branch_enable(&have_ce);
+	}
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm64/sha3-ce-core.S b/lib/crypto/arm64/sha3-ce-core.S
new file mode 100644
index 000000000000..ace90b506490
--- /dev/null
+++ b/lib/crypto/arm64/sha3-ce-core.S
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Core SHA-3 transform using v8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.irp	b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+	.set	.Lv\b\().2d, \b
+	.set	.Lv\b\().16b, \b
+	.endr
+
+	/*
+	 * ARMv8.2 Crypto Extensions instructions
+	 */
+	.macro	eor3, rd, rn, rm, ra
+	.inst	0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+	.endm
+
+	.macro	rax1, rd, rn, rm
+	.inst	0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+	.endm
+
+	.macro	bcax, rd, rn, rm, ra
+	.inst	0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+	.endm
+
+	.macro	xar, rd, rn, rm, imm6
+	.inst	0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
+	.endm
+
+	/*
+	 * size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+	 *			    size_t nblocks, size_t block_size)
+	 *
+	 * block_size is assumed to be one of 72 (SHA3-512), 104 (SHA3-384), 136
+	 * (SHA3-256 and SHAKE256), 144 (SHA3-224), or 168 (SHAKE128).
+	 */
+	.text
+SYM_FUNC_START(sha3_ce_transform)
+	/* load state */
+	add	x8, x0, #32
+	ld1	{ v0.1d- v3.1d}, [x0]
+	ld1	{ v4.1d- v7.1d}, [x8], #32
+	ld1	{ v8.1d-v11.1d}, [x8], #32
+	ld1	{v12.1d-v15.1d}, [x8], #32
+	ld1	{v16.1d-v19.1d}, [x8], #32
+	ld1	{v20.1d-v23.1d}, [x8], #32
+	ld1	{v24.1d}, [x8]
+
+0:	sub	x2, x2, #1
+	mov	w8, #24
+	adr_l	x9, .Lsha3_rcon
+
+	/* load input */
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	ld1	{v29.8b}, [x1], #8
+	eor	v0.8b, v0.8b, v25.8b
+	eor	v1.8b, v1.8b, v26.8b
+	eor	v2.8b, v2.8b, v27.8b
+	eor	v3.8b, v3.8b, v28.8b
+	eor	v4.8b, v4.8b, v29.8b
+
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	eor	v5.8b, v5.8b, v25.8b
+	eor	v6.8b, v6.8b, v26.8b
+	eor	v7.8b, v7.8b, v27.8b
+	eor	v8.8b, v8.8b, v28.8b
+	cmp	x3, #72
+	b.eq	3f	/* SHA3-512 (block_size=72)? */
+
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	eor	v9.8b, v9.8b, v25.8b
+	eor	v10.8b, v10.8b, v26.8b
+	eor	v11.8b, v11.8b, v27.8b
+	eor	v12.8b, v12.8b, v28.8b
+	cmp	x3, #104
+	b.eq	3f	/* SHA3-384 (block_size=104)? */
+
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	eor	v13.8b, v13.8b, v25.8b
+	eor	v14.8b, v14.8b, v26.8b
+	eor	v15.8b, v15.8b, v27.8b
+	eor	v16.8b, v16.8b, v28.8b
+	cmp	x3, #144
+	b.lt	3f	/* SHA3-256 or SHAKE256 (block_size=136)? */
+	b.eq	2f	/* SHA3-224 (block_size=144)? */
+
+	/* SHAKE128 (block_size=168) */
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	eor	v17.8b, v17.8b, v25.8b
+	eor	v18.8b, v18.8b, v26.8b
+	eor	v19.8b, v19.8b, v27.8b
+	eor	v20.8b, v20.8b, v28.8b
+	b	3f
+2:
+	/* SHA3-224 (block_size=144) */
+	ld1	{v25.8b}, [x1], #8
+	eor	v17.8b, v17.8b, v25.8b
+
+3:	sub	w8, w8, #1
+
+	eor3	v29.16b,  v4.16b,  v9.16b, v14.16b
+	eor3	v26.16b,  v1.16b,  v6.16b, v11.16b
+	eor3	v28.16b,  v3.16b,  v8.16b, v13.16b
+	eor3	v25.16b,  v0.16b,  v5.16b, v10.16b
+	eor3	v27.16b,  v2.16b,  v7.16b, v12.16b
+	eor3	v29.16b, v29.16b, v19.16b, v24.16b
+	eor3	v26.16b, v26.16b, v16.16b, v21.16b
+	eor3	v28.16b, v28.16b, v18.16b, v23.16b
+	eor3	v25.16b, v25.16b, v15.16b, v20.16b
+	eor3	v27.16b, v27.16b, v17.16b, v22.16b
+
+	rax1	v30.2d, v29.2d, v26.2d	// bc[0]
+	rax1	v26.2d, v26.2d, v28.2d	// bc[2]
+	rax1	v28.2d, v28.2d, v25.2d	// bc[4]
+	rax1	v25.2d, v25.2d, v27.2d	// bc[1]
+	rax1	v27.2d, v27.2d, v29.2d	// bc[3]
+
+	eor	 v0.16b,  v0.16b, v30.16b
+	xar	 v29.2d,   v1.2d,  v25.2d, (64 - 1)
+	xar	  v1.2d,   v6.2d,  v25.2d, (64 - 44)
+	xar	  v6.2d,   v9.2d,  v28.2d, (64 - 20)
+	xar	  v9.2d,  v22.2d,  v26.2d, (64 - 61)
+	xar	 v22.2d,  v14.2d,  v28.2d, (64 - 39)
+	xar	 v14.2d,  v20.2d,  v30.2d, (64 - 18)
+	xar	 v31.2d,   v2.2d,  v26.2d, (64 - 62)
+	xar	  v2.2d,  v12.2d,  v26.2d, (64 - 43)
+	xar	 v12.2d,  v13.2d,  v27.2d, (64 - 25)
+	xar	 v13.2d,  v19.2d,  v28.2d, (64 - 8)
+	xar	 v19.2d,  v23.2d,  v27.2d, (64 - 56)
+	xar	 v23.2d,  v15.2d,  v30.2d, (64 - 41)
+	xar	 v15.2d,   v4.2d,  v28.2d, (64 - 27)
+	xar	 v28.2d,  v24.2d,  v28.2d, (64 - 14)
+	xar	 v24.2d,  v21.2d,  v25.2d, (64 - 2)
+	xar	  v8.2d,   v8.2d,  v27.2d, (64 - 55)
+	xar	  v4.2d,  v16.2d,  v25.2d, (64 - 45)
+	xar	 v16.2d,   v5.2d,  v30.2d, (64 - 36)
+	xar	  v5.2d,   v3.2d,  v27.2d, (64 - 28)
+	xar	 v27.2d,  v18.2d,  v27.2d, (64 - 21)
+	xar	  v3.2d,  v17.2d,  v26.2d, (64 - 15)
+	xar	 v25.2d,  v11.2d,  v25.2d, (64 - 10)
+	xar	 v26.2d,   v7.2d,  v26.2d, (64 - 6)
+	xar	 v30.2d,  v10.2d,  v30.2d, (64 - 3)
+
+	bcax	v20.16b, v31.16b, v22.16b,  v8.16b
+	bcax	v21.16b,  v8.16b, v23.16b, v22.16b
+	bcax	v22.16b, v22.16b, v24.16b, v23.16b
+	bcax	v23.16b, v23.16b, v31.16b, v24.16b
+	bcax	v24.16b, v24.16b,  v8.16b, v31.16b
+
+	ld1r	{v31.2d}, [x9], #8
+
+	bcax	v17.16b, v25.16b, v19.16b,  v3.16b
+	bcax	v18.16b,  v3.16b, v15.16b, v19.16b
+	bcax	v19.16b, v19.16b, v16.16b, v15.16b
+	bcax	v15.16b, v15.16b, v25.16b, v16.16b
+	bcax	v16.16b, v16.16b,  v3.16b, v25.16b
+
+	bcax	v10.16b, v29.16b, v12.16b, v26.16b
+	bcax	v11.16b, v26.16b, v13.16b, v12.16b
+	bcax	v12.16b, v12.16b, v14.16b, v13.16b
+	bcax	v13.16b, v13.16b, v29.16b, v14.16b
+	bcax	v14.16b, v14.16b, v26.16b, v29.16b
+
+	bcax	 v7.16b, v30.16b,  v9.16b,  v4.16b
+	bcax	 v8.16b,  v4.16b,  v5.16b,  v9.16b
+	bcax	 v9.16b,  v9.16b,  v6.16b,  v5.16b
+	bcax	 v5.16b,  v5.16b, v30.16b,  v6.16b
+	bcax	 v6.16b,  v6.16b,  v4.16b, v30.16b
+
+	bcax	 v3.16b, v27.16b,  v0.16b, v28.16b
+	bcax	 v4.16b, v28.16b,  v1.16b,  v0.16b
+	bcax	 v0.16b,  v0.16b,  v2.16b,  v1.16b
+	bcax	 v1.16b,  v1.16b, v27.16b,  v2.16b
+	bcax	 v2.16b,  v2.16b, v28.16b, v27.16b
+
+	eor	 v0.16b,  v0.16b, v31.16b
+
+	cbnz	w8, 3b
+	cond_yield 4f, x8, x9
+	cbnz	x2, 0b
+
+	/* save state */
+4:	st1	{ v0.1d- v3.1d}, [x0], #32
+	st1	{ v4.1d- v7.1d}, [x0], #32
+	st1	{ v8.1d-v11.1d}, [x0], #32
+	st1	{v12.1d-v15.1d}, [x0], #32
+	st1	{v16.1d-v19.1d}, [x0], #32
+	st1	{v20.1d-v23.1d}, [x0], #32
+	st1	{v24.1d}, [x0]
+	mov	x0, x2
+	ret
+SYM_FUNC_END(sha3_ce_transform)
+
+	.section	".rodata", "a"
+	.align		8
+.Lsha3_rcon:
+	.quad	0x0000000000000001, 0x0000000000008082, 0x800000000000808a
+	.quad	0x8000000080008000, 0x000000000000808b, 0x0000000080000001
+	.quad	0x8000000080008081, 0x8000000000008009, 0x000000000000008a
+	.quad	0x0000000000000088, 0x0000000080008009, 0x000000008000000a
+	.quad	0x000000008000808b, 0x800000000000008b, 0x8000000000008089
+	.quad	0x8000000000008003, 0x8000000000008002, 0x8000000000000080
+	.quad	0x000000000000800a, 0x800000008000000a, 0x8000000080008081
+	.quad	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
diff --git a/lib/crypto/arm64/sha3.h b/lib/crypto/arm64/sha3.h
new file mode 100644
index 000000000000..b602f1b3b282
--- /dev/null
+++ b/lib/crypto/arm64/sha3.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+
+asmlinkage size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+				    size_t nblocks, size_t block_size);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+			       size_t nblocks, size_t block_size)
+{
+	if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+		do {
+			size_t rem;
+
+			scoped_ksimd()
+				rem = sha3_ce_transform(state, data, nblocks,
+							block_size);
+			data += (nblocks - rem) * block_size;
+			nblocks = rem;
+		} while (nblocks);
+	} else {
+		sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+	}
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+	if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+		/*
+		 * Passing zeroes into sha3_ce_transform() gives the plain
+		 * Keccak-f permutation, which is what we want here.  Any
+		 * supported block size may be used.  Use SHA3_512_BLOCK_SIZE
+		 * since it's the shortest.
+		 */
+		static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+		scoped_ksimd()
+			sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
+	} else {
+		sha3_keccakf_generic(state);
+	}
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(SHA3))
+		static_branch_enable(&have_sha3);
+}
diff --git a/lib/crypto/arm64/sha512-ce-core.S b/lib/crypto/arm64/sha512-ce-core.S
new file mode 100644
index 000000000000..ffd51acfd1ee
--- /dev/null
+++ b/lib/crypto/arm64/sha512-ce-core.S
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Core SHA-384/SHA-512 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	/*
+	 * We have to specify the "sha3" feature here, since the GNU and clang
+	 * assemblers both consider the SHA-512 instructions to be part of the
+	 * "sha3" feature.  (Except binutils 2.30 through 2.42, which used
+	 * "sha2".  But "sha3" implies "sha2", so "sha3" still works in those
+	 * versions.)  "sha3" doesn't make a lot of sense, since SHA-512 is part
+	 * of the SHA-2 family of algorithms, and also the Arm Architecture
+	 * Reference Manual defines FEAT_SHA512 and FEAT_SHA3 separately.
+	 * Regardless, we must use "sha3" to be compatible with the assemblers.
+	 */
+	.arch		armv8-a+sha3
+
+	/*
+	 * The SHA-512 round constants
+	 */
+	.section	".rodata", "a"
+	.align		4
+.Lsha512_rcon:
+	.quad		0x428a2f98d728ae22, 0x7137449123ef65cd
+	.quad		0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+	.quad		0x3956c25bf348b538, 0x59f111f1b605d019
+	.quad		0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+	.quad		0xd807aa98a3030242, 0x12835b0145706fbe
+	.quad		0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+	.quad		0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+	.quad		0x9bdc06a725c71235, 0xc19bf174cf692694
+	.quad		0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+	.quad		0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+	.quad		0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+	.quad		0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+	.quad		0x983e5152ee66dfab, 0xa831c66d2db43210
+	.quad		0xb00327c898fb213f, 0xbf597fc7beef0ee4
+	.quad		0xc6e00bf33da88fc2, 0xd5a79147930aa725
+	.quad		0x06ca6351e003826f, 0x142929670a0e6e70
+	.quad		0x27b70a8546d22ffc, 0x2e1b21385c26c926
+	.quad		0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+	.quad		0x650a73548baf63de, 0x766a0abb3c77b2a8
+	.quad		0x81c2c92e47edaee6, 0x92722c851482353b
+	.quad		0xa2bfe8a14cf10364, 0xa81a664bbc423001
+	.quad		0xc24b8b70d0f89791, 0xc76c51a30654be30
+	.quad		0xd192e819d6ef5218, 0xd69906245565a910
+	.quad		0xf40e35855771202a, 0x106aa07032bbd1b8
+	.quad		0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+	.quad		0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+	.quad		0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+	.quad		0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+	.quad		0x748f82ee5defb2fc, 0x78a5636f43172f60
+	.quad		0x84c87814a1f0ab72, 0x8cc702081a6439ec
+	.quad		0x90befffa23631e28, 0xa4506cebde82bde9
+	.quad		0xbef9a3f7b2c67915, 0xc67178f2e372532b
+	.quad		0xca273eceea26619c, 0xd186b8c721c0c207
+	.quad		0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+	.quad		0x06f067aa72176fba, 0x0a637dc5a2c898a6
+	.quad		0x113f9804bef90dae, 0x1b710b35131c471b
+	.quad		0x28db77f523047d84, 0x32caab7b40c72493
+	.quad		0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+	.quad		0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+	.quad		0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+
+	.macro		dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4
+	.ifnb		\rc1
+	ld1		{v\rc1\().2d}, [x4], #16
+	.endif
+	add		v5.2d, v\rc0\().2d, v\in0\().2d
+	ext		v6.16b, v\i2\().16b, v\i3\().16b, #8
+	ext		v5.16b, v5.16b, v5.16b, #8
+	ext		v7.16b, v\i1\().16b, v\i2\().16b, #8
+	add		v\i3\().2d, v\i3\().2d, v5.2d
+	.ifnb		\in1
+	ext		v5.16b, v\in3\().16b, v\in4\().16b, #8
+	sha512su0	v\in0\().2d, v\in1\().2d
+	.endif
+	sha512h		q\i3, q6, v7.2d
+	.ifnb		\in1
+	sha512su1	v\in0\().2d, v\in2\().2d, v5.2d
+	.endif
+	add		v\i4\().2d, v\i1\().2d, v\i3\().2d
+	sha512h2	q\i3, q\i1, v\i0\().2d
+	.endm
+
+	/*
+	 * size_t __sha512_ce_transform(struct sha512_block_state *state,
+	 *				const u8 *data, size_t nblocks);
+	 */
+	.text
+SYM_FUNC_START(__sha512_ce_transform)
+	/* load state */
+	ld1		{v8.2d-v11.2d}, [x0]
+
+	/* load first 4 round constants */
+	adr_l		x3, .Lsha512_rcon
+	ld1		{v20.2d-v23.2d}, [x3], #64
+
+	/* load input */
+0:	ld1		{v12.2d-v15.2d}, [x1], #64
+	ld1		{v16.2d-v19.2d}, [x1], #64
+	sub		x2, x2, #1
+
+CPU_LE(	rev64		v12.16b, v12.16b	)
+CPU_LE(	rev64		v13.16b, v13.16b	)
+CPU_LE(	rev64		v14.16b, v14.16b	)
+CPU_LE(	rev64		v15.16b, v15.16b	)
+CPU_LE(	rev64		v16.16b, v16.16b	)
+CPU_LE(	rev64		v17.16b, v17.16b	)
+CPU_LE(	rev64		v18.16b, v18.16b	)
+CPU_LE(	rev64		v19.16b, v19.16b	)
+
+	mov		x4, x3				// rc pointer
+
+	mov		v0.16b, v8.16b
+	mov		v1.16b, v9.16b
+	mov		v2.16b, v10.16b
+	mov		v3.16b, v11.16b
+
+	// v0  ab  cd  --  ef  gh  ab
+	// v1  cd  --  ef  gh  ab  cd
+	// v2  ef  gh  ab  cd  --  ef
+	// v3  gh  ab  cd  --  ef  gh
+	// v4  --  ef  gh  ab  cd  --
+
+	dround		0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17
+	dround		3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18
+	dround		2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19
+	dround		4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12
+	dround		1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13
+
+	dround		0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14
+	dround		3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15
+	dround		2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16
+	dround		4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17
+	dround		1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18
+
+	dround		0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19
+	dround		3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12
+	dround		2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13
+	dround		4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14
+	dround		1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15
+
+	dround		0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16
+	dround		3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17
+	dround		2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18
+	dround		4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19
+	dround		1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12
+
+	dround		0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13
+	dround		3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14
+	dround		2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15
+	dround		4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16
+	dround		1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17
+
+	dround		0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18
+	dround		3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19
+	dround		2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12
+	dround		4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13
+	dround		1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14
+
+	dround		0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15
+	dround		3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16
+	dround		2, 3, 1, 4, 0, 28, 24, 12
+	dround		4, 2, 0, 1, 3, 29, 25, 13
+	dround		1, 4, 3, 0, 2, 30, 26, 14
+
+	dround		0, 1, 2, 3, 4, 31, 27, 15
+	dround		3, 0, 4, 2, 1, 24,   , 16
+	dround		2, 3, 1, 4, 0, 25,   , 17
+	dround		4, 2, 0, 1, 3, 26,   , 18
+	dround		1, 4, 3, 0, 2, 27,   , 19
+
+	/* update state */
+	add		v8.2d, v8.2d, v0.2d
+	add		v9.2d, v9.2d, v1.2d
+	add		v10.2d, v10.2d, v2.2d
+	add		v11.2d, v11.2d, v3.2d
+
+	cond_yield	3f, x4, x5
+	/* handled all input blocks? */
+	cbnz		x2, 0b
+
+	/* store new state */
+3:	st1		{v8.2d-v11.2d}, [x0]
+	mov		x0, x2
+	ret
+SYM_FUNC_END(__sha512_ce_transform)
diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h
new file mode 100644
index 000000000000..7eb7ef04d268
--- /dev/null
+++ b/lib/crypto/arm64/sha512.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * arm64-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_insns);
+
+asmlinkage void sha512_block_data_order(struct sha512_block_state *state,
+					const u8 *data, size_t nblocks);
+asmlinkage size_t __sha512_ce_transform(struct sha512_block_state *state,
+					const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_sha512_insns) &&
+	    likely(may_use_simd())) {
+		do {
+			size_t rem;
+
+			scoped_ksimd()
+				rem = __sha512_ce_transform(state, data, nblocks);
+
+			data += (nblocks - rem) * SHA512_BLOCK_SIZE;
+			nblocks = rem;
+		} while (nblocks);
+	} else {
+		sha512_block_data_order(state, data, nblocks);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(SHA512))
+		static_branch_enable(&have_sha512_insns);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/blake2b.c b/lib/crypto/blake2b.c
new file mode 100644
index 000000000000..09c6d65d8a6e
--- /dev/null
+++ b/lib/crypto/blake2b.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2025 Google LLC
+ *
+ * This is an implementation of the BLAKE2b hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ */
+
+#include <crypto/blake2b.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+static const u8 blake2b_sigma[12][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
+static inline void blake2b_increment_counter(struct blake2b_ctx *ctx, u32 inc)
+{
+	ctx->t[0] += inc;
+	ctx->t[1] += (ctx->t[0] < inc);
+}
+
+static void __maybe_unused
+blake2b_compress_generic(struct blake2b_ctx *ctx,
+			 const u8 *data, size_t nblocks, u32 inc)
+{
+	u64 m[16];
+	u64 v[16];
+	int i;
+
+	WARN_ON(IS_ENABLED(DEBUG) &&
+		(nblocks > 1 && inc != BLAKE2B_BLOCK_SIZE));
+
+	while (nblocks > 0) {
+		blake2b_increment_counter(ctx, inc);
+		memcpy(m, data, BLAKE2B_BLOCK_SIZE);
+		le64_to_cpu_array(m, ARRAY_SIZE(m));
+		memcpy(v, ctx->h, 64);
+		v[ 8] = BLAKE2B_IV0;
+		v[ 9] = BLAKE2B_IV1;
+		v[10] = BLAKE2B_IV2;
+		v[11] = BLAKE2B_IV3;
+		v[12] = BLAKE2B_IV4 ^ ctx->t[0];
+		v[13] = BLAKE2B_IV5 ^ ctx->t[1];
+		v[14] = BLAKE2B_IV6 ^ ctx->f[0];
+		v[15] = BLAKE2B_IV7 ^ ctx->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+	a += b + m[blake2b_sigma[r][2 * i + 0]]; \
+	d = ror64(d ^ a, 32); \
+	c += d; \
+	b = ror64(b ^ c, 24); \
+	a += b + m[blake2b_sigma[r][2 * i + 1]]; \
+	d = ror64(d ^ a, 16); \
+	c += d; \
+	b = ror64(b ^ c, 63); \
+} while (0)
+
+#define ROUND(r) do { \
+	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+	G(r, 2, v[2], v[ 6], v[10], v[14]); \
+	G(r, 3, v[3], v[ 7], v[11], v[15]); \
+	G(r, 4, v[0], v[ 5], v[10], v[15]); \
+	G(r, 5, v[1], v[ 6], v[11], v[12]); \
+	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+		ROUND(0);
+		ROUND(1);
+		ROUND(2);
+		ROUND(3);
+		ROUND(4);
+		ROUND(5);
+		ROUND(6);
+		ROUND(7);
+		ROUND(8);
+		ROUND(9);
+		ROUND(10);
+		ROUND(11);
+
+#undef G
+#undef ROUND
+
+		for (i = 0; i < 8; ++i)
+			ctx->h[i] ^= v[i] ^ v[i + 8];
+
+		data += BLAKE2B_BLOCK_SIZE;
+		--nblocks;
+	}
+}
+
+#ifdef CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+#include "blake2b.h" /* $(SRCARCH)/blake2b.h */
+#else
+#define blake2b_compress blake2b_compress_generic
+#endif
+
+static inline void blake2b_set_lastblock(struct blake2b_ctx *ctx)
+{
+	ctx->f[0] = -1;
+}
+
+void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen)
+{
+	const size_t fill = BLAKE2B_BLOCK_SIZE - ctx->buflen;
+
+	if (unlikely(!inlen))
+		return;
+	if (inlen > fill) {
+		memcpy(ctx->buf + ctx->buflen, in, fill);
+		blake2b_compress(ctx, ctx->buf, 1, BLAKE2B_BLOCK_SIZE);
+		ctx->buflen = 0;
+		in += fill;
+		inlen -= fill;
+	}
+	if (inlen > BLAKE2B_BLOCK_SIZE) {
+		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2B_BLOCK_SIZE);
+
+		blake2b_compress(ctx, in, nblocks - 1, BLAKE2B_BLOCK_SIZE);
+		in += BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+		inlen -= BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+	}
+	memcpy(ctx->buf + ctx->buflen, in, inlen);
+	ctx->buflen += inlen;
+}
+EXPORT_SYMBOL(blake2b_update);
+
+void blake2b_final(struct blake2b_ctx *ctx, u8 *out)
+{
+	WARN_ON(IS_ENABLED(DEBUG) && !out);
+	blake2b_set_lastblock(ctx);
+	memset(ctx->buf + ctx->buflen, 0,
+	       BLAKE2B_BLOCK_SIZE - ctx->buflen); /* Padding */
+	blake2b_compress(ctx, ctx->buf, 1, ctx->buflen);
+	cpu_to_le64_array(ctx->h, ARRAY_SIZE(ctx->h));
+	memcpy(out, ctx->h, ctx->outlen);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(blake2b_final);
+
+#ifdef blake2b_mod_init_arch
+static int __init blake2b_mod_init(void)
+{
+	blake2b_mod_init_arch();
+	return 0;
+}
+subsys_initcall(blake2b_mod_init);
+
+static void __exit blake2b_mod_exit(void)
+{
+}
+module_exit(blake2b_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("BLAKE2b hash function");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
deleted file mode 100644
index 75ccb3e633e6..000000000000
--- a/lib/crypto/blake2s-generic.c
+++ /dev/null
@@ -1,115 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is an implementation of the BLAKE2s hash and PRF functions.
- *
- * Information: https://blake2.net/
- *
- */
-
-#include <crypto/internal/blake2s.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bug.h>
-#include <asm/unaligned.h>
-
-static const u8 blake2s_sigma[10][16] = {
-	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-};
-
-static inline void blake2s_increment_counter(struct blake2s_state *state,
-					     const u32 inc)
-{
-	state->t[0] += inc;
-	state->t[1] += (state->t[0] < inc);
-}
-
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
-		      size_t nblocks, const u32 inc)
-		      __weak __alias(blake2s_compress_generic);
-
-void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
-			      size_t nblocks, const u32 inc)
-{
-	u32 m[16];
-	u32 v[16];
-	int i;
-
-	WARN_ON(IS_ENABLED(DEBUG) &&
-		(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
-
-	while (nblocks > 0) {
-		blake2s_increment_counter(state, inc);
-		memcpy(m, block, BLAKE2S_BLOCK_SIZE);
-		le32_to_cpu_array(m, ARRAY_SIZE(m));
-		memcpy(v, state->h, 32);
-		v[ 8] = BLAKE2S_IV0;
-		v[ 9] = BLAKE2S_IV1;
-		v[10] = BLAKE2S_IV2;
-		v[11] = BLAKE2S_IV3;
-		v[12] = BLAKE2S_IV4 ^ state->t[0];
-		v[13] = BLAKE2S_IV5 ^ state->t[1];
-		v[14] = BLAKE2S_IV6 ^ state->f[0];
-		v[15] = BLAKE2S_IV7 ^ state->f[1];
-
-#define G(r, i, a, b, c, d) do { \
-	a += b + m[blake2s_sigma[r][2 * i + 0]]; \
-	d = ror32(d ^ a, 16); \
-	c += d; \
-	b = ror32(b ^ c, 12); \
-	a += b + m[blake2s_sigma[r][2 * i + 1]]; \
-	d = ror32(d ^ a, 8); \
-	c += d; \
-	b = ror32(b ^ c, 7); \
-} while (0)
-
-#define ROUND(r) do { \
-	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
-	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
-	G(r, 2, v[2], v[ 6], v[10], v[14]); \
-	G(r, 3, v[3], v[ 7], v[11], v[15]); \
-	G(r, 4, v[0], v[ 5], v[10], v[15]); \
-	G(r, 5, v[1], v[ 6], v[11], v[12]); \
-	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
-	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
-} while (0)
-		ROUND(0);
-		ROUND(1);
-		ROUND(2);
-		ROUND(3);
-		ROUND(4);
-		ROUND(5);
-		ROUND(6);
-		ROUND(7);
-		ROUND(8);
-		ROUND(9);
-
-#undef G
-#undef ROUND
-
-		for (i = 0; i < 8; ++i)
-			state->h[i] ^= v[i] ^ v[i + 8];
-
-		block += BLAKE2S_BLOCK_SIZE;
-		--nblocks;
-	}
-}
-
-EXPORT_SYMBOL(blake2s_compress_generic);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("BLAKE2s hash function");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c
deleted file mode 100644
index 7d77dea15587..000000000000
--- a/lib/crypto/blake2s-selftest.c
+++ /dev/null
@@ -1,632 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <crypto/internal/blake2s.h>
-#include <linux/kernel.h>
-#include <linux/random.h>
-#include <linux/string.h>
-
-/*
- * blake2s_testvecs[] generated with the program below (using libb2-dev and
- * libssl-dev [OpenSSL])
- *
- * #include <blake2.h>
- * #include <stdint.h>
- * #include <stdio.h>
- *
- * #include <openssl/evp.h>
- *
- * #define BLAKE2S_TESTVEC_COUNT	256
- *
- * static void print_vec(const uint8_t vec[], int len)
- * {
- *	int i;
- *
- *	printf("  { ");
- *	for (i = 0; i < len; i++) {
- *		if (i && (i % 12) == 0)
- *			printf("\n    ");
- *		printf("0x%02x, ", vec[i]);
- *	}
- *	printf("},\n");
- * }
- *
- * int main(void)
- * {
- *	uint8_t key[BLAKE2S_KEYBYTES];
- *	uint8_t buf[BLAKE2S_TESTVEC_COUNT];
- *	uint8_t hash[BLAKE2S_OUTBYTES];
- *	int i, j;
- *
- *	key[0] = key[1] = 1;
- *	for (i = 2; i < BLAKE2S_KEYBYTES; ++i)
- *		key[i] = key[i - 2] + key[i - 1];
- *
- *	for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i)
- *		buf[i] = (uint8_t)i;
- *
- *	printf("static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n");
- *
- *	for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i) {
- *		int outlen = 1 + i % BLAKE2S_OUTBYTES;
- *		int keylen = (13 * i) % (BLAKE2S_KEYBYTES + 1);
- *
- *		blake2s(hash, buf, key + BLAKE2S_KEYBYTES - keylen, outlen, i,
- *			keylen);
- *		print_vec(hash, outlen);
- *	}
- *	printf("};\n\n");
- *
- *	return 0;
- *}
- */
-static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
-  { 0xa1, },
-  { 0x7c, 0x89, },
-  { 0x74, 0x0e, 0xd4, },
-  { 0x47, 0x0c, 0x21, 0x15, },
-  { 0x18, 0xd6, 0x9c, 0xa6, 0xc4, },
-  { 0x13, 0x5d, 0x16, 0x63, 0x2e, 0xf9, },
-  { 0x2c, 0xb5, 0x04, 0xb7, 0x99, 0xe2, 0x73, },
-  { 0x9a, 0x0f, 0xd2, 0x39, 0xd6, 0x68, 0x1b, 0x92, },
-  { 0xc8, 0xde, 0x7a, 0xea, 0x2f, 0xf4, 0xd2, 0xe3, 0x2b, },
-  { 0x5b, 0xf9, 0x43, 0x52, 0x0c, 0x12, 0xba, 0xb5, 0x93, 0x9f, },
-  { 0xc6, 0x2c, 0x4e, 0x80, 0xfc, 0x32, 0x5b, 0x33, 0xb8, 0xb8, 0x0a, },
-  { 0xa7, 0x5c, 0xfd, 0x3a, 0xcc, 0xbf, 0x90, 0xca, 0xb7, 0x97, 0xde, 0xd8, },
-  { 0x66, 0xca, 0x3c, 0xc4, 0x19, 0xef, 0x92, 0x66, 0x3f, 0x21, 0x8f, 0xda,
-    0xb7, },
-  { 0xba, 0xe5, 0xbb, 0x30, 0x25, 0x94, 0x6d, 0xc3, 0x89, 0x09, 0xc4, 0x25,
-    0x52, 0x3e, },
-  { 0xa2, 0xef, 0x0e, 0x52, 0x0b, 0x5f, 0xa2, 0x01, 0x6d, 0x0a, 0x25, 0xbc,
-    0x57, 0xe2, 0x27, },
-  { 0x4f, 0xe0, 0xf9, 0x52, 0x12, 0xda, 0x84, 0xb7, 0xab, 0xae, 0xb0, 0xa6,
-    0x47, 0x2a, 0xc7, 0xf5, },
-  { 0x56, 0xe7, 0xa8, 0x1c, 0x4c, 0xca, 0xed, 0x90, 0x31, 0xec, 0x87, 0x43,
-    0xe7, 0x72, 0x08, 0xec, 0xbe, },
-  { 0x7e, 0xdf, 0x80, 0x1c, 0x93, 0x33, 0xfd, 0x53, 0x44, 0xba, 0xfd, 0x96,
-    0xe1, 0xbb, 0xb5, 0x65, 0xa5, 0x00, },
-  { 0xec, 0x6b, 0xed, 0xf7, 0x7b, 0x62, 0x1d, 0x7d, 0xf4, 0x82, 0xf3, 0x1e,
-    0x18, 0xff, 0x2b, 0xc4, 0x06, 0x20, 0x2a, },
-  { 0x74, 0x98, 0xd7, 0x68, 0x63, 0xed, 0x87, 0xe4, 0x5d, 0x8d, 0x9e, 0x1d,
-    0xfd, 0x2a, 0xbb, 0x86, 0xac, 0xe9, 0x2a, 0x89, },
-  { 0x89, 0xc3, 0x88, 0xce, 0x2b, 0x33, 0x1e, 0x10, 0xd1, 0x37, 0x20, 0x86,
-    0x28, 0x43, 0x70, 0xd9, 0xfb, 0x96, 0xd9, 0xb5, 0xd3, },
-  { 0xcb, 0x56, 0x74, 0x41, 0x8d, 0x80, 0x01, 0x9a, 0x6b, 0x38, 0xe1, 0x41,
-    0xad, 0x9c, 0x62, 0x74, 0xce, 0x35, 0xd5, 0x6c, 0x89, 0x6e, },
-  { 0x79, 0xaf, 0x94, 0x59, 0x99, 0x26, 0xe1, 0xc9, 0x34, 0xfe, 0x7c, 0x22,
-    0xf7, 0x43, 0xd7, 0x65, 0xd4, 0x48, 0x18, 0xac, 0x3d, 0xfd, 0x93, },
-  { 0x85, 0x0d, 0xff, 0xb8, 0x3e, 0x87, 0x41, 0xb0, 0x95, 0xd3, 0x3d, 0x00,
-    0x47, 0x55, 0x9e, 0xd2, 0x69, 0xea, 0xbf, 0xe9, 0x7a, 0x2d, 0x61, 0x45, },
-  { 0x03, 0xe0, 0x85, 0xec, 0x54, 0xb5, 0x16, 0x53, 0xa8, 0xc4, 0x71, 0xe9,
-    0x6a, 0xe7, 0xcb, 0xc4, 0x15, 0x02, 0xfc, 0x34, 0xa4, 0xa4, 0x28, 0x13,
-    0xd1, },
-  { 0xe3, 0x34, 0x4b, 0xe1, 0xd0, 0x4b, 0x55, 0x61, 0x8f, 0xc0, 0x24, 0x05,
-    0xe6, 0xe0, 0x3d, 0x70, 0x24, 0x4d, 0xda, 0xb8, 0x91, 0x05, 0x29, 0x07,
-    0x01, 0x3e, },
-  { 0x61, 0xff, 0x01, 0x72, 0xb1, 0x4d, 0xf6, 0xfe, 0xd1, 0xd1, 0x08, 0x74,
-    0xe6, 0x91, 0x44, 0xeb, 0x61, 0xda, 0x40, 0xaf, 0xfc, 0x8c, 0x91, 0x6b,
-    0xec, 0x13, 0xed, },
-  { 0xd4, 0x40, 0xd2, 0xa0, 0x7f, 0xc1, 0x58, 0x0c, 0x85, 0xa0, 0x86, 0xc7,
-    0x86, 0xb9, 0x61, 0xc9, 0xea, 0x19, 0x86, 0x1f, 0xab, 0x07, 0xce, 0x37,
-    0x72, 0x67, 0x09, 0xfc, },
-  { 0x9e, 0xf8, 0x18, 0x67, 0x93, 0x10, 0x9b, 0x39, 0x75, 0xe8, 0x8b, 0x38,
-    0x82, 0x7d, 0xb8, 0xb7, 0xa5, 0xaf, 0xe6, 0x6a, 0x22, 0x5e, 0x1f, 0x9c,
-    0x95, 0x29, 0x19, 0xf2, 0x4b, },
-  { 0xc8, 0x62, 0x25, 0xf5, 0x98, 0xc9, 0xea, 0xe5, 0x29, 0x3a, 0xd3, 0x22,
-    0xeb, 0xeb, 0x07, 0x7c, 0x15, 0x07, 0xee, 0x15, 0x61, 0xbb, 0x05, 0x30,
-    0x99, 0x7f, 0x11, 0xf6, 0x0a, 0x1d, },
-  { 0x68, 0x70, 0xf7, 0x90, 0xa1, 0x8b, 0x1f, 0x0f, 0xbb, 0xce, 0xd2, 0x0e,
-    0x33, 0x1f, 0x7f, 0xa9, 0x78, 0xa8, 0xa6, 0x81, 0x66, 0xab, 0x8d, 0xcd,
-    0x58, 0x55, 0x3a, 0x0b, 0x7a, 0xdb, 0xb5, },
-  { 0xdd, 0x35, 0xd2, 0xb4, 0xf6, 0xc7, 0xea, 0xab, 0x64, 0x24, 0x4e, 0xfe,
-    0xe5, 0x3d, 0x4e, 0x95, 0x8b, 0x6d, 0x6c, 0xbc, 0xb0, 0xf8, 0x88, 0x61,
-    0x09, 0xb7, 0x78, 0xa3, 0x31, 0xfe, 0xd9, 0x2f, },
-  { 0x0a, },
-  { 0x6e, 0xd4, },
-  { 0x64, 0xe9, 0xd1, },
-  { 0x30, 0xdd, 0x71, 0xef, },
-  { 0x11, 0xb5, 0x0c, 0x87, 0xc9, },
-  { 0x06, 0x1c, 0x6d, 0x04, 0x82, 0xd0, },
-  { 0x5c, 0x42, 0x0b, 0xee, 0xc5, 0x9c, 0xb2, },
-  { 0xe8, 0x29, 0xd6, 0xb4, 0x5d, 0xf7, 0x2b, 0x93, },
-  { 0x18, 0xca, 0x27, 0x72, 0x43, 0x39, 0x16, 0xbc, 0x6a, },
-  { 0x39, 0x8f, 0xfd, 0x64, 0xf5, 0x57, 0x23, 0xb0, 0x45, 0xf8, },
-  { 0xbb, 0x3a, 0x78, 0x6b, 0x02, 0x1d, 0x0b, 0x16, 0xe3, 0xb2, 0x9a, },
-  { 0xb8, 0xb4, 0x0b, 0xe5, 0xd4, 0x1d, 0x0d, 0x85, 0x49, 0x91, 0x35, 0xfa, },
-  { 0x6d, 0x48, 0x2a, 0x0c, 0x42, 0x08, 0xbd, 0xa9, 0x78, 0x6f, 0x18, 0xaf,
-    0xe2, },
-  { 0x10, 0x45, 0xd4, 0x58, 0x88, 0xec, 0x4e, 0x1e, 0xf6, 0x14, 0x92, 0x64,
-    0x7e, 0xb0, },
-  { 0x8b, 0x0b, 0x95, 0xee, 0x92, 0xc6, 0x3b, 0x91, 0xf1, 0x1e, 0xeb, 0x51,
-    0x98, 0x0a, 0x8d, },
-  { 0xa3, 0x50, 0x4d, 0xa5, 0x1d, 0x03, 0x68, 0xe9, 0x57, 0x78, 0xd6, 0x04,
-    0xf1, 0xc3, 0x94, 0xd8, },
-  { 0xb8, 0x66, 0x6e, 0xdd, 0x46, 0x15, 0xae, 0x3d, 0x83, 0x7e, 0xcf, 0xe7,
-    0x2c, 0xe8, 0x8f, 0xc7, 0x34, },
-  { 0x2e, 0xc0, 0x1f, 0x29, 0xea, 0xf6, 0xb9, 0xe2, 0xc2, 0x93, 0xeb, 0x41,
-    0x0d, 0xf0, 0x0a, 0x13, 0x0e, 0xa2, },
-  { 0x71, 0xb8, 0x33, 0xa9, 0x1b, 0xac, 0xf1, 0xb5, 0x42, 0x8f, 0x5e, 0x81,
-    0x34, 0x43, 0xb7, 0xa4, 0x18, 0x5c, 0x47, },
-  { 0xda, 0x45, 0xb8, 0x2e, 0x82, 0x1e, 0xc0, 0x59, 0x77, 0x9d, 0xfa, 0xb4,
-    0x1c, 0x5e, 0xa0, 0x2b, 0x33, 0x96, 0x5a, 0x58, },
-  { 0xe3, 0x09, 0x05, 0xa9, 0xeb, 0x48, 0x13, 0xad, 0x71, 0x88, 0x81, 0x9a,
-    0x3e, 0x2c, 0xe1, 0x23, 0x99, 0x13, 0x35, 0x9f, 0xb5, },
-  { 0xb7, 0x86, 0x2d, 0x16, 0xe1, 0x04, 0x00, 0x47, 0x47, 0x61, 0x31, 0xfb,
-    0x14, 0xac, 0xd8, 0xe9, 0xe3, 0x49, 0xbd, 0xf7, 0x9c, 0x3f, },
-  { 0x7f, 0xd9, 0x95, 0xa8, 0xa7, 0xa0, 0xcc, 0xba, 0xef, 0xb1, 0x0a, 0xa9,
-    0x21, 0x62, 0x08, 0x0f, 0x1b, 0xff, 0x7b, 0x9d, 0xae, 0xb2, 0x95, },
-  { 0x85, 0x99, 0xea, 0x33, 0xe0, 0x56, 0xff, 0x13, 0xc6, 0x61, 0x8c, 0xf9,
-    0x57, 0x05, 0x03, 0x11, 0xf9, 0xfb, 0x3a, 0xf7, 0xce, 0xbb, 0x52, 0x30, },
-  { 0xb2, 0x72, 0x9c, 0xf8, 0x77, 0x4e, 0x8f, 0x6b, 0x01, 0x6c, 0xff, 0x4e,
-    0x4f, 0x02, 0xd2, 0xbc, 0xeb, 0x51, 0x28, 0x99, 0x50, 0xab, 0xc4, 0x42,
-    0xe3, },
-  { 0x8b, 0x0a, 0xb5, 0x90, 0x8f, 0xf5, 0x7b, 0xdd, 0xba, 0x47, 0x37, 0xc9,
-    0x2a, 0xd5, 0x4b, 0x25, 0x08, 0x8b, 0x02, 0x17, 0xa7, 0x9e, 0x6b, 0x6e,
-    0xe3, 0x90, },
-  { 0x90, 0xdd, 0xf7, 0x75, 0xa7, 0xa3, 0x99, 0x5e, 0x5b, 0x7d, 0x75, 0xc3,
-    0x39, 0x6b, 0xa0, 0xe2, 0x44, 0x53, 0xb1, 0x9e, 0xc8, 0xf1, 0x77, 0x10,
-    0x58, 0x06, 0x9a, },
-  { 0x99, 0x52, 0xf0, 0x49, 0xa8, 0x8c, 0xec, 0xa6, 0x97, 0x32, 0x13, 0xb5,
-    0xf7, 0xa3, 0x8e, 0xfb, 0x4b, 0x59, 0x31, 0x3d, 0x01, 0x59, 0x98, 0x5d,
-    0x53, 0x03, 0x1a, 0x39, },
-  { 0x9f, 0xe0, 0xc2, 0xe5, 0x5d, 0x93, 0xd6, 0x9b, 0x47, 0x8f, 0x9b, 0xe0,
-    0x26, 0x35, 0x84, 0x20, 0x1d, 0xc5, 0x53, 0x10, 0x0f, 0x22, 0xb9, 0xb5,
-    0xd4, 0x36, 0xb1, 0xac, 0x73, },
-  { 0x30, 0x32, 0x20, 0x3b, 0x10, 0x28, 0xec, 0x1f, 0x4f, 0x9b, 0x47, 0x59,
-    0xeb, 0x7b, 0xee, 0x45, 0xfb, 0x0c, 0x49, 0xd8, 0x3d, 0x69, 0xbd, 0x90,
-    0x2c, 0xf0, 0x9e, 0x8d, 0xbf, 0xd5, },
-  { 0x2a, 0x37, 0x73, 0x7f, 0xf9, 0x96, 0x19, 0xaa, 0x25, 0xd8, 0x13, 0x28,
-    0x01, 0x29, 0x89, 0xdf, 0x6e, 0x0c, 0x9b, 0x43, 0x44, 0x51, 0xe9, 0x75,
-    0x26, 0x0c, 0xb7, 0x87, 0x66, 0x0b, 0x5f, },
-  { 0x23, 0xdf, 0x96, 0x68, 0x91, 0x86, 0xd0, 0x93, 0x55, 0x33, 0x24, 0xf6,
-    0xba, 0x08, 0x75, 0x5b, 0x59, 0x11, 0x69, 0xb8, 0xb9, 0xe5, 0x2c, 0x77,
-    0x02, 0xf6, 0x47, 0xee, 0x81, 0xdd, 0xb9, 0x06, },
-  { 0x9d, },
-  { 0x9d, 0x7d, },
-  { 0xfd, 0xc3, 0xda, },
-  { 0xe8, 0x82, 0xcd, 0x21, },
-  { 0xc3, 0x1d, 0x42, 0x4c, 0x74, },
-  { 0xe9, 0xda, 0xf1, 0xa2, 0xe5, 0x7c, },
-  { 0x52, 0xb8, 0x6f, 0x81, 0x5c, 0x3a, 0x4c, },
-  { 0x5b, 0x39, 0x26, 0xfc, 0x92, 0x5e, 0xe0, 0x49, },
-  { 0x59, 0xe4, 0x7c, 0x93, 0x1c, 0xf9, 0x28, 0x93, 0xde, },
-  { 0xde, 0xdf, 0xb2, 0x43, 0x61, 0x0b, 0x86, 0x16, 0x4c, 0x2e, },
-  { 0x14, 0x8f, 0x75, 0x51, 0xaf, 0xb9, 0xee, 0x51, 0x5a, 0xae, 0x23, },
-  { 0x43, 0x5f, 0x50, 0xd5, 0x70, 0xb0, 0x5b, 0x87, 0xf5, 0xd9, 0xb3, 0x6d, },
-  { 0x66, 0x0a, 0x64, 0x93, 0x79, 0x71, 0x94, 0x40, 0xb7, 0x68, 0x2d, 0xd3,
-    0x63, },
-  { 0x15, 0x00, 0xc4, 0x0c, 0x7d, 0x1b, 0x10, 0xa9, 0x73, 0x1b, 0x90, 0x6f,
-    0xe6, 0xa9, },
-  { 0x34, 0x75, 0xf3, 0x86, 0x8f, 0x56, 0xcf, 0x2a, 0x0a, 0xf2, 0x62, 0x0a,
-    0xf6, 0x0e, 0x20, },
-  { 0xb1, 0xde, 0xc9, 0xf5, 0xdb, 0xf3, 0x2f, 0x4c, 0xd6, 0x41, 0x7d, 0x39,
-    0x18, 0x3e, 0xc7, 0xc3, },
-  { 0xc5, 0x89, 0xb2, 0xf8, 0xb8, 0xc0, 0xa3, 0xb9, 0x3b, 0x10, 0x6d, 0x7c,
-    0x92, 0xfc, 0x7f, 0x34, 0x41, },
-  { 0xc4, 0xd8, 0xef, 0xba, 0xef, 0xd2, 0xaa, 0xc5, 0x6c, 0x8e, 0x3e, 0xbb,
-    0x12, 0xfc, 0x0f, 0x72, 0xbf, 0x0f, },
-  { 0xdd, 0x91, 0xd1, 0x15, 0x9e, 0x7d, 0xf8, 0xc1, 0xb9, 0x14, 0x63, 0x96,
-    0xb5, 0xcb, 0x83, 0x1d, 0x35, 0x1c, 0xec, },
-  { 0xa9, 0xf8, 0x52, 0xc9, 0x67, 0x76, 0x2b, 0xad, 0xfb, 0xd8, 0x3a, 0xa6,
-    0x74, 0x02, 0xae, 0xb8, 0x25, 0x2c, 0x63, 0x49, },
-  { 0x77, 0x1f, 0x66, 0x70, 0xfd, 0x50, 0x29, 0xaa, 0xeb, 0xdc, 0xee, 0xba,
-    0x75, 0x98, 0xdc, 0x93, 0x12, 0x3f, 0xdc, 0x7c, 0x38, },
-  { 0xe2, 0xe1, 0x89, 0x5c, 0x37, 0x38, 0x6a, 0xa3, 0x40, 0xac, 0x3f, 0xb0,
-    0xca, 0xfc, 0xa7, 0xf3, 0xea, 0xf9, 0x0f, 0x5d, 0x8e, 0x39, },
-  { 0x0f, 0x67, 0xc8, 0x38, 0x01, 0xb1, 0xb7, 0xb8, 0xa2, 0xe7, 0x0a, 0x6d,
-    0xd2, 0x63, 0x69, 0x9e, 0xcc, 0xf0, 0xf2, 0xbe, 0x9b, 0x98, 0xdd, },
-  { 0x13, 0xe1, 0x36, 0x30, 0xfe, 0xc6, 0x01, 0x8a, 0xa1, 0x63, 0x96, 0x59,
-    0xc2, 0xa9, 0x68, 0x3f, 0x58, 0xd4, 0x19, 0x0c, 0x40, 0xf3, 0xde, 0x02, },
-  { 0xa3, 0x9e, 0xce, 0xda, 0x42, 0xee, 0x8c, 0x6c, 0x5a, 0x7d, 0xdc, 0x89,
-    0x02, 0x77, 0xdd, 0xe7, 0x95, 0xbb, 0xff, 0x0d, 0xa4, 0xb5, 0x38, 0x1e,
-    0xaf, },
-  { 0x9a, 0xf6, 0xb5, 0x9a, 0x4f, 0xa9, 0x4f, 0x2c, 0x35, 0x3c, 0x24, 0xdc,
-    0x97, 0x6f, 0xd9, 0xa1, 0x7d, 0x1a, 0x85, 0x0b, 0xf5, 0xda, 0x2e, 0xe7,
-    0xb1, 0x1d, },
-  { 0x84, 0x1e, 0x8e, 0x3d, 0x45, 0xa5, 0xf2, 0x27, 0xf3, 0x31, 0xfe, 0xb9,
-    0xfb, 0xc5, 0x45, 0x99, 0x99, 0xdd, 0x93, 0x43, 0x02, 0xee, 0x58, 0xaf,
-    0xee, 0x6a, 0xbe, },
-  { 0x07, 0x2f, 0xc0, 0xa2, 0x04, 0xc4, 0xab, 0x7c, 0x26, 0xbb, 0xa8, 0xd8,
-    0xe3, 0x1c, 0x75, 0x15, 0x64, 0x5d, 0x02, 0x6a, 0xf0, 0x86, 0xe9, 0xcd,
-    0x5c, 0xef, 0xa3, 0x25, },
-  { 0x2f, 0x3b, 0x1f, 0xb5, 0x91, 0x8f, 0x86, 0xe0, 0xdc, 0x31, 0x48, 0xb6,
-    0xa1, 0x8c, 0xfd, 0x75, 0xbb, 0x7d, 0x3d, 0xc1, 0xf0, 0x10, 0x9a, 0xd8,
-    0x4b, 0x0e, 0xe3, 0x94, 0x9f, },
-  { 0x29, 0xbb, 0x8f, 0x6c, 0xd1, 0xf2, 0xb6, 0xaf, 0xe5, 0xe3, 0x2d, 0xdc,
-    0x6f, 0xa4, 0x53, 0x88, 0xd8, 0xcf, 0x4d, 0x45, 0x42, 0x62, 0xdb, 0xdf,
-    0xf8, 0x45, 0xc2, 0x13, 0xec, 0x35, },
-  { 0x06, 0x3c, 0xe3, 0x2c, 0x15, 0xc6, 0x43, 0x03, 0x81, 0xfb, 0x08, 0x76,
-    0x33, 0xcb, 0x02, 0xc1, 0xba, 0x33, 0xe5, 0xe0, 0xd1, 0x92, 0xa8, 0x46,
-    0x28, 0x3f, 0x3e, 0x9d, 0x2c, 0x44, 0x54, },
-  { 0xea, 0xbb, 0x96, 0xf8, 0xd1, 0x8b, 0x04, 0x11, 0x40, 0x78, 0x42, 0x02,
-    0x19, 0xd1, 0xbc, 0x65, 0x92, 0xd3, 0xc3, 0xd6, 0xd9, 0x19, 0xe7, 0xc3,
-    0x40, 0x97, 0xbd, 0xd4, 0xed, 0xfa, 0x5e, 0x28, },
-  { 0x02, },
-  { 0x52, 0xa8, },
-  { 0x38, 0x25, 0x0d, },
-  { 0xe3, 0x04, 0xd4, 0x92, },
-  { 0x97, 0xdb, 0xf7, 0x81, 0xca, },
-  { 0x8a, 0x56, 0x9d, 0x62, 0x56, 0xcc, },
-  { 0xa1, 0x8e, 0x3c, 0x72, 0x8f, 0x63, 0x03, },
-  { 0xf7, 0xf3, 0x39, 0x09, 0x0a, 0xa1, 0xbb, 0x23, },
-  { 0x6b, 0x03, 0xc0, 0xe9, 0xd9, 0x83, 0x05, 0x22, 0x01, },
-  { 0x1b, 0x4b, 0xf5, 0xd6, 0x4f, 0x05, 0x75, 0x91, 0x4c, 0x7f, },
-  { 0x4c, 0x8c, 0x25, 0x20, 0x21, 0xcb, 0xc2, 0x4b, 0x3a, 0x5b, 0x8d, },
-  { 0x56, 0xe2, 0x77, 0xa0, 0xb6, 0x9f, 0x81, 0xec, 0x83, 0x75, 0xc4, 0xf9, },
-  { 0x71, 0x70, 0x0f, 0xad, 0x4d, 0x35, 0x81, 0x9d, 0x88, 0x69, 0xf9, 0xaa,
-    0xd3, },
-  { 0x50, 0x6e, 0x86, 0x6e, 0x43, 0xc0, 0xc2, 0x44, 0xc2, 0xe2, 0xa0, 0x1c,
-    0xb7, 0x9a, },
-  { 0xe4, 0x7e, 0x72, 0xc6, 0x12, 0x8e, 0x7c, 0xfc, 0xbd, 0xe2, 0x08, 0x31,
-    0x3d, 0x47, 0x3d, },
-  { 0x08, 0x97, 0x5b, 0x80, 0xae, 0xc4, 0x1d, 0x50, 0x77, 0xdf, 0x1f, 0xd0,
-    0x24, 0xf0, 0x17, 0xc0, },
-  { 0x01, 0xb6, 0x29, 0xf4, 0xaf, 0x78, 0x5f, 0xb6, 0x91, 0xdd, 0x76, 0x76,
-    0xd2, 0xfd, 0x0c, 0x47, 0x40, },
-  { 0xa1, 0xd8, 0x09, 0x97, 0x7a, 0xa6, 0xc8, 0x94, 0xf6, 0x91, 0x7b, 0xae,
-    0x2b, 0x9f, 0x0d, 0x83, 0x48, 0xf7, },
-  { 0x12, 0xd5, 0x53, 0x7d, 0x9a, 0xb0, 0xbe, 0xd9, 0xed, 0xe9, 0x9e, 0xee,
-    0x61, 0x5b, 0x42, 0xf2, 0xc0, 0x73, 0xc0, },
-  { 0xd5, 0x77, 0xd6, 0x5c, 0x6e, 0xa5, 0x69, 0x2b, 0x3b, 0x8c, 0xd6, 0x7d,
-    0x1d, 0xbe, 0x2c, 0xa1, 0x02, 0x21, 0xcd, 0x29, },
-  { 0xa4, 0x98, 0x80, 0xca, 0x22, 0xcf, 0x6a, 0xab, 0x5e, 0x40, 0x0d, 0x61,
-    0x08, 0x21, 0xef, 0xc0, 0x6c, 0x52, 0xb4, 0xb0, 0x53, },
-  { 0xbf, 0xaf, 0x8f, 0x3b, 0x7a, 0x97, 0x33, 0xe5, 0xca, 0x07, 0x37, 0xfd,
-    0x15, 0xdf, 0xce, 0x26, 0x2a, 0xb1, 0xa7, 0x0b, 0xb3, 0xac, },
-  { 0x16, 0x22, 0xe1, 0xbc, 0x99, 0x4e, 0x01, 0xf0, 0xfa, 0xff, 0x8f, 0xa5,
-    0x0c, 0x61, 0xb0, 0xad, 0xcc, 0xb1, 0xe1, 0x21, 0x46, 0xfa, 0x2e, },
-  { 0x11, 0x5b, 0x0b, 0x2b, 0xe6, 0x14, 0xc1, 0xd5, 0x4d, 0x71, 0x5e, 0x17,
-    0xea, 0x23, 0xdd, 0x6c, 0xbd, 0x1d, 0xbe, 0x12, 0x1b, 0xee, 0x4c, 0x1a, },
-  { 0x40, 0x88, 0x22, 0xf3, 0x20, 0x6c, 0xed, 0xe1, 0x36, 0x34, 0x62, 0x2c,
-    0x98, 0x83, 0x52, 0xe2, 0x25, 0xee, 0xe9, 0xf5, 0xe1, 0x17, 0xf0, 0x5c,
-    0xae, },
-  { 0xc3, 0x76, 0x37, 0xde, 0x95, 0x8c, 0xca, 0x2b, 0x0c, 0x23, 0xe7, 0xb5,
-    0x38, 0x70, 0x61, 0xcc, 0xff, 0xd3, 0x95, 0x7b, 0xf3, 0xff, 0x1f, 0x9d,
-    0x59, 0x00, },
-  { 0x0c, 0x19, 0x52, 0x05, 0x22, 0x53, 0xcb, 0x48, 0xd7, 0x10, 0x0e, 0x7e,
-    0x14, 0x69, 0xb5, 0xa2, 0x92, 0x43, 0xa3, 0x9e, 0x4b, 0x8f, 0x51, 0x2c,
-    0x5a, 0x2c, 0x3b, },
-  { 0xe1, 0x9d, 0x70, 0x70, 0x28, 0xec, 0x86, 0x40, 0x55, 0x33, 0x56, 0xda,
-    0x88, 0xca, 0xee, 0xc8, 0x6a, 0x20, 0xb1, 0xe5, 0x3d, 0x57, 0xf8, 0x3c,
-    0x10, 0x07, 0x2a, 0xc4, },
-  { 0x0b, 0xae, 0xf1, 0xc4, 0x79, 0xee, 0x1b, 0x3d, 0x27, 0x35, 0x8d, 0x14,
-    0xd6, 0xae, 0x4e, 0x3c, 0xe9, 0x53, 0x50, 0xb5, 0xcc, 0x0c, 0xf7, 0xdf,
-    0xee, 0xa1, 0x74, 0xd6, 0x71, },
-  { 0xe6, 0xa4, 0xf4, 0x99, 0x98, 0xb9, 0x80, 0xea, 0x96, 0x7f, 0x4f, 0x33,
-    0xcf, 0x74, 0x25, 0x6f, 0x17, 0x6c, 0xbf, 0xf5, 0x5c, 0x38, 0xd0, 0xff,
-    0x96, 0xcb, 0x13, 0xf9, 0xdf, 0xfd, },
-  { 0xbe, 0x92, 0xeb, 0xba, 0x44, 0x2c, 0x24, 0x74, 0xd4, 0x03, 0x27, 0x3c,
-    0x5d, 0x5b, 0x03, 0x30, 0x87, 0x63, 0x69, 0xe0, 0xb8, 0x94, 0xf4, 0x44,
-    0x7e, 0xad, 0xcd, 0x20, 0x12, 0x16, 0x79, },
-  { 0x30, 0xf1, 0xc4, 0x8e, 0x05, 0x90, 0x2a, 0x97, 0x63, 0x94, 0x46, 0xff,
-    0xce, 0xd8, 0x67, 0xa7, 0xac, 0x33, 0x8c, 0x95, 0xb7, 0xcd, 0xa3, 0x23,
-    0x98, 0x9d, 0x76, 0x6c, 0x9d, 0xa8, 0xd6, 0x8a, },
-  { 0xbe, },
-  { 0x17, 0x6c, },
-  { 0x1a, 0x42, 0x4f, },
-  { 0xba, 0xaf, 0xb7, 0x65, },
-  { 0xc2, 0x63, 0x43, 0x6a, 0xea, },
-  { 0xe4, 0x4d, 0xad, 0xf2, 0x0b, 0x02, },
-  { 0x04, 0xc7, 0xc4, 0x7f, 0xa9, 0x2b, 0xce, },
-  { 0x66, 0xf6, 0x67, 0xcb, 0x03, 0x53, 0xc8, 0xf1, },
-  { 0x56, 0xa3, 0x60, 0x78, 0xc9, 0x5f, 0x70, 0x1b, 0x5e, },
-  { 0x99, 0xff, 0x81, 0x7c, 0x13, 0x3c, 0x29, 0x79, 0x4b, 0x65, },
-  { 0x51, 0x10, 0x50, 0x93, 0x01, 0x93, 0xb7, 0x01, 0xc9, 0x18, 0xb7, },
-  { 0x8e, 0x3c, 0x42, 0x1e, 0x5e, 0x7d, 0xc1, 0x50, 0x70, 0x1f, 0x00, 0x98, },
-  { 0x5f, 0xd9, 0x9b, 0xc8, 0xd7, 0xb2, 0x72, 0x62, 0x1a, 0x1e, 0xba, 0x92,
-    0xe9, },
-  { 0x70, 0x2b, 0xba, 0xfe, 0xad, 0x5d, 0x96, 0x3f, 0x27, 0xc2, 0x41, 0x6d,
-    0xc4, 0xb3, },
-  { 0xae, 0xe0, 0xd5, 0xd4, 0xc7, 0xae, 0x15, 0x5e, 0xdc, 0xdd, 0x33, 0x60,
-    0xd7, 0xd3, 0x5e, },
-  { 0x79, 0x8e, 0xbc, 0x9e, 0x20, 0xb9, 0x19, 0x4b, 0x63, 0x80, 0xf3, 0x16,
-    0xaf, 0x39, 0xbd, 0x92, },
-  { 0xc2, 0x0e, 0x85, 0xa0, 0x0b, 0x9a, 0xb0, 0xec, 0xde, 0x38, 0xd3, 0x10,
-    0xd9, 0xa7, 0x66, 0x27, 0xcf, },
-  { 0x0e, 0x3b, 0x75, 0x80, 0x67, 0x14, 0x0c, 0x02, 0x90, 0xd6, 0xb3, 0x02,
-    0x81, 0xf6, 0xa6, 0x87, 0xce, 0x58, },
-  { 0x79, 0xb5, 0xe9, 0x5d, 0x52, 0x4d, 0xf7, 0x59, 0xf4, 0x2e, 0x27, 0xdd,
-    0xb3, 0xed, 0x57, 0x5b, 0x82, 0xea, 0x6f, },
-  { 0xa2, 0x97, 0xf5, 0x80, 0x02, 0x3d, 0xde, 0xa3, 0xf9, 0xf6, 0xab, 0xe3,
-    0x57, 0x63, 0x7b, 0x9b, 0x10, 0x42, 0x6f, 0xf2, },
-  { 0x12, 0x7a, 0xfc, 0xb7, 0x67, 0x06, 0x0c, 0x78, 0x1a, 0xfe, 0x88, 0x4f,
-    0xc6, 0xac, 0x52, 0x96, 0x64, 0x28, 0x97, 0x84, 0x06, },
-  { 0xc5, 0x04, 0x44, 0x6b, 0xb2, 0xa5, 0xa4, 0x66, 0xe1, 0x76, 0xa2, 0x51,
-    0xf9, 0x59, 0x69, 0x97, 0x56, 0x0b, 0xbf, 0x50, 0xb3, 0x34, },
-  { 0x21, 0x32, 0x6b, 0x42, 0xb5, 0xed, 0x71, 0x8d, 0xf7, 0x5a, 0x35, 0xe3,
-    0x90, 0xe2, 0xee, 0xaa, 0x89, 0xf6, 0xc9, 0x9c, 0x4d, 0x73, 0xf4, },
-  { 0x4c, 0xa6, 0x09, 0xf4, 0x48, 0xe7, 0x46, 0xbc, 0x49, 0xfc, 0xe5, 0xda,
-    0xd1, 0x87, 0x13, 0x17, 0x4c, 0x59, 0x71, 0x26, 0x5b, 0x2c, 0x42, 0xb7, },
-  { 0x13, 0x63, 0xf3, 0x40, 0x02, 0xe5, 0xa3, 0x3a, 0x5e, 0x8e, 0xf8, 0xb6,
-    0x8a, 0x49, 0x60, 0x76, 0x34, 0x72, 0x94, 0x73, 0xf6, 0xd9, 0x21, 0x6a,
-    0x26, },
-  { 0xdf, 0x75, 0x16, 0x10, 0x1b, 0x5e, 0x81, 0xc3, 0xc8, 0xde, 0x34, 0x24,
-    0xb0, 0x98, 0xeb, 0x1b, 0x8f, 0xa1, 0x9b, 0x05, 0xee, 0xa5, 0xe9, 0x35,
-    0xf4, 0x1d, },
-  { 0xcd, 0x21, 0x93, 0x6e, 0x5b, 0xa0, 0x26, 0x2b, 0x21, 0x0e, 0xa0, 0xb9,
-    0x1c, 0xb5, 0xbb, 0xb8, 0xf8, 0x1e, 0xff, 0x5c, 0xa8, 0xf9, 0x39, 0x46,
-    0x4e, 0x29, 0x26, },
-  { 0x73, 0x7f, 0x0e, 0x3b, 0x0b, 0x5c, 0xf9, 0x60, 0xaa, 0x88, 0xa1, 0x09,
-    0xb1, 0x5d, 0x38, 0x7b, 0x86, 0x8f, 0x13, 0x7a, 0x8d, 0x72, 0x7a, 0x98,
-    0x1a, 0x5b, 0xff, 0xc9, },
-  { 0xd3, 0x3c, 0x61, 0x71, 0x44, 0x7e, 0x31, 0x74, 0x98, 0x9d, 0x9a, 0xd2,
-    0x27, 0xf3, 0x46, 0x43, 0x42, 0x51, 0xd0, 0x5f, 0xe9, 0x1c, 0x5c, 0x69,
-    0xbf, 0xf6, 0xbe, 0x3c, 0x40, },
-  { 0x31, 0x99, 0x31, 0x9f, 0xaa, 0x43, 0x2e, 0x77, 0x3e, 0x74, 0x26, 0x31,
-    0x5e, 0x61, 0xf1, 0x87, 0xe2, 0xeb, 0x9b, 0xcd, 0xd0, 0x3a, 0xee, 0x20,
-    0x7e, 0x10, 0x0a, 0x0b, 0x7e, 0xfa, },
-  { 0xa4, 0x27, 0x80, 0x67, 0x81, 0x2a, 0xa7, 0x62, 0xf7, 0x6e, 0xda, 0xd4,
-    0x5c, 0x39, 0x74, 0xad, 0x7e, 0xbe, 0xad, 0xa5, 0x84, 0x7f, 0xa9, 0x30,
-    0x5d, 0xdb, 0xe2, 0x05, 0x43, 0xf7, 0x1b, },
-  { 0x0b, 0x37, 0xd8, 0x02, 0xe1, 0x83, 0xd6, 0x80, 0xf2, 0x35, 0xc2, 0xb0,
-    0x37, 0xef, 0xef, 0x5e, 0x43, 0x93, 0xf0, 0x49, 0x45, 0x0a, 0xef, 0xb5,
-    0x76, 0x70, 0x12, 0x44, 0xc4, 0xdb, 0xf5, 0x7a, },
-  { 0x1f, },
-  { 0x82, 0x60, },
-  { 0xcc, 0xe3, 0x08, },
-  { 0x56, 0x17, 0xe4, 0x59, },
-  { 0xe2, 0xd7, 0x9e, 0xc4, 0x4c, },
-  { 0xb2, 0xad, 0xd3, 0x78, 0x58, 0x5a, },
-  { 0xce, 0x43, 0xb4, 0x02, 0x96, 0xab, 0x3c, },
-  { 0xe6, 0x05, 0x1a, 0x73, 0x22, 0x32, 0xbb, 0x77, },
-  { 0x23, 0xe7, 0xda, 0xfe, 0x2c, 0xef, 0x8c, 0x22, 0xec, },
-  { 0xe9, 0x8e, 0x55, 0x38, 0xd1, 0xd7, 0x35, 0x23, 0x98, 0xc7, },
-  { 0xb5, 0x81, 0x1a, 0xe5, 0xb5, 0xa5, 0xd9, 0x4d, 0xca, 0x41, 0xe7, },
-  { 0x41, 0x16, 0x16, 0x95, 0x8d, 0x9e, 0x0c, 0xea, 0x8c, 0x71, 0x9a, 0xc1, },
-  { 0x7c, 0x33, 0xc0, 0xa4, 0x00, 0x62, 0xea, 0x60, 0x67, 0xe4, 0x20, 0xbc,
-    0x5b, },
-  { 0xdb, 0xb1, 0xdc, 0xfd, 0x08, 0xc0, 0xde, 0x82, 0xd1, 0xde, 0x38, 0xc0,
-    0x90, 0x48, },
-  { 0x37, 0x18, 0x2e, 0x0d, 0x61, 0xaa, 0x61, 0xd7, 0x86, 0x20, 0x16, 0x60,
-    0x04, 0xd9, 0xd5, },
-  { 0xb0, 0xcf, 0x2c, 0x4c, 0x5e, 0x5b, 0x4f, 0x2a, 0x23, 0x25, 0x58, 0x47,
-    0xe5, 0x31, 0x06, 0x70, },
-  { 0x91, 0xa0, 0xa3, 0x86, 0x4e, 0xe0, 0x72, 0x38, 0x06, 0x67, 0x59, 0x5c,
-    0x70, 0x25, 0xdb, 0x33, 0x27, },
-  { 0x44, 0x58, 0x66, 0xb8, 0x58, 0xc7, 0x13, 0xed, 0x4c, 0xc0, 0xf4, 0x9a,
-    0x1e, 0x67, 0x75, 0x33, 0xb6, 0xb8, },
-  { 0x7f, 0x98, 0x4a, 0x8e, 0x50, 0xa2, 0x5c, 0xcd, 0x59, 0xde, 0x72, 0xb3,
-    0x9d, 0xc3, 0x09, 0x8a, 0xab, 0x56, 0xf1, },
-  { 0x80, 0x96, 0x49, 0x1a, 0x59, 0xa2, 0xc5, 0xd5, 0xa7, 0x20, 0x8a, 0xb7,
-    0x27, 0x62, 0x84, 0x43, 0xc6, 0xe1, 0x1b, 0x5d, },
-  { 0x6b, 0xb7, 0x2b, 0x26, 0x62, 0x14, 0x70, 0x19, 0x3d, 0x4d, 0xac, 0xac,
-    0x63, 0x58, 0x5e, 0x94, 0xb5, 0xb7, 0xe8, 0xe8, 0xa2, },
-  { 0x20, 0xa8, 0xc0, 0xfd, 0x63, 0x3d, 0x6e, 0x98, 0xcf, 0x0c, 0x49, 0x98,
-    0xe4, 0x5a, 0xfe, 0x8c, 0xaa, 0x70, 0x82, 0x1c, 0x7b, 0x74, },
-  { 0xc8, 0xe8, 0xdd, 0xdf, 0x69, 0x30, 0x01, 0xc2, 0x0f, 0x7e, 0x2f, 0x11,
-    0xcc, 0x3e, 0x17, 0xa5, 0x69, 0x40, 0x3f, 0x0e, 0x79, 0x7f, 0xcf, },
-  { 0xdb, 0x61, 0xc0, 0xe2, 0x2e, 0x49, 0x07, 0x31, 0x1d, 0x91, 0x42, 0x8a,
-    0xfc, 0x5e, 0xd3, 0xf8, 0x56, 0x1f, 0x2b, 0x73, 0xfd, 0x9f, 0xb2, 0x8e, },
-  { 0x0c, 0x89, 0x55, 0x0c, 0x1f, 0x59, 0x2c, 0x9d, 0x1b, 0x29, 0x1d, 0x41,
-    0x1d, 0xe6, 0x47, 0x8f, 0x8c, 0x2b, 0xea, 0x8f, 0xf0, 0xff, 0x21, 0x70,
-    0x88, },
-  { 0x12, 0x18, 0x95, 0xa6, 0x59, 0xb1, 0x31, 0x24, 0x45, 0x67, 0x55, 0xa4,
-    0x1a, 0x2d, 0x48, 0x67, 0x1b, 0x43, 0x88, 0x2d, 0x8e, 0xa0, 0x70, 0xb3,
-    0xc6, 0xbb, },
-  { 0xe7, 0xb1, 0x1d, 0xb2, 0x76, 0x4d, 0x68, 0x68, 0x68, 0x23, 0x02, 0x55,
-    0x3a, 0xe2, 0xe5, 0xd5, 0x4b, 0x43, 0xf9, 0x34, 0x77, 0x5c, 0xa1, 0xf5,
-    0x55, 0xfd, 0x4f, },
-  { 0x8c, 0x87, 0x5a, 0x08, 0x3a, 0x73, 0xad, 0x61, 0xe1, 0xe7, 0x99, 0x7e,
-    0xf0, 0x5d, 0xe9, 0x5d, 0x16, 0x43, 0x80, 0x2f, 0xd0, 0x66, 0x34, 0xe2,
-    0x42, 0x64, 0x3b, 0x1a, },
-  { 0x39, 0xc1, 0x99, 0xcf, 0x22, 0xbf, 0x16, 0x8f, 0x9f, 0x80, 0x7f, 0x95,
-    0x0a, 0x05, 0x67, 0x27, 0xe7, 0x15, 0xdf, 0x9d, 0xb2, 0xfe, 0x1c, 0xb5,
-    0x1d, 0x60, 0x8f, 0x8a, 0x1d, },
-  { 0x9b, 0x6e, 0x08, 0x09, 0x06, 0x73, 0xab, 0x68, 0x02, 0x62, 0x1a, 0xe4,
-    0xd4, 0xdf, 0xc7, 0x02, 0x4c, 0x6a, 0x5f, 0xfd, 0x23, 0xac, 0xae, 0x6d,
-    0x43, 0xa4, 0x7a, 0x50, 0x60, 0x3c, },
-  { 0x1d, 0xb4, 0xc6, 0xe1, 0xb1, 0x4b, 0xe3, 0xf2, 0xe2, 0x1a, 0x73, 0x1b,
-    0xa0, 0x92, 0xa7, 0xf5, 0xff, 0x8f, 0x8b, 0x5d, 0xdf, 0xa8, 0x04, 0xb3,
-    0xb0, 0xf7, 0xcc, 0x12, 0xfa, 0x35, 0x46, },
-  { 0x49, 0x45, 0x97, 0x11, 0x0f, 0x1c, 0x60, 0x8e, 0xe8, 0x47, 0x30, 0xcf,
-    0x60, 0xa8, 0x71, 0xc5, 0x1b, 0xe9, 0x39, 0x4d, 0x49, 0xb6, 0x12, 0x1f,
-    0x24, 0xab, 0x37, 0xff, 0x83, 0xc2, 0xe1, 0x3a, },
-  { 0x60, },
-  { 0x24, 0x26, },
-  { 0x47, 0xeb, 0xc9, },
-  { 0x4a, 0xd0, 0xbc, 0xf0, },
-  { 0x8e, 0x2b, 0xc9, 0x85, 0x3c, },
-  { 0xa2, 0x07, 0x15, 0xb8, 0x12, 0x74, },
-  { 0x0f, 0xdb, 0x5b, 0x33, 0x69, 0xfe, 0x4b, },
-  { 0xa2, 0x86, 0x54, 0xf4, 0xfd, 0xb2, 0xd4, 0xe6, },
-  { 0xbb, 0x84, 0x78, 0x49, 0x27, 0x8e, 0x61, 0xda, 0x60, },
-  { 0x04, 0xc3, 0xcd, 0xaa, 0x8f, 0xa7, 0x03, 0xc9, 0xf9, 0xb6, },
-  { 0xf8, 0x27, 0x1d, 0x61, 0xdc, 0x21, 0x42, 0xdd, 0xad, 0x92, 0x40, },
-  { 0x12, 0x87, 0xdf, 0xc2, 0x41, 0x45, 0x5a, 0x36, 0x48, 0x5b, 0x51, 0x2b, },
-  { 0xbb, 0x37, 0x5d, 0x1f, 0xf1, 0x68, 0x7a, 0xc4, 0xa5, 0xd2, 0xa4, 0x91,
-    0x8d, },
-  { 0x5b, 0x27, 0xd1, 0x04, 0x54, 0x52, 0x9f, 0xa3, 0x47, 0x86, 0x33, 0x33,
-    0xbf, 0xa0, },
-  { 0xcf, 0x04, 0xea, 0xf8, 0x03, 0x2a, 0x43, 0xff, 0xa6, 0x68, 0x21, 0x4c,
-    0xd5, 0x4b, 0xed, },
-  { 0xaf, 0xb8, 0xbc, 0x63, 0x0f, 0x18, 0x4d, 0xe2, 0x7a, 0xdd, 0x46, 0x44,
-    0xc8, 0x24, 0x0a, 0xb7, },
-  { 0x3e, 0xdc, 0x36, 0xe4, 0x89, 0xb1, 0xfa, 0xc6, 0x40, 0x93, 0x2e, 0x75,
-    0xb2, 0x15, 0xd1, 0xb1, 0x10, },
-  { 0x6c, 0xd8, 0x20, 0x3b, 0x82, 0x79, 0xf9, 0xc8, 0xbc, 0x9d, 0xe0, 0x35,
-    0xbe, 0x1b, 0x49, 0x1a, 0xbc, 0x3a, },
-  { 0x78, 0x65, 0x2c, 0xbe, 0x35, 0x67, 0xdc, 0x78, 0xd4, 0x41, 0xf6, 0xc9,
-    0xde, 0xde, 0x1f, 0x18, 0x13, 0x31, 0x11, },
-  { 0x8a, 0x7f, 0xb1, 0x33, 0x8f, 0x0c, 0x3c, 0x0a, 0x06, 0x61, 0xf0, 0x47,
-    0x29, 0x1b, 0x29, 0xbc, 0x1c, 0x47, 0xef, 0x7a, },
-  { 0x65, 0x91, 0xf1, 0xe6, 0xb3, 0x96, 0xd3, 0x8c, 0xc2, 0x4a, 0x59, 0x35,
-    0x72, 0x8e, 0x0b, 0x9a, 0x87, 0xca, 0x34, 0x7b, 0x63, },
-  { 0x5f, 0x08, 0x87, 0x80, 0x56, 0x25, 0x89, 0x77, 0x61, 0x8c, 0x64, 0xa1,
-    0x59, 0x6d, 0x59, 0x62, 0xe8, 0x4a, 0xc8, 0x58, 0x99, 0xd1, },
-  { 0x23, 0x87, 0x1d, 0xed, 0x6f, 0xf2, 0x91, 0x90, 0xe2, 0xfe, 0x43, 0x21,
-    0xaf, 0x97, 0xc6, 0xbc, 0xd7, 0x15, 0xc7, 0x2d, 0x08, 0x77, 0x91, },
-  { 0x90, 0x47, 0x9a, 0x9e, 0x3a, 0xdf, 0xf3, 0xc9, 0x4c, 0x1e, 0xa7, 0xd4,
-    0x6a, 0x32, 0x90, 0xfe, 0xb7, 0xb6, 0x7b, 0xfa, 0x96, 0x61, 0xfb, 0xa4, },
-  { 0xb1, 0x67, 0x60, 0x45, 0xb0, 0x96, 0xc5, 0x15, 0x9f, 0x4d, 0x26, 0xd7,
-    0x9d, 0xf1, 0xf5, 0x6d, 0x21, 0x00, 0x94, 0x31, 0x64, 0x94, 0xd3, 0xa7,
-    0xd3, },
-  { 0x02, 0x3e, 0xaf, 0xf3, 0x79, 0x73, 0xa5, 0xf5, 0xcc, 0x7a, 0x7f, 0xfb,
-    0x79, 0x2b, 0x85, 0x8c, 0x88, 0x72, 0x06, 0xbe, 0xfe, 0xaf, 0xc1, 0x16,
-    0xa6, 0xd6, },
-  { 0x2a, 0xb0, 0x1a, 0xe5, 0xaa, 0x6e, 0xb3, 0xae, 0x53, 0x85, 0x33, 0x80,
-    0x75, 0xae, 0x30, 0xe6, 0xb8, 0x72, 0x42, 0xf6, 0x25, 0x4f, 0x38, 0x88,
-    0x55, 0xd1, 0xa9, },
-  { 0x90, 0xd8, 0x0c, 0xc0, 0x93, 0x4b, 0x4f, 0x9e, 0x65, 0x6c, 0xa1, 0x54,
-    0xa6, 0xf6, 0x6e, 0xca, 0xd2, 0xbb, 0x7e, 0x6a, 0x1c, 0xd3, 0xce, 0x46,
-    0xef, 0xb0, 0x00, 0x8d, },
-  { 0xed, 0x9c, 0x49, 0xcd, 0xc2, 0xde, 0x38, 0x0e, 0xe9, 0x98, 0x6c, 0xc8,
-    0x90, 0x9e, 0x3c, 0xd4, 0xd3, 0xeb, 0x88, 0x32, 0xc7, 0x28, 0xe3, 0x94,
-    0x1c, 0x9f, 0x8b, 0xf3, 0xcb, },
-  { 0xac, 0xe7, 0x92, 0x16, 0xb4, 0x14, 0xa0, 0xe4, 0x04, 0x79, 0xa2, 0xf4,
-    0x31, 0xe6, 0x0c, 0x26, 0xdc, 0xbf, 0x2f, 0x69, 0x1b, 0x55, 0x94, 0x67,
-    0xda, 0x0c, 0xd7, 0x32, 0x1f, 0xef, },
-  { 0x68, 0x63, 0x85, 0x57, 0x95, 0x9e, 0x42, 0x27, 0x41, 0x43, 0x42, 0x02,
-    0xa5, 0x78, 0xa7, 0xc6, 0x43, 0xc1, 0x6a, 0xba, 0x70, 0x80, 0xcd, 0x04,
-    0xb6, 0x78, 0x76, 0x29, 0xf3, 0xe8, 0xa0, },
-  { 0xe6, 0xac, 0x8d, 0x9d, 0xf0, 0xc0, 0xf7, 0xf7, 0xe3, 0x3e, 0x4e, 0x28,
-    0x0f, 0x59, 0xb2, 0x67, 0x9e, 0x84, 0x34, 0x42, 0x96, 0x30, 0x2b, 0xca,
-    0x49, 0xb6, 0xc5, 0x9a, 0x84, 0x59, 0xa7, 0x81, },
-  { 0x7e, },
-  { 0x1e, 0x21, },
-  { 0x26, 0xd3, 0xdd, },
-  { 0x2c, 0xd4, 0xb3, 0x3d, },
-  { 0x86, 0x7b, 0x76, 0x3c, 0xf0, },
-  { 0x12, 0xc3, 0x70, 0x1d, 0x55, 0x18, },
-  { 0x96, 0xc2, 0xbd, 0x61, 0x55, 0xf4, 0x24, },
-  { 0x20, 0x51, 0xf7, 0x86, 0x58, 0x8f, 0x07, 0x2a, },
-  { 0x93, 0x15, 0xa8, 0x1d, 0xda, 0x97, 0xee, 0x0e, 0x6c, },
-  { 0x39, 0x93, 0xdf, 0xd5, 0x0e, 0xca, 0xdc, 0x7a, 0x92, 0xce, },
-  { 0x60, 0xd5, 0xfd, 0xf5, 0x1b, 0x26, 0x82, 0x26, 0x73, 0x02, 0xbc, },
-  { 0x98, 0xf2, 0x34, 0xe1, 0xf5, 0xfb, 0x00, 0xac, 0x10, 0x4a, 0x38, 0x9f, },
-  { 0xda, 0x3a, 0x92, 0x8a, 0xd0, 0xcd, 0x12, 0xcd, 0x15, 0xbb, 0xab, 0x77,
-    0x66, },
-  { 0xa2, 0x92, 0x1a, 0xe5, 0xca, 0x0c, 0x30, 0x75, 0xeb, 0xaf, 0x00, 0x31,
-    0x55, 0x66, },
-  { 0x06, 0xea, 0xfd, 0x3e, 0x86, 0x38, 0x62, 0x4e, 0xa9, 0x12, 0xa4, 0x12,
-    0x43, 0xbf, 0xa1, },
-  { 0xe4, 0x71, 0x7b, 0x94, 0xdb, 0xa0, 0xd2, 0xff, 0x9b, 0xeb, 0xad, 0x8e,
-    0x95, 0x8a, 0xc5, 0xed, },
-  { 0x25, 0x5a, 0x77, 0x71, 0x41, 0x0e, 0x7a, 0xe9, 0xed, 0x0c, 0x10, 0xef,
-    0xf6, 0x2b, 0x3a, 0xba, 0x60, },
-  { 0xee, 0xe2, 0xa3, 0x67, 0x64, 0x1d, 0xc6, 0x04, 0xc4, 0xe1, 0x68, 0xd2,
-    0x6e, 0xd2, 0x91, 0x75, 0x53, 0x07, },
-  { 0xe0, 0xf6, 0x4d, 0x8f, 0x68, 0xfc, 0x06, 0x7e, 0x18, 0x79, 0x7f, 0x2b,
-    0x6d, 0xef, 0x46, 0x7f, 0xab, 0xb2, 0xad, },
-  { 0x3d, 0x35, 0x88, 0x9f, 0x2e, 0xcf, 0x96, 0x45, 0x07, 0x60, 0x71, 0x94,
-    0x00, 0x8d, 0xbf, 0xf4, 0xef, 0x46, 0x2e, 0x3c, },
-  { 0x43, 0xcf, 0x98, 0xf7, 0x2d, 0xf4, 0x17, 0xe7, 0x8c, 0x05, 0x2d, 0x9b,
-    0x24, 0xfb, 0x4d, 0xea, 0x4a, 0xec, 0x01, 0x25, 0x29, },
-  { 0x8e, 0x73, 0x9a, 0x78, 0x11, 0xfe, 0x48, 0xa0, 0x3b, 0x1a, 0x26, 0xdf,
-    0x25, 0xe9, 0x59, 0x1c, 0x70, 0x07, 0x9f, 0xdc, 0xa0, 0xa6, },
-  { 0xe8, 0x47, 0x71, 0xc7, 0x3e, 0xdf, 0xb5, 0x13, 0xb9, 0x85, 0x13, 0xa8,
-    0x54, 0x47, 0x6e, 0x59, 0x96, 0x09, 0x13, 0x5f, 0x82, 0x16, 0x0b, },
-  { 0xfb, 0xc0, 0x8c, 0x03, 0x21, 0xb3, 0xc4, 0xb5, 0x43, 0x32, 0x6c, 0xea,
-    0x7f, 0xa8, 0x43, 0x91, 0xe8, 0x4e, 0x3f, 0xbf, 0x45, 0x58, 0x6a, 0xa3, },
-  { 0x55, 0xf8, 0xf3, 0x00, 0x76, 0x09, 0xef, 0x69, 0x5d, 0xd2, 0x8a, 0xf2,
-    0x65, 0xc3, 0xcb, 0x9b, 0x43, 0xfd, 0xb1, 0x7e, 0x7f, 0xa1, 0x94, 0xb0,
-    0xd7, },
-  { 0xaa, 0x13, 0xc1, 0x51, 0x40, 0x6d, 0x8d, 0x4c, 0x0a, 0x95, 0x64, 0x7b,
-    0xd1, 0x96, 0xb6, 0x56, 0xb4, 0x5b, 0xcf, 0xd6, 0xd9, 0x15, 0x97, 0xdd,
-    0xb6, 0xef, },
-  { 0xaf, 0xb7, 0x36, 0xb0, 0x04, 0xdb, 0xd7, 0x9c, 0x9a, 0x44, 0xc4, 0xf6,
-    0x1f, 0x12, 0x21, 0x2d, 0x59, 0x30, 0x54, 0xab, 0x27, 0x61, 0xa3, 0x57,
-    0xef, 0xf8, 0x53, },
-  { 0x97, 0x34, 0x45, 0x3e, 0xce, 0x7c, 0x35, 0xa2, 0xda, 0x9f, 0x4b, 0x46,
-    0x6c, 0x11, 0x67, 0xff, 0x2f, 0x76, 0x58, 0x15, 0x71, 0xfa, 0x44, 0x89,
-    0x89, 0xfd, 0xf7, 0x99, },
-  { 0x1f, 0xb1, 0x62, 0xeb, 0x83, 0xc5, 0x9c, 0x89, 0xf9, 0x2c, 0xd2, 0x03,
-    0x61, 0xbc, 0xbb, 0xa5, 0x74, 0x0e, 0x9b, 0x7e, 0x82, 0x3e, 0x70, 0x0a,
-    0xa9, 0x8f, 0x2b, 0x59, 0xfb, },
-  { 0xf8, 0xca, 0x5e, 0x3a, 0x4f, 0x9e, 0x10, 0x69, 0x10, 0xd5, 0x4c, 0xeb,
-    0x1a, 0x0f, 0x3c, 0x6a, 0x98, 0xf5, 0xb0, 0x97, 0x5b, 0x37, 0x2f, 0x0d,
-    0xbd, 0x42, 0x4b, 0x69, 0xa1, 0x82, },
-  { 0x12, 0x8c, 0x6d, 0x52, 0x08, 0xef, 0x74, 0xb2, 0xe6, 0xaa, 0xd3, 0xb0,
-    0x26, 0xb0, 0xd9, 0x94, 0xb6, 0x11, 0x45, 0x0e, 0x36, 0x71, 0x14, 0x2d,
-    0x41, 0x8c, 0x21, 0x53, 0x31, 0xe9, 0x68, },
-  { 0xee, 0xea, 0x0d, 0x89, 0x47, 0x7e, 0x72, 0xd1, 0xd8, 0xce, 0x58, 0x4c,
-    0x94, 0x1f, 0x0d, 0x51, 0x08, 0xa3, 0xb6, 0x3d, 0xe7, 0x82, 0x46, 0x92,
-    0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, },
-};
-
-bool __init blake2s_selftest(void)
-{
-	u8 key[BLAKE2S_KEY_SIZE];
-	u8 buf[ARRAY_SIZE(blake2s_testvecs)];
-	u8 hash[BLAKE2S_HASH_SIZE];
-	struct blake2s_state state;
-	bool success = true;
-	int i, l;
-
-	key[0] = key[1] = 1;
-	for (i = 2; i < sizeof(key); ++i)
-		key[i] = key[i - 2] + key[i - 1];
-
-	for (i = 0; i < sizeof(buf); ++i)
-		buf[i] = (u8)i;
-
-	for (i = l = 0; i < ARRAY_SIZE(blake2s_testvecs); l = (l + 37) % ++i) {
-		int outlen = 1 + i % BLAKE2S_HASH_SIZE;
-		int keylen = (13 * i) % (BLAKE2S_KEY_SIZE + 1);
-
-		blake2s(hash, buf, key + BLAKE2S_KEY_SIZE - keylen, outlen, i,
-			keylen);
-		if (memcmp(hash, blake2s_testvecs[i], outlen)) {
-			pr_err("blake2s self-test %d: FAIL\n", i + 1);
-			success = false;
-		}
-
-		if (!keylen)
-			blake2s_init(&state, outlen);
-		else
-			blake2s_init_key(&state, outlen,
-					 key + BLAKE2S_KEY_SIZE - keylen,
-					 keylen);
-
-		blake2s_update(&state, buf, l);
-		blake2s_update(&state, buf + l, i - l);
-		blake2s_final(&state, hash);
-		if (memcmp(hash, blake2s_testvecs[i], outlen)) {
-			pr_err("blake2s init/update/final self-test %d: FAIL\n",
-			       i + 1);
-			success = false;
-		}
-	}
-
-	for (i = 0; i < 32; ++i) {
-		enum { TEST_ALIGNMENT = 16 };
-		u8 unaligned_block[BLAKE2S_BLOCK_SIZE + TEST_ALIGNMENT - 1]
-					__aligned(TEST_ALIGNMENT);
-		u8 blocks[BLAKE2S_BLOCK_SIZE * 2];
-		struct blake2s_state state1, state2;
-
-		get_random_bytes(blocks, sizeof(blocks));
-		get_random_bytes(&state, sizeof(state));
-
-#if defined(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) && \
-    defined(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
-		memcpy(&state1, &state, sizeof(state1));
-		memcpy(&state2, &state, sizeof(state2));
-		blake2s_compress(&state1, blocks, 2, BLAKE2S_BLOCK_SIZE);
-		blake2s_compress_generic(&state2, blocks, 2, BLAKE2S_BLOCK_SIZE);
-		if (memcmp(&state1, &state2, sizeof(state1))) {
-			pr_err("blake2s random compress self-test %d: FAIL\n",
-			       i + 1);
-			success = false;
-		}
-#endif
-
-		memcpy(&state1, &state, sizeof(state1));
-		blake2s_compress(&state1, blocks, 1, BLAKE2S_BLOCK_SIZE);
-		for (l = 1; l < TEST_ALIGNMENT; ++l) {
-			memcpy(unaligned_block + l, blocks,
-			       BLAKE2S_BLOCK_SIZE);
-			memcpy(&state2, &state, sizeof(state2));
-			blake2s_compress(&state2, unaligned_block + l, 1,
-					 BLAKE2S_BLOCK_SIZE);
-			if (memcmp(&state1, &state2, sizeof(state1))) {
-				pr_err("blake2s random compress align %d self-test %d: FAIL\n",
-				       l, i + 1);
-				success = false;
-			}
-		}
-	}
-
-	return success;
-}
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 98e688c6d891..6182c21ed943 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -8,65 +8,158 @@
  *
  */
 
-#include <crypto/internal/blake2s.h>
-#include <linux/types.h>
-#include <linux/string.h>
+#include <crypto/blake2s.h>
+#include <linux/bug.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bug.h>
+#include <linux/string.h>
+#include <linux/types.h>
 
-static inline void blake2s_set_lastblock(struct blake2s_state *state)
+static const u8 blake2s_sigma[10][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+
+static inline void blake2s_increment_counter(struct blake2s_ctx *ctx, u32 inc)
 {
-	state->f[0] = -1;
+	ctx->t[0] += inc;
+	ctx->t[1] += (ctx->t[0] < inc);
 }
 
-void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
+static void __maybe_unused
+blake2s_compress_generic(struct blake2s_ctx *ctx,
+			 const u8 *data, size_t nblocks, u32 inc)
 {
-	const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+	u32 m[16];
+	u32 v[16];
+	int i;
+
+	WARN_ON(IS_ENABLED(DEBUG) &&
+		(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
+
+	while (nblocks > 0) {
+		blake2s_increment_counter(ctx, inc);
+		memcpy(m, data, BLAKE2S_BLOCK_SIZE);
+		le32_to_cpu_array(m, ARRAY_SIZE(m));
+		memcpy(v, ctx->h, 32);
+		v[ 8] = BLAKE2S_IV0;
+		v[ 9] = BLAKE2S_IV1;
+		v[10] = BLAKE2S_IV2;
+		v[11] = BLAKE2S_IV3;
+		v[12] = BLAKE2S_IV4 ^ ctx->t[0];
+		v[13] = BLAKE2S_IV5 ^ ctx->t[1];
+		v[14] = BLAKE2S_IV6 ^ ctx->f[0];
+		v[15] = BLAKE2S_IV7 ^ ctx->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+	a += b + m[blake2s_sigma[r][2 * i + 0]]; \
+	d = ror32(d ^ a, 16); \
+	c += d; \
+	b = ror32(b ^ c, 12); \
+	a += b + m[blake2s_sigma[r][2 * i + 1]]; \
+	d = ror32(d ^ a, 8); \
+	c += d; \
+	b = ror32(b ^ c, 7); \
+} while (0)
+
+#define ROUND(r) do { \
+	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+	G(r, 2, v[2], v[ 6], v[10], v[14]); \
+	G(r, 3, v[3], v[ 7], v[11], v[15]); \
+	G(r, 4, v[0], v[ 5], v[10], v[15]); \
+	G(r, 5, v[1], v[ 6], v[11], v[12]); \
+	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+		ROUND(0);
+		ROUND(1);
+		ROUND(2);
+		ROUND(3);
+		ROUND(4);
+		ROUND(5);
+		ROUND(6);
+		ROUND(7);
+		ROUND(8);
+		ROUND(9);
+
+#undef G
+#undef ROUND
+
+		for (i = 0; i < 8; ++i)
+			ctx->h[i] ^= v[i] ^ v[i + 8];
+
+		data += BLAKE2S_BLOCK_SIZE;
+		--nblocks;
+	}
+}
+
+#ifdef CONFIG_CRYPTO_LIB_BLAKE2S_ARCH
+#include "blake2s.h" /* $(SRCARCH)/blake2s.h */
+#else
+#define blake2s_compress blake2s_compress_generic
+#endif
+
+static inline void blake2s_set_lastblock(struct blake2s_ctx *ctx)
+{
+	ctx->f[0] = -1;
+}
+
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
+{
+	const size_t fill = BLAKE2S_BLOCK_SIZE - ctx->buflen;
 
 	if (unlikely(!inlen))
 		return;
 	if (inlen > fill) {
-		memcpy(state->buf + state->buflen, in, fill);
-		blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
-		state->buflen = 0;
+		memcpy(ctx->buf + ctx->buflen, in, fill);
+		blake2s_compress(ctx, ctx->buf, 1, BLAKE2S_BLOCK_SIZE);
+		ctx->buflen = 0;
 		in += fill;
 		inlen -= fill;
 	}
 	if (inlen > BLAKE2S_BLOCK_SIZE) {
 		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
-		blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+
+		blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
 		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 	}
-	memcpy(state->buf + state->buflen, in, inlen);
-	state->buflen += inlen;
+	memcpy(ctx->buf + ctx->buflen, in, inlen);
+	ctx->buflen += inlen;
 }
 EXPORT_SYMBOL(blake2s_update);
 
-void blake2s_final(struct blake2s_state *state, u8 *out)
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out)
 {
 	WARN_ON(IS_ENABLED(DEBUG) && !out);
-	blake2s_set_lastblock(state);
-	memset(state->buf + state->buflen, 0,
-	       BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
-	blake2s_compress(state, state->buf, 1, state->buflen);
-	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
-	memcpy(out, state->h, state->outlen);
-	memzero_explicit(state, sizeof(*state));
+	blake2s_set_lastblock(ctx);
+	memset(ctx->buf + ctx->buflen, 0,
+	       BLAKE2S_BLOCK_SIZE - ctx->buflen); /* Padding */
+	blake2s_compress(ctx, ctx->buf, 1, ctx->buflen);
+	cpu_to_le32_array(ctx->h, ARRAY_SIZE(ctx->h));
+	memcpy(out, ctx->h, ctx->outlen);
+	memzero_explicit(ctx, sizeof(*ctx));
 }
 EXPORT_SYMBOL(blake2s_final);
 
+#ifdef blake2s_mod_init_arch
 static int __init blake2s_mod_init(void)
 {
-	if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
-	    WARN_ON(!blake2s_selftest()))
-		return -ENODEV;
+	blake2s_mod_init_arch();
 	return 0;
 }
+subsys_initcall(blake2s_mod_init);
+#endif
 
-module_init(blake2s_mod_init);
-MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("BLAKE2s hash function");
 MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/chacha-block-generic.c b/lib/crypto/chacha-block-generic.c
new file mode 100644
index 000000000000..77f68de71066
--- /dev/null
+++ b/lib/crypto/chacha-block-generic.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/chacha.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+static void chacha_permute(struct chacha_state *state, int nrounds)
+{
+	u32 *x = state->x;
+	int i;
+
+	/* whitelist the allowed round counts */
+	WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
+
+	for (i = 0; i < nrounds; i += 2) {
+		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
+		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
+		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
+		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);
+
+		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
+		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
+		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
+		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);
+
+		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
+		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
+		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
+		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);
+
+		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
+		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
+		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
+		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);
+
+		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
+		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
+		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
+		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);
+
+		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
+		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
+		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
+		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);
+
+		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
+		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
+		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
+		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);
+
+		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
+		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
+		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
+		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
+	}
+}
+
+/**
+ * chacha_block_generic - generate one keystream block and increment block counter
+ * @state: input state matrix
+ * @out: output keystream block
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
+ * The caller has already converted the endianness of the input.  This function
+ * also handles incrementing the block counter in the input matrix.
+ */
+void chacha_block_generic(struct chacha_state *state,
+			  u8 out[CHACHA_BLOCK_SIZE], int nrounds)
+{
+	struct chacha_state permuted_state = *state;
+	int i;
+
+	chacha_permute(&permuted_state, nrounds);
+
+	for (i = 0; i < ARRAY_SIZE(state->x); i++)
+		put_unaligned_le32(permuted_state.x[i] + state->x[i],
+				   &out[i * sizeof(u32)]);
+
+	state->x[12]++;
+}
+EXPORT_SYMBOL(chacha_block_generic);
+
+/**
+ * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
+ * @state: input state matrix
+ * @out: the output words
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
+ * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
+ * skips the final addition of the initial state, and outputs only certain words
+ * of the state.  It should not be used for streaming directly.
+ */
+void hchacha_block_generic(const struct chacha_state *state,
+			   u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	struct chacha_state permuted_state = *state;
+
+	chacha_permute(&permuted_state, nrounds);
+
+	memcpy(&out[0], &permuted_state.x[0], 16);
+	memcpy(&out[4], &permuted_state.x[12], 16);
+}
+EXPORT_SYMBOL(hchacha_block_generic);
diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha.c
index b748fd3d256e..e0c7cb4af318 100644
--- a/lib/crypto/chacha.c
+++ b/lib/crypto/chacha.c
@@ -1,114 +1,70 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+ * The ChaCha stream cipher (RFC7539)
  *
  * Copyright (C) 2015 Martin Willi
  */
 
-#include <linux/bug.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/bitops.h>
-#include <linux/string.h>
-#include <asm/unaligned.h>
+#include <crypto/algapi.h> // for crypto_xor_cpy
 #include <crypto/chacha.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 
-static void chacha_permute(u32 *x, int nrounds)
+static void __maybe_unused
+chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src,
+		     unsigned int bytes, int nrounds)
 {
-	int i;
-
-	/* whitelist the allowed round counts */
-	WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
-
-	for (i = 0; i < nrounds; i += 2) {
-		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
-		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
-		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
-		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);
-
-		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
-		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
-		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
-		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);
-
-		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
-		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
-		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
-		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);
-
-		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
-		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
-		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
-		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);
-
-		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
-		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
-		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
-		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);
-
-		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
-		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
-		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
-		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);
-
-		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
-		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
-		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
-		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);
-
-		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
-		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
-		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
-		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
+	/* aligned to potentially speed up crypto_xor() */
+	u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
+
+	while (bytes >= CHACHA_BLOCK_SIZE) {
+		chacha_block_generic(state, stream, nrounds);
+		crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
+		bytes -= CHACHA_BLOCK_SIZE;
+		dst += CHACHA_BLOCK_SIZE;
+		src += CHACHA_BLOCK_SIZE;
+	}
+	if (bytes) {
+		chacha_block_generic(state, stream, nrounds);
+		crypto_xor_cpy(dst, src, stream, bytes);
 	}
 }
 
-/**
- * chacha_block_generic - generate one keystream block and increment block counter
- * @state: input state matrix (16 32-bit words)
- * @stream: output keystream block (64 bytes)
- * @nrounds: number of rounds (20 or 12; 20 is recommended)
- *
- * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
- * The caller has already converted the endianness of the input.  This function
- * also handles incrementing the block counter in the input matrix.
- */
-void chacha_block_generic(u32 *state, u8 *stream, int nrounds)
-{
-	u32 x[16];
-	int i;
-
-	memcpy(x, state, 64);
-
-	chacha_permute(x, nrounds);
+#ifdef CONFIG_CRYPTO_LIB_CHACHA_ARCH
+#include "chacha.h" /* $(SRCARCH)/chacha.h */
+#else
+#define chacha_crypt_arch chacha_crypt_generic
+#define hchacha_block_arch hchacha_block_generic
+#endif
 
-	for (i = 0; i < ARRAY_SIZE(x); i++)
-		put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
-
-	state[12]++;
+void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src,
+		  unsigned int bytes, int nrounds)
+{
+	chacha_crypt_arch(state, dst, src, bytes, nrounds);
 }
-EXPORT_SYMBOL(chacha_block_generic);
+EXPORT_SYMBOL_GPL(chacha_crypt);
 
-/**
- * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
- * @state: input state matrix (16 32-bit words)
- * @stream: output (8 32-bit words)
- * @nrounds: number of rounds (20 or 12; 20 is recommended)
- *
- * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
- * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
- * skips the final addition of the initial state, and outputs only certain words
- * of the state.  It should not be used for streaming directly.
- */
-void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds)
+void hchacha_block(const struct chacha_state *state,
+		   u32 out[HCHACHA_OUT_WORDS], int nrounds)
 {
-	u32 x[16];
-
-	memcpy(x, state, 64);
+	hchacha_block_arch(state, out, nrounds);
+}
+EXPORT_SYMBOL_GPL(hchacha_block);
 
-	chacha_permute(x, nrounds);
+#ifdef chacha_mod_init_arch
+static int __init chacha_mod_init(void)
+{
+	chacha_mod_init_arch();
+	return 0;
+}
+subsys_initcall(chacha_mod_init);
 
-	memcpy(&stream[0], &x[0], 16);
-	memcpy(&stream[4], &x[12], 16);
+static void __exit chacha_mod_exit(void)
+{
 }
-EXPORT_SYMBOL(hchacha_block_generic);
+module_exit(chacha_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/chacha20poly1305-selftest.c b/lib/crypto/chacha20poly1305-selftest.c
index fa43deda2660..e4c85bc5a6d7 100644
--- a/lib/crypto/chacha20poly1305-selftest.c
+++ b/lib/crypto/chacha20poly1305-selftest.c
@@ -7,7 +7,7 @@
 #include <crypto/chacha.h>
 #include <crypto/poly1305.h>
 
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <linux/bug.h>
 #include <linux/init.h>
 #include <linux/mm.h>
@@ -8832,7 +8832,7 @@ chacha20poly1305_encrypt_bignonce(u8 *dst, const u8 *src, const size_t src_len,
 {
 	const u8 *pad0 = page_address(ZERO_PAGE(0));
 	struct poly1305_desc_ctx poly1305_state;
-	u32 chacha20_state[CHACHA_STATE_WORDS];
+	struct chacha_state chacha20_state;
 	union {
 		u8 block0[POLY1305_KEY_SIZE];
 		__le64 lens[2];
@@ -8844,12 +8844,12 @@ chacha20poly1305_encrypt_bignonce(u8 *dst, const u8 *src, const size_t src_len,
 	memcpy(&bottom_row[4], nonce, 12);
 	for (i = 0; i < 8; ++i)
 		le_key[i] = get_unaligned_le32(key + sizeof(le_key[i]) * i);
-	chacha_init(chacha20_state, le_key, bottom_row);
-	chacha20_crypt(chacha20_state, b.block0, b.block0, sizeof(b.block0));
+	chacha_init(&chacha20_state, le_key, bottom_row);
+	chacha20_crypt(&chacha20_state, b.block0, b.block0, sizeof(b.block0));
 	poly1305_init(&poly1305_state, b.block0);
 	poly1305_update(&poly1305_state, ad, ad_len);
 	poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf);
-	chacha20_crypt(chacha20_state, dst, src, src_len);
+	chacha20_crypt(&chacha20_state, dst, src, src_len);
 	poly1305_update(&poly1305_state, dst, src_len);
 	poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf);
 	b.lens[0] = cpu_to_le64(ad_len);
diff --git a/lib/crypto/chacha20poly1305.c b/lib/crypto/chacha20poly1305.c
index fa6a9440fc95..212ce33562af 100644
--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
@@ -7,19 +7,16 @@
  * Information: https://tools.ietf.org/html/rfc8439
  */
 
-#include <crypto/algapi.h>
-#include <crypto/chacha20poly1305.h>
 #include <crypto/chacha.h>
+#include <crypto/chacha20poly1305.h>
 #include <crypto/poly1305.h>
-#include <crypto/scatterwalk.h>
-
-#include <asm/unaligned.h>
-#include <linux/kernel.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
 #include <linux/init.h>
+#include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/module.h>
-
-#define CHACHA_KEY_WORDS	(CHACHA_KEY_SIZE / sizeof(u32))
+#include <linux/unaligned.h>
 
 static void chacha_load_key(u32 *k, const u8 *in)
 {
@@ -33,7 +30,8 @@ static void chacha_load_key(u32 *k, const u8 *in)
 	k[7] = get_unaligned_le32(in + 28);
 }
 
-static void xchacha_init(u32 *chacha_state, const u8 *key, const u8 *nonce)
+static void xchacha_init(struct chacha_state *chacha_state,
+			 const u8 *key, const u8 *nonce)
 {
 	u32 k[CHACHA_KEY_WORDS];
 	u8 iv[CHACHA_IV_SIZE];
@@ -55,7 +53,8 @@ static void xchacha_init(u32 *chacha_state, const u8 *key, const u8 *nonce)
 
 static void
 __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
-			   const u8 *ad, const size_t ad_len, u32 *chacha_state)
+			   const u8 *ad, const size_t ad_len,
+			   struct chacha_state *chacha_state)
 {
 	const u8 *pad0 = page_address(ZERO_PAGE(0));
 	struct poly1305_desc_ctx poly1305_state;
@@ -83,16 +82,16 @@ __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 
 	poly1305_final(&poly1305_state, dst + src_len);
 
-	memzero_explicit(chacha_state, CHACHA_STATE_WORDS * sizeof(u32));
+	chacha_zeroize_state(chacha_state);
 	memzero_explicit(&b, sizeof(b));
 }
 
 void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 			      const u8 *ad, const size_t ad_len,
 			      const u64 nonce,
-			      const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			      const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
-	u32 chacha_state[CHACHA_STATE_WORDS];
+	struct chacha_state chacha_state;
 	u32 k[CHACHA_KEY_WORDS];
 	__le64 iv[2];
 
@@ -101,8 +100,9 @@ void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 	iv[0] = 0;
 	iv[1] = cpu_to_le64(nonce);
 
-	chacha_init(chacha_state, k, (u8 *)iv);
-	__chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, chacha_state);
+	chacha_init(&chacha_state, k, (u8 *)iv);
+	__chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len,
+				   &chacha_state);
 
 	memzero_explicit(iv, sizeof(iv));
 	memzero_explicit(k, sizeof(k));
@@ -111,19 +111,21 @@ EXPORT_SYMBOL(chacha20poly1305_encrypt);
 
 void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 			       const u8 *ad, const size_t ad_len,
-			       const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
-			       const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			       const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+			       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
-	u32 chacha_state[CHACHA_STATE_WORDS];
+	struct chacha_state chacha_state;
 
-	xchacha_init(chacha_state, key, nonce);
-	__chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, chacha_state);
+	xchacha_init(&chacha_state, key, nonce);
+	__chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len,
+				   &chacha_state);
 }
 EXPORT_SYMBOL(xchacha20poly1305_encrypt);
 
 static bool
 __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
-			   const u8 *ad, const size_t ad_len, u32 *chacha_state)
+			   const u8 *ad, const size_t ad_len,
+			   struct chacha_state *chacha_state)
 {
 	const u8 *pad0 = page_address(ZERO_PAGE(0));
 	struct poly1305_desc_ctx poly1305_state;
@@ -168,9 +170,9 @@ __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 			      const u8 *ad, const size_t ad_len,
 			      const u64 nonce,
-			      const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			      const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
-	u32 chacha_state[CHACHA_STATE_WORDS];
+	struct chacha_state chacha_state;
 	u32 k[CHACHA_KEY_WORDS];
 	__le64 iv[2];
 	bool ret;
@@ -180,11 +182,11 @@ bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 	iv[0] = 0;
 	iv[1] = cpu_to_le64(nonce);
 
-	chacha_init(chacha_state, k, (u8 *)iv);
+	chacha_init(&chacha_state, k, (u8 *)iv);
 	ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len,
-					 chacha_state);
+					 &chacha_state);
 
-	memzero_explicit(chacha_state, sizeof(chacha_state));
+	chacha_zeroize_state(&chacha_state);
 	memzero_explicit(iv, sizeof(iv));
 	memzero_explicit(k, sizeof(k));
 	return ret;
@@ -193,14 +195,14 @@ EXPORT_SYMBOL(chacha20poly1305_decrypt);
 
 bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 			       const u8 *ad, const size_t ad_len,
-			       const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
-			       const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			       const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+			       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
-	u32 chacha_state[CHACHA_STATE_WORDS];
+	struct chacha_state chacha_state;
 
-	xchacha_init(chacha_state, key, nonce);
+	xchacha_init(&chacha_state, key, nonce);
 	return __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len,
-					  chacha_state);
+					  &chacha_state);
 }
 EXPORT_SYMBOL(xchacha20poly1305_decrypt);
 
@@ -209,12 +211,12 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
 				       const size_t src_len,
 				       const u8 *ad, const size_t ad_len,
 				       const u64 nonce,
-				       const u8 key[CHACHA20POLY1305_KEY_SIZE],
+				       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE],
 				       int encrypt)
 {
 	const u8 *pad0 = page_address(ZERO_PAGE(0));
 	struct poly1305_desc_ctx poly1305_state;
-	u32 chacha_state[CHACHA_STATE_WORDS];
+	struct chacha_state chacha_state;
 	struct sg_mapping_iter miter;
 	size_t partial = 0;
 	unsigned int flags;
@@ -241,8 +243,8 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
 	b.iv[0] = 0;
 	b.iv[1] = cpu_to_le64(nonce);
 
-	chacha_init(chacha_state, b.k, (u8 *)b.iv);
-	chacha20_crypt(chacha_state, b.block0, pad0, sizeof(b.block0));
+	chacha_init(&chacha_state, b.k, (u8 *)b.iv);
+	chacha20_crypt(&chacha_state, b.block0, pad0, sizeof(b.block0));
 	poly1305_init(&poly1305_state, b.block0);
 
 	if (unlikely(ad_len)) {
@@ -277,13 +279,13 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
 
 			if (unlikely(length < sl))
 				l &= ~(CHACHA_BLOCK_SIZE - 1);
-			chacha20_crypt(chacha_state, addr, addr, l);
+			chacha20_crypt(&chacha_state, addr, addr, l);
 			addr += l;
 			length -= l;
 		}
 
 		if (unlikely(length > 0)) {
-			chacha20_crypt(chacha_state, b.chacha_stream, pad0,
+			chacha20_crypt(&chacha_state, b.chacha_stream, pad0,
 				       CHACHA_BLOCK_SIZE);
 			crypto_xor(addr, b.chacha_stream, length);
 			partial = length;
@@ -318,13 +320,13 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
 
 	if (unlikely(sl > -POLY1305_DIGEST_SIZE)) {
 		poly1305_final(&poly1305_state, b.mac[1]);
-		scatterwalk_map_and_copy(b.mac[encrypt], src, src_len,
-					 sizeof(b.mac[1]), encrypt);
+		sg_copy_buffer(src, sg_nents(src), b.mac[encrypt],
+			       sizeof(b.mac[1]), src_len, !encrypt);
 		ret = encrypt ||
 		      !crypto_memneq(b.mac[0], b.mac[1], POLY1305_DIGEST_SIZE);
 	}
 
-	memzero_explicit(chacha_state, sizeof(chacha_state));
+	chacha_zeroize_state(&chacha_state);
 	memzero_explicit(&b, sizeof(b));
 
 	return ret;
@@ -333,7 +335,7 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
 bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
 					 const u8 *ad, const size_t ad_len,
 					 const u64 nonce,
-					 const u8 key[CHACHA20POLY1305_KEY_SIZE])
+					 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	return chacha20poly1305_crypt_sg_inplace(src, src_len, ad, ad_len,
 						 nonce, key, 1);
@@ -343,7 +345,7 @@ EXPORT_SYMBOL(chacha20poly1305_encrypt_sg_inplace);
 bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
 					 const u8 *ad, const size_t ad_len,
 					 const u64 nonce,
-					 const u8 key[CHACHA20POLY1305_KEY_SIZE])
+					 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	if (unlikely(src_len < POLY1305_DIGEST_SIZE))
 		return false;
@@ -356,7 +358,7 @@ EXPORT_SYMBOL(chacha20poly1305_decrypt_sg_inplace);
 
 static int __init chacha20poly1305_init(void)
 {
-	if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
+	if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) &&
 	    WARN_ON(!chacha20poly1305_selftest()))
 		return -ENODEV;
 	return 0;
diff --git a/lib/crypto/curve25519-fiat32.c b/lib/crypto/curve25519-fiat32.c
index 2fde0ec33dbd..2e0ba634e299 100644
--- a/lib/crypto/curve25519-fiat32.c
+++ b/lib/crypto/curve25519-fiat32.c
@@ -10,7 +10,7 @@
  * with 128-bit integer types.
  */
 
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <crypto/curve25519.h>
 #include <linux/string.h>
 
diff --git a/lib/crypto/curve25519-generic.c b/lib/crypto/curve25519-generic.c
deleted file mode 100644
index de7c99172fa2..000000000000
--- a/lib/crypto/curve25519-generic.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is an implementation of the Curve25519 ECDH algorithm, using either
- * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
- * depending on what is supported by the target compiler.
- *
- * Information: https://cr.yp.to/ecdh.html
- */
-
-#include <crypto/curve25519.h>
-#include <linux/module.h>
-
-const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
-const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
-
-EXPORT_SYMBOL(curve25519_null_point);
-EXPORT_SYMBOL(curve25519_base_point);
-EXPORT_SYMBOL(curve25519_generic);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Curve25519 scalar multiplication");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/curve25519-hacl64.c b/lib/crypto/curve25519-hacl64.c
index 771d82dc5f14..c4204133afb7 100644
--- a/lib/crypto/curve25519-hacl64.c
+++ b/lib/crypto/curve25519-hacl64.c
@@ -10,12 +10,10 @@
  * integer types.
  */
 
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <crypto/curve25519.h>
 #include <linux/string.h>
 
-typedef __uint128_t u128;
-
 static __always_inline u64 u64_eq_mask(u64 a, u64 b)
 {
 	u64 x = a ^ b;
diff --git a/lib/crypto/curve25519.c b/lib/crypto/curve25519.c
index 064b352c6907..01e265dfbcd9 100644
--- a/lib/crypto/curve25519.c
+++ b/lib/crypto/curve25519.c
@@ -2,32 +2,77 @@
 /*
  * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  *
- * This is an implementation of the Curve25519 ECDH algorithm, using either
- * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
+ * This is an implementation of the Curve25519 ECDH algorithm, using either an
+ * architecture-optimized implementation or a generic implementation. The
+ * generic implementation is either 32-bit, or 64-bit with 128-bit integers,
  * depending on what is supported by the target compiler.
  *
  * Information: https://cr.yp.to/ecdh.html
  */
 
 #include <crypto/curve25519.h>
-#include <linux/module.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
 #include <linux/init.h>
+#include <linux/module.h>
 
-static int __init curve25519_init(void)
+static const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
+static const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
+
+#ifdef CONFIG_CRYPTO_LIB_CURVE25519_ARCH
+#include "curve25519.h" /* $(SRCARCH)/curve25519.h */
+#else
+static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+			    const u8 secret[CURVE25519_KEY_SIZE],
+			    const u8 basepoint[CURVE25519_KEY_SIZE])
 {
-	if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
-	    WARN_ON(!curve25519_selftest()))
-		return -ENODEV;
-	return 0;
+	curve25519_generic(mypublic, secret, basepoint);
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+				 const u8 secret[CURVE25519_KEY_SIZE])
+{
+	curve25519_generic(pub, secret, curve25519_base_point);
+}
+#endif
+
+bool __must_check
+curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
+	   const u8 secret[CURVE25519_KEY_SIZE],
+	   const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+	curve25519_arch(mypublic, secret, basepoint);
+	return crypto_memneq(mypublic, curve25519_null_point,
+			     CURVE25519_KEY_SIZE);
+}
+EXPORT_SYMBOL(curve25519);
+
+bool __must_check
+curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
+			   const u8 secret[CURVE25519_KEY_SIZE])
+{
+	if (unlikely(!crypto_memneq(secret, curve25519_null_point,
+				    CURVE25519_KEY_SIZE)))
+		return false;
+	curve25519_base_arch(pub, secret);
+	return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE);
 }
+EXPORT_SYMBOL(curve25519_generate_public);
 
-static void __exit curve25519_exit(void)
+#ifdef curve25519_mod_init_arch
+static int __init curve25519_mod_init(void)
 {
+	curve25519_mod_init_arch();
+	return 0;
 }
+subsys_initcall(curve25519_mod_init);
 
-module_init(curve25519_init);
-module_exit(curve25519_exit);
+static void __exit curve25519_mod_exit(void)
+{
+}
+module_exit(curve25519_mod_exit);
+#endif
 
 MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Curve25519 scalar multiplication");
+MODULE_DESCRIPTION("Curve25519 algorithm");
 MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/des.c b/lib/crypto/des.c
index ef5bb8822aba..a906070136dc 100644
--- a/lib/crypto/des.c
+++ b/lib/crypto/des.c
@@ -7,20 +7,19 @@
  * Copyright (c) 2005 Dag Arne Osvik <da@osvik.no>
  */
 
+#include <crypto/des.h>
+#include <crypto/internal/des.h>
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/crypto.h>
 #include <linux/errno.h>
+#include <linux/export.h>
 #include <linux/fips.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/types.h>
-
-#include <asm/unaligned.h>
-
-#include <crypto/des.h>
-#include <crypto/internal/des.h>
+#include <linux/unaligned.h>
 
 #define ROL(x, r) ((x) = rol32((x), (r)))
 #define ROR(x, r) ((x) = ror32((x), (r)))
@@ -899,4 +898,5 @@ void des3_ede_decrypt(const struct des3_ede_ctx *dctx, u8 *dst, const u8 *src)
 }
 EXPORT_SYMBOL_GPL(des3_ede_decrypt);
 
+MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms");
 MODULE_LICENSE("GPL");
diff --git a/lib/crypto/fips.h b/lib/crypto/fips.h
new file mode 100644
index 000000000000..023410c2e0db
--- /dev/null
+++ b/lib/crypto/fips.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: gen-fips-testvecs.py */
+
+#include <linux/fips.h>
+
+static const u8 fips_test_data[] __initconst __maybe_unused = {
+	0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
+	0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00,
+};
+
+static const u8 fips_test_key[] __initconst __maybe_unused = {
+	0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
+	0x74, 0x20, 0x6b, 0x65, 0x79, 0x00, 0x00, 0x00,
+};
+
+static const u8 fips_test_hmac_sha1_value[] __initconst __maybe_unused = {
+	0x29, 0xa9, 0x88, 0xb8, 0x5c, 0xb4, 0xaf, 0x4b,
+	0x97, 0x2a, 0xee, 0x87, 0x5b, 0x0a, 0x02, 0x55,
+	0x99, 0xbf, 0x86, 0x78,
+};
+
+static const u8 fips_test_hmac_sha256_value[] __initconst __maybe_unused = {
+	0x59, 0x25, 0x85, 0xcc, 0x40, 0xe9, 0x64, 0x2f,
+	0xe9, 0xbf, 0x82, 0xb7, 0xd3, 0x15, 0x3d, 0x43,
+	0x22, 0x0b, 0x4c, 0x00, 0x90, 0x14, 0x25, 0xcf,
+	0x9e, 0x13, 0x2b, 0xc2, 0x30, 0xe6, 0xe8, 0x93,
+};
+
+static const u8 fips_test_hmac_sha512_value[] __initconst __maybe_unused = {
+	0x6b, 0xea, 0x5d, 0x27, 0x49, 0x5b, 0x3f, 0xea,
+	0xde, 0x2d, 0xfa, 0x32, 0x75, 0xdb, 0x77, 0xc8,
+	0x26, 0xe9, 0x4e, 0x95, 0x4d, 0xad, 0x88, 0x02,
+	0x87, 0xf9, 0x52, 0x0a, 0xd1, 0x92, 0x80, 0x1d,
+	0x92, 0x7e, 0x3c, 0xbd, 0xb1, 0x3c, 0x49, 0x98,
+	0x44, 0x9c, 0x8f, 0xee, 0x3f, 0x02, 0x71, 0x51,
+	0x57, 0x0b, 0x15, 0x38, 0x95, 0xd8, 0xa3, 0x81,
+	0xba, 0xb3, 0x15, 0x37, 0x5c, 0x6d, 0x57, 0x2b,
+};
+
+static const u8 fips_test_sha3_256_value[] __initconst __maybe_unused = {
+	0x77, 0xc4, 0x8b, 0x69, 0x70, 0x5f, 0x0a, 0xb1,
+	0xb1, 0xa5, 0x82, 0x0a, 0x22, 0x2b, 0x49, 0x31,
+	0xba, 0x9b, 0xb6, 0xaa, 0x32, 0xa7, 0x97, 0x00,
+	0x98, 0xdb, 0xff, 0xe7, 0xc6, 0xde, 0xb5, 0x82,
+};
diff --git a/lib/crypto/gf128mul.c b/lib/crypto/gf128mul.c
index 8f8c45e0cdcf..2a34590fe3f1 100644
--- a/lib/crypto/gf128mul.c
+++ b/lib/crypto/gf128mul.c
@@ -49,6 +49,7 @@
 */
 
 #include <crypto/gf128mul.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -225,44 +226,6 @@ void gf128mul_lle(be128 *r, const be128 *b)
 }
 EXPORT_SYMBOL(gf128mul_lle);
 
-void gf128mul_bbe(be128 *r, const be128 *b)
-{
-	be128 p[8];
-	int i;
-
-	p[0] = *r;
-	for (i = 0; i < 7; ++i)
-		gf128mul_x_bbe(&p[i + 1], &p[i]);
-
-	memset(r, 0, sizeof(*r));
-	for (i = 0;;) {
-		u8 ch = ((u8 *)b)[i];
-
-		if (ch & 0x80)
-			be128_xor(r, r, &p[7]);
-		if (ch & 0x40)
-			be128_xor(r, r, &p[6]);
-		if (ch & 0x20)
-			be128_xor(r, r, &p[5]);
-		if (ch & 0x10)
-			be128_xor(r, r, &p[4]);
-		if (ch & 0x08)
-			be128_xor(r, r, &p[3]);
-		if (ch & 0x04)
-			be128_xor(r, r, &p[2]);
-		if (ch & 0x02)
-			be128_xor(r, r, &p[1]);
-		if (ch & 0x01)
-			be128_xor(r, r, &p[0]);
-
-		if (++i >= 16)
-			break;
-
-		gf128mul_x8_bbe(r);
-	}
-}
-EXPORT_SYMBOL(gf128mul_bbe);
-
 /*      This version uses 64k bytes of table space.
     A 16 byte buffer has to be multiplied by a 16 byte key
     value in GF(2^128).  If we consider a GF(2^128) value in
@@ -380,28 +343,6 @@ out:
 }
 EXPORT_SYMBOL(gf128mul_init_4k_lle);
 
-struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g)
-{
-	struct gf128mul_4k *t;
-	int j, k;
-
-	t = kzalloc(sizeof(*t), GFP_KERNEL);
-	if (!t)
-		goto out;
-
-	t->t[1] = *g;
-	for (j = 1; j <= 64; j <<= 1)
-		gf128mul_x_bbe(&t->t[j + j], &t->t[j]);
-
-	for (j = 2; j < 256; j += j)
-		for (k = 1; k < j; ++k)
-			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
-
-out:
-	return t;
-}
-EXPORT_SYMBOL(gf128mul_init_4k_bbe);
-
 void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t)
 {
 	u8 *ap = (u8 *)a;
@@ -417,20 +358,5 @@ void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t)
 }
 EXPORT_SYMBOL(gf128mul_4k_lle);
 
-void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t)
-{
-	u8 *ap = (u8 *)a;
-	be128 r[1];
-	int i = 0;
-
-	*r = t->t[ap[0]];
-	while (++i < 16) {
-		gf128mul_x8_bbe(r);
-		be128_xor(r, r, &t->t[ap[i]]);
-	}
-	*a = *r;
-}
-EXPORT_SYMBOL(gf128mul_4k_bbe);
-
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
diff --git a/lib/crypto/hash_info.c b/lib/crypto/hash_info.c
new file mode 100644
index 000000000000..9a467638c971
--- /dev/null
+++ b/lib/crypto/hash_info.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Hash Info: Hash algorithms information
+ *
+ * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com>
+ */
+
+#include <linux/export.h>
+#include <crypto/hash_info.h>
+
+const char *const hash_algo_name[HASH_ALGO__LAST] = {
+	[HASH_ALGO_MD4]		= "md4",
+	[HASH_ALGO_MD5]		= "md5",
+	[HASH_ALGO_SHA1]	= "sha1",
+	[HASH_ALGO_RIPE_MD_160]	= "rmd160",
+	[HASH_ALGO_SHA256]	= "sha256",
+	[HASH_ALGO_SHA384]	= "sha384",
+	[HASH_ALGO_SHA512]	= "sha512",
+	[HASH_ALGO_SHA224]	= "sha224",
+	[HASH_ALGO_RIPE_MD_128]	= "rmd128",
+	[HASH_ALGO_RIPE_MD_256]	= "rmd256",
+	[HASH_ALGO_RIPE_MD_320]	= "rmd320",
+	[HASH_ALGO_WP_256]	= "wp256",
+	[HASH_ALGO_WP_384]	= "wp384",
+	[HASH_ALGO_WP_512]	= "wp512",
+	[HASH_ALGO_TGR_128]	= "tgr128",
+	[HASH_ALGO_TGR_160]	= "tgr160",
+	[HASH_ALGO_TGR_192]	= "tgr192",
+	[HASH_ALGO_SM3_256]	= "sm3",
+	[HASH_ALGO_STREEBOG_256] = "streebog256",
+	[HASH_ALGO_STREEBOG_512] = "streebog512",
+	[HASH_ALGO_SHA3_256]    = "sha3-256",
+	[HASH_ALGO_SHA3_384]    = "sha3-384",
+	[HASH_ALGO_SHA3_512]    = "sha3-512",
+};
+EXPORT_SYMBOL_GPL(hash_algo_name);
+
+const int hash_digest_size[HASH_ALGO__LAST] = {
+	[HASH_ALGO_MD4]		= MD5_DIGEST_SIZE,
+	[HASH_ALGO_MD5]		= MD5_DIGEST_SIZE,
+	[HASH_ALGO_SHA1]	= SHA1_DIGEST_SIZE,
+	[HASH_ALGO_RIPE_MD_160]	= RMD160_DIGEST_SIZE,
+	[HASH_ALGO_SHA256]	= SHA256_DIGEST_SIZE,
+	[HASH_ALGO_SHA384]	= SHA384_DIGEST_SIZE,
+	[HASH_ALGO_SHA512]	= SHA512_DIGEST_SIZE,
+	[HASH_ALGO_SHA224]	= SHA224_DIGEST_SIZE,
+	[HASH_ALGO_RIPE_MD_128]	= RMD128_DIGEST_SIZE,
+	[HASH_ALGO_RIPE_MD_256]	= RMD256_DIGEST_SIZE,
+	[HASH_ALGO_RIPE_MD_320]	= RMD320_DIGEST_SIZE,
+	[HASH_ALGO_WP_256]	= WP256_DIGEST_SIZE,
+	[HASH_ALGO_WP_384]	= WP384_DIGEST_SIZE,
+	[HASH_ALGO_WP_512]	= WP512_DIGEST_SIZE,
+	[HASH_ALGO_TGR_128]	= TGR128_DIGEST_SIZE,
+	[HASH_ALGO_TGR_160]	= TGR160_DIGEST_SIZE,
+	[HASH_ALGO_TGR_192]	= TGR192_DIGEST_SIZE,
+	[HASH_ALGO_SM3_256]	= SM3256_DIGEST_SIZE,
+	[HASH_ALGO_STREEBOG_256] = STREEBOG256_DIGEST_SIZE,
+	[HASH_ALGO_STREEBOG_512] = STREEBOG512_DIGEST_SIZE,
+	[HASH_ALGO_SHA3_256]    = SHA3_256_DIGEST_SIZE,
+	[HASH_ALGO_SHA3_384]    = SHA3_384_DIGEST_SIZE,
+	[HASH_ALGO_SHA3_512]    = SHA3_512_DIGEST_SIZE,
+};
+EXPORT_SYMBOL_GPL(hash_digest_size);
diff --git a/lib/crypto/libchacha.c b/lib/crypto/libchacha.c
deleted file mode 100644
index dabc3accae05..000000000000
--- a/lib/crypto/libchacha.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * The ChaCha stream cipher (RFC7539)
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/module.h>
-
-#include <crypto/algapi.h> // for crypto_xor_cpy
-#include <crypto/chacha.h>
-
-void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src,
-			  unsigned int bytes, int nrounds)
-{
-	/* aligned to potentially speed up crypto_xor() */
-	u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
-
-	while (bytes >= CHACHA_BLOCK_SIZE) {
-		chacha_block_generic(state, stream, nrounds);
-		crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
-		bytes -= CHACHA_BLOCK_SIZE;
-		dst += CHACHA_BLOCK_SIZE;
-		src += CHACHA_BLOCK_SIZE;
-	}
-	if (bytes) {
-		chacha_block_generic(state, stream, nrounds);
-		crypto_xor_cpy(dst, src, stream, bytes);
-	}
-}
-EXPORT_SYMBOL(chacha_crypt_generic);
-
-MODULE_LICENSE("GPL");
diff --git a/lib/crypto/md5.c b/lib/crypto/md5.c
new file mode 100644
index 000000000000..c0610ea1370e
--- /dev/null
+++ b/lib/crypto/md5.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * MD5 and HMAC-MD5 library functions
+ *
+ * md5_block_generic() is derived from cryptoapi implementation, originally
+ * based on the public domain implementation written by Colin Plumb in 1993.
+ *
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/hmac.h>
+#include <crypto/md5.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+
+static const struct md5_block_state md5_iv = {
+	.h = { MD5_H0, MD5_H1, MD5_H2, MD5_H3 },
+};
+
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+#define MD5STEP(f, w, x, y, z, in, s) \
+	(w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+
+static void md5_block_generic(struct md5_block_state *state,
+			      const u8 data[MD5_BLOCK_SIZE])
+{
+	u32 in[MD5_BLOCK_WORDS];
+	u32 a, b, c, d;
+
+	memcpy(in, data, MD5_BLOCK_SIZE);
+	le32_to_cpu_array(in, ARRAY_SIZE(in));
+
+	a = state->h[0];
+	b = state->h[1];
+	c = state->h[2];
+	d = state->h[3];
+
+	MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+	MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+	MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+	MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+	MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+	MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+	MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+	MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+	MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+	MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+	MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+	MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+	MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+	MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+	MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+	MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+	MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+	MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+	MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+	MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+	MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+	MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+	MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+	MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+	MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+	MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+	MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+	MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+	MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+	MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+	MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+	MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+	MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+	MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+	MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+	MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+	MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+	MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+	MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+	MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+	MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+	MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+	MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+	MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+	MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+	MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+	MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+	MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+	MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+	MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+	MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+	MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+	MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+	MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+	MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+	MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+	MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+	MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+	MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+	MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+	MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+	MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+	MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+	MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+	state->h[0] += a;
+	state->h[1] += b;
+	state->h[2] += c;
+	state->h[3] += d;
+}
+
+static void __maybe_unused md5_blocks_generic(struct md5_block_state *state,
+					      const u8 *data, size_t nblocks)
+{
+	do {
+		md5_block_generic(state, data);
+		data += MD5_BLOCK_SIZE;
+	} while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_MD5_ARCH
+#include "md5.h" /* $(SRCARCH)/md5.h */
+#else
+#define md5_blocks md5_blocks_generic
+#endif
+
+void md5_init(struct md5_ctx *ctx)
+{
+	ctx->state = md5_iv;
+	ctx->bytecount = 0;
+}
+EXPORT_SYMBOL_GPL(md5_init);
+
+void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len)
+{
+	size_t partial = ctx->bytecount % MD5_BLOCK_SIZE;
+
+	ctx->bytecount += len;
+
+	if (partial + len >= MD5_BLOCK_SIZE) {
+		size_t nblocks;
+
+		if (partial) {
+			size_t l = MD5_BLOCK_SIZE - partial;
+
+			memcpy(&ctx->buf[partial], data, l);
+			data += l;
+			len -= l;
+
+			md5_blocks(&ctx->state, ctx->buf, 1);
+		}
+
+		nblocks = len / MD5_BLOCK_SIZE;
+		len %= MD5_BLOCK_SIZE;
+
+		if (nblocks) {
+			md5_blocks(&ctx->state, data, nblocks);
+			data += nblocks * MD5_BLOCK_SIZE;
+		}
+		partial = 0;
+	}
+	if (len)
+		memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(md5_update);
+
+static void __md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE])
+{
+	u64 bitcount = ctx->bytecount << 3;
+	size_t partial = ctx->bytecount % MD5_BLOCK_SIZE;
+
+	ctx->buf[partial++] = 0x80;
+	if (partial > MD5_BLOCK_SIZE - 8) {
+		memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - partial);
+		md5_blocks(&ctx->state, ctx->buf, 1);
+		partial = 0;
+	}
+	memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - 8 - partial);
+	*(__le64 *)&ctx->buf[MD5_BLOCK_SIZE - 8] = cpu_to_le64(bitcount);
+	md5_blocks(&ctx->state, ctx->buf, 1);
+
+	cpu_to_le32_array(ctx->state.h, ARRAY_SIZE(ctx->state.h));
+	memcpy(out, ctx->state.h, MD5_DIGEST_SIZE);
+}
+
+void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE])
+{
+	__md5_final(ctx, out);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(md5_final);
+
+void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE])
+{
+	struct md5_ctx ctx;
+
+	md5_init(&ctx);
+	md5_update(&ctx, data, len);
+	md5_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(md5);
+
+static void __hmac_md5_preparekey(struct md5_block_state *istate,
+				  struct md5_block_state *ostate,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	union {
+		u8 b[MD5_BLOCK_SIZE];
+		unsigned long w[MD5_BLOCK_SIZE / sizeof(unsigned long)];
+	} derived_key = { 0 };
+
+	if (unlikely(raw_key_len > MD5_BLOCK_SIZE))
+		md5(raw_key, raw_key_len, derived_key.b);
+	else
+		memcpy(derived_key.b, raw_key, raw_key_len);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+	*istate = md5_iv;
+	md5_blocks(istate, derived_key.b, 1);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+						HMAC_IPAD_VALUE);
+	*ostate = md5_iv;
+	md5_blocks(ostate, derived_key.b, 1);
+
+	memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_md5_preparekey(struct hmac_md5_key *key,
+			 const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_md5_preparekey(&key->istate, &key->ostate, raw_key, raw_key_len);
+}
+EXPORT_SYMBOL_GPL(hmac_md5_preparekey);
+
+void hmac_md5_init(struct hmac_md5_ctx *ctx, const struct hmac_md5_key *key)
+{
+	ctx->hash_ctx.state = key->istate;
+	ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE;
+	ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(hmac_md5_init);
+
+void hmac_md5_init_usingrawkey(struct hmac_md5_ctx *ctx,
+			       const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_md5_preparekey(&ctx->hash_ctx.state, &ctx->ostate,
+			      raw_key, raw_key_len);
+	ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_md5_init_usingrawkey);
+
+void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE])
+{
+	/* Generate the padded input for the outer hash in ctx->hash_ctx.buf. */
+	__md5_final(&ctx->hash_ctx, ctx->hash_ctx.buf);
+	memset(&ctx->hash_ctx.buf[MD5_DIGEST_SIZE], 0,
+	       MD5_BLOCK_SIZE - MD5_DIGEST_SIZE);
+	ctx->hash_ctx.buf[MD5_DIGEST_SIZE] = 0x80;
+	*(__le64 *)&ctx->hash_ctx.buf[MD5_BLOCK_SIZE - 8] =
+		cpu_to_le64(8 * (MD5_BLOCK_SIZE + MD5_DIGEST_SIZE));
+
+	/* Compute the outer hash, which gives the HMAC value. */
+	md5_blocks(&ctx->ostate, ctx->hash_ctx.buf, 1);
+	cpu_to_le32_array(ctx->ostate.h, ARRAY_SIZE(ctx->ostate.h));
+	memcpy(out, ctx->ostate.h, MD5_DIGEST_SIZE);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(hmac_md5_final);
+
+void hmac_md5(const struct hmac_md5_key *key,
+	      const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE])
+{
+	struct hmac_md5_ctx ctx;
+
+	hmac_md5_init(&ctx, key);
+	hmac_md5_update(&ctx, data, data_len);
+	hmac_md5_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_md5);
+
+void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			  const u8 *data, size_t data_len,
+			  u8 out[MD5_DIGEST_SIZE])
+{
+	struct hmac_md5_ctx ctx;
+
+	hmac_md5_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_md5_update(&ctx, data, data_len);
+	hmac_md5_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_md5_usingrawkey);
+
+#ifdef md5_mod_init_arch
+static int __init md5_mod_init(void)
+{
+	md5_mod_init_arch();
+	return 0;
+}
+subsys_initcall(md5_mod_init);
+
+static void __exit md5_mod_exit(void)
+{
+}
+module_exit(md5_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("MD5 and HMAC-MD5 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/memneq.c b/lib/crypto/memneq.c
index 243d8677cc51..44daacb8cb51 100644
--- a/lib/crypto/memneq.c
+++ b/lib/crypto/memneq.c
@@ -59,9 +59,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <asm/unaligned.h>
 #include <crypto/algapi.h>
+#include <linux/export.h>
 #include <linux/module.h>
+#include <linux/unaligned.h>
 
 /* Generic path for arbitrary size */
 static inline unsigned long
diff --git a/lib/crypto/mips/.gitignore b/lib/crypto/mips/.gitignore
new file mode 100644
index 000000000000..0d47d4f21c6d
--- /dev/null
+++ b/lib/crypto/mips/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
diff --git a/lib/crypto/mips/chacha-core.S b/lib/crypto/mips/chacha-core.S
new file mode 100644
index 000000000000..706aeb850fb0
--- /dev/null
+++ b/lib/crypto/mips/chacha-core.S
@@ -0,0 +1,491 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define MASK_U32		0x3c
+#define CHACHA20_BLOCK_SIZE	64
+#define STACK_SIZE		32
+
+#define X0	$t0
+#define X1	$t1
+#define X2	$t2
+#define X3	$t3
+#define X4	$t4
+#define X5	$t5
+#define X6	$t6
+#define X7	$t7
+#define X8	$t8
+#define X9	$t9
+#define X10	$v1
+#define X11	$s6
+#define X12	$s5
+#define X13	$s4
+#define X14	$s3
+#define X15	$s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0	$s1
+#define T1	$s0
+#define T(n)	T ## n
+#define X(n)	X ## n
+
+/* Input arguments */
+#define STATE		$a0
+#define OUT		$a1
+#define IN		$a2
+#define BYTES		$a3
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0		$v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X		X15
+#define SAVED_CA	$s7
+
+#define IS_UNALIGNED	$s7
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define	CPU_TO_LE32(n) \
+	wsbh	n, n; \
+	rotr	n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define CPU_TO_LE32(n)
+#endif
+
+#define FOR_EACH_WORD(x) \
+	x( 0); \
+	x( 1); \
+	x( 2); \
+	x( 3); \
+	x( 4); \
+	x( 5); \
+	x( 6); \
+	x( 7); \
+	x( 8); \
+	x( 9); \
+	x(10); \
+	x(11); \
+	x(12); \
+	x(13); \
+	x(14); \
+	x(15);
+
+#define FOR_EACH_WORD_REV(x) \
+	x(15); \
+	x(14); \
+	x(13); \
+	x(12); \
+	x(11); \
+	x(10); \
+	x( 9); \
+	x( 8); \
+	x( 7); \
+	x( 6); \
+	x( 5); \
+	x( 4); \
+	x( 3); \
+	x( 2); \
+	x( 1); \
+	x( 0);
+
+#define PLUS_ONE_0	 1
+#define PLUS_ONE_1	 2
+#define PLUS_ONE_2	 3
+#define PLUS_ONE_3	 4
+#define PLUS_ONE_4	 5
+#define PLUS_ONE_5	 6
+#define PLUS_ONE_6	 7
+#define PLUS_ONE_7	 8
+#define PLUS_ONE_8	 9
+#define PLUS_ONE_9	10
+#define PLUS_ONE_10	11
+#define PLUS_ONE_11	12
+#define PLUS_ONE_12	13
+#define PLUS_ONE_13	14
+#define PLUS_ONE_14	15
+#define PLUS_ONE_15	16
+#define PLUS_ONE(x)	PLUS_ONE_ ## x
+#define _CONCAT3(a,b,c)	a ## b ## c
+#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
+
+#define STORE_UNALIGNED(x) \
+CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
+	.if (x != 12); \
+		lw	T0, (x*4)(STATE); \
+	.endif; \
+	lwl	T1, (x*4)+MSB ## (IN); \
+	lwr	T1, (x*4)+LSB ## (IN); \
+	.if (x == 12); \
+		addu	X ## x, NONCE_0; \
+	.else; \
+		addu	X ## x, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## x); \
+	xor	X ## x, T1; \
+	swl	X ## x, (x*4)+MSB ## (OUT); \
+	swr	X ## x, (x*4)+LSB ## (OUT);
+
+#define STORE_ALIGNED(x) \
+CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
+	.if (x != 12); \
+		lw	T0, (x*4)(STATE); \
+	.endif; \
+	lw	T1, (x*4) ## (IN); \
+	.if (x == 12); \
+		addu	X ## x, NONCE_0; \
+	.else; \
+		addu	X ## x, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## x); \
+	xor	X ## x, T1; \
+	sw	X ## x, (x*4) ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x) \
+.Lchacha_mips_jmptbl_aligned_ ## x: ; \
+	.set	noreorder; \
+	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
+	.if (x == 12); \
+		addu	SAVED_X, X ## x, NONCE_0; \
+	.else; \
+		addu	SAVED_X, X ## x, SAVED_CA; \
+	.endif; \
+	.set	reorder
+
+#define JMPTBL_UNALIGNED(x) \
+.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
+	.set	noreorder; \
+	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
+	.if (x == 12); \
+		addu	SAVED_X, X ## x, NONCE_0; \
+	.else; \
+		addu	SAVED_X, X ## x, SAVED_CA; \
+	.endif; \
+	.set	reorder
+
+#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
+	addu	X(A), X(K); \
+	addu	X(B), X(L); \
+	addu	X(C), X(M); \
+	addu	X(D), X(N); \
+	xor	X(V), X(A); \
+	xor	X(W), X(B); \
+	xor	X(Y), X(C); \
+	xor	X(Z), X(D); \
+	rotr	X(V), 32 - S; \
+	rotr	X(W), 32 - S; \
+	rotr	X(Y), 32 - S; \
+	rotr	X(Z), 32 - S;
+
+.text
+.set	reorder
+.set	noat
+.globl	chacha_crypt_arch
+.ent	chacha_crypt_arch
+chacha_crypt_arch:
+	.frame	$sp, STACK_SIZE, $ra
+
+	/* Load number of rounds */
+	lw	$at, 16($sp)
+
+	addiu	$sp, -STACK_SIZE
+
+	/* Return bytes = 0. */
+	beqz	BYTES, .Lchacha_mips_end
+
+	lw	NONCE_0, 48(STATE)
+
+	/* Save s0-s7 */
+	sw	$s0,  0($sp)
+	sw	$s1,  4($sp)
+	sw	$s2,  8($sp)
+	sw	$s3, 12($sp)
+	sw	$s4, 16($sp)
+	sw	$s5, 20($sp)
+	sw	$s6, 24($sp)
+	sw	$s7, 28($sp)
+
+	/* Test IN or OUT is unaligned.
+	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
+	 */
+	or	IS_UNALIGNED, IN, OUT
+	andi	IS_UNALIGNED, 0x3
+
+	b	.Lchacha_rounds_start
+
+.align 4
+.Loop_chacha_rounds:
+	addiu	IN,  CHACHA20_BLOCK_SIZE
+	addiu	OUT, CHACHA20_BLOCK_SIZE
+	addiu	NONCE_0, 1
+
+.Lchacha_rounds_start:
+	lw	X0,  0(STATE)
+	lw	X1,  4(STATE)
+	lw	X2,  8(STATE)
+	lw	X3,  12(STATE)
+
+	lw	X4,  16(STATE)
+	lw	X5,  20(STATE)
+	lw	X6,  24(STATE)
+	lw	X7,  28(STATE)
+	lw	X8,  32(STATE)
+	lw	X9,  36(STATE)
+	lw	X10, 40(STATE)
+	lw	X11, 44(STATE)
+
+	move	X12, NONCE_0
+	lw	X13, 52(STATE)
+	lw	X14, 56(STATE)
+	lw	X15, 60(STATE)
+
+.Loop_chacha_xor_rounds:
+	addiu	$at, -2
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+	bnez	$at, .Loop_chacha_xor_rounds
+
+	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
+
+	/* Is data src/dst unaligned? Jump */
+	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
+
+	/* Set number rounds here to fill delayslot. */
+	lw	$at, (STACK_SIZE+16)($sp)
+
+	/* BYTES < 0, it has no full block. */
+	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
+
+	FOR_EACH_WORD_REV(STORE_ALIGNED)
+
+	/* BYTES > 0? Loop again. */
+	bgtz	BYTES, .Loop_chacha_rounds
+
+	/* Place this here to fill delay slot */
+	addiu	NONCE_0, 1
+
+	/* BYTES < 0? Handle last bytes */
+	bltz	BYTES, .Lchacha_mips_xor_bytes
+
+.Lchacha_mips_xor_done:
+	/* Restore used registers */
+	lw	$s0,  0($sp)
+	lw	$s1,  4($sp)
+	lw	$s2,  8($sp)
+	lw	$s3, 12($sp)
+	lw	$s4, 16($sp)
+	lw	$s5, 20($sp)
+	lw	$s6, 24($sp)
+	lw	$s7, 28($sp)
+
+	/* Write NONCE_0 back to right location in state */
+	sw	NONCE_0, 48(STATE)
+
+.Lchacha_mips_end:
+	addiu	$sp, STACK_SIZE
+	jr	$ra
+
+.Lchacha_mips_no_full_block_aligned:
+	/* Restore the offset on BYTES */
+	addiu	BYTES, CHACHA20_BLOCK_SIZE
+
+	/* Get number of full WORDS */
+	andi	$at, BYTES, MASK_U32
+
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
+
+	/* Calculate lower half jump table offset */
+	ins	T0, $at, 1, 6
+
+	/* Add offset to STATE */
+	addu	T1, STATE, $at
+
+	/* Add lower half jump table addr */
+	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
+
+	/* Read value from STATE */
+	lw	SAVED_CA, 0(T1)
+
+	/* Store remaining bytecounter as negative value */
+	subu	BYTES, $at, BYTES
+
+	jr	T0
+
+	/* Jump table */
+	FOR_EACH_WORD(JMPTBL_ALIGNED)
+
+
+.Loop_chacha_unaligned:
+	/* Set number rounds here to fill delayslot. */
+	lw	$at, (STACK_SIZE+16)($sp)
+
+	/* BYTES > 0, it has no full block. */
+	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
+
+	FOR_EACH_WORD_REV(STORE_UNALIGNED)
+
+	/* BYTES > 0? Loop again. */
+	bgtz	BYTES, .Loop_chacha_rounds
+
+	/* Write NONCE_0 back to right location in state */
+	sw	NONCE_0, 48(STATE)
+
+	.set noreorder
+	/* Fall through to byte handling */
+	bgez	BYTES, .Lchacha_mips_xor_done
+.Lchacha_mips_xor_unaligned_0_b:
+.Lchacha_mips_xor_aligned_0_b:
+	/* Place this here to fill delay slot */
+	addiu	NONCE_0, 1
+	.set reorder
+
+.Lchacha_mips_xor_bytes:
+	addu	IN, $at
+	addu	OUT, $at
+	/* First byte */
+	lbu	T1, 0(IN)
+	addiu	$at, BYTES, 1
+	xor	T1, SAVED_X
+	sb	T1, 0(OUT)
+	beqz	$at, .Lchacha_mips_xor_done
+	/* Second byte */
+	lbu	T1, 1(IN)
+	addiu	$at, BYTES, 2
+	rotr	SAVED_X, 8
+	xor	T1, SAVED_X
+	sb	T1, 1(OUT)
+	beqz	$at, .Lchacha_mips_xor_done
+	/* Third byte */
+	lbu	T1, 2(IN)
+	rotr	SAVED_X, 8
+	xor	T1, SAVED_X
+	sb	T1, 2(OUT)
+	b	.Lchacha_mips_xor_done
+
+.Lchacha_mips_no_full_block_unaligned:
+	/* Restore the offset on BYTES */
+	addiu	BYTES, CHACHA20_BLOCK_SIZE
+
+	/* Get number of full WORDS */
+	andi	$at, BYTES, MASK_U32
+
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
+
+	/* Calculate lower half jump table offset */
+	ins	T0, $at, 1, 6
+
+	/* Add offset to STATE */
+	addu	T1, STATE, $at
+
+	/* Add lower half jump table addr */
+	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
+
+	/* Read value from STATE */
+	lw	SAVED_CA, 0(T1)
+
+	/* Store remaining bytecounter as negative value */
+	subu	BYTES, $at, BYTES
+
+	jr	T0
+
+	/* Jump table */
+	FOR_EACH_WORD(JMPTBL_UNALIGNED)
+.end chacha_crypt_arch
+.set at
+
+/* Input arguments
+ * STATE	$a0
+ * OUT		$a1
+ * NROUND	$a2
+ */
+
+#undef X12
+#undef X13
+#undef X14
+#undef X15
+
+#define X12	$a3
+#define X13	$at
+#define X14	$v0
+#define X15	STATE
+
+.set noat
+.globl	hchacha_block_arch
+.ent	hchacha_block_arch
+hchacha_block_arch:
+	.frame	$sp, STACK_SIZE, $ra
+
+	addiu	$sp, -STACK_SIZE
+
+	/* Save X11(s6) */
+	sw	X11, 0($sp)
+
+	lw	X0,  0(STATE)
+	lw	X1,  4(STATE)
+	lw	X2,  8(STATE)
+	lw	X3,  12(STATE)
+	lw	X4,  16(STATE)
+	lw	X5,  20(STATE)
+	lw	X6,  24(STATE)
+	lw	X7,  28(STATE)
+	lw	X8,  32(STATE)
+	lw	X9,  36(STATE)
+	lw	X10, 40(STATE)
+	lw	X11, 44(STATE)
+	lw	X12, 48(STATE)
+	lw	X13, 52(STATE)
+	lw	X14, 56(STATE)
+	lw	X15, 60(STATE)
+
+.Loop_hchacha_xor_rounds:
+	addiu	$a2, -2
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+	bnez	$a2, .Loop_hchacha_xor_rounds
+
+	/* Restore used register */
+	lw	X11, 0($sp)
+
+	sw	X0,  0(OUT)
+	sw	X1,  4(OUT)
+	sw	X2,  8(OUT)
+	sw	X3,  12(OUT)
+	sw	X12, 16(OUT)
+	sw	X13, 20(OUT)
+	sw	X14, 24(OUT)
+	sw	X15, 28(OUT)
+
+	addiu	$sp, STACK_SIZE
+	jr	$ra
+.end hchacha_block_arch
+.set at
diff --git a/lib/crypto/mips/chacha.h b/lib/crypto/mips/chacha.h
new file mode 100644
index 000000000000..0c18c0dc2a40
--- /dev/null
+++ b/lib/crypto/mips/chacha.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ChaCha and HChaCha functions (MIPS optimized)
+ *
+ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/kernel.h>
+
+asmlinkage void chacha_crypt_arch(struct chacha_state *state,
+				  u8 *dst, const u8 *src,
+				  unsigned int bytes, int nrounds);
+asmlinkage void hchacha_block_arch(const struct chacha_state *state,
+				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
diff --git a/lib/crypto/mips/md5.h b/lib/crypto/mips/md5.h
new file mode 100644
index 000000000000..e08e28aeffa4
--- /dev/null
+++ b/lib/crypto/mips/md5.h
@@ -0,0 +1,65 @@
+/*
+ * Cryptographic API.
+ *
+ * MD5 Message Digest Algorithm (RFC1321).
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/md5.c, which is:
+ *
+ * Derived from cryptoapi implementation, originally based on the
+ * public domain implementation written by Colin Plumb in 1993.
+ *
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void md5_blocks(struct md5_block_state *state,
+		       const u8 *data, size_t nblocks)
+{
+	struct octeon_cop2_state cop2_state;
+	u64 *state64 = (u64 *)state;
+	unsigned long flags;
+
+	if (!octeon_has_crypto())
+		return md5_blocks_generic(state, data, nblocks);
+
+	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+
+	flags = octeon_crypto_enable(&cop2_state);
+	write_octeon_64bit_hash_dword(state64[0], 0);
+	write_octeon_64bit_hash_dword(state64[1], 1);
+
+	do {
+		const u64 *block = (const u64 *)data;
+
+		write_octeon_64bit_block_dword(block[0], 0);
+		write_octeon_64bit_block_dword(block[1], 1);
+		write_octeon_64bit_block_dword(block[2], 2);
+		write_octeon_64bit_block_dword(block[3], 3);
+		write_octeon_64bit_block_dword(block[4], 4);
+		write_octeon_64bit_block_dword(block[5], 5);
+		write_octeon_64bit_block_dword(block[6], 6);
+		octeon_md5_start(block[7]);
+
+		data += MD5_BLOCK_SIZE;
+	} while (--nblocks);
+
+	state64[0] = read_octeon_64bit_hash_dword(0);
+	state64[1] = read_octeon_64bit_hash_dword(1);
+	octeon_crypto_disable(&cop2_state, flags);
+
+	le32_to_cpu_array(state->h, ARRAY_SIZE(state->h));
+}
diff --git a/lib/crypto/mips/poly1305-mips.pl b/lib/crypto/mips/poly1305-mips.pl
new file mode 100644
index 000000000000..71347f34f4f9
--- /dev/null
+++ b/lib/crypto/mips/poly1305-mips.pl
@@ -0,0 +1,1269 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
+# project.
+# ====================================================================
+
+# Poly1305 hash for MIPS.
+#
+# May 2016
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+#		IALU/gcc
+# R1x000	~5.5/+130%	(big-endian)
+# Octeon II	2.50/+70%	(little-endian)
+#
+# March 2019
+#
+# Add 32-bit code path.
+#
+# October 2019
+#
+# Modulo-scheduling reduction allows to omit dependency chain at the
+# end of inner loop and improve performance. Also optimize MIPS32R2
+# code path for MIPS 1004K core. Per René von Dorst's suggestions.
+#
+#		IALU/gcc
+# R1x000	~9.8/?		(big-endian)
+# Octeon II	3.65/+140%	(little-endian)
+# MT7621/1004K	4.75/?		(little-endian)
+#
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+#   excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
+
+if ($flavour =~ /64|n32/i) {{{
+######################################################################
+# 64-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
+     defined(_MIPS_ARCH_MIPS64R6)) \\
+     && !defined(_MIPS_ARCH_MIPS64R2)
+# define _MIPS_ARCH_MIPS64R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+# define dmultu(rs,rt)
+# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
+# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
+#else
+# define dmultu(rs,rt)		dmultu	rs,rt
+# define mflo(rd,rs,rt)	mflo	rd
+# define mfhi(rd,rs,rt)	mfhi	rd
+#endif
+
+#ifdef	__KERNEL__
+# define poly1305_init   poly1305_block_init
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 7
+#else
+# define MSB 7
+# define LSB 0
+#endif
+
+.text
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	poly1305_init
+.ent	poly1305_init
+poly1305_init:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	sd	$zero,0($ctx)
+	sd	$zero,8($ctx)
+	sd	$zero,16($ctx)
+
+	beqz	$inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+	andi	$tmp0,$inp,7		# $inp % 8
+	dsubu	$inp,$inp,$tmp0		# align $inp
+	sll	$tmp0,$tmp0,3		# byte to bit offset
+	ld	$in0,0($inp)
+	ld	$in1,8($inp)
+	beqz	$tmp0,.Laligned_key
+	ld	$tmp2,16($inp)
+
+	subu	$tmp1,$zero,$tmp0
+# ifdef	MIPSEB
+	dsllv	$in0,$in0,$tmp0
+	dsrlv	$tmp3,$in1,$tmp1
+	dsllv	$in1,$in1,$tmp0
+	dsrlv	$tmp2,$tmp2,$tmp1
+# else
+	dsrlv	$in0,$in0,$tmp0
+	dsllv	$tmp3,$in1,$tmp1
+	dsrlv	$in1,$in1,$tmp0
+	dsllv	$tmp2,$tmp2,$tmp1
+# endif
+	or	$in0,$in0,$tmp3
+	or	$in1,$in1,$tmp2
+.Laligned_key:
+#else
+	ldl	$in0,0+MSB($inp)
+	ldl	$in1,8+MSB($inp)
+	ldr	$in0,0+LSB($inp)
+	ldr	$in1,8+LSB($inp)
+#endif
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+	dsbh	$in0,$in0		# byte swap
+	 dsbh	$in1,$in1
+	dshd	$in0,$in0
+	 dshd	$in1,$in1
+# else
+	ori	$tmp0,$zero,0xFF
+	dsll	$tmp2,$tmp0,32
+	or	$tmp0,$tmp2		# 0x000000FF000000FF
+
+	and	$tmp1,$in0,$tmp0	# byte swap
+	 and	$tmp3,$in1,$tmp0
+	dsrl	$tmp2,$in0,24
+	 dsrl	$tmp4,$in1,24
+	dsll	$tmp1,24
+	 dsll	$tmp3,24
+	and	$tmp2,$tmp0
+	 and	$tmp4,$tmp0
+	dsll	$tmp0,8			# 0x0000FF000000FF00
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	and	$tmp2,$in0,$tmp0
+	 and	$tmp4,$in1,$tmp0
+	dsrl	$in0,8
+	 dsrl	$in1,8
+	dsll	$tmp2,8
+	 dsll	$tmp4,8
+	and	$in0,$tmp0
+	 and	$in1,$tmp0
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+	dsrl	$tmp1,$in0,32
+	 dsrl	$tmp3,$in1,32
+	dsll	$in0,32
+	 dsll	$in1,32
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+# endif
+#endif
+	li	$tmp0,1
+	dsll	$tmp0,32		# 0x0000000100000000
+	daddiu	$tmp0,-63		# 0x00000000ffffffc1
+	dsll	$tmp0,28		# 0x0ffffffc10000000
+	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
+
+	and	$in0,$tmp0
+	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
+	and	$in1,$tmp0
+
+	sd	$in0,24($ctx)
+	dsrl	$tmp0,$in1,2
+	sd	$in1,32($ctx)
+	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
+	sd	$tmp0,40($ctx)
+
+.Lno_key:
+	li	$v0,0			# return 0
+	jr	$ra
+.end	poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
+
+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
+   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
+my ($shr,$shl) = ($s6,$s7);		# used on R6
+
+$code.=<<___;
+.align	5
+.globl	poly1305_blocks
+.ent	poly1305_blocks
+poly1305_blocks:
+	.set	noreorder
+	dsrl	$len,4			# number of complete blocks
+	bnez	$len,poly1305_blocks_internal
+	nop
+	jr	$ra
+	nop
+.end	poly1305_blocks
+
+.align	5
+.ent	poly1305_blocks_internal
+poly1305_blocks_internal:
+	.set	noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+	.frame	$sp,8*8,$ra
+	.mask	$SAVED_REGS_MASK|0x000c0000,-8
+	dsubu	$sp,8*8
+	sd	$s7,56($sp)
+	sd	$s6,48($sp)
+#else
+	.frame	$sp,6*8,$ra
+	.mask	$SAVED_REGS_MASK,-8
+	dsubu	$sp,6*8
+#endif
+	sd	$s5,40($sp)
+	sd	$s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	sd	$s3,24($sp)
+	sd	$s2,16($sp)
+	sd	$s1,8($sp)
+	sd	$s0,0($sp)
+___
+$code.=<<___;
+	.set	reorder
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+	andi	$shr,$inp,7
+	dsubu	$inp,$inp,$shr		# align $inp
+	sll	$shr,$shr,3		# byte to bit offset
+	subu	$shl,$zero,$shr
+#endif
+
+	ld	$h0,0($ctx)		# load hash value
+	ld	$h1,8($ctx)
+	ld	$h2,16($ctx)
+
+	ld	$r0,24($ctx)		# load key
+	ld	$r1,32($ctx)
+	ld	$rs1,40($ctx)
+
+	dsll	$len,4
+	daddu	$len,$inp		# end of buffer
+	b	.Loop
+
+.align	4
+.Loop:
+#if defined(_MIPS_ARCH_MIPS64R6)
+	ld	$in0,0($inp)		# load input
+	ld	$in1,8($inp)
+	beqz	$shr,.Laligned_inp
+
+	ld	$tmp2,16($inp)
+# ifdef	MIPSEB
+	dsllv	$in0,$in0,$shr
+	dsrlv	$tmp3,$in1,$shl
+	dsllv	$in1,$in1,$shr
+	dsrlv	$tmp2,$tmp2,$shl
+# else
+	dsrlv	$in0,$in0,$shr
+	dsllv	$tmp3,$in1,$shl
+	dsrlv	$in1,$in1,$shr
+	dsllv	$tmp2,$tmp2,$shl
+# endif
+	or	$in0,$in0,$tmp3
+	or	$in1,$in1,$tmp2
+.Laligned_inp:
+#else
+	ldl	$in0,0+MSB($inp)	# load input
+	ldl	$in1,8+MSB($inp)
+	ldr	$in0,0+LSB($inp)
+	ldr	$in1,8+LSB($inp)
+#endif
+	daddiu	$inp,16
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+	dsbh	$in0,$in0		# byte swap
+	 dsbh	$in1,$in1
+	dshd	$in0,$in0
+	 dshd	$in1,$in1
+# else
+	ori	$tmp0,$zero,0xFF
+	dsll	$tmp2,$tmp0,32
+	or	$tmp0,$tmp2		# 0x000000FF000000FF
+
+	and	$tmp1,$in0,$tmp0	# byte swap
+	 and	$tmp3,$in1,$tmp0
+	dsrl	$tmp2,$in0,24
+	 dsrl	$tmp4,$in1,24
+	dsll	$tmp1,24
+	 dsll	$tmp3,24
+	and	$tmp2,$tmp0
+	 and	$tmp4,$tmp0
+	dsll	$tmp0,8			# 0x0000FF000000FF00
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	and	$tmp2,$in0,$tmp0
+	 and	$tmp4,$in1,$tmp0
+	dsrl	$in0,8
+	 dsrl	$in1,8
+	dsll	$tmp2,8
+	 dsll	$tmp4,8
+	and	$in0,$tmp0
+	 and	$in1,$tmp0
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+	dsrl	$tmp1,$in0,32
+	 dsrl	$tmp3,$in1,32
+	dsll	$in0,32
+	 dsll	$in1,32
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+# endif
+#endif
+	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction
+	andi	$h2,$h2,3
+	dsll	$tmp0,$tmp1,2
+
+	daddu	$d0,$h0,$in0		# accumulate input
+	 daddu	$tmp1,$tmp0
+	sltu	$tmp0,$d0,$h0
+	daddu	$d0,$d0,$tmp1		# ... and residue
+	sltu	$tmp1,$d0,$tmp1
+	daddu	$d1,$h1,$in1
+	daddu	$tmp0,$tmp1
+	sltu	$tmp1,$d1,$h1
+	daddu	$d1,$tmp0
+
+	dmultu	($r0,$d0)		# h0*r0
+	 daddu	$d2,$h2,$padbit
+	 sltu	$tmp0,$d1,$tmp0
+	mflo	($h0,$r0,$d0)
+	mfhi	($h1,$r0,$d0)
+
+	dmultu	($rs1,$d1)		# h1*5*r1
+	 daddu	$d2,$tmp1
+	 daddu	$d2,$tmp0
+	mflo	($tmp0,$rs1,$d1)
+	mfhi	($tmp1,$rs1,$d1)
+
+	dmultu	($r1,$d0)		# h0*r1
+	mflo	($tmp2,$r1,$d0)
+	mfhi	($h2,$r1,$d0)
+	 daddu	$h0,$tmp0
+	 daddu	$h1,$tmp1
+	 sltu	$tmp0,$h0,$tmp0
+
+	dmultu	($r0,$d1)		# h1*r0
+	 daddu	$h1,$tmp0
+	 daddu	$h1,$tmp2
+	mflo	($tmp0,$r0,$d1)
+	mfhi	($tmp1,$r0,$d1)
+
+	dmultu	($rs1,$d2)		# h2*5*r1
+	 sltu	$tmp2,$h1,$tmp2
+	 daddu	$h2,$tmp2
+	mflo	($tmp2,$rs1,$d2)
+
+	dmultu	($r0,$d2)		# h2*r0
+	 daddu	$h1,$tmp0
+	 daddu	$h2,$tmp1
+	mflo	($tmp3,$r0,$d2)
+	 sltu	$tmp0,$h1,$tmp0
+	 daddu	$h2,$tmp0
+
+	daddu	$h1,$tmp2
+	sltu	$tmp2,$h1,$tmp2
+	daddu	$h2,$tmp2
+	daddu	$h2,$tmp3
+
+	bne	$inp,$len,.Loop
+
+	sd	$h0,0($ctx)		# store hash value
+	sd	$h1,8($ctx)
+	sd	$h2,16($ctx)
+
+	.set	noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+	ld	$s7,56($sp)
+	ld	$s6,48($sp)
+#endif
+	ld	$s5,40($sp)		# epilogue
+	ld	$s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
+	ld	$s3,24($sp)
+	ld	$s2,16($sp)
+	ld	$s1,8($sp)
+	ld	$s0,0($sp)
+___
+$code.=<<___;
+	jr	$ra
+#if defined(_MIPS_ARCH_MIPS64R6)
+	daddu	$sp,8*8
+#else
+	daddu	$sp,6*8
+#endif
+.end	poly1305_blocks_internal
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.align	5
+.globl	poly1305_emit
+.ent	poly1305_emit
+poly1305_emit:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	ld	$tmp2,16($ctx)
+	ld	$tmp0,0($ctx)
+	ld	$tmp1,8($ctx)
+
+	li	$in0,-4			# final reduction
+	dsrl	$in1,$tmp2,2
+	and	$in0,$tmp2
+	andi	$tmp2,$tmp2,3
+	daddu	$in0,$in1
+
+	daddu	$tmp0,$tmp0,$in0
+	sltu	$in1,$tmp0,$in0
+	 daddiu	$in0,$tmp0,5		# compare to modulus
+	daddu	$tmp1,$tmp1,$in1
+	 sltiu	$tmp3,$in0,5
+	sltu	$tmp4,$tmp1,$in1
+	 daddu	$in1,$tmp1,$tmp3
+	daddu	$tmp2,$tmp2,$tmp4
+	 sltu	$tmp3,$in1,$tmp3
+	 daddu	$tmp2,$tmp2,$tmp3
+
+	dsrl	$tmp2,2			# see if it carried/borrowed
+	dsubu	$tmp2,$zero,$tmp2
+
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+	and	$in0,$tmp2
+	and	$in1,$tmp2
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+
+	lwu	$tmp0,0($nonce)		# load nonce
+	lwu	$tmp1,4($nonce)
+	lwu	$tmp2,8($nonce)
+	lwu	$tmp3,12($nonce)
+	dsll	$tmp1,32
+	dsll	$tmp3,32
+	or	$tmp0,$tmp1
+	or	$tmp2,$tmp3
+
+	daddu	$in0,$tmp0		# accumulate nonce
+	daddu	$in1,$tmp2
+	sltu	$tmp0,$in0,$tmp0
+	daddu	$in1,$tmp0
+
+	dsrl	$tmp0,$in0,8		# write mac value
+	dsrl	$tmp1,$in0,16
+	dsrl	$tmp2,$in0,24
+	sb	$in0,0($mac)
+	dsrl	$tmp3,$in0,32
+	sb	$tmp0,1($mac)
+	dsrl	$tmp0,$in0,40
+	sb	$tmp1,2($mac)
+	dsrl	$tmp1,$in0,48
+	sb	$tmp2,3($mac)
+	dsrl	$tmp2,$in0,56
+	sb	$tmp3,4($mac)
+	dsrl	$tmp3,$in1,8
+	sb	$tmp0,5($mac)
+	dsrl	$tmp0,$in1,16
+	sb	$tmp1,6($mac)
+	dsrl	$tmp1,$in1,24
+	sb	$tmp2,7($mac)
+
+	sb	$in1,8($mac)
+	dsrl	$tmp2,$in1,32
+	sb	$tmp3,9($mac)
+	dsrl	$tmp3,$in1,40
+	sb	$tmp0,10($mac)
+	dsrl	$tmp0,$in1,48
+	sb	$tmp1,11($mac)
+	dsrl	$tmp1,$in1,56
+	sb	$tmp2,12($mac)
+	sb	$tmp3,13($mac)
+	sb	$tmp0,14($mac)
+	sb	$tmp1,15($mac)
+
+	jr	$ra
+.end	poly1305_emit
+.rdata
+.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+}
+}}} else {{{
+######################################################################
+# 32-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
+   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
+     defined(_MIPS_ARCH_MIPS32R6)) \\
+     && !defined(_MIPS_ARCH_MIPS32R2)
+# define _MIPS_ARCH_MIPS32R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+# define multu(rs,rt)
+# define mflo(rd,rs,rt)	mulu	rd,rs,rt
+# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
+#else
+# define multu(rs,rt)	multu	rs,rt
+# define mflo(rd,rs,rt)	mflo	rd
+# define mfhi(rd,rs,rt)	mfhi	rd
+#endif
+
+#ifdef	__KERNEL__
+# define poly1305_init   poly1305_block_init
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 3
+#else
+# define MSB 3
+# define LSB 0
+#endif
+
+.text
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	poly1305_init
+.ent	poly1305_init
+poly1305_init:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	sw	$zero,0($ctx)
+	sw	$zero,4($ctx)
+	sw	$zero,8($ctx)
+	sw	$zero,12($ctx)
+	sw	$zero,16($ctx)
+
+	beqz	$inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+	andi	$tmp0,$inp,3		# $inp % 4
+	subu	$inp,$inp,$tmp0		# align $inp
+	sll	$tmp0,$tmp0,3		# byte to bit offset
+	lw	$in0,0($inp)
+	lw	$in1,4($inp)
+	lw	$in2,8($inp)
+	lw	$in3,12($inp)
+	beqz	$tmp0,.Laligned_key
+
+	lw	$tmp2,16($inp)
+	subu	$tmp1,$zero,$tmp0
+# ifdef	MIPSEB
+	sllv	$in0,$in0,$tmp0
+	srlv	$tmp3,$in1,$tmp1
+	sllv	$in1,$in1,$tmp0
+	or	$in0,$in0,$tmp3
+	srlv	$tmp3,$in2,$tmp1
+	sllv	$in2,$in2,$tmp0
+	or	$in1,$in1,$tmp3
+	srlv	$tmp3,$in3,$tmp1
+	sllv	$in3,$in3,$tmp0
+	or	$in2,$in2,$tmp3
+	srlv	$tmp2,$tmp2,$tmp1
+	or	$in3,$in3,$tmp2
+# else
+	srlv	$in0,$in0,$tmp0
+	sllv	$tmp3,$in1,$tmp1
+	srlv	$in1,$in1,$tmp0
+	or	$in0,$in0,$tmp3
+	sllv	$tmp3,$in2,$tmp1
+	srlv	$in2,$in2,$tmp0
+	or	$in1,$in1,$tmp3
+	sllv	$tmp3,$in3,$tmp1
+	srlv	$in3,$in3,$tmp0
+	or	$in2,$in2,$tmp3
+	sllv	$tmp2,$tmp2,$tmp1
+	or	$in3,$in3,$tmp2
+# endif
+.Laligned_key:
+#else
+	lwl	$in0,0+MSB($inp)
+	lwl	$in1,4+MSB($inp)
+	lwl	$in2,8+MSB($inp)
+	lwl	$in3,12+MSB($inp)
+	lwr	$in0,0+LSB($inp)
+	lwr	$in1,4+LSB($inp)
+	lwr	$in2,8+LSB($inp)
+	lwr	$in3,12+LSB($inp)
+#endif
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+	wsbh	$in0,$in0		# byte swap
+	wsbh	$in1,$in1
+	wsbh	$in2,$in2
+	wsbh	$in3,$in3
+	rotr	$in0,$in0,16
+	rotr	$in1,$in1,16
+	rotr	$in2,$in2,16
+	rotr	$in3,$in3,16
+# else
+	srl	$tmp0,$in0,24		# byte swap
+	srl	$tmp1,$in0,8
+	andi	$tmp2,$in0,0xFF00
+	sll	$in0,$in0,24
+	andi	$tmp1,0xFF00
+	sll	$tmp2,$tmp2,8
+	or	$in0,$tmp0
+	 srl	$tmp0,$in1,24
+	or	$tmp1,$tmp2
+	 srl	$tmp2,$in1,8
+	or	$in0,$tmp1
+	 andi	$tmp1,$in1,0xFF00
+	 sll	$in1,$in1,24
+	 andi	$tmp2,0xFF00
+	 sll	$tmp1,$tmp1,8
+	 or	$in1,$tmp0
+	srl	$tmp0,$in2,24
+	 or	$tmp2,$tmp1
+	srl	$tmp1,$in2,8
+	 or	$in1,$tmp2
+	andi	$tmp2,$in2,0xFF00
+	sll	$in2,$in2,24
+	andi	$tmp1,0xFF00
+	sll	$tmp2,$tmp2,8
+	or	$in2,$tmp0
+	 srl	$tmp0,$in3,24
+	or	$tmp1,$tmp2
+	 srl	$tmp2,$in3,8
+	or	$in2,$tmp1
+	 andi	$tmp1,$in3,0xFF00
+	 sll	$in3,$in3,24
+	 andi	$tmp2,0xFF00
+	 sll	$tmp1,$tmp1,8
+	 or	$in3,$tmp0
+	 or	$tmp2,$tmp1
+	 or	$in3,$tmp2
+# endif
+#endif
+	lui	$tmp0,0x0fff
+	ori	$tmp0,0xffff		# 0x0fffffff
+	and	$in0,$in0,$tmp0
+	subu	$tmp0,3			# 0x0ffffffc
+	and	$in1,$in1,$tmp0
+	and	$in2,$in2,$tmp0
+	and	$in3,$in3,$tmp0
+
+	sw	$in0,20($ctx)
+	sw	$in1,24($ctx)
+	sw	$in2,28($ctx)
+	sw	$in3,32($ctx)
+
+	srl	$tmp1,$in1,2
+	srl	$tmp2,$in2,2
+	srl	$tmp3,$in3,2
+	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
+	addu	$in2,$in2,$tmp2
+	addu	$in3,$in3,$tmp3
+	sw	$in1,36($ctx)
+	sw	$in2,40($ctx)
+	sw	$in3,44($ctx)
+.Lno_key:
+	li	$v0,0
+	jr	$ra
+.end	poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
+
+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
+   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
+my ($d0,$d1,$d2,$d3) =
+   ($a4,$a5,$a6,$a7);
+my $shr = $t2;		# used on R6
+my $one = $t2;		# used on R2
+
+$code.=<<___;
+.globl	poly1305_blocks
+.align	5
+.ent	poly1305_blocks
+poly1305_blocks:
+	.frame	$sp,16*4,$ra
+	.mask	$SAVED_REGS_MASK,-4
+	.set	noreorder
+	subu	$sp, $sp,4*12
+	sw	$s11,4*11($sp)
+	sw	$s10,4*10($sp)
+	sw	$s9, 4*9($sp)
+	sw	$s8, 4*8($sp)
+	sw	$s7, 4*7($sp)
+	sw	$s6, 4*6($sp)
+	sw	$s5, 4*5($sp)
+	sw	$s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	sw	$s3, 4*3($sp)
+	sw	$s2, 4*2($sp)
+	sw	$s1, 4*1($sp)
+	sw	$s0, 4*0($sp)
+___
+$code.=<<___;
+	.set	reorder
+
+	srl	$len,4			# number of complete blocks
+	li	$one,1
+	beqz	$len,.Labort
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+	andi	$shr,$inp,3
+	subu	$inp,$inp,$shr		# align $inp
+	sll	$shr,$shr,3		# byte to bit offset
+#endif
+
+	lw	$h0,0($ctx)		# load hash value
+	lw	$h1,4($ctx)
+	lw	$h2,8($ctx)
+	lw	$h3,12($ctx)
+	lw	$h4,16($ctx)
+
+	lw	$r0,20($ctx)		# load key
+	lw	$r1,24($ctx)
+	lw	$r2,28($ctx)
+	lw	$r3,32($ctx)
+	lw	$rs1,36($ctx)
+	lw	$rs2,40($ctx)
+	lw	$rs3,44($ctx)
+
+	sll	$len,4
+	addu	$len,$len,$inp		# end of buffer
+	b	.Loop
+
+.align	4
+.Loop:
+#if defined(_MIPS_ARCH_MIPS32R6)
+	lw	$d0,0($inp)		# load input
+	lw	$d1,4($inp)
+	lw	$d2,8($inp)
+	lw	$d3,12($inp)
+	beqz	$shr,.Laligned_inp
+
+	lw	$t0,16($inp)
+	subu	$t1,$zero,$shr
+# ifdef	MIPSEB
+	sllv	$d0,$d0,$shr
+	srlv	$at,$d1,$t1
+	sllv	$d1,$d1,$shr
+	or	$d0,$d0,$at
+	srlv	$at,$d2,$t1
+	sllv	$d2,$d2,$shr
+	or	$d1,$d1,$at
+	srlv	$at,$d3,$t1
+	sllv	$d3,$d3,$shr
+	or	$d2,$d2,$at
+	srlv	$t0,$t0,$t1
+	or	$d3,$d3,$t0
+# else
+	srlv	$d0,$d0,$shr
+	sllv	$at,$d1,$t1
+	srlv	$d1,$d1,$shr
+	or	$d0,$d0,$at
+	sllv	$at,$d2,$t1
+	srlv	$d2,$d2,$shr
+	or	$d1,$d1,$at
+	sllv	$at,$d3,$t1
+	srlv	$d3,$d3,$shr
+	or	$d2,$d2,$at
+	sllv	$t0,$t0,$t1
+	or	$d3,$d3,$t0
+# endif
+.Laligned_inp:
+#else
+	lwl	$d0,0+MSB($inp)		# load input
+	lwl	$d1,4+MSB($inp)
+	lwl	$d2,8+MSB($inp)
+	lwl	$d3,12+MSB($inp)
+	lwr	$d0,0+LSB($inp)
+	lwr	$d1,4+LSB($inp)
+	lwr	$d2,8+LSB($inp)
+	lwr	$d3,12+LSB($inp)
+#endif
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+	wsbh	$d0,$d0			# byte swap
+	wsbh	$d1,$d1
+	wsbh	$d2,$d2
+	wsbh	$d3,$d3
+	rotr	$d0,$d0,16
+	rotr	$d1,$d1,16
+	rotr	$d2,$d2,16
+	rotr	$d3,$d3,16
+# else
+	srl	$at,$d0,24		# byte swap
+	srl	$t0,$d0,8
+	andi	$t1,$d0,0xFF00
+	sll	$d0,$d0,24
+	andi	$t0,0xFF00
+	sll	$t1,$t1,8
+	or	$d0,$at
+	 srl	$at,$d1,24
+	or	$t0,$t1
+	 srl	$t1,$d1,8
+	or	$d0,$t0
+	 andi	$t0,$d1,0xFF00
+	 sll	$d1,$d1,24
+	 andi	$t1,0xFF00
+	 sll	$t0,$t0,8
+	 or	$d1,$at
+	srl	$at,$d2,24
+	 or	$t1,$t0
+	srl	$t0,$d2,8
+	 or	$d1,$t1
+	andi	$t1,$d2,0xFF00
+	sll	$d2,$d2,24
+	andi	$t0,0xFF00
+	sll	$t1,$t1,8
+	or	$d2,$at
+	 srl	$at,$d3,24
+	or	$t0,$t1
+	 srl	$t1,$d3,8
+	or	$d2,$t0
+	 andi	$t0,$d3,0xFF00
+	 sll	$d3,$d3,24
+	 andi	$t1,0xFF00
+	 sll	$t0,$t0,8
+	 or	$d3,$at
+	 or	$t1,$t0
+	 or	$d3,$t1
+# endif
+#endif
+	srl	$t0,$h4,2		# modulo-scheduled reduction
+	andi	$h4,$h4,3
+	sll	$at,$t0,2
+
+	addu	$d0,$d0,$h0		# accumulate input
+	 addu	$t0,$t0,$at
+	sltu	$h0,$d0,$h0
+	addu	$d0,$d0,$t0		# ... and residue
+	sltu	$at,$d0,$t0
+
+	addu	$d1,$d1,$h1
+	 addu	$h0,$h0,$at		# carry
+	sltu	$h1,$d1,$h1
+	addu	$d1,$d1,$h0
+	sltu	$h0,$d1,$h0
+
+	addu	$d2,$d2,$h2
+	 addu	$h1,$h1,$h0		# carry
+	sltu	$h2,$d2,$h2
+	addu	$d2,$d2,$h1
+	sltu	$h1,$d2,$h1
+
+	addu	$d3,$d3,$h3
+	 addu	$h2,$h2,$h1		# carry
+	sltu	$h3,$d3,$h3
+	addu	$d3,$d3,$h2
+
+#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
+	multu	$r0,$d0			# d0*r0
+	 sltu	$h2,$d3,$h2
+	maddu	$rs3,$d1		# d1*s3
+	 addu	$h3,$h3,$h2		# carry
+	maddu	$rs2,$d2		# d2*s2
+	 addu	$h4,$h4,$padbit
+	maddu	$rs1,$d3		# d3*s1
+	 addu	$h4,$h4,$h3
+	mfhi	$at
+	mflo	$h0
+
+	multu	$r1,$d0			# d0*r1
+	maddu	$r0,$d1			# d1*r0
+	maddu	$rs3,$d2		# d2*s3
+	maddu	$rs2,$d3		# d3*s2
+	maddu	$rs1,$h4		# h4*s1
+	maddu	$at,$one		# hi*1
+	mfhi	$at
+	mflo	$h1
+
+	multu	$r2,$d0			# d0*r2
+	maddu	$r1,$d1			# d1*r1
+	maddu	$r0,$d2			# d2*r0
+	maddu	$rs3,$d3		# d3*s3
+	maddu	$rs2,$h4		# h4*s2
+	maddu	$at,$one		# hi*1
+	mfhi	$at
+	mflo	$h2
+
+	mul	$t0,$r0,$h4		# h4*r0
+
+	multu	$r3,$d0			# d0*r3
+	maddu	$r2,$d1			# d1*r2
+	maddu	$r1,$d2			# d2*r1
+	maddu	$r0,$d3			# d3*r0
+	maddu	$rs3,$h4		# h4*s3
+	maddu	$at,$one		# hi*1
+	mfhi	$at
+	mflo	$h3
+
+	 addiu	$inp,$inp,16
+
+	addu	$h4,$t0,$at
+#else
+	multu	($r0,$d0)		# d0*r0
+	mflo	($h0,$r0,$d0)
+	mfhi	($h1,$r0,$d0)
+
+	 sltu	$h2,$d3,$h2
+	 addu	$h3,$h3,$h2		# carry
+
+	multu	($rs3,$d1)		# d1*s3
+	mflo	($at,$rs3,$d1)
+	mfhi	($t0,$rs3,$d1)
+
+	 addu	$h4,$h4,$padbit
+	 addiu	$inp,$inp,16
+	 addu	$h4,$h4,$h3
+
+	multu	($rs2,$d2)		# d2*s2
+	mflo	($a3,$rs2,$d2)
+	mfhi	($t1,$rs2,$d2)
+	 addu	$h0,$h0,$at
+	 addu	$h1,$h1,$t0
+	multu	($rs1,$d3)		# d3*s1
+	 sltu	$at,$h0,$at
+	 addu	$h1,$h1,$at
+
+	mflo	($at,$rs1,$d3)
+	mfhi	($t0,$rs1,$d3)
+	 addu	$h0,$h0,$a3
+	 addu	$h1,$h1,$t1
+	multu	($r1,$d0)		# d0*r1
+	 sltu	$a3,$h0,$a3
+	 addu	$h1,$h1,$a3
+
+
+	mflo	($a3,$r1,$d0)
+	mfhi	($h2,$r1,$d0)
+	 addu	$h0,$h0,$at
+	 addu	$h1,$h1,$t0
+	multu	($r0,$d1)		# d1*r0
+	 sltu	$at,$h0,$at
+	 addu	$h1,$h1,$at
+
+	mflo	($at,$r0,$d1)
+	mfhi	($t0,$r0,$d1)
+	 addu	$h1,$h1,$a3
+	 sltu	$a3,$h1,$a3
+	multu	($rs3,$d2)		# d2*s3
+	 addu	$h2,$h2,$a3
+
+	mflo	($a3,$rs3,$d2)
+	mfhi	($t1,$rs3,$d2)
+	 addu	$h1,$h1,$at
+	 addu	$h2,$h2,$t0
+	multu	($rs2,$d3)		# d3*s2
+	 sltu	$at,$h1,$at
+	 addu	$h2,$h2,$at
+
+	mflo	($at,$rs2,$d3)
+	mfhi	($t0,$rs2,$d3)
+	 addu	$h1,$h1,$a3
+	 addu	$h2,$h2,$t1
+	multu	($rs1,$h4)		# h4*s1
+	 sltu	$a3,$h1,$a3
+	 addu	$h2,$h2,$a3
+
+	mflo	($a3,$rs1,$h4)
+	 addu	$h1,$h1,$at
+	 addu	$h2,$h2,$t0
+	multu	($r2,$d0)		# d0*r2
+	 sltu	$at,$h1,$at
+	 addu	$h2,$h2,$at
+
+
+	mflo	($at,$r2,$d0)
+	mfhi	($h3,$r2,$d0)
+	 addu	$h1,$h1,$a3
+	 sltu	$a3,$h1,$a3
+	multu	($r1,$d1)		# d1*r1
+	 addu	$h2,$h2,$a3
+
+	mflo	($a3,$r1,$d1)
+	mfhi	($t1,$r1,$d1)
+	 addu	$h2,$h2,$at
+	 sltu	$at,$h2,$at
+	multu	($r0,$d2)		# d2*r0
+	 addu	$h3,$h3,$at
+
+	mflo	($at,$r0,$d2)
+	mfhi	($t0,$r0,$d2)
+	 addu	$h2,$h2,$a3
+	 addu	$h3,$h3,$t1
+	multu	($rs3,$d3)		# d3*s3
+	 sltu	$a3,$h2,$a3
+	 addu	$h3,$h3,$a3
+
+	mflo	($a3,$rs3,$d3)
+	mfhi	($t1,$rs3,$d3)
+	 addu	$h2,$h2,$at
+	 addu	$h3,$h3,$t0
+	multu	($rs2,$h4)		# h4*s2
+	 sltu	$at,$h2,$at
+	 addu	$h3,$h3,$at
+
+	mflo	($at,$rs2,$h4)
+	 addu	$h2,$h2,$a3
+	 addu	$h3,$h3,$t1
+	multu	($r3,$d0)		# d0*r3
+	 sltu	$a3,$h2,$a3
+	 addu	$h3,$h3,$a3
+
+
+	mflo	($a3,$r3,$d0)
+	mfhi	($t1,$r3,$d0)
+	 addu	$h2,$h2,$at
+	 sltu	$at,$h2,$at
+	multu	($r2,$d1)		# d1*r2
+	 addu	$h3,$h3,$at
+
+	mflo	($at,$r2,$d1)
+	mfhi	($t0,$r2,$d1)
+	 addu	$h3,$h3,$a3
+	 sltu	$a3,$h3,$a3
+	multu	($r0,$d3)		# d3*r0
+	 addu	$t1,$t1,$a3
+
+	mflo	($a3,$r0,$d3)
+	mfhi	($d3,$r0,$d3)
+	 addu	$h3,$h3,$at
+	 addu	$t1,$t1,$t0
+	multu	($r1,$d2)		# d2*r1
+	 sltu	$at,$h3,$at
+	 addu	$t1,$t1,$at
+
+	mflo	($at,$r1,$d2)
+	mfhi	($t0,$r1,$d2)
+	 addu	$h3,$h3,$a3
+	 addu	$t1,$t1,$d3
+	multu	($rs3,$h4)		# h4*s3
+	 sltu	$a3,$h3,$a3
+	 addu	$t1,$t1,$a3
+
+	mflo	($a3,$rs3,$h4)
+	 addu	$h3,$h3,$at
+	 addu	$t1,$t1,$t0
+	multu	($r0,$h4)		# h4*r0
+	 sltu	$at,$h3,$at
+	 addu	$t1,$t1,$at
+
+
+	mflo	($h4,$r0,$h4)
+	 addu	$h3,$h3,$a3
+	 sltu	$a3,$h3,$a3
+	 addu	$t1,$t1,$a3
+	addu	$h4,$h4,$t1
+
+	li	$padbit,1		# if we loop, padbit is 1
+#endif
+	bne	$inp,$len,.Loop
+
+	sw	$h0,0($ctx)		# store hash value
+	sw	$h1,4($ctx)
+	sw	$h2,8($ctx)
+	sw	$h3,12($ctx)
+	sw	$h4,16($ctx)
+
+	.set	noreorder
+.Labort:
+	lw	$s11,4*11($sp)
+	lw	$s10,4*10($sp)
+	lw	$s9, 4*9($sp)
+	lw	$s8, 4*8($sp)
+	lw	$s7, 4*7($sp)
+	lw	$s6, 4*6($sp)
+	lw	$s5, 4*5($sp)
+	lw	$s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	lw	$s3, 4*3($sp)
+	lw	$s2, 4*2($sp)
+	lw	$s1, 4*1($sp)
+	lw	$s0, 4*0($sp)
+___
+$code.=<<___;
+	jr	$ra
+	addu	$sp,$sp,4*12
+.end	poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
+
+$code.=<<___;
+.align	5
+.globl	poly1305_emit
+.ent	poly1305_emit
+poly1305_emit:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	lw	$tmp4,16($ctx)
+	lw	$tmp0,0($ctx)
+	lw	$tmp1,4($ctx)
+	lw	$tmp2,8($ctx)
+	lw	$tmp3,12($ctx)
+
+	li	$in0,-4			# final reduction
+	srl	$ctx,$tmp4,2
+	and	$in0,$in0,$tmp4
+	andi	$tmp4,$tmp4,3
+	addu	$ctx,$ctx,$in0
+
+	addu	$tmp0,$tmp0,$ctx
+	sltu	$ctx,$tmp0,$ctx
+	 addiu	$in0,$tmp0,5		# compare to modulus
+	addu	$tmp1,$tmp1,$ctx
+	 sltiu	$in1,$in0,5
+	sltu	$ctx,$tmp1,$ctx
+	 addu	$in1,$in1,$tmp1
+	addu	$tmp2,$tmp2,$ctx
+	 sltu	$in2,$in1,$tmp1
+	sltu	$ctx,$tmp2,$ctx
+	 addu	$in2,$in2,$tmp2
+	addu	$tmp3,$tmp3,$ctx
+	 sltu	$in3,$in2,$tmp2
+	sltu	$ctx,$tmp3,$ctx
+	 addu	$in3,$in3,$tmp3
+	addu	$tmp4,$tmp4,$ctx
+	 sltu	$ctx,$in3,$tmp3
+	 addu	$ctx,$tmp4
+
+	srl	$ctx,2			# see if it carried/borrowed
+	subu	$ctx,$zero,$ctx
+
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+	xor	$in2,$tmp2
+	xor	$in3,$tmp3
+	and	$in0,$ctx
+	and	$in1,$ctx
+	and	$in2,$ctx
+	and	$in3,$ctx
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+	xor	$in2,$tmp2
+	xor	$in3,$tmp3
+
+	lw	$tmp0,0($nonce)		# load nonce
+	lw	$tmp1,4($nonce)
+	lw	$tmp2,8($nonce)
+	lw	$tmp3,12($nonce)
+
+	addu	$in0,$tmp0		# accumulate nonce
+	sltu	$ctx,$in0,$tmp0
+
+	addu	$in1,$tmp1
+	sltu	$tmp1,$in1,$tmp1
+	addu	$in1,$ctx
+	sltu	$ctx,$in1,$ctx
+	addu	$ctx,$tmp1
+
+	addu	$in2,$tmp2
+	sltu	$tmp2,$in2,$tmp2
+	addu	$in2,$ctx
+	sltu	$ctx,$in2,$ctx
+	addu	$ctx,$tmp2
+
+	addu	$in3,$tmp3
+	addu	$in3,$ctx
+
+	srl	$tmp0,$in0,8		# write mac value
+	srl	$tmp1,$in0,16
+	srl	$tmp2,$in0,24
+	sb	$in0, 0($mac)
+	sb	$tmp0,1($mac)
+	srl	$tmp0,$in1,8
+	sb	$tmp1,2($mac)
+	srl	$tmp1,$in1,16
+	sb	$tmp2,3($mac)
+	srl	$tmp2,$in1,24
+	sb	$in1, 4($mac)
+	sb	$tmp0,5($mac)
+	srl	$tmp0,$in2,8
+	sb	$tmp1,6($mac)
+	srl	$tmp1,$in2,16
+	sb	$tmp2,7($mac)
+	srl	$tmp2,$in2,24
+	sb	$in2, 8($mac)
+	sb	$tmp0,9($mac)
+	srl	$tmp0,$in3,8
+	sb	$tmp1,10($mac)
+	srl	$tmp1,$in3,16
+	sb	$tmp2,11($mac)
+	srl	$tmp2,$in3,24
+	sb	$in3, 12($mac)
+	sb	$tmp0,13($mac)
+	sb	$tmp1,14($mac)
+	sb	$tmp2,15($mac)
+
+	jr	$ra
+.end	poly1305_emit
+.rdata
+.asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+}
+}}}
+
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/lib/crypto/mips/poly1305.h b/lib/crypto/mips/poly1305.h
new file mode 100644
index 000000000000..85de450f1a93
--- /dev/null
+++ b/lib/crypto/mips/poly1305.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+				    const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
+				const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+			      u8 digest[POLY1305_DIGEST_SIZE],
+			      const u32 nonce[4]);
diff --git a/lib/crypto/mips/sha1.h b/lib/crypto/mips/sha1.h
new file mode 100644
index 000000000000..ba1965002e4a
--- /dev/null
+++ b/lib/crypto/mips/sha1.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic API.
+ *
+ * SHA1 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha1_generic.c, which is:
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void octeon_sha1_store_hash(struct sha1_block_state *state)
+{
+	u64 *hash = (u64 *)&state->h[0];
+	union {
+		u32 word[2];
+		u64 dword;
+	} hash_tail = { { state->h[4], } };
+
+	write_octeon_64bit_hash_dword(hash[0], 0);
+	write_octeon_64bit_hash_dword(hash[1], 1);
+	write_octeon_64bit_hash_dword(hash_tail.dword, 2);
+	memzero_explicit(&hash_tail.word[0], sizeof(hash_tail.word[0]));
+}
+
+static void octeon_sha1_read_hash(struct sha1_block_state *state)
+{
+	u64 *hash = (u64 *)&state->h[0];
+	union {
+		u32 word[2];
+		u64 dword;
+	} hash_tail;
+
+	hash[0]		= read_octeon_64bit_hash_dword(0);
+	hash[1]		= read_octeon_64bit_hash_dword(1);
+	hash_tail.dword	= read_octeon_64bit_hash_dword(2);
+	state->h[4]	= hash_tail.word[0];
+	memzero_explicit(&hash_tail.dword, sizeof(hash_tail.dword));
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	struct octeon_cop2_state cop2_state;
+	unsigned long flags;
+
+	if (!octeon_has_crypto())
+		return sha1_blocks_generic(state, data, nblocks);
+
+	flags = octeon_crypto_enable(&cop2_state);
+	octeon_sha1_store_hash(state);
+
+	do {
+		const u64 *block = (const u64 *)data;
+
+		write_octeon_64bit_block_dword(block[0], 0);
+		write_octeon_64bit_block_dword(block[1], 1);
+		write_octeon_64bit_block_dword(block[2], 2);
+		write_octeon_64bit_block_dword(block[3], 3);
+		write_octeon_64bit_block_dword(block[4], 4);
+		write_octeon_64bit_block_dword(block[5], 5);
+		write_octeon_64bit_block_dword(block[6], 6);
+		octeon_sha1_start(block[7]);
+
+		data += SHA1_BLOCK_SIZE;
+	} while (--nblocks);
+
+	octeon_sha1_read_hash(state);
+	octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/mips/sha256.h b/lib/crypto/mips/sha256.h
new file mode 100644
index 000000000000..ccccfd131634
--- /dev/null
+++ b/lib/crypto/mips/sha256.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha256_generic.c, which is:
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	struct octeon_cop2_state cop2_state;
+	u64 *state64 = (u64 *)state;
+	unsigned long flags;
+
+	if (!octeon_has_crypto())
+		return sha256_blocks_generic(state, data, nblocks);
+
+	flags = octeon_crypto_enable(&cop2_state);
+	write_octeon_64bit_hash_dword(state64[0], 0);
+	write_octeon_64bit_hash_dword(state64[1], 1);
+	write_octeon_64bit_hash_dword(state64[2], 2);
+	write_octeon_64bit_hash_dword(state64[3], 3);
+
+	do {
+		const u64 *block = (const u64 *)data;
+
+		write_octeon_64bit_block_dword(block[0], 0);
+		write_octeon_64bit_block_dword(block[1], 1);
+		write_octeon_64bit_block_dword(block[2], 2);
+		write_octeon_64bit_block_dword(block[3], 3);
+		write_octeon_64bit_block_dword(block[4], 4);
+		write_octeon_64bit_block_dword(block[5], 5);
+		write_octeon_64bit_block_dword(block[6], 6);
+		octeon_sha256_start(block[7]);
+
+		data += SHA256_BLOCK_SIZE;
+	} while (--nblocks);
+
+	state64[0] = read_octeon_64bit_hash_dword(0);
+	state64[1] = read_octeon_64bit_hash_dword(1);
+	state64[2] = read_octeon_64bit_hash_dword(2);
+	state64[3] = read_octeon_64bit_hash_dword(3);
+	octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/mips/sha512.h b/lib/crypto/mips/sha512.h
new file mode 100644
index 000000000000..b3ffbc1e8ca8
--- /dev/null
+++ b/lib/crypto/mips/sha512.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic API.
+ *
+ * SHA-512 and SHA-384 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha512_generic.c, which is:
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	struct octeon_cop2_state cop2_state;
+	unsigned long flags;
+
+	if (!octeon_has_crypto())
+		return sha512_blocks_generic(state, data, nblocks);
+
+	flags = octeon_crypto_enable(&cop2_state);
+	write_octeon_64bit_hash_sha512(state->h[0], 0);
+	write_octeon_64bit_hash_sha512(state->h[1], 1);
+	write_octeon_64bit_hash_sha512(state->h[2], 2);
+	write_octeon_64bit_hash_sha512(state->h[3], 3);
+	write_octeon_64bit_hash_sha512(state->h[4], 4);
+	write_octeon_64bit_hash_sha512(state->h[5], 5);
+	write_octeon_64bit_hash_sha512(state->h[6], 6);
+	write_octeon_64bit_hash_sha512(state->h[7], 7);
+
+	do {
+		const u64 *block = (const u64 *)data;
+
+		write_octeon_64bit_block_sha512(block[0], 0);
+		write_octeon_64bit_block_sha512(block[1], 1);
+		write_octeon_64bit_block_sha512(block[2], 2);
+		write_octeon_64bit_block_sha512(block[3], 3);
+		write_octeon_64bit_block_sha512(block[4], 4);
+		write_octeon_64bit_block_sha512(block[5], 5);
+		write_octeon_64bit_block_sha512(block[6], 6);
+		write_octeon_64bit_block_sha512(block[7], 7);
+		write_octeon_64bit_block_sha512(block[8], 8);
+		write_octeon_64bit_block_sha512(block[9], 9);
+		write_octeon_64bit_block_sha512(block[10], 10);
+		write_octeon_64bit_block_sha512(block[11], 11);
+		write_octeon_64bit_block_sha512(block[12], 12);
+		write_octeon_64bit_block_sha512(block[13], 13);
+		write_octeon_64bit_block_sha512(block[14], 14);
+		octeon_sha512_start(block[15]);
+
+		data += SHA512_BLOCK_SIZE;
+	} while (--nblocks);
+
+	state->h[0] = read_octeon_64bit_hash_sha512(0);
+	state->h[1] = read_octeon_64bit_hash_sha512(1);
+	state->h[2] = read_octeon_64bit_hash_sha512(2);
+	state->h[3] = read_octeon_64bit_hash_sha512(3);
+	state->h[4] = read_octeon_64bit_hash_sha512(4);
+	state->h[5] = read_octeon_64bit_hash_sha512(5);
+	state->h[6] = read_octeon_64bit_hash_sha512(6);
+	state->h[7] = read_octeon_64bit_hash_sha512(7);
+	octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/mpi/Makefile b/lib/crypto/mpi/Makefile
new file mode 100644
index 000000000000..9ad84079025a
--- /dev/null
+++ b/lib/crypto/mpi/Makefile
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# MPI multiprecision maths library (from gpg)
+#
+
+obj-$(CONFIG_MPILIB) = mpi.o
+
+mpi-y = \
+	generic_mpih-lshift.o		\
+	generic_mpih-mul1.o		\
+	generic_mpih-mul2.o		\
+	generic_mpih-mul3.o		\
+	generic_mpih-rshift.o		\
+	generic_mpih-sub1.o		\
+	generic_mpih-add1.o		\
+	mpicoder.o			\
+	mpi-add.o			\
+	mpi-bit.o			\
+	mpi-cmp.o			\
+	mpi-sub-ui.o			\
+	mpi-div.o			\
+	mpi-mod.o			\
+	mpi-mul.o			\
+	mpih-cmp.o			\
+	mpih-div.o			\
+	mpih-mul.o			\
+	mpi-pow.o			\
+	mpiutil.o
diff --git a/lib/crypto/mpi/generic_mpih-add1.c b/lib/crypto/mpi/generic_mpih-add1.c
new file mode 100644
index 000000000000..299308b5461c
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-add1.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-add_1.c  -  MPI helper functions
+ * Copyright (C) 1994, 1996, 1997, 1998,
+ *               2000 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+mpi_limb_t
+mpihelp_add_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+	      mpi_ptr_t s2_ptr, mpi_size_t size)
+{
+	mpi_limb_t x, y, cy;
+	mpi_size_t j;
+
+	/* The loop counter and index J goes from -SIZE to -1.  This way
+	   the loop becomes faster.  */
+	j = -size;
+
+	/* Offset the base pointers to compensate for the negative indices. */
+	s1_ptr -= j;
+	s2_ptr -= j;
+	res_ptr -= j;
+
+	cy = 0;
+	do {
+		y = s2_ptr[j];
+		x = s1_ptr[j];
+		y += cy;	/* add previous carry to one addend */
+		cy = y < cy;	/* get out carry from that addition */
+		y += x;		/* add other addend */
+		cy += y < x;	/* get out carry from that add, combine */
+		res_ptr[j] = y;
+	} while (++j);
+
+	return cy;
+}
diff --git a/lib/crypto/mpi/generic_mpih-lshift.c b/lib/crypto/mpi/generic_mpih-lshift.c
new file mode 100644
index 000000000000..7b21f5938a50
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-lshift.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-lshift.c  -	MPI helper functions
+ * Copyright (C) 1994, 1996, 1998, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+
+/* Shift U (pointed to by UP and USIZE digits long) CNT bits to the left
+ * and store the USIZE least significant digits of the result at WP.
+ * Return the bits shifted out from the most significant digit.
+ *
+ * Argument constraints:
+ * 1. 0 < CNT < BITS_PER_MP_LIMB
+ * 2. If the result is to be written over the input, WP must be >= UP.
+ */
+
+mpi_limb_t
+mpihelp_lshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, unsigned int cnt)
+{
+	mpi_limb_t high_limb, low_limb;
+	unsigned sh_1, sh_2;
+	mpi_size_t i;
+	mpi_limb_t retval;
+
+	sh_1 = cnt;
+	wp += 1;
+	sh_2 = BITS_PER_MPI_LIMB - sh_1;
+	i = usize - 1;
+	low_limb = up[i];
+	retval = low_limb >> sh_2;
+	high_limb = low_limb;
+	while (--i >= 0) {
+		low_limb = up[i];
+		wp[i] = (high_limb << sh_1) | (low_limb >> sh_2);
+		high_limb = low_limb;
+	}
+	wp[i] = high_limb << sh_1;
+
+	return retval;
+}
diff --git a/lib/crypto/mpi/generic_mpih-mul1.c b/lib/crypto/mpi/generic_mpih-mul1.c
new file mode 100644
index 000000000000..e020e61d47b9
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-mul1.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-mul_1.c  -  MPI helper functions
+ * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+mpi_limb_t
+mpihelp_mul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
+	      mpi_limb_t s2_limb)
+{
+	mpi_limb_t cy_limb;
+	mpi_size_t j;
+	mpi_limb_t prod_high, prod_low;
+
+	/* The loop counter and index J goes from -S1_SIZE to -1.  This way
+	 * the loop becomes faster.  */
+	j = -s1_size;
+
+	/* Offset the base pointers to compensate for the negative indices.  */
+	s1_ptr -= j;
+	res_ptr -= j;
+
+	cy_limb = 0;
+	do {
+		umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb);
+		prod_low += cy_limb;
+		cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high;
+		res_ptr[j] = prod_low;
+	} while (++j);
+
+	return cy_limb;
+}
diff --git a/lib/crypto/mpi/generic_mpih-mul2.c b/lib/crypto/mpi/generic_mpih-mul2.c
new file mode 100644
index 000000000000..9484d8528243
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-mul2.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-mul_2.c  -  MPI helper functions
+ * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+mpi_limb_t
+mpihelp_addmul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+		 mpi_size_t s1_size, mpi_limb_t s2_limb)
+{
+	mpi_limb_t cy_limb;
+	mpi_size_t j;
+	mpi_limb_t prod_high, prod_low;
+	mpi_limb_t x;
+
+	/* The loop counter and index J goes from -SIZE to -1.  This way
+	 * the loop becomes faster.  */
+	j = -s1_size;
+	res_ptr -= j;
+	s1_ptr -= j;
+
+	cy_limb = 0;
+	do {
+		umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb);
+
+		prod_low += cy_limb;
+		cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high;
+
+		x = res_ptr[j];
+		prod_low = x + prod_low;
+		cy_limb += prod_low < x ? 1 : 0;
+		res_ptr[j] = prod_low;
+	} while (++j);
+	return cy_limb;
+}
diff --git a/lib/crypto/mpi/generic_mpih-mul3.c b/lib/crypto/mpi/generic_mpih-mul3.c
new file mode 100644
index 000000000000..ccdbab4121e0
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-mul3.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-mul_3.c  -  MPI helper functions
+ * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+mpi_limb_t
+mpihelp_submul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+		 mpi_size_t s1_size, mpi_limb_t s2_limb)
+{
+	mpi_limb_t cy_limb;
+	mpi_size_t j;
+	mpi_limb_t prod_high, prod_low;
+	mpi_limb_t x;
+
+	/* The loop counter and index J goes from -SIZE to -1.  This way
+	 * the loop becomes faster.  */
+	j = -s1_size;
+	res_ptr -= j;
+	s1_ptr -= j;
+
+	cy_limb = 0;
+	do {
+		umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb);
+
+		prod_low += cy_limb;
+		cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high;
+
+		x = res_ptr[j];
+		prod_low = x - prod_low;
+		cy_limb += prod_low > x ? 1 : 0;
+		res_ptr[j] = prod_low;
+	} while (++j);
+
+	return cy_limb;
+}
diff --git a/lib/crypto/mpi/generic_mpih-rshift.c b/lib/crypto/mpi/generic_mpih-rshift.c
new file mode 100644
index 000000000000..e07bc69aa898
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-rshift.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpih-rshift.c  -  MPI helper functions
+ * Copyright (C) 1994, 1996, 1998, 1999,
+ *               2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GNUPG
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+
+/* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right
+ * and store the USIZE least significant limbs of the result at WP.
+ * The bits shifted out to the right are returned.
+ *
+ * Argument constraints:
+ * 1. 0 < CNT < BITS_PER_MP_LIMB
+ * 2. If the result is to be written over the input, WP must be <= UP.
+ */
+
+mpi_limb_t
+mpihelp_rshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, unsigned cnt)
+{
+	mpi_limb_t high_limb, low_limb;
+	unsigned sh_1, sh_2;
+	mpi_size_t i;
+	mpi_limb_t retval;
+
+	sh_1 = cnt;
+	wp -= 1;
+	sh_2 = BITS_PER_MPI_LIMB - sh_1;
+	high_limb = up[0];
+	retval = high_limb << sh_2;
+	low_limb = high_limb;
+	for (i = 1; i < usize; i++) {
+		high_limb = up[i];
+		wp[i] = (low_limb >> sh_1) | (high_limb << sh_2);
+		low_limb = high_limb;
+	}
+	wp[i] = low_limb >> sh_1;
+
+	return retval;
+}
diff --git a/lib/crypto/mpi/generic_mpih-sub1.c b/lib/crypto/mpi/generic_mpih-sub1.c
new file mode 100644
index 000000000000..eea4382aad5f
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-sub1.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-add_2.c  -  MPI helper functions
+ * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+mpi_limb_t
+mpihelp_sub_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+	      mpi_ptr_t s2_ptr, mpi_size_t size)
+{
+	mpi_limb_t x, y, cy;
+	mpi_size_t j;
+
+	/* The loop counter and index J goes from -SIZE to -1.  This way
+	   the loop becomes faster.  */
+	j = -size;
+
+	/* Offset the base pointers to compensate for the negative indices.  */
+	s1_ptr -= j;
+	s2_ptr -= j;
+	res_ptr -= j;
+
+	cy = 0;
+	do {
+		y = s2_ptr[j];
+		x = s1_ptr[j];
+		y += cy;	/* add previous carry to subtrahend */
+		cy = y < cy;	/* get out carry from that addition */
+		y = x - y;	/* main subtract */
+		cy += y > x;	/* get out carry from the subtract, combine */
+		res_ptr[j] = y;
+	} while (++j);
+
+	return cy;
+}
diff --git a/lib/crypto/mpi/longlong.h b/lib/crypto/mpi/longlong.h
new file mode 100644
index 000000000000..b6fa1d08fb55
--- /dev/null
+++ b/lib/crypto/mpi/longlong.h
@@ -0,0 +1,1361 @@
+/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
+ * Note: I added some stuff for use with gnupg
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994, 1996, 1998,
+ *	2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public License
+ * along with this file; see the file COPYING.LIB.  If not, write to
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA. */
+
+#include <linux/count_zeros.h>
+
+/* You have to define the following before including this file:
+ *
+ * UWtype -- An unsigned type, default type for operations (typically a "word")
+ * UHWtype -- An unsigned type, at least half the size of UWtype.
+ * UDWtype -- An unsigned type, at least twice as large a UWtype
+ * W_TYPE_SIZE -- size in bits of UWtype
+ *
+ * SItype, USItype -- Signed and unsigned 32 bit types.
+ * DItype, UDItype -- Signed and unsigned 64 bit types.
+ *
+ * On a 32 bit machine UWtype should typically be USItype;
+ * on a 64 bit machine, UWtype should typically be UDItype.
+*/
+
+#define __BITS4 (W_TYPE_SIZE / 4)
+#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
+#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
+#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
+
+/* This is used to make sure no undesirable sharing between different libraries
+	that use this file takes place.  */
+#ifndef __MPN
+#define __MPN(x) __##x
+#endif
+
+/* Define auxiliary asm macros.
+ *
+ * 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
+ * UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
+ * word product in HIGH_PROD and LOW_PROD.
+ *
+ * 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
+ * UDWtype product.  This is just a variant of umul_ppmm.
+
+ * 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
+ * denominator) divides a UDWtype, composed by the UWtype integers
+ * HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
+ * in QUOTIENT and the remainder in REMAINDER.	HIGH_NUMERATOR must be less
+ * than DENOMINATOR for correct operation.  If, in addition, the most
+ * significant bit of DENOMINATOR must be 1, then the pre-processor symbol
+ * UDIV_NEEDS_NORMALIZATION is defined to 1.
+ * 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
+ * denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
+ * is rounded towards 0.
+ *
+ * 5) count_leading_zeros(count, x) counts the number of zero-bits from the
+ * msb to the first non-zero bit in the UWtype X.  This is the number of
+ * steps X needs to be shifted left to set the msb.  Undefined for X == 0,
+ * unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
+ *
+ * 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
+ * from the least significant end.
+ *
+ * 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
+ * high_addend_2, low_addend_2) adds two UWtype integers, composed by
+ * HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
+ * respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
+ * (i.e. carry out) is not stored anywhere, and is lost.
+ *
+ * 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
+ * high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
+ * composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
+ * LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
+ * and LOW_DIFFERENCE.	Overflow (i.e. carry out) is not stored anywhere,
+ * and is lost.
+ *
+ * If any of these macros are left undefined for a particular CPU,
+ * C macros are used.  */
+
+/* The CPUs come in alphabetical order below.
+ *
+ * Please add support for more CPUs here, or improve the current support
+ * for the CPUs below!	*/
+
+#if defined(__GNUC__) && !defined(NO_ASM)
+
+/* We sometimes need to clobber "cc" with gcc2, but that would not be
+	understood by gcc1.	Use cpp to avoid major code duplication.  */
+#if __GNUC__ < 2
+#define __CLOBBER_CC
+#define __AND_CLOBBER_CC
+#else /* __GNUC__ >= 2 */
+#define __CLOBBER_CC : "cc"
+#define __AND_CLOBBER_CC , "cc"
+#endif /* __GNUC__ < 2 */
+
+/***************************************
+	**************  A29K  *****************
+	***************************************/
+#if (defined(__a29k__) || defined(_AM29K)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("add %1,%4,%5\n" \
+		"addc %0,%2,%3" \
+	: "=r" ((USItype)(sh)), \
+		"=&r" ((USItype)(sl)) \
+	: "%r" ((USItype)(ah)), \
+		"rI" ((USItype)(bh)), \
+		"%r" ((USItype)(al)), \
+		"rI" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("sub %1,%4,%5\n" \
+		"subc %0,%2,%3" \
+	: "=r" ((USItype)(sh)), \
+		"=&r" ((USItype)(sl)) \
+	: "r" ((USItype)(ah)), \
+		"rI" ((USItype)(bh)), \
+		"r" ((USItype)(al)), \
+		"rI" ((USItype)(bl)))
+#define umul_ppmm(xh, xl, m0, m1) \
+do { \
+		USItype __m0 = (m0), __m1 = (m1); \
+		__asm__ ("multiplu %0,%1,%2" \
+		: "=r" ((USItype)(xl)) \
+		: "r" (__m0), \
+			"r" (__m1)); \
+		__asm__ ("multmu %0,%1,%2" \
+		: "=r" ((USItype)(xh)) \
+		: "r" (__m0), \
+			"r" (__m1)); \
+} while (0)
+#define udiv_qrnnd(q, r, n1, n0, d) \
+	__asm__ ("dividu %0,%3,%4" \
+	: "=r" ((USItype)(q)), \
+		"=q" ((USItype)(r)) \
+	: "1" ((USItype)(n1)), \
+		"r" ((USItype)(n0)), \
+		"r" ((USItype)(d)))
+#endif /* __a29k__ */
+
+#if defined(__alpha) && W_TYPE_SIZE == 64
+#define umul_ppmm(ph, pl, m0, m1)			\
+do {							\
+	UDItype __m0 = (m0), __m1 = (m1);		\
+	(ph) = __builtin_alpha_umulh(__m0, __m1);	\
+	(pl) = __m0 * __m1;                             \
+} while (0)
+#define UMUL_TIME 46
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+do { UDItype __r; \
+	(q) = __udiv_qrnnd(&__r, (n1), (n0), (d)); \
+	(r) = __r; \
+} while (0)
+extern UDItype __udiv_qrnnd(UDItype *, UDItype, UDItype, UDItype);
+#define UDIV_TIME 220
+#endif /* LONGLONG_STANDALONE */
+#endif /* __alpha */
+
+/***************************************
+	**************  ARM  ******************
+	***************************************/
+#if defined(__arm__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("adds %1, %4, %5\n" \
+		"adc  %0, %2, %3" \
+	: "=r" (sh), \
+		"=&r" (sl) \
+	: "%r" ((USItype)(ah)), \
+		"rI" ((USItype)(bh)), \
+		"%r" ((USItype)(al)), \
+		"rI" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("subs %1, %4, %5\n" \
+		"sbc  %0, %2, %3" \
+	: "=r" (sh), \
+		"=&r" (sl) \
+	: "r" ((USItype)(ah)), \
+		"rI" ((USItype)(bh)), \
+		"r" ((USItype)(al)), \
+		"rI" ((USItype)(bl)))
+#if defined __ARM_ARCH_2__ || defined __ARM_ARCH_3__
+#define umul_ppmm(xh, xl, a, b) \
+	__asm__ ("@ Inlined umul_ppmm\n" \
+		"mov	%|r0, %2, lsr #16		@ AAAA\n" \
+		"mov	%|r2, %3, lsr #16		@ BBBB\n" \
+		"bic	%|r1, %2, %|r0, lsl #16		@ aaaa\n" \
+		"bic	%0, %3, %|r2, lsl #16		@ bbbb\n" \
+		"mul	%1, %|r1, %|r2			@ aaaa * BBBB\n" \
+		"mul	%|r2, %|r0, %|r2		@ AAAA * BBBB\n" \
+		"mul	%|r1, %0, %|r1			@ aaaa * bbbb\n" \
+		"mul	%0, %|r0, %0			@ AAAA * bbbb\n" \
+		"adds	%|r0, %1, %0			@ central sum\n" \
+		"addcs	%|r2, %|r2, #65536\n" \
+		"adds	%1, %|r1, %|r0, lsl #16\n" \
+		"adc	%0, %|r2, %|r0, lsr #16" \
+	: "=&r" (xh), \
+		"=r" (xl) \
+	: "r" ((USItype)(a)), \
+		"r" ((USItype)(b)) \
+	: "r0", "r1", "r2")
+#else
+#define umul_ppmm(xh, xl, a, b) \
+	__asm__ ("@ Inlined umul_ppmm\n" \
+		"umull %1, %0, %2, %3" \
+	: "=&r" (xh), \
+		"=&r" (xl) \
+	: "r" ((USItype)(a)), \
+		"r" ((USItype)(b)) \
+	: "r0", "r1")
+#endif
+#define UMUL_TIME 20
+#define UDIV_TIME 100
+#endif /* __arm__ */
+
+/***************************************
+	**************  CLIPPER  **************
+	***************************************/
+#if defined(__clipper__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+	({union {UDItype __ll; \
+		struct {USItype __l, __h; } __i; \
+	} __xx; \
+	__asm__ ("mulwux %2,%0" \
+	: "=r" (__xx.__ll) \
+	: "%0" ((USItype)(u)), \
+		"r" ((USItype)(v))); \
+	(w1) = __xx.__i.__h; (w0) = __xx.__i.__l; })
+#define smul_ppmm(w1, w0, u, v) \
+	({union {DItype __ll; \
+		struct {SItype __l, __h; } __i; \
+	} __xx; \
+	__asm__ ("mulwx %2,%0" \
+	: "=r" (__xx.__ll) \
+	: "%0" ((SItype)(u)), \
+		"r" ((SItype)(v))); \
+	(w1) = __xx.__i.__h; (w0) = __xx.__i.__l; })
+#define __umulsidi3(u, v) \
+	({UDItype __w; \
+	__asm__ ("mulwux %2,%0" \
+	: "=r" (__w) \
+	: "%0" ((USItype)(u)), \
+		"r" ((USItype)(v))); \
+	__w; })
+#endif /* __clipper__ */
+
+/***************************************
+	**************  GMICRO  ***************
+	***************************************/
+#if defined(__gmicro__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("add.w %5,%1\n" \
+		"addx %3,%0" \
+	: "=g" ((USItype)(sh)), \
+		"=&g" ((USItype)(sl)) \
+	: "%0" ((USItype)(ah)), \
+		"g" ((USItype)(bh)), \
+		"%1" ((USItype)(al)), \
+		"g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("sub.w %5,%1\n" \
+		"subx %3,%0" \
+	: "=g" ((USItype)(sh)), \
+		"=&g" ((USItype)(sl)) \
+	: "0" ((USItype)(ah)), \
+		"g" ((USItype)(bh)), \
+		"1" ((USItype)(al)), \
+		"g" ((USItype)(bl)))
+#define umul_ppmm(ph, pl, m0, m1) \
+	__asm__ ("mulx %3,%0,%1" \
+	: "=g" ((USItype)(ph)), \
+		"=r" ((USItype)(pl)) \
+	: "%0" ((USItype)(m0)), \
+		"g" ((USItype)(m1)))
+#define udiv_qrnnd(q, r, nh, nl, d) \
+	__asm__ ("divx %4,%0,%1" \
+	: "=g" ((USItype)(q)), \
+		"=r" ((USItype)(r)) \
+	: "1" ((USItype)(nh)), \
+		"0" ((USItype)(nl)), \
+		"g" ((USItype)(d)))
+#endif
+
+/***************************************
+	**************  HPPA  *****************
+	***************************************/
+#if defined(__hppa) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("add %4,%5,%1\n" \
+		   "addc %2,%3,%0" \
+	: "=r" ((USItype)(sh)), \
+	     "=&r" ((USItype)(sl)) \
+	: "%rM" ((USItype)(ah)), \
+	     "rM" ((USItype)(bh)), \
+	     "%rM" ((USItype)(al)), \
+	     "rM" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("sub %4,%5,%1\n" \
+	   "subb %2,%3,%0" \
+	: "=r" ((USItype)(sh)), \
+	     "=&r" ((USItype)(sl)) \
+	: "rM" ((USItype)(ah)), \
+	     "rM" ((USItype)(bh)), \
+	     "rM" ((USItype)(al)), \
+	     "rM" ((USItype)(bl)))
+#if 0 && defined(_PA_RISC1_1)
+/* xmpyu uses floating point register which is not allowed in Linux kernel. */
+#define umul_ppmm(wh, wl, u, v) \
+do { \
+	union {UDItype __ll; \
+	struct {USItype __h, __l; } __i; \
+	} __xx; \
+	__asm__ ("xmpyu %1,%2,%0" \
+	: "=*f" (__xx.__ll) \
+	: "*f" ((USItype)(u)), \
+	       "*f" ((USItype)(v))); \
+	(wh) = __xx.__i.__h; \
+	(wl) = __xx.__i.__l; \
+} while (0)
+#define UMUL_TIME 8
+#define UDIV_TIME 60
+#else
+#define UMUL_TIME 40
+#define UDIV_TIME 80
+#endif
+#if 0 /* #ifndef LONGLONG_STANDALONE */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+do { USItype __r; \
+	(q) = __udiv_qrnnd(&__r, (n1), (n0), (d)); \
+	(r) = __r; \
+} while (0)
+extern USItype __udiv_qrnnd();
+#endif /* LONGLONG_STANDALONE */
+#endif /* hppa */
+
+/***************************************
+	**************  I370  *****************
+	***************************************/
+#if (defined(__i370__) || defined(__mvs__)) && W_TYPE_SIZE == 32
+#define umul_ppmm(xh, xl, m0, m1) \
+do { \
+	union {UDItype __ll; \
+	   struct {USItype __h, __l; } __i; \
+	} __xx; \
+	USItype __m0 = (m0), __m1 = (m1); \
+	__asm__ ("mr %0,%3" \
+	: "=r" (__xx.__i.__h), \
+	       "=r" (__xx.__i.__l) \
+	: "%1" (__m0), \
+	       "r" (__m1)); \
+	(xh) = __xx.__i.__h; (xl) = __xx.__i.__l; \
+	(xh) += ((((SItype) __m0 >> 31) & __m1) \
+	     + (((SItype) __m1 >> 31) & __m0)); \
+} while (0)
+#define smul_ppmm(xh, xl, m0, m1) \
+do { \
+	union {DItype __ll; \
+	   struct {USItype __h, __l; } __i; \
+	} __xx; \
+	__asm__ ("mr %0,%3" \
+	: "=r" (__xx.__i.__h), \
+	       "=r" (__xx.__i.__l) \
+	: "%1" (m0), \
+	       "r" (m1)); \
+	(xh) = __xx.__i.__h; (xl) = __xx.__i.__l; \
+} while (0)
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+do { \
+	union {DItype __ll; \
+	   struct {USItype __h, __l; } __i; \
+	} __xx; \
+	__xx.__i.__h = n1; __xx.__i.__l = n0; \
+	__asm__ ("dr %0,%2" \
+	: "=r" (__xx.__ll) \
+	: "0" (__xx.__ll), "r" (d)); \
+	(q) = __xx.__i.__l; (r) = __xx.__i.__h; \
+} while (0)
+#endif
+
+/***************************************
+	**************  I386  *****************
+	***************************************/
+#undef __i386__
+#if (defined(__i386__) || defined(__i486__)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("addl %5,%1\n" \
+	   "adcl %3,%0" \
+	: "=r" (sh), \
+	     "=&r" (sl) \
+	: "%0" ((USItype)(ah)), \
+	     "g" ((USItype)(bh)), \
+	     "%1" ((USItype)(al)), \
+	     "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("subl %5,%1\n" \
+	   "sbbl %3,%0" \
+	: "=r" (sh), \
+	     "=&r" (sl) \
+	: "0" ((USItype)(ah)), \
+	     "g" ((USItype)(bh)), \
+	     "1" ((USItype)(al)), \
+	     "g" ((USItype)(bl)))
+#define umul_ppmm(w1, w0, u, v) \
+	__asm__ ("mull %3" \
+	: "=a" (w0), \
+	     "=d" (w1) \
+	: "%0" ((USItype)(u)), \
+	     "rm" ((USItype)(v)))
+#define udiv_qrnnd(q, r, n1, n0, d) \
+	__asm__ ("divl %4" \
+	: "=a" (q), \
+	     "=d" (r) \
+	: "0" ((USItype)(n0)), \
+	     "1" ((USItype)(n1)), \
+	     "rm" ((USItype)(d)))
+#ifndef UMUL_TIME
+#define UMUL_TIME 40
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME 40
+#endif
+#endif /* 80x86 */
+
+/***************************************
+	**************  I860  *****************
+	***************************************/
+#if defined(__i860__) && W_TYPE_SIZE == 32
+#define rshift_rhlc(r, h, l, c) \
+	__asm__ ("shr %3,r0,r0\n" \
+	"shrd %1,%2,%0" \
+	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
+#endif /* i860 */
+
+/***************************************
+	**************  I960  *****************
+	***************************************/
+#if defined(__i960__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("cmpo 1,0\n" \
+	"addc %5,%4,%1\n" \
+	"addc %3,%2,%0" \
+	: "=r" ((USItype)(sh)), \
+	     "=&r" ((USItype)(sl)) \
+	: "%dI" ((USItype)(ah)), \
+	     "dI" ((USItype)(bh)), \
+	     "%dI" ((USItype)(al)), \
+	     "dI" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("cmpo 0,0\n" \
+	"subc %5,%4,%1\n" \
+	"subc %3,%2,%0" \
+	: "=r" ((USItype)(sh)), \
+	     "=&r" ((USItype)(sl)) \
+	: "dI" ((USItype)(ah)), \
+	     "dI" ((USItype)(bh)), \
+	     "dI" ((USItype)(al)), \
+	     "dI" ((USItype)(bl)))
+#define umul_ppmm(w1, w0, u, v) \
+	({union {UDItype __ll; \
+	   struct {USItype __l, __h; } __i; \
+	} __xx; \
+	__asm__ ("emul        %2,%1,%0" \
+	: "=d" (__xx.__ll) \
+	: "%dI" ((USItype)(u)), \
+	     "dI" ((USItype)(v))); \
+	(w1) = __xx.__i.__h; (w0) = __xx.__i.__l; })
+#define __umulsidi3(u, v) \
+	({UDItype __w; \
+	__asm__ ("emul      %2,%1,%0" \
+	: "=d" (__w) \
+	: "%dI" ((USItype)(u)), \
+	       "dI" ((USItype)(v))); \
+	__w; })
+#define udiv_qrnnd(q, r, nh, nl, d) \
+do { \
+	union {UDItype __ll; \
+	   struct {USItype __l, __h; } __i; \
+	} __nn; \
+	__nn.__i.__h = (nh); __nn.__i.__l = (nl); \
+	__asm__ ("ediv %d,%n,%0" \
+	: "=d" (__rq.__ll) \
+	: "dI" (__nn.__ll), \
+	     "dI" ((USItype)(d))); \
+	(r) = __rq.__i.__l; (q) = __rq.__i.__h; \
+} while (0)
+#if defined(__i960mx)		/* what is the proper symbol to test??? */
+#define rshift_rhlc(r, h, l, c) \
+do { \
+	union {UDItype __ll; \
+	   struct {USItype __l, __h; } __i; \
+	} __nn; \
+	__nn.__i.__h = (h); __nn.__i.__l = (l); \
+	__asm__ ("shre %2,%1,%0" \
+	: "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \
+}
+#endif /* i960mx */
+#endif /* i960 */
+
+/***************************************
+	**************  68000	****************
+	***************************************/
+#if (defined(__mc68000__) || defined(__mc68020__) || defined(__NeXT__) || defined(mc68020)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("add%.l %5,%1\n" \
+	   "addx%.l %3,%0" \
+	: "=d" ((USItype)(sh)), \
+	     "=&d" ((USItype)(sl)) \
+	: "%0" ((USItype)(ah)), \
+	     "d" ((USItype)(bh)), \
+	     "%1" ((USItype)(al)), \
+	     "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("sub%.l %5,%1\n" \
+	   "subx%.l %3,%0" \
+	: "=d" ((USItype)(sh)), \
+	     "=&d" ((USItype)(sl)) \
+	: "0" ((USItype)(ah)), \
+	     "d" ((USItype)(bh)), \
+	     "1" ((USItype)(al)), \
+	     "g" ((USItype)(bl)))
+#if (defined(__mc68020__) || defined(__NeXT__) || defined(mc68020))
+#define umul_ppmm(w1, w0, u, v) \
+	__asm__ ("mulu%.l %3,%1:%0" \
+	: "=d" ((USItype)(w0)), \
+	     "=d" ((USItype)(w1)) \
+	: "%0" ((USItype)(u)), \
+	     "dmi" ((USItype)(v)))
+#define UMUL_TIME 45
+#define udiv_qrnnd(q, r, n1, n0, d) \
+	__asm__ ("divu%.l %4,%1:%0" \
+	: "=d" ((USItype)(q)), \
+	     "=d" ((USItype)(r)) \
+	: "0" ((USItype)(n0)), \
+	     "1" ((USItype)(n1)), \
+	     "dmi" ((USItype)(d)))
+#define UDIV_TIME 90
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+	__asm__ ("divs%.l %4,%1:%0" \
+	: "=d" ((USItype)(q)), \
+	     "=d" ((USItype)(r)) \
+	: "0" ((USItype)(n0)), \
+	     "1" ((USItype)(n1)), \
+	     "dmi" ((USItype)(d)))
+#else /* not mc68020 */
+#define umul_ppmm(xh, xl, a, b) \
+do { USItype __umul_tmp1, __umul_tmp2; \
+	__asm__ ("| Inlined umul_ppmm\n" \
+	"move%.l %5,%3\n" \
+	"move%.l %2,%0\n" \
+	"move%.w %3,%1\n" \
+	"swap	%3\n" \
+	"swap	%0\n" \
+	"mulu	%2,%1\n" \
+	"mulu	%3,%0\n" \
+	"mulu	%2,%3\n" \
+	"swap	%2\n" \
+	"mulu	%5,%2\n" \
+	"add%.l	%3,%2\n" \
+	"jcc	1f\n" \
+	"add%.l	%#0x10000,%0\n" \
+	"1:	move%.l %2,%3\n" \
+	"clr%.w	%2\n" \
+	"swap	%2\n" \
+	"swap	%3\n" \
+	"clr%.w	%3\n" \
+	"add%.l	%3,%1\n" \
+	"addx%.l %2,%0\n" \
+	"| End inlined umul_ppmm" \
+	: "=&d" ((USItype)(xh)), "=&d" ((USItype)(xl)), \
+		"=d" (__umul_tmp1), "=&d" (__umul_tmp2) \
+	: "%2" ((USItype)(a)), "d" ((USItype)(b))); \
+} while (0)
+#define UMUL_TIME 100
+#define UDIV_TIME 400
+#endif /* not mc68020 */
+#endif /* mc68000 */
+
+/***************************************
+	**************  88000	****************
+	***************************************/
+#if defined(__m88000__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("addu.co %1,%r4,%r5\n" \
+	   "addu.ci %0,%r2,%r3" \
+	: "=r" ((USItype)(sh)), \
+	     "=&r" ((USItype)(sl)) \
+	: "%rJ" ((USItype)(ah)), \
+	     "rJ" ((USItype)(bh)), \
+	     "%rJ" ((USItype)(al)), \
+	     "rJ" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("subu.co %1,%r4,%r5\n" \
+	   "subu.ci %0,%r2,%r3" \
+	: "=r" ((USItype)(sh)), \
+	     "=&r" ((USItype)(sl)) \
+	: "rJ" ((USItype)(ah)), \
+	     "rJ" ((USItype)(bh)), \
+	     "rJ" ((USItype)(al)), \
+	     "rJ" ((USItype)(bl)))
+#if defined(__m88110__)
+#define umul_ppmm(wh, wl, u, v) \
+do { \
+	union {UDItype __ll; \
+	   struct {USItype __h, __l; } __i; \
+	} __x; \
+	__asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \
+	(wh) = __x.__i.__h; \
+	(wl) = __x.__i.__l; \
+} while (0)
+#define udiv_qrnnd(q, r, n1, n0, d) \
+	({union {UDItype __ll; \
+	   struct {USItype __h, __l; } __i; \
+	} __x, __q; \
+	__x.__i.__h = (n1); __x.__i.__l = (n0); \
+	__asm__ ("divu.d %0,%1,%2" \
+	: "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \
+	(r) = (n0) - __q.__l * (d); (q) = __q.__l; })
+#define UMUL_TIME 5
+#define UDIV_TIME 25
+#else
+#define UMUL_TIME 17
+#define UDIV_TIME 150
+#endif /* __m88110__ */
+#endif /* __m88000__ */
+
+/***************************************
+	**************  MIPS  *****************
+	***************************************/
+#if defined(__mips__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v)			\
+do {						\
+	UDItype __ll = (UDItype)(u) * (v);	\
+	w1 = __ll >> 32;			\
+	w0 = __ll;				\
+} while (0)
+#define UMUL_TIME 10
+#define UDIV_TIME 100
+#endif /* __mips__ */
+
+/***************************************
+	**************  MIPS/64  **************
+	***************************************/
+#if (defined(__mips) && __mips >= 3) && W_TYPE_SIZE == 64
+#if defined(__mips_isa_rev) && __mips_isa_rev >= 6 && defined(CONFIG_CC_IS_GCC)
+/*
+ * GCC ends up emitting a __multi3 intrinsic call for MIPS64r6 with the plain C
+ * code below, so we special case MIPS64r6 until the compiler can do better.
+ */
+#define umul_ppmm(w1, w0, u, v)						\
+do {									\
+	__asm__ ("dmulu %0,%1,%2"					\
+		 : "=d" ((UDItype)(w0))					\
+		 : "d" ((UDItype)(u)),					\
+		   "d" ((UDItype)(v)));					\
+	__asm__ ("dmuhu %0,%1,%2"					\
+		 : "=d" ((UDItype)(w1))					\
+		 : "d" ((UDItype)(u)),					\
+		   "d" ((UDItype)(v)));					\
+} while (0)
+#else
+#define umul_ppmm(w1, w0, u, v) \
+do {									\
+	typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
+	__ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
+	w1 = __ll >> 64;						\
+	w0 = __ll;							\
+} while (0)
+#endif
+#define UMUL_TIME 20
+#define UDIV_TIME 140
+#endif /* __mips__ */
+
+/***************************************
+	**************  32000	****************
+	***************************************/
+#if defined(__ns32000__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+	({union {UDItype __ll; \
+	   struct {USItype __l, __h; } __i; \
+	} __xx; \
+	__asm__ ("meid %2,%0" \
+	: "=g" (__xx.__ll) \
+	: "%0" ((USItype)(u)), \
+	     "g" ((USItype)(v))); \
+	(w1) = __xx.__i.__h; (w0) = __xx.__i.__l; })
+#define __umulsidi3(u, v) \
+	({UDItype __w; \
+	__asm__ ("meid %2,%0" \
+	: "=g" (__w) \
+	: "%0" ((USItype)(u)), \
+	       "g" ((USItype)(v))); \
+	__w; })
+#define udiv_qrnnd(q, r, n1, n0, d) \
+	({union {UDItype __ll; \
+	   struct {USItype __l, __h; } __i; \
+	} __xx; \
+	__xx.__i.__h = (n1); __xx.__i.__l = (n0); \
+	__asm__ ("deid %2,%0" \
+	: "=g" (__xx.__ll) \
+	: "0" (__xx.__ll), \
+	     "g" ((USItype)(d))); \
+	(r) = __xx.__i.__l; (q) = __xx.__i.__h; })
+#endif /* __ns32000__ */
+
+/***************************************
+	**************  PPC  ******************
+	***************************************/
+#if (defined(_ARCH_PPC) || defined(_IBMR2)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+do { \
+	if (__builtin_constant_p(bh) && (bh) == 0) \
+		__asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \
+		: "=r" (sh), \
+		"=&r" (sl) \
+		: "%r" ((USItype)(ah)), \
+		"%r" ((USItype)(al)), \
+		"rI" ((USItype)(bl))); \
+	else if (__builtin_constant_p(bh) && (bh) == ~(USItype) 0) \
+		__asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \
+		: "=r" (sh), \
+		"=&r" (sl) \
+		: "%r" ((USItype)(ah)), \
+		"%r" ((USItype)(al)), \
+		"rI" ((USItype)(bl))); \
+	else \
+		__asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \
+		: "=r" (sh), \
+		"=&r" (sl) \
+		: "%r" ((USItype)(ah)), \
+		"r" ((USItype)(bh)), \
+		"%r" ((USItype)(al)), \
+		"rI" ((USItype)(bl))); \
+} while (0)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+do { \
+	if (__builtin_constant_p(ah) && (ah) == 0) \
+		__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \
+		: "=r" (sh), \
+		"=&r" (sl) \
+		: "r" ((USItype)(bh)), \
+		"rI" ((USItype)(al)), \
+		"r" ((USItype)(bl))); \
+	else if (__builtin_constant_p(ah) && (ah) == ~(USItype) 0) \
+		__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \
+		: "=r" (sh), \
+		"=&r" (sl) \
+		: "r" ((USItype)(bh)), \
+		"rI" ((USItype)(al)), \
+		"r" ((USItype)(bl))); \
+	else if (__builtin_constant_p(bh) && (bh) == 0) \
+		__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \
+		: "=r" (sh), \
+		"=&r" (sl) \
+		: "r" ((USItype)(ah)), \
+		"rI" ((USItype)(al)), \
+		"r" ((USItype)(bl))); \
+	else if (__builtin_constant_p(bh) && (bh) == ~(USItype) 0) \
+		__asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \
+		: "=r" (sh), \
+		"=&r" (sl) \
+		: "r" ((USItype)(ah)), \
+		"rI" ((USItype)(al)), \
+		"r" ((USItype)(bl))); \
+	else \
+		__asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \
+		: "=r" (sh), \
+		"=&r" (sl) \
+		: "r" ((USItype)(ah)), \
+		"r" ((USItype)(bh)), \
+		"rI" ((USItype)(al)), \
+		"r" ((USItype)(bl))); \
+} while (0)
+#if defined(_ARCH_PPC)
+#define umul_ppmm(ph, pl, m0, m1) \
+do { \
+	USItype __m0 = (m0), __m1 = (m1); \
+	__asm__ ("mulhwu %0,%1,%2" \
+	: "=r" (ph) \
+	: "%r" (__m0), \
+	"r" (__m1)); \
+	(pl) = __m0 * __m1; \
+} while (0)
+#define UMUL_TIME 15
+#define smul_ppmm(ph, pl, m0, m1) \
+do { \
+	SItype __m0 = (m0), __m1 = (m1); \
+	__asm__ ("mulhw %0,%1,%2" \
+	: "=r" ((SItype) ph) \
+	: "%r" (__m0), \
+	"r" (__m1)); \
+	(pl) = __m0 * __m1; \
+} while (0)
+#define SMUL_TIME 14
+#define UDIV_TIME 120
+#else
+#define umul_ppmm(xh, xl, m0, m1) \
+do { \
+	USItype __m0 = (m0), __m1 = (m1); \
+	__asm__ ("mul %0,%2,%3" \
+	: "=r" ((USItype)(xh)), \
+	"=q" ((USItype)(xl)) \
+	: "r" (__m0), \
+	"r" (__m1)); \
+	(xh) += ((((SItype) __m0 >> 31) & __m1) \
+	+ (((SItype) __m1 >> 31) & __m0)); \
+} while (0)
+#define UMUL_TIME 8
+#define smul_ppmm(xh, xl, m0, m1) \
+	__asm__ ("mul %0,%2,%3" \
+	: "=r" ((SItype)(xh)), \
+	"=q" ((SItype)(xl)) \
+	: "r" (m0), \
+	"r" (m1))
+#define SMUL_TIME 4
+#define sdiv_qrnnd(q, r, nh, nl, d) \
+	__asm__ ("div %0,%2,%4" \
+	: "=r" ((SItype)(q)), "=q" ((SItype)(r)) \
+	: "r" ((SItype)(nh)), "1" ((SItype)(nl)), "r" ((SItype)(d)))
+#define UDIV_TIME 100
+#endif
+#endif /* Power architecture variants.  */
+
+/***************************************
+	**************  PYR  ******************
+	***************************************/
+#if defined(__pyr__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("addw        %5,%1\n" \
+	"addwc	%3,%0" \
+	: "=r" ((USItype)(sh)), \
+	"=&r" ((USItype)(sl)) \
+	: "%0" ((USItype)(ah)), \
+	"g" ((USItype)(bh)), \
+	"%1" ((USItype)(al)), \
+	"g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("subw        %5,%1\n" \
+	"subwb	%3,%0" \
+	: "=r" ((USItype)(sh)), \
+	"=&r" ((USItype)(sl)) \
+	: "0" ((USItype)(ah)), \
+	"g" ((USItype)(bh)), \
+	"1" ((USItype)(al)), \
+	"g" ((USItype)(bl)))
+	/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
+#define umul_ppmm(w1, w0, u, v) \
+	({union {UDItype __ll; \
+	struct {USItype __h, __l; } __i; \
+	} __xx; \
+	__asm__ ("movw %1,%R0\n" \
+	"uemul %2,%0" \
+	: "=&r" (__xx.__ll) \
+	: "g" ((USItype) (u)), \
+	"g" ((USItype)(v))); \
+	(w1) = __xx.__i.__h; (w0) = __xx.__i.__l; })
+#endif /* __pyr__ */
+
+/***************************************
+	**************  RT/ROMP  **************
+	***************************************/
+#if defined(__ibm032__) /* RT/ROMP */	&& W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("a %1,%5\n" \
+	"ae %0,%3" \
+	: "=r" ((USItype)(sh)), \
+	"=&r" ((USItype)(sl)) \
+	: "%0" ((USItype)(ah)), \
+	"r" ((USItype)(bh)), \
+	"%1" ((USItype)(al)), \
+	"r" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("s %1,%5\n" \
+	"se %0,%3" \
+	: "=r" ((USItype)(sh)), \
+	"=&r" ((USItype)(sl)) \
+	: "0" ((USItype)(ah)), \
+	"r" ((USItype)(bh)), \
+	"1" ((USItype)(al)), \
+	"r" ((USItype)(bl)))
+#define umul_ppmm(ph, pl, m0, m1) \
+do { \
+	USItype __m0 = (m0), __m1 = (m1); \
+	__asm__ ( \
+	"s       r2,r2\n" \
+	"mts	r10,%2\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"m	r2,%3\n" \
+	"cas	%0,r2,r0\n" \
+	"mfs	r10,%1" \
+	: "=r" ((USItype)(ph)), \
+	"=r" ((USItype)(pl)) \
+	: "%r" (__m0), \
+	"r" (__m1) \
+	: "r2"); \
+	(ph) += ((((SItype) __m0 >> 31) & __m1) \
+	+ (((SItype) __m1 >> 31) & __m0)); \
+} while (0)
+#define UMUL_TIME 20
+#define UDIV_TIME 200
+#endif /* RT/ROMP */
+
+/***************************************
+	**************  SH2  ******************
+	***************************************/
+#if (defined(__sh2__) || defined(__sh3__) || defined(__SH4__)) \
+	&& W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+	__asm__ ( \
+	"dmulu.l %2,%3\n" \
+	"sts	macl,%1\n" \
+	"sts	mach,%0" \
+	: "=r" ((USItype)(w1)), \
+	"=r" ((USItype)(w0)) \
+	: "r" ((USItype)(u)), \
+	"r" ((USItype)(v)) \
+	: "macl", "mach")
+#define UMUL_TIME 5
+#endif
+
+/***************************************
+	**************  SPARC	****************
+	***************************************/
+#if defined(__sparc__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("addcc %r4,%5,%1\n" \
+	"addx %r2,%3,%0" \
+	: "=r" ((USItype)(sh)), \
+	"=&r" ((USItype)(sl)) \
+	: "%rJ" ((USItype)(ah)), \
+	"rI" ((USItype)(bh)), \
+	"%rJ" ((USItype)(al)), \
+	"rI" ((USItype)(bl)) \
+	__CLOBBER_CC)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("subcc %r4,%5,%1\n" \
+	"subx %r2,%3,%0" \
+	: "=r" ((USItype)(sh)), \
+	"=&r" ((USItype)(sl)) \
+	: "rJ" ((USItype)(ah)), \
+	"rI" ((USItype)(bh)), \
+	"rJ" ((USItype)(al)), \
+	"rI" ((USItype)(bl)) \
+	__CLOBBER_CC)
+#if defined(__sparc_v8__)
+/* Don't match immediate range because, 1) it is not often useful,
+	2) the 'I' flag thinks of the range as a 13 bit signed interval,
+	while we want to match a 13 bit interval, sign extended to 32 bits,
+	but INTERPRETED AS UNSIGNED.  */
+#define umul_ppmm(w1, w0, u, v) \
+	__asm__ ("umul %2,%3,%1;rd %%y,%0" \
+	: "=r" ((USItype)(w1)), \
+	"=r" ((USItype)(w0)) \
+	: "r" ((USItype)(u)), \
+	"r" ((USItype)(v)))
+#define UMUL_TIME 5
+#ifndef SUPERSPARC		/* SuperSPARC's udiv only handles 53 bit dividends */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+do { \
+	USItype __q; \
+	__asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
+	: "=r" ((USItype)(__q)) \
+	: "r" ((USItype)(n1)), \
+	"r" ((USItype)(n0)), \
+	"r" ((USItype)(d))); \
+	(r) = (n0) - __q * (d); \
+	(q) = __q; \
+} while (0)
+#define UDIV_TIME 25
+#endif /* SUPERSPARC */
+#else /* ! __sparc_v8__ */
+#if defined(__sparclite__)
+/* This has hardware multiply but not divide.  It also has two additional
+	instructions scan (ffs from high bit) and divscc.  */
+#define umul_ppmm(w1, w0, u, v) \
+	__asm__ ("umul %2,%3,%1;rd %%y,%0" \
+	: "=r" ((USItype)(w1)), \
+	"=r" ((USItype)(w0)) \
+	: "r" ((USItype)(u)), \
+	"r" ((USItype)(v)))
+#define UMUL_TIME 5
+#define udiv_qrnnd(q, r, n1, n0, d) \
+	__asm__ ("! Inlined udiv_qrnnd\n" \
+	"wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n" \
+	"tst	%%g0\n" \
+	"divscc	%3,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%%g1\n" \
+	"divscc	%%g1,%4,%0\n" \
+	"rd	%%y,%1\n" \
+	"bl,a 1f\n" \
+	"add	%1,%4,%1\n" \
+	"1:	! End of inline udiv_qrnnd" \
+	: "=r" ((USItype)(q)), \
+	"=r" ((USItype)(r)) \
+	: "r" ((USItype)(n1)), \
+	"r" ((USItype)(n0)), \
+	"rI" ((USItype)(d)) \
+	: "%g1" __AND_CLOBBER_CC)
+#define UDIV_TIME 37
+#endif /* __sparclite__ */
+#endif /* __sparc_v8__ */
+	/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
+#ifndef umul_ppmm
+#define umul_ppmm(w1, w0, u, v) \
+	__asm__ ("! Inlined umul_ppmm\n" \
+	"wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
+	"sra	%3,31,%%g2	! Don't move this insn\n" \
+	"and	%2,%%g2,%%g2	! Don't move this insn\n" \
+	"andcc	%%g0,0,%%g1	! Don't move this insn\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,%3,%%g1\n" \
+	"mulscc	%%g1,0,%%g1\n" \
+	"add	%%g1,%%g2,%0\n" \
+	"rd	%%y,%1" \
+	: "=r" ((USItype)(w1)), \
+	"=r" ((USItype)(w0)) \
+	: "%rI" ((USItype)(u)), \
+	"r" ((USItype)(v)) \
+	: "%g1", "%g2" __AND_CLOBBER_CC)
+#define UMUL_TIME 39		/* 39 instructions */
+/* It's quite necessary to add this much assembler for the sparc.
+   The default udiv_qrnnd (in C) is more than 10 times slower!  */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("! Inlined udiv_qrnnd\n\t"					\
+	   "mov	32,%%g1\n\t"						\
+	   "subcc	%1,%2,%%g0\n\t"					\
+	   "1:	bcs	5f\n\t"						\
+	   "addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n\t"	\
+	   "sub	%1,%2,%1	! this kills msb of n\n\t"		\
+	   "addx	%1,%1,%1	! so this can't give carry\n\t"	\
+	   "subcc	%%g1,1,%%g1\n\t"				\
+	   "2:	bne	1b\n\t"						\
+	   "subcc	%1,%2,%%g0\n\t"					\
+	   "bcs	3f\n\t"							\
+	   "addxcc %0,%0,%0	! shift n1n0 and a q-bit in lsb\n\t"	\
+	   "b		3f\n\t"						\
+	   "sub	%1,%2,%1	! this kills msb of n\n\t"		\
+	   "4:	sub	%1,%2,%1\n\t"					\
+	   "5:	addxcc	%1,%1,%1\n\t"					\
+	   "bcc	2b\n\t"							\
+	   "subcc	%%g1,1,%%g1\n\t"				\
+	   "! Got carry from n.  Subtract next step to cancel this carry.\n\t" \
+	   "bne	4b\n\t"							\
+	   "addcc	%0,%0,%0	! shift n1n0 and a 0-bit in lsb\n\t" \
+	   "sub	%1,%2,%1\n\t"						\
+	   "3:	xnor	%0,0,%0\n\t"					\
+	   "! End of inline udiv_qrnnd\n"				\
+	   : "=&r" ((USItype)(q)),					\
+	     "=&r" ((USItype)(r))					\
+	   : "r" ((USItype)(d)),					\
+	     "1" ((USItype)(n1)),					\
+	     "0" ((USItype)(n0)) : "%g1", "cc")
+#define UDIV_TIME (3+7*32)      /* 7 instructions/iteration. 32 iterations.  */
+#endif
+#endif /* __sparc__ */
+
+/***************************************
+	**************  VAX  ******************
+	***************************************/
+#if defined(__vax__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("addl2 %5,%1\n" \
+	"adwc %3,%0" \
+	: "=g" ((USItype)(sh)), \
+	"=&g" ((USItype)(sl)) \
+	: "%0" ((USItype)(ah)), \
+	"g" ((USItype)(bh)), \
+	"%1" ((USItype)(al)), \
+	"g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("subl2 %5,%1\n" \
+	"sbwc %3,%0" \
+	: "=g" ((USItype)(sh)), \
+	"=&g" ((USItype)(sl)) \
+	: "0" ((USItype)(ah)), \
+	"g" ((USItype)(bh)), \
+	"1" ((USItype)(al)), \
+	"g" ((USItype)(bl)))
+#define umul_ppmm(xh, xl, m0, m1) \
+do { \
+	union {UDItype __ll; \
+	struct {USItype __l, __h; } __i; \
+	} __xx; \
+	USItype __m0 = (m0), __m1 = (m1); \
+	__asm__ ("emul %1,%2,$0,%0" \
+	: "=g" (__xx.__ll) \
+	: "g" (__m0), \
+	"g" (__m1)); \
+	(xh) = __xx.__i.__h; (xl) = __xx.__i.__l; \
+	(xh) += ((((SItype) __m0 >> 31) & __m1) \
+	+ (((SItype) __m1 >> 31) & __m0)); \
+} while (0)
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+do { \
+	union {DItype __ll; \
+	struct {SItype __l, __h; } __i; \
+	} __xx; \
+	__xx.__i.__h = n1; __xx.__i.__l = n0; \
+	__asm__ ("ediv %3,%2,%0,%1" \
+	: "=g" (q), "=g" (r) \
+	: "g" (__xx.__ll), "g" (d)); \
+} while (0)
+#endif /* __vax__ */
+
+/***************************************
+	**************  Z8000	****************
+	***************************************/
+#if defined(__z8000__) && W_TYPE_SIZE == 16
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+	__asm__ ("add %H1,%H5\n\tadc  %H0,%H3" \
+	: "=r" ((unsigned int)(sh)), \
+	"=&r" ((unsigned int)(sl)) \
+	: "%0" ((unsigned int)(ah)), \
+	"r" ((unsigned int)(bh)), \
+	"%1" ((unsigned int)(al)), \
+	"rQR" ((unsigned int)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+	__asm__ ("sub %H1,%H5\n\tsbc  %H0,%H3" \
+	: "=r" ((unsigned int)(sh)), \
+	"=&r" ((unsigned int)(sl)) \
+	: "0" ((unsigned int)(ah)), \
+	"r" ((unsigned int)(bh)), \
+	"1" ((unsigned int)(al)), \
+	"rQR" ((unsigned int)(bl)))
+#define umul_ppmm(xh, xl, m0, m1) \
+do { \
+	union {long int __ll; \
+	struct {unsigned int __h, __l; } __i; \
+	} __xx; \
+	unsigned int __m0 = (m0), __m1 = (m1); \
+	__asm__ ("mult      %S0,%H3" \
+	: "=r" (__xx.__i.__h), \
+	"=r" (__xx.__i.__l) \
+	: "%1" (__m0), \
+	"rQR" (__m1)); \
+	(xh) = __xx.__i.__h; (xl) = __xx.__i.__l; \
+	(xh) += ((((signed int) __m0 >> 15) & __m1) \
+	+ (((signed int) __m1 >> 15) & __m0)); \
+} while (0)
+#endif /* __z8000__ */
+
+#endif /* __GNUC__ */
+
+/***************************************
+	***********  Generic Versions	********
+	***************************************/
+#if !defined(umul_ppmm) && defined(__umulsidi3)
+#define umul_ppmm(ph, pl, m0, m1) \
+{ \
+	UDWtype __ll = __umulsidi3(m0, m1); \
+	ph = (UWtype) (__ll >> W_TYPE_SIZE); \
+	pl = (UWtype) __ll; \
+}
+#endif
+
+#if !defined(__umulsidi3)
+#define __umulsidi3(u, v) \
+	({UWtype __hi, __lo; \
+	umul_ppmm(__hi, __lo, u, v); \
+	((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
+#endif
+
+	/* If this machine has no inline assembler, use C macros.  */
+
+#if !defined(add_ssaaaa)
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+do { \
+	UWtype __x; \
+	__x = (al) + (bl); \
+	(sh) = (ah) + (bh) + (__x < (al)); \
+	(sl) = __x; \
+} while (0)
+#endif
+
+#if !defined(sub_ddmmss)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+do { \
+	UWtype __x; \
+	__x = (al) - (bl); \
+	(sh) = (ah) - (bh) - (__x > (al)); \
+	(sl) = __x; \
+} while (0)
+#endif
+
+#if !defined(umul_ppmm)
+#define umul_ppmm(w1, w0, u, v) \
+do { \
+	UWtype __x0, __x1, __x2, __x3; \
+	UHWtype __ul, __vl, __uh, __vh; \
+	UWtype __u = (u), __v = (v); \
+	\
+	__ul = __ll_lowpart(__u); \
+	__uh = __ll_highpart(__u); \
+	__vl = __ll_lowpart(__v); \
+	__vh = __ll_highpart(__v); \
+	\
+	__x0 = (UWtype) __ul * __vl; \
+	__x1 = (UWtype) __ul * __vh; \
+	__x2 = (UWtype) __uh * __vl; \
+	__x3 = (UWtype) __uh * __vh; \
+	\
+	__x1 += __ll_highpart(__x0);/* this can't give carry */ \
+	__x1 += __x2;		/* but this indeed can */ \
+	if (__x1 < __x2)		/* did we get it? */ \
+	__x3 += __ll_B;		/* yes, add it in the proper pos. */ \
+	\
+	(w1) = __x3 + __ll_highpart(__x1); \
+	(w0) = (__ll_lowpart(__x1) << W_TYPE_SIZE/2) + __ll_lowpart(__x0); \
+} while (0)
+#endif
+
+#if !defined(umul_ppmm)
+#define smul_ppmm(w1, w0, u, v) \
+do { \
+	UWtype __w1; \
+	UWtype __m0 = (u), __m1 = (v); \
+	umul_ppmm(__w1, w0, __m0, __m1); \
+	(w1) = __w1 - (-(__m0 >> (W_TYPE_SIZE - 1)) & __m1) \
+	- (-(__m1 >> (W_TYPE_SIZE - 1)) & __m0); \
+} while (0)
+#endif
+
+	/* Define this unconditionally, so it can be used for debugging.  */
+#define __udiv_qrnnd_c(q, r, n1, n0, d) \
+do { \
+	UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \
+	__d1 = __ll_highpart(d); \
+	__d0 = __ll_lowpart(d); \
+	\
+	__r1 = (n1) % __d1; \
+	__q1 = (n1) / __d1; \
+	__m = (UWtype) __q1 * __d0; \
+	__r1 = __r1 * __ll_B | __ll_highpart(n0); \
+	if (__r1 < __m) { \
+		__q1--, __r1 += (d); \
+		if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */ \
+		if (__r1 < __m) \
+			__q1--, __r1 += (d); \
+	} \
+	__r1 -= __m; \
+	\
+	__r0 = __r1 % __d1; \
+	__q0 = __r1 / __d1; \
+	__m = (UWtype) __q0 * __d0; \
+	__r0 = __r0 * __ll_B | __ll_lowpart(n0); \
+	if (__r0 < __m) { \
+		__q0--, __r0 += (d); \
+		if (__r0 >= (d)) \
+			if (__r0 < __m) \
+				__q0--, __r0 += (d); \
+	} \
+	__r0 -= __m; \
+	\
+	(q) = (UWtype) __q1 * __ll_B | __q0; \
+	(r) = __r0; \
+} while (0)
+
+/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
+	__udiv_w_sdiv (defined in libgcc or elsewhere).  */
+#if !defined(udiv_qrnnd) && defined(sdiv_qrnnd)
+#define udiv_qrnnd(q, r, nh, nl, d) \
+do { \
+	UWtype __r; \
+	(q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \
+	(r) = __r; \
+} while (0)
+#endif
+
+	/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
+#if !defined(udiv_qrnnd)
+#define UDIV_NEEDS_NORMALIZATION 1
+#define udiv_qrnnd __udiv_qrnnd_c
+#endif
+
+#ifndef UDIV_NEEDS_NORMALIZATION
+#define UDIV_NEEDS_NORMALIZATION 0
+#endif
diff --git a/lib/crypto/mpi/mpi-add.c b/lib/crypto/mpi/mpi-add.c
new file mode 100644
index 000000000000..c0375c1672fa
--- /dev/null
+++ b/lib/crypto/mpi/mpi-add.c
@@ -0,0 +1,120 @@
+/* mpi-add.c  -  MPI functions
+ * Copyright (C) 1994, 1996, 1998, 2001, 2002,
+ *               2003 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ */
+
+#include <linux/export.h>
+
+#include "mpi-internal.h"
+
+int mpi_add(MPI w, MPI u, MPI v)
+{
+	mpi_ptr_t wp, up, vp;
+	mpi_size_t usize, vsize, wsize;
+	int usign, vsign, wsign;
+	int err;
+
+	if (u->nlimbs < v->nlimbs) { /* Swap U and V. */
+		usize = v->nlimbs;
+		usign = v->sign;
+		vsize = u->nlimbs;
+		vsign = u->sign;
+		wsize = usize + 1;
+		err = RESIZE_IF_NEEDED(w, wsize);
+		if (err)
+			return err;
+		/* These must be after realloc (u or v may be the same as w).  */
+		up = v->d;
+		vp = u->d;
+	} else {
+		usize = u->nlimbs;
+		usign = u->sign;
+		vsize = v->nlimbs;
+		vsign = v->sign;
+		wsize = usize + 1;
+		err = RESIZE_IF_NEEDED(w, wsize);
+		if (err)
+			return err;
+		/* These must be after realloc (u or v may be the same as w).  */
+		up = u->d;
+		vp = v->d;
+	}
+	wp = w->d;
+	wsign = 0;
+
+	if (!vsize) {  /* simple */
+		MPN_COPY(wp, up, usize);
+		wsize = usize;
+		wsign = usign;
+	} else if (usign != vsign) { /* different sign */
+		/* This test is right since USIZE >= VSIZE */
+		if (usize != vsize) {
+			mpihelp_sub(wp, up, usize, vp, vsize);
+			wsize = usize;
+			MPN_NORMALIZE(wp, wsize);
+			wsign = usign;
+		} else if (mpihelp_cmp(up, vp, usize) < 0) {
+			mpihelp_sub_n(wp, vp, up, usize);
+			wsize = usize;
+			MPN_NORMALIZE(wp, wsize);
+			if (!usign)
+				wsign = 1;
+		} else {
+			mpihelp_sub_n(wp, up, vp, usize);
+			wsize = usize;
+			MPN_NORMALIZE(wp, wsize);
+			if (usign)
+				wsign = 1;
+		}
+	} else { /* U and V have same sign. Add them. */
+		mpi_limb_t cy = mpihelp_add(wp, up, usize, vp, vsize);
+		wp[usize] = cy;
+		wsize = usize + cy;
+		if (usign)
+			wsign = 1;
+	}
+
+	w->nlimbs = wsize;
+	w->sign = wsign;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_add);
+
+int mpi_sub(MPI w, MPI u, MPI v)
+{
+	int err;
+	MPI vv;
+
+	vv = mpi_copy(v);
+	if (!vv)
+		return -ENOMEM;
+
+	vv->sign = !vv->sign;
+	err = mpi_add(w, u, vv);
+	mpi_free(vv);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mpi_sub);
+
+int mpi_addm(MPI w, MPI u, MPI v, MPI m)
+{
+	return mpi_add(w, u, v) ?:
+	       mpi_mod(w, w, m);
+}
+EXPORT_SYMBOL_GPL(mpi_addm);
+
+int mpi_subm(MPI w, MPI u, MPI v, MPI m)
+{
+	return mpi_sub(w, u, v) ?:
+	       mpi_mod(w, w, m);
+}
+EXPORT_SYMBOL_GPL(mpi_subm);
diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c
new file mode 100644
index 000000000000..b3d0e7ddbc03
--- /dev/null
+++ b/lib/crypto/mpi/mpi-bit.c
@@ -0,0 +1,177 @@
+/* mpi-bit.c  -  MPI bit level functions
+ * Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include <linux/export.h>
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+#define A_LIMB_1 ((mpi_limb_t) 1)
+
+/****************
+ * Sometimes we have MSL (most significant limbs) which are 0;
+ * this is for some reasons not good, so this function removes them.
+ */
+void mpi_normalize(MPI a)
+{
+	for (; a->nlimbs && !a->d[a->nlimbs - 1]; a->nlimbs--)
+		;
+}
+
+/****************
+ * Return the number of bits in A.
+ */
+unsigned mpi_get_nbits(MPI a)
+{
+	unsigned n;
+
+	mpi_normalize(a);
+
+	if (a->nlimbs) {
+		mpi_limb_t alimb = a->d[a->nlimbs - 1];
+		if (alimb)
+			n = count_leading_zeros(alimb);
+		else
+			n = BITS_PER_MPI_LIMB;
+		n = BITS_PER_MPI_LIMB - n + (a->nlimbs - 1) * BITS_PER_MPI_LIMB;
+	} else
+		n = 0;
+	return n;
+}
+EXPORT_SYMBOL_GPL(mpi_get_nbits);
+
+/****************
+ * Test whether bit N is set.
+ */
+int mpi_test_bit(MPI a, unsigned int n)
+{
+	unsigned int limbno, bitno;
+	mpi_limb_t limb;
+
+	limbno = n / BITS_PER_MPI_LIMB;
+	bitno  = n % BITS_PER_MPI_LIMB;
+
+	if (limbno >= a->nlimbs)
+		return 0; /* too far left: this is a 0 */
+	limb = a->d[limbno];
+	return (limb & (A_LIMB_1 << bitno)) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(mpi_test_bit);
+
+/****************
+ * Set bit N of A.
+ */
+int mpi_set_bit(MPI a, unsigned int n)
+{
+	unsigned int i, limbno, bitno;
+	int err;
+
+	limbno = n / BITS_PER_MPI_LIMB;
+	bitno  = n % BITS_PER_MPI_LIMB;
+
+	if (limbno >= a->nlimbs) {
+		for (i = a->nlimbs; i < a->alloced; i++)
+			a->d[i] = 0;
+		err = mpi_resize(a, limbno+1);
+		if (err)
+			return err;
+		a->nlimbs = limbno+1;
+	}
+	a->d[limbno] |= (A_LIMB_1<<bitno);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_set_bit);
+
+/*
+ * Shift A by N bits to the right.
+ */
+int mpi_rshift(MPI x, MPI a, unsigned int n)
+{
+	mpi_size_t xsize;
+	unsigned int i;
+	unsigned int nlimbs = (n/BITS_PER_MPI_LIMB);
+	unsigned int nbits = (n%BITS_PER_MPI_LIMB);
+	int err;
+
+	if (x == a) {
+		/* In-place operation.  */
+		if (nlimbs >= x->nlimbs) {
+			x->nlimbs = 0;
+			return 0;
+		}
+
+		if (nlimbs) {
+			for (i = 0; i < x->nlimbs - nlimbs; i++)
+				x->d[i] = x->d[i+nlimbs];
+			x->d[i] = 0;
+			x->nlimbs -= nlimbs;
+		}
+		if (x->nlimbs && nbits)
+			mpihelp_rshift(x->d, x->d, x->nlimbs, nbits);
+	} else if (nlimbs) {
+		/* Copy and shift by more or equal bits than in a limb. */
+		xsize = a->nlimbs;
+		x->sign = a->sign;
+		err = RESIZE_IF_NEEDED(x, xsize);
+		if (err)
+			return err;
+		x->nlimbs = xsize;
+		for (i = 0; i < a->nlimbs; i++)
+			x->d[i] = a->d[i];
+		x->nlimbs = i;
+
+		if (nlimbs >= x->nlimbs) {
+			x->nlimbs = 0;
+			return 0;
+		}
+
+		for (i = 0; i < x->nlimbs - nlimbs; i++)
+			x->d[i] = x->d[i+nlimbs];
+		x->d[i] = 0;
+		x->nlimbs -= nlimbs;
+
+		if (x->nlimbs && nbits)
+			mpihelp_rshift(x->d, x->d, x->nlimbs, nbits);
+	} else {
+		/* Copy and shift by less than bits in a limb.  */
+		xsize = a->nlimbs;
+		x->sign = a->sign;
+		err = RESIZE_IF_NEEDED(x, xsize);
+		if (err)
+			return err;
+		x->nlimbs = xsize;
+
+		if (xsize) {
+			if (nbits)
+				mpihelp_rshift(x->d, a->d, x->nlimbs, nbits);
+			else {
+				/* The rshift helper function is not specified for
+				 * NBITS==0, thus we do a plain copy here.
+				 */
+				for (i = 0; i < x->nlimbs; i++)
+					x->d[i] = a->d[i];
+			}
+		}
+	}
+	MPN_NORMALIZE(x->d, x->nlimbs);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_rshift);
diff --git a/lib/crypto/mpi/mpi-cmp.c b/lib/crypto/mpi/mpi-cmp.c
new file mode 100644
index 000000000000..b42929296bce
--- /dev/null
+++ b/lib/crypto/mpi/mpi-cmp.c
@@ -0,0 +1,74 @@
+/* mpi-cmp.c  -  MPI functions
+ * Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include <linux/export.h>
+
+#include "mpi-internal.h"
+
+int mpi_cmp_ui(MPI u, unsigned long v)
+{
+	mpi_limb_t limb = v;
+
+	mpi_normalize(u);
+	if (u->nlimbs == 0) {
+		if (v == 0)
+			return 0;
+		else
+			return -1;
+	}
+	if (u->sign)
+		return -1;
+	if (u->nlimbs > 1)
+		return 1;
+
+	if (u->d[0] == limb)
+		return 0;
+	else if (u->d[0] > limb)
+		return 1;
+	else
+		return -1;
+}
+EXPORT_SYMBOL_GPL(mpi_cmp_ui);
+
+int mpi_cmp(MPI u, MPI v)
+{
+	mpi_size_t usize, vsize;
+	int cmp;
+
+	mpi_normalize(u);
+	mpi_normalize(v);
+	usize = u->nlimbs;
+	vsize = v->nlimbs;
+	if (!u->sign && v->sign)
+		return 1;
+	if (u->sign && !v->sign)
+		return -1;
+	if (usize != vsize && !u->sign && !v->sign)
+		return usize - vsize;
+	if (usize != vsize && u->sign && v->sign)
+		return vsize - usize;
+	if (!usize)
+		return 0;
+	cmp = mpihelp_cmp(u->d, v->d, usize);
+	if (u->sign)
+		return -cmp;
+	return cmp;
+}
+EXPORT_SYMBOL_GPL(mpi_cmp);
diff --git a/lib/crypto/mpi/mpi-div.c b/lib/crypto/mpi/mpi-div.c
new file mode 100644
index 000000000000..6e5044e72595
--- /dev/null
+++ b/lib/crypto/mpi/mpi-div.c
@@ -0,0 +1,230 @@
+/* mpi-div.c  -  MPI functions
+ * Copyright (C) 1994, 1996, 1998, 2001, 2002,
+ *               2003 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+int mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den);
+
+int mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor)
+{
+	int divisor_sign = divisor->sign;
+	MPI temp_divisor = NULL;
+	int err;
+
+	/* We need the original value of the divisor after the remainder has been
+	 * preliminary calculated.	We have to copy it to temporary space if it's
+	 * the same variable as REM.
+	 */
+	if (rem == divisor) {
+		temp_divisor = mpi_copy(divisor);
+		if (!temp_divisor)
+			return -ENOMEM;
+		divisor = temp_divisor;
+	}
+
+	err = mpi_tdiv_r(rem, dividend, divisor);
+	if (err)
+		goto free_temp_divisor;
+
+	if (((divisor_sign?1:0) ^ (dividend->sign?1:0)) && rem->nlimbs)
+		err = mpi_add(rem, rem, divisor);
+
+free_temp_divisor:
+	mpi_free(temp_divisor);
+
+	return err;
+}
+
+/* If den == quot, den needs temporary storage.
+ * If den == rem, den needs temporary storage.
+ * If num == quot, num needs temporary storage.
+ * If den has temporary storage, it can be normalized while being copied,
+ *   i.e no extra storage should be allocated.
+ */
+
+int mpi_tdiv_r(MPI rem, MPI num, MPI den)
+{
+	return mpi_tdiv_qr(NULL, rem, num, den);
+}
+
+int mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den)
+{
+	mpi_ptr_t np, dp;
+	mpi_ptr_t qp, rp;
+	mpi_size_t nsize = num->nlimbs;
+	mpi_size_t dsize = den->nlimbs;
+	mpi_size_t qsize, rsize;
+	mpi_size_t sign_remainder = num->sign;
+	mpi_size_t sign_quotient = num->sign ^ den->sign;
+	unsigned int normalization_steps;
+	mpi_limb_t q_limb;
+	mpi_ptr_t marker[5];
+	int markidx = 0;
+	int err;
+
+	/* Ensure space is enough for quotient and remainder.
+	 * We need space for an extra limb in the remainder, because it's
+	 * up-shifted (normalized) below.
+	 */
+	rsize = nsize + 1;
+	err = mpi_resize(rem, rsize);
+	if (err)
+		return err;
+
+	qsize = rsize - dsize;	  /* qsize cannot be bigger than this.	*/
+	if (qsize <= 0) {
+		if (num != rem) {
+			rem->nlimbs = num->nlimbs;
+			rem->sign = num->sign;
+			MPN_COPY(rem->d, num->d, nsize);
+		}
+		if (quot) {
+			/* This needs to follow the assignment to rem, in case the
+			 * numerator and quotient are the same.
+			 */
+			quot->nlimbs = 0;
+			quot->sign = 0;
+		}
+		return 0;
+	}
+
+	if (quot) {
+		err = mpi_resize(quot, qsize);
+		if (err)
+			return err;
+	}
+
+	/* Read pointers here, when reallocation is finished.  */
+	np = num->d;
+	dp = den->d;
+	rp = rem->d;
+
+	/* Optimize division by a single-limb divisor.  */
+	if (dsize == 1) {
+		mpi_limb_t rlimb;
+		if (quot) {
+			qp = quot->d;
+			rlimb = mpihelp_divmod_1(qp, np, nsize, dp[0]);
+			qsize -= qp[qsize - 1] == 0;
+			quot->nlimbs = qsize;
+			quot->sign = sign_quotient;
+		} else
+			rlimb = mpihelp_mod_1(np, nsize, dp[0]);
+		rp[0] = rlimb;
+		rsize = rlimb != 0?1:0;
+		rem->nlimbs = rsize;
+		rem->sign = sign_remainder;
+		return 0;
+	}
+
+	err = -ENOMEM;
+	if (quot) {
+		qp = quot->d;
+		/* Make sure QP and NP point to different objects.  Otherwise the
+		 * numerator would be gradually overwritten by the quotient limbs.
+		 */
+		if (qp == np) { /* Copy NP object to temporary space.  */
+			np = marker[markidx++] = mpi_alloc_limb_space(nsize);
+			if (!np)
+				goto out_free_marker;
+			MPN_COPY(np, qp, nsize);
+		}
+	} else /* Put quotient at top of remainder. */
+		qp = rp + dsize;
+
+	normalization_steps = count_leading_zeros(dp[dsize - 1]);
+
+	/* Normalize the denominator, i.e. make its most significant bit set by
+	 * shifting it NORMALIZATION_STEPS bits to the left.  Also shift the
+	 * numerator the same number of steps (to keep the quotient the same!).
+	 */
+	if (normalization_steps) {
+		mpi_ptr_t tp;
+		mpi_limb_t nlimb;
+
+		/* Shift up the denominator setting the most significant bit of
+		 * the most significant word.  Use temporary storage not to clobber
+		 * the original contents of the denominator.
+		 */
+		tp = marker[markidx++] = mpi_alloc_limb_space(dsize);
+		if (!tp)
+			goto out_free_marker;
+		mpihelp_lshift(tp, dp, dsize, normalization_steps);
+		dp = tp;
+
+		/* Shift up the numerator, possibly introducing a new most
+		 * significant word.  Move the shifted numerator in the remainder
+		 * meanwhile.
+		 */
+		nlimb = mpihelp_lshift(rp, np, nsize, normalization_steps);
+		if (nlimb) {
+			rp[nsize] = nlimb;
+			rsize = nsize + 1;
+		} else
+			rsize = nsize;
+	} else {
+		/* The denominator is already normalized, as required.	Copy it to
+		 * temporary space if it overlaps with the quotient or remainder.
+		 */
+		if (dp == rp || (quot && (dp == qp))) {
+			mpi_ptr_t tp;
+
+			tp = marker[markidx++] = mpi_alloc_limb_space(dsize);
+			if (!tp)
+				goto out_free_marker;
+			MPN_COPY(tp, dp, dsize);
+			dp = tp;
+		}
+
+		/* Move the numerator to the remainder.  */
+		if (rp != np)
+			MPN_COPY(rp, np, nsize);
+
+		rsize = nsize;
+	}
+
+	q_limb = mpihelp_divrem(qp, 0, rp, rsize, dp, dsize);
+
+	if (quot) {
+		qsize = rsize - dsize;
+		if (q_limb) {
+			qp[qsize] = q_limb;
+			qsize += 1;
+		}
+
+		quot->nlimbs = qsize;
+		quot->sign = sign_quotient;
+	}
+
+	rsize = dsize;
+	MPN_NORMALIZE(rp, rsize);
+
+	if (normalization_steps && rsize) {
+		mpihelp_rshift(rp, rp, rsize, normalization_steps);
+		rsize -= rp[rsize - 1] == 0?1:0;
+	}
+
+	rem->nlimbs = rsize;
+	rem->sign	= sign_remainder;
+
+	err = 0;
+
+out_free_marker:
+	while (markidx) {
+		markidx--;
+		mpi_free_limb_space(marker[markidx]);
+	}
+
+	return err;
+}
diff --git a/lib/crypto/mpi/mpi-inline.h b/lib/crypto/mpi/mpi-inline.h
new file mode 100644
index 000000000000..980b6b940953
--- /dev/null
+++ b/lib/crypto/mpi/mpi-inline.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* mpi-inline.h  -  Internal to the Multi Precision Integers
+ *	Copyright (C) 1994, 1996, 1998, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#ifndef G10_MPI_INLINE_H
+#define G10_MPI_INLINE_H
+
+#ifndef G10_MPI_INLINE_DECL
+#define G10_MPI_INLINE_DECL  static inline
+#endif
+
+G10_MPI_INLINE_DECL mpi_limb_t
+mpihelp_add_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+	      mpi_size_t s1_size, mpi_limb_t s2_limb)
+{
+	mpi_limb_t x;
+
+	x = *s1_ptr++;
+	s2_limb += x;
+	*res_ptr++ = s2_limb;
+	if (s2_limb < x) {	/* sum is less than the left operand: handle carry */
+		while (--s1_size) {
+			x = *s1_ptr++ + 1;	/* add carry */
+			*res_ptr++ = x;	/* and store */
+			if (x)	/* not 0 (no overflow): we can stop */
+				goto leave;
+		}
+		return 1;	/* return carry (size of s1 to small) */
+	}
+
+leave:
+	if (res_ptr != s1_ptr) {	/* not the same variable */
+		mpi_size_t i;	/* copy the rest */
+		for (i = 0; i < s1_size - 1; i++)
+			res_ptr[i] = s1_ptr[i];
+	}
+	return 0;		/* no carry */
+}
+
+G10_MPI_INLINE_DECL mpi_limb_t
+mpihelp_add(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
+	    mpi_ptr_t s2_ptr, mpi_size_t s2_size)
+{
+	mpi_limb_t cy = 0;
+
+	if (s2_size)
+		cy = mpihelp_add_n(res_ptr, s1_ptr, s2_ptr, s2_size);
+
+	if (s1_size - s2_size)
+		cy = mpihelp_add_1(res_ptr + s2_size, s1_ptr + s2_size,
+				   s1_size - s2_size, cy);
+	return cy;
+}
+
+G10_MPI_INLINE_DECL mpi_limb_t
+mpihelp_sub_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+	      mpi_size_t s1_size, mpi_limb_t s2_limb)
+{
+	mpi_limb_t x;
+
+	x = *s1_ptr++;
+	s2_limb = x - s2_limb;
+	*res_ptr++ = s2_limb;
+	if (s2_limb > x) {
+		while (--s1_size) {
+			x = *s1_ptr++;
+			*res_ptr++ = x - 1;
+			if (x)
+				goto leave;
+		}
+		return 1;
+	}
+
+leave:
+	if (res_ptr != s1_ptr) {
+		mpi_size_t i;
+		for (i = 0; i < s1_size - 1; i++)
+			res_ptr[i] = s1_ptr[i];
+	}
+	return 0;
+}
+
+G10_MPI_INLINE_DECL mpi_limb_t
+mpihelp_sub(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
+	    mpi_ptr_t s2_ptr, mpi_size_t s2_size)
+{
+	mpi_limb_t cy = 0;
+
+	if (s2_size)
+		cy = mpihelp_sub_n(res_ptr, s1_ptr, s2_ptr, s2_size);
+
+	if (s1_size - s2_size)
+		cy = mpihelp_sub_1(res_ptr + s2_size, s1_ptr + s2_size,
+				   s1_size - s2_size, cy);
+	return cy;
+}
+
+#endif /*G10_MPI_INLINE_H */
diff --git a/lib/crypto/mpi/mpi-internal.h b/lib/crypto/mpi/mpi-internal.h
new file mode 100644
index 000000000000..8a4f49e3043c
--- /dev/null
+++ b/lib/crypto/mpi/mpi-internal.h
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* mpi-internal.h  -  Internal to the Multi Precision Integers
+ *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *	Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#ifndef G10_MPI_INTERNAL_H
+#define G10_MPI_INTERNAL_H
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/mpi.h>
+#include <linux/errno.h>
+
+#define log_debug printk
+#define log_bug printk
+
+#define assert(x) \
+	do { \
+		if (!x) \
+			log_bug("failed assertion\n"); \
+	} while (0);
+
+/* If KARATSUBA_THRESHOLD is not already defined, define it to a
+ * value which is good on most machines.  */
+
+/* tested 4, 16, 32 and 64, where 16 gave the best performance when
+ * checking a 768 and a 1024 bit ElGamal signature.
+ * (wk 22.12.97) */
+#ifndef KARATSUBA_THRESHOLD
+#define KARATSUBA_THRESHOLD 16
+#endif
+
+/* The code can't handle KARATSUBA_THRESHOLD smaller than 2.  */
+#if KARATSUBA_THRESHOLD < 2
+#undef KARATSUBA_THRESHOLD
+#define KARATSUBA_THRESHOLD 2
+#endif
+
+typedef mpi_limb_t *mpi_ptr_t;	/* pointer to a limb */
+typedef int mpi_size_t;		/* (must be a signed type) */
+
+static inline int RESIZE_IF_NEEDED(MPI a, unsigned b)
+{
+	if (a->alloced < b)
+		return mpi_resize(a, b);
+	return 0;
+}
+
+/* Copy N limbs from S to D.  */
+#define MPN_COPY(d, s, n) \
+	do {					\
+		mpi_size_t _i;			\
+		for (_i = 0; _i < (n); _i++)	\
+			(d)[_i] = (s)[_i];	\
+	} while (0)
+
+#define MPN_COPY_DECR(d, s, n) \
+	do {					\
+		mpi_size_t _i;			\
+		for (_i = (n)-1; _i >= 0; _i--) \
+			(d)[_i] = (s)[_i];	\
+	} while (0)
+
+/* Zero N limbs at D */
+#define MPN_ZERO(d, n) \
+	do {					\
+		int  _i;			\
+		for (_i = 0; _i < (n); _i++)	\
+			(d)[_i] = 0;		\
+	} while (0)
+
+#define MPN_NORMALIZE(d, n)  \
+	do {					\
+		while ((n) > 0) {		\
+			if ((d)[(n)-1])		\
+				break;		\
+			(n)--;			\
+		}				\
+	} while (0)
+
+#define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace) \
+	do {							\
+		if ((size) < KARATSUBA_THRESHOLD)		\
+			mul_n_basecase(prodp, up, vp, size);	\
+		else						\
+			mul_n(prodp, up, vp, size, tspace);	\
+	} while (0);
+
+/* Divide the two-limb number in (NH,,NL) by D, with DI being the largest
+ * limb not larger than (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB).
+ * If this would yield overflow, DI should be the largest possible number
+ * (i.e., only ones).  For correct operation, the most significant bit of D
+ * has to be set.  Put the quotient in Q and the remainder in R.
+ */
+#define UDIV_QRNND_PREINV(q, r, nh, nl, d, di)				\
+	do {								\
+		mpi_limb_t _ql __maybe_unused;				\
+		mpi_limb_t _q, _r;					\
+		mpi_limb_t _xh, _xl;					\
+		umul_ppmm(_q, _ql, (nh), (di));				\
+		_q += (nh);	/* DI is 2**BITS_PER_MPI_LIMB too small */ \
+		umul_ppmm(_xh, _xl, _q, (d));				\
+		sub_ddmmss(_xh, _r, (nh), (nl), _xh, _xl);		\
+		if (_xh) {						\
+			sub_ddmmss(_xh, _r, _xh, _r, 0, (d));		\
+			_q++;						\
+			if (_xh) {					\
+				sub_ddmmss(_xh, _r, _xh, _r, 0, (d));	\
+				_q++;					\
+			}						\
+		}							\
+		if (_r >= (d)) {					\
+			_r -= (d);					\
+			_q++;						\
+		}							\
+		(r) = _r;						\
+		(q) = _q;						\
+	} while (0)
+
+
+/*-- mpiutil.c --*/
+mpi_ptr_t mpi_alloc_limb_space(unsigned nlimbs);
+void mpi_free_limb_space(mpi_ptr_t a);
+void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs);
+
+static inline mpi_limb_t mpihelp_add_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+			 mpi_size_t s1_size, mpi_limb_t s2_limb);
+mpi_limb_t mpihelp_add_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+			 mpi_ptr_t s2_ptr, mpi_size_t size);
+static inline mpi_limb_t mpihelp_add(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
+		       mpi_ptr_t s2_ptr, mpi_size_t s2_size);
+
+static inline mpi_limb_t mpihelp_sub_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+			 mpi_size_t s1_size, mpi_limb_t s2_limb);
+mpi_limb_t mpihelp_sub_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+			 mpi_ptr_t s2_ptr, mpi_size_t size);
+static inline mpi_limb_t mpihelp_sub(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
+		       mpi_ptr_t s2_ptr, mpi_size_t s2_size);
+
+/*-- mpih-cmp.c --*/
+int mpihelp_cmp(mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size);
+
+/*-- mpih-mul.c --*/
+
+struct karatsuba_ctx {
+	struct karatsuba_ctx *next;
+	mpi_ptr_t tspace;
+	mpi_size_t tspace_size;
+	mpi_ptr_t tp;
+	mpi_size_t tp_size;
+};
+
+void mpihelp_release_karatsuba_ctx(struct karatsuba_ctx *ctx);
+
+mpi_limb_t mpihelp_addmul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+			    mpi_size_t s1_size, mpi_limb_t s2_limb);
+mpi_limb_t mpihelp_submul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+			    mpi_size_t s1_size, mpi_limb_t s2_limb);
+int mpihelp_mul(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
+		mpi_ptr_t vp, mpi_size_t vsize, mpi_limb_t *_result);
+void mpih_sqr_n_basecase(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size);
+void mpih_sqr_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size,
+		mpi_ptr_t tspace);
+
+int mpihelp_mul_karatsuba_case(mpi_ptr_t prodp,
+			       mpi_ptr_t up, mpi_size_t usize,
+			       mpi_ptr_t vp, mpi_size_t vsize,
+			       struct karatsuba_ctx *ctx);
+
+/*-- generic_mpih-mul1.c --*/
+mpi_limb_t mpihelp_mul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+			 mpi_size_t s1_size, mpi_limb_t s2_limb);
+
+/*-- mpih-div.c --*/
+mpi_limb_t mpihelp_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
+			 mpi_limb_t divisor_limb);
+mpi_limb_t mpihelp_divrem(mpi_ptr_t qp, mpi_size_t qextra_limbs,
+			  mpi_ptr_t np, mpi_size_t nsize,
+			  mpi_ptr_t dp, mpi_size_t dsize);
+mpi_limb_t mpihelp_divmod_1(mpi_ptr_t quot_ptr,
+			    mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
+			    mpi_limb_t divisor_limb);
+
+/*-- generic_mpih-[lr]shift.c --*/
+mpi_limb_t mpihelp_lshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize,
+			  unsigned cnt);
+mpi_limb_t mpihelp_rshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize,
+			  unsigned cnt);
+
+/* Define stuff for longlong.h.  */
+#define W_TYPE_SIZE BITS_PER_MPI_LIMB
+typedef mpi_limb_t UWtype;
+typedef unsigned int UHWtype;
+#if defined(__GNUC__)
+typedef unsigned int UQItype __attribute__ ((mode(QI)));
+typedef int SItype __attribute__ ((mode(SI)));
+typedef unsigned int USItype __attribute__ ((mode(SI)));
+typedef int DItype __attribute__ ((mode(DI)));
+typedef unsigned int UDItype __attribute__ ((mode(DI)));
+#else
+typedef unsigned char UQItype;
+typedef long SItype;
+typedef unsigned long USItype;
+#endif
+
+#ifdef __GNUC__
+#include "mpi-inline.h"
+#endif
+
+#endif /*G10_MPI_INTERNAL_H */
diff --git a/lib/crypto/mpi/mpi-mod.c b/lib/crypto/mpi/mpi-mod.c
new file mode 100644
index 000000000000..d5fdaec3d0b6
--- /dev/null
+++ b/lib/crypto/mpi/mpi-mod.c
@@ -0,0 +1,13 @@
+/* mpi-mod.c -  Modular reduction
+ * Copyright (C) 1998, 1999, 2001, 2002, 2003,
+ *               2007  Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ */
+
+#include "mpi-internal.h"
+
+int mpi_mod(MPI rem, MPI dividend, MPI divisor)
+{
+	return mpi_fdiv_r(rem, dividend, divisor);
+}
diff --git a/lib/crypto/mpi/mpi-mul.c b/lib/crypto/mpi/mpi-mul.c
new file mode 100644
index 000000000000..d79f186ad90b
--- /dev/null
+++ b/lib/crypto/mpi/mpi-mul.c
@@ -0,0 +1,111 @@
+/* mpi-mul.c  -  MPI functions
+ * Copyright (C) 1994, 1996, 1998, 2001, 2002,
+ *               2003 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ */
+
+#include <linux/export.h>
+
+#include "mpi-internal.h"
+
+int mpi_mul(MPI w, MPI u, MPI v)
+{
+	mpi_size_t usize, vsize, wsize;
+	mpi_ptr_t up, vp, wp;
+	mpi_limb_t cy;
+	int usign, vsign, sign_product;
+	int assign_wp = 0;
+	mpi_ptr_t tmp_limb = NULL;
+	int err = 0;
+
+	if (u->nlimbs < v->nlimbs) {
+		/* Swap U and V. */
+		usize = v->nlimbs;
+		usign = v->sign;
+		up    = v->d;
+		vsize = u->nlimbs;
+		vsign = u->sign;
+		vp    = u->d;
+	} else {
+		usize = u->nlimbs;
+		usign = u->sign;
+		up    = u->d;
+		vsize = v->nlimbs;
+		vsign = v->sign;
+		vp    = v->d;
+	}
+	sign_product = usign ^ vsign;
+	wp = w->d;
+
+	/* Ensure W has space enough to store the result.  */
+	wsize = usize + vsize;
+	if (w->alloced < wsize) {
+		if (wp == up || wp == vp) {
+			wp = mpi_alloc_limb_space(wsize);
+			if (!wp)
+				return -ENOMEM;
+			assign_wp = 1;
+		} else {
+			err = mpi_resize(w, wsize);
+			if (err)
+				return err;
+			wp = w->d;
+		}
+	} else { /* Make U and V not overlap with W.	*/
+		if (wp == up) {
+			/* W and U are identical.  Allocate temporary space for U. */
+			up = tmp_limb = mpi_alloc_limb_space(usize);
+			if (!up)
+				return -ENOMEM;
+			/* Is V identical too?  Keep it identical with U.  */
+			if (wp == vp)
+				vp = up;
+			/* Copy to the temporary space.  */
+			MPN_COPY(up, wp, usize);
+		} else if (wp == vp) {
+			/* W and V are identical.  Allocate temporary space for V. */
+			vp = tmp_limb = mpi_alloc_limb_space(vsize);
+			if (!vp)
+				return -ENOMEM;
+			/* Copy to the temporary space.  */
+			MPN_COPY(vp, wp, vsize);
+		}
+	}
+
+	if (!vsize)
+		wsize = 0;
+	else {
+		err = mpihelp_mul(wp, up, usize, vp, vsize, &cy);
+		if (err) {
+			if (assign_wp)
+				mpi_free_limb_space(wp);
+			goto free_tmp_limb;
+		}
+		wsize -= cy ? 0:1;
+	}
+
+	if (assign_wp)
+		mpi_assign_limb_space(w, wp, wsize);
+	w->nlimbs = wsize;
+	w->sign = sign_product;
+
+free_tmp_limb:
+	if (tmp_limb)
+		mpi_free_limb_space(tmp_limb);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mpi_mul);
+
+int mpi_mulm(MPI w, MPI u, MPI v, MPI m)
+{
+	return mpi_mul(w, u, v) ?:
+	       mpi_tdiv_r(w, w, m);
+}
+EXPORT_SYMBOL_GPL(mpi_mulm);
diff --git a/lib/crypto/mpi/mpi-pow.c b/lib/crypto/mpi/mpi-pow.c
new file mode 100644
index 000000000000..9e695a3bda3a
--- /dev/null
+++ b/lib/crypto/mpi/mpi-pow.c
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpi-pow.c  -  MPI functions
+ *	Copyright (C) 1994, 1996, 1998, 2000 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+/****************
+ * RES = BASE ^ EXP mod MOD
+ */
+int mpi_powm(MPI res, MPI base, MPI exp, MPI mod)
+{
+	mpi_ptr_t mp_marker = NULL, bp_marker = NULL, ep_marker = NULL;
+	struct karatsuba_ctx karactx = {};
+	mpi_ptr_t xp_marker = NULL;
+	mpi_ptr_t tspace = NULL;
+	mpi_ptr_t rp, ep, mp, bp;
+	mpi_size_t esize, msize, bsize, rsize;
+	int msign, bsign, rsign;
+	mpi_size_t size;
+	int mod_shift_cnt;
+	int negative_result;
+	int assign_rp = 0;
+	mpi_size_t tsize = 0;	/* to avoid compiler warning */
+	/* fixme: we should check that the warning is void */
+	int rc = -ENOMEM;
+
+	esize = exp->nlimbs;
+	msize = mod->nlimbs;
+	size = 2 * msize;
+	msign = mod->sign;
+
+	rp = res->d;
+	ep = exp->d;
+
+	if (!msize)
+		return -EINVAL;
+
+	if (!esize) {
+		/* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
+		 * depending on if MOD equals 1.  */
+		res->nlimbs = (msize == 1 && mod->d[0] == 1) ? 0 : 1;
+		if (res->nlimbs) {
+			if (mpi_resize(res, 1) < 0)
+				goto enomem;
+			rp = res->d;
+			rp[0] = 1;
+		}
+		res->sign = 0;
+		goto leave;
+	}
+
+	/* Normalize MOD (i.e. make its most significant bit set) as required by
+	 * mpn_divrem.  This will make the intermediate values in the calculation
+	 * slightly larger, but the correct result is obtained after a final
+	 * reduction using the original MOD value.  */
+	mp = mp_marker = mpi_alloc_limb_space(msize);
+	if (!mp)
+		goto enomem;
+	mod_shift_cnt = count_leading_zeros(mod->d[msize - 1]);
+	if (mod_shift_cnt)
+		mpihelp_lshift(mp, mod->d, msize, mod_shift_cnt);
+	else
+		MPN_COPY(mp, mod->d, msize);
+
+	bsize = base->nlimbs;
+	bsign = base->sign;
+	if (bsize > msize) {	/* The base is larger than the module. Reduce it. */
+		/* Allocate (BSIZE + 1) with space for remainder and quotient.
+		 * (The quotient is (bsize - msize + 1) limbs.)  */
+		bp = bp_marker = mpi_alloc_limb_space(bsize + 1);
+		if (!bp)
+			goto enomem;
+		MPN_COPY(bp, base->d, bsize);
+		/* We don't care about the quotient, store it above the remainder,
+		 * at BP + MSIZE.  */
+		mpihelp_divrem(bp + msize, 0, bp, bsize, mp, msize);
+		bsize = msize;
+		/* Canonicalize the base, since we are going to multiply with it
+		 * quite a few times.  */
+		MPN_NORMALIZE(bp, bsize);
+	} else
+		bp = base->d;
+
+	if (!bsize) {
+		res->nlimbs = 0;
+		res->sign = 0;
+		goto leave;
+	}
+
+	if (res->alloced < size) {
+		/* We have to allocate more space for RES.  If any of the input
+		 * parameters are identical to RES, defer deallocation of the old
+		 * space.  */
+		if (rp == ep || rp == mp || rp == bp) {
+			rp = mpi_alloc_limb_space(size);
+			if (!rp)
+				goto enomem;
+			assign_rp = 1;
+		} else {
+			if (mpi_resize(res, size) < 0)
+				goto enomem;
+			rp = res->d;
+		}
+	} else {		/* Make BASE, EXP and MOD not overlap with RES.  */
+		if (rp == bp) {
+			/* RES and BASE are identical.  Allocate temp. space for BASE.  */
+			BUG_ON(bp_marker);
+			bp = bp_marker = mpi_alloc_limb_space(bsize);
+			if (!bp)
+				goto enomem;
+			MPN_COPY(bp, rp, bsize);
+		}
+		if (rp == ep) {
+			/* RES and EXP are identical.  Allocate temp. space for EXP.  */
+			ep = ep_marker = mpi_alloc_limb_space(esize);
+			if (!ep)
+				goto enomem;
+			MPN_COPY(ep, rp, esize);
+		}
+		if (rp == mp) {
+			/* RES and MOD are identical.  Allocate temporary space for MOD. */
+			BUG_ON(mp_marker);
+			mp = mp_marker = mpi_alloc_limb_space(msize);
+			if (!mp)
+				goto enomem;
+			MPN_COPY(mp, rp, msize);
+		}
+	}
+
+	MPN_COPY(rp, bp, bsize);
+	rsize = bsize;
+	rsign = bsign;
+
+	{
+		mpi_size_t i;
+		mpi_ptr_t xp;
+		int c;
+		mpi_limb_t e;
+		mpi_limb_t carry_limb;
+
+		xp = xp_marker = mpi_alloc_limb_space(2 * (msize + 1));
+		if (!xp)
+			goto enomem;
+
+		negative_result = (ep[0] & 1) && base->sign;
+
+		i = esize - 1;
+		e = ep[i];
+		c = count_leading_zeros(e);
+		e = (e << c) << 1;	/* shift the exp bits to the left, lose msb */
+		c = BITS_PER_MPI_LIMB - 1 - c;
+
+		/* Main loop.
+		 *
+		 * Make the result be pointed to alternately by XP and RP.  This
+		 * helps us avoid block copying, which would otherwise be necessary
+		 * with the overlap restrictions of mpihelp_divmod. With 50% probability
+		 * the result after this loop will be in the area originally pointed
+		 * by RP (==RES->d), and with 50% probability in the area originally
+		 * pointed to by XP.
+		 */
+
+		for (;;) {
+			while (c) {
+				mpi_size_t xsize;
+
+				/*if (mpihelp_mul_n(xp, rp, rp, rsize) < 0) goto enomem */
+				if (rsize < KARATSUBA_THRESHOLD)
+					mpih_sqr_n_basecase(xp, rp, rsize);
+				else {
+					if (!tspace) {
+						tsize = 2 * rsize;
+						tspace =
+						    mpi_alloc_limb_space(tsize);
+						if (!tspace)
+							goto enomem;
+					} else if (tsize < (2 * rsize)) {
+						mpi_free_limb_space(tspace);
+						tsize = 2 * rsize;
+						tspace =
+						    mpi_alloc_limb_space(tsize);
+						if (!tspace)
+							goto enomem;
+					}
+					mpih_sqr_n(xp, rp, rsize, tspace);
+				}
+
+				xsize = 2 * rsize;
+				if (xsize > msize) {
+					mpihelp_divrem(xp + msize, 0, xp, xsize,
+						       mp, msize);
+					xsize = msize;
+				}
+
+				swap(rp, xp);
+				rsize = xsize;
+
+				if ((mpi_limb_signed_t) e < 0) {
+					/*mpihelp_mul( xp, rp, rsize, bp, bsize ); */
+					if (bsize < KARATSUBA_THRESHOLD) {
+						mpi_limb_t tmp;
+						if (mpihelp_mul
+						    (xp, rp, rsize, bp, bsize,
+						     &tmp) < 0)
+							goto enomem;
+					} else {
+						if (mpihelp_mul_karatsuba_case
+						    (xp, rp, rsize, bp, bsize,
+						     &karactx) < 0)
+							goto enomem;
+					}
+
+					xsize = rsize + bsize;
+					if (xsize > msize) {
+						mpihelp_divrem(xp + msize, 0,
+							       xp, xsize, mp,
+							       msize);
+						xsize = msize;
+					}
+
+					swap(rp, xp);
+					rsize = xsize;
+				}
+				e <<= 1;
+				c--;
+				cond_resched();
+			}
+
+			i--;
+			if (i < 0)
+				break;
+			e = ep[i];
+			c = BITS_PER_MPI_LIMB;
+		}
+
+		/* We shifted MOD, the modulo reduction argument, left MOD_SHIFT_CNT
+		 * steps.  Adjust the result by reducing it with the original MOD.
+		 *
+		 * Also make sure the result is put in RES->d (where it already
+		 * might be, see above).
+		 */
+		if (mod_shift_cnt) {
+			carry_limb =
+			    mpihelp_lshift(res->d, rp, rsize, mod_shift_cnt);
+			rp = res->d;
+			if (carry_limb) {
+				rp[rsize] = carry_limb;
+				rsize++;
+			}
+		} else {
+			MPN_COPY(res->d, rp, rsize);
+			rp = res->d;
+		}
+
+		if (rsize >= msize) {
+			mpihelp_divrem(rp + msize, 0, rp, rsize, mp, msize);
+			rsize = msize;
+		}
+
+		/* Remove any leading zero words from the result.  */
+		if (mod_shift_cnt)
+			mpihelp_rshift(rp, rp, rsize, mod_shift_cnt);
+		MPN_NORMALIZE(rp, rsize);
+	}
+
+	if (negative_result && rsize) {
+		if (mod_shift_cnt)
+			mpihelp_rshift(mp, mp, msize, mod_shift_cnt);
+		mpihelp_sub(rp, mp, msize, rp, rsize);
+		rsize = msize;
+		rsign = msign;
+		MPN_NORMALIZE(rp, rsize);
+	}
+	res->nlimbs = rsize;
+	res->sign = rsign;
+
+leave:
+	rc = 0;
+enomem:
+	mpihelp_release_karatsuba_ctx(&karactx);
+	if (assign_rp)
+		mpi_assign_limb_space(res, rp, size);
+	if (mp_marker)
+		mpi_free_limb_space(mp_marker);
+	if (bp_marker)
+		mpi_free_limb_space(bp_marker);
+	if (ep_marker)
+		mpi_free_limb_space(ep_marker);
+	if (xp_marker)
+		mpi_free_limb_space(xp_marker);
+	if (tspace)
+		mpi_free_limb_space(tspace);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(mpi_powm);
diff --git a/lib/crypto/mpi/mpi-sub-ui.c b/lib/crypto/mpi/mpi-sub-ui.c
new file mode 100644
index 000000000000..0edcdbd24833
--- /dev/null
+++ b/lib/crypto/mpi/mpi-sub-ui.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpi-sub-ui.c - Subtract an unsigned integer from an MPI.
+ *
+ * Copyright 1991, 1993, 1994, 1996, 1999-2002, 2004, 2012, 2013, 2015
+ * Free Software Foundation, Inc.
+ *
+ * This file was based on the GNU MP Library source file:
+ * https://gmplib.org/repo/gmp-6.2/file/510b83519d1c/mpz/aors_ui.h
+ *
+ * The GNU MP Library is free software; you can redistribute it and/or modify
+ * it under the terms of either:
+ *
+ *   * the GNU Lesser General Public License as published by the Free
+ *     Software Foundation; either version 3 of the License, or (at your
+ *     option) any later version.
+ *
+ * or
+ *
+ *   * the GNU General Public License as published by the Free Software
+ *     Foundation; either version 2 of the License, or (at your option) any
+ *     later version.
+ *
+ * or both in parallel, as here.
+ *
+ * The GNU MP Library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received copies of the GNU General Public License and the
+ * GNU Lesser General Public License along with the GNU MP Library.  If not,
+ * see https://www.gnu.org/licenses/.
+ */
+
+#include <linux/export.h>
+
+#include "mpi-internal.h"
+
+int mpi_sub_ui(MPI w, MPI u, unsigned long vval)
+{
+	if (u->nlimbs == 0) {
+		if (mpi_resize(w, 1) < 0)
+			return -ENOMEM;
+		w->d[0] = vval;
+		w->nlimbs = (vval != 0);
+		w->sign = (vval != 0);
+		return 0;
+	}
+
+	/* If not space for W (and possible carry), increase space. */
+	if (mpi_resize(w, u->nlimbs + 1))
+		return -ENOMEM;
+
+	if (u->sign) {
+		mpi_limb_t cy;
+
+		cy = mpihelp_add_1(w->d, u->d, u->nlimbs, (mpi_limb_t) vval);
+		w->d[u->nlimbs] = cy;
+		w->nlimbs = u->nlimbs + cy;
+		w->sign = 1;
+	} else {
+		/* The signs are different.  Need exact comparison to determine
+		 * which operand to subtract from which.
+		 */
+		if (u->nlimbs == 1 && u->d[0] < vval) {
+			w->d[0] = vval - u->d[0];
+			w->nlimbs = 1;
+			w->sign = 1;
+		} else {
+			mpihelp_sub_1(w->d, u->d, u->nlimbs, (mpi_limb_t) vval);
+			/* Size can decrease with at most one limb. */
+			w->nlimbs = (u->nlimbs - (w->d[u->nlimbs - 1] == 0));
+			w->sign = 0;
+		}
+	}
+
+	mpi_normalize(w);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_sub_ui);
diff --git a/lib/crypto/mpi/mpicoder.c b/lib/crypto/mpi/mpicoder.c
new file mode 100644
index 000000000000..bf716a03c704
--- /dev/null
+++ b/lib/crypto/mpi/mpicoder.c
@@ -0,0 +1,417 @@
+/* mpicoder.c  -  Coder for the external representation of MPIs
+ * Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include <linux/bitops.h>
+#include <linux/byteorder/generic.h>
+#include <linux/count_zeros.h>
+#include <linux/export.h>
+#include <linux/scatterlist.h>
+#include <linux/string.h>
+#include "mpi-internal.h"
+
+#define MAX_EXTERN_MPI_BITS 16384
+
+/**
+ * mpi_read_raw_data - Read a raw byte stream as a positive integer
+ * @xbuffer: The data to read
+ * @nbytes: The amount of data to read
+ */
+MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes)
+{
+	const uint8_t *buffer = xbuffer;
+	int i, j;
+	unsigned nbits, nlimbs;
+	mpi_limb_t a;
+	MPI val = NULL;
+
+	while (nbytes > 0 && buffer[0] == 0) {
+		buffer++;
+		nbytes--;
+	}
+
+	nbits = nbytes * 8;
+	if (nbits > MAX_EXTERN_MPI_BITS) {
+		pr_info("MPI: mpi too large (%u bits)\n", nbits);
+		return NULL;
+	}
+	if (nbytes > 0)
+		nbits -= count_leading_zeros(buffer[0]) - (BITS_PER_LONG - 8);
+
+	nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB);
+	val = mpi_alloc(nlimbs);
+	if (!val)
+		return NULL;
+	val->nbits = nbits;
+	val->sign = 0;
+	val->nlimbs = nlimbs;
+
+	if (nbytes > 0) {
+		i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB;
+		i %= BYTES_PER_MPI_LIMB;
+		for (j = nlimbs; j > 0; j--) {
+			a = 0;
+			for (; i < BYTES_PER_MPI_LIMB; i++) {
+				a <<= 8;
+				a |= *buffer++;
+			}
+			i = 0;
+			val->d[j - 1] = a;
+		}
+	}
+	return val;
+}
+EXPORT_SYMBOL_GPL(mpi_read_raw_data);
+
+MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread)
+{
+	const uint8_t *buffer = xbuffer;
+	unsigned int nbits, nbytes;
+	MPI val;
+
+	if (*ret_nread < 2)
+		return ERR_PTR(-EINVAL);
+	nbits = buffer[0] << 8 | buffer[1];
+
+	if (nbits > MAX_EXTERN_MPI_BITS) {
+		pr_info("MPI: mpi too large (%u bits)\n", nbits);
+		return ERR_PTR(-EINVAL);
+	}
+
+	nbytes = DIV_ROUND_UP(nbits, 8);
+	if (nbytes + 2 > *ret_nread) {
+		pr_info("MPI: mpi larger than buffer nbytes=%u ret_nread=%u\n",
+				nbytes, *ret_nread);
+		return ERR_PTR(-EINVAL);
+	}
+
+	val = mpi_read_raw_data(buffer + 2, nbytes);
+	if (!val)
+		return ERR_PTR(-ENOMEM);
+
+	*ret_nread = nbytes + 2;
+	return val;
+}
+EXPORT_SYMBOL_GPL(mpi_read_from_buffer);
+
+static int count_lzeros(MPI a)
+{
+	mpi_limb_t alimb;
+	int i, lzeros = 0;
+
+	for (i = a->nlimbs - 1; i >= 0; i--) {
+		alimb = a->d[i];
+		if (alimb == 0) {
+			lzeros += sizeof(mpi_limb_t);
+		} else {
+			lzeros += count_leading_zeros(alimb) / 8;
+			break;
+		}
+	}
+	return lzeros;
+}
+
+/**
+ * mpi_read_buffer() - read MPI to a buffer provided by user (msb first)
+ *
+ * @a:		a multi precision integer
+ * @buf:	buffer to which the output will be written to. Needs to be at
+ *		least mpi_get_size(a) long.
+ * @buf_len:	size of the buf.
+ * @nbytes:	receives the actual length of the data written on success and
+ *		the data to-be-written on -EOVERFLOW in case buf_len was too
+ *		small.
+ * @sign:	if not NULL, it will be set to the sign of a.
+ *
+ * Return:	0 on success or error code in case of error
+ */
+int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes,
+		    int *sign)
+{
+	uint8_t *p;
+#if BYTES_PER_MPI_LIMB == 4
+	__be32 alimb;
+#elif BYTES_PER_MPI_LIMB == 8
+	__be64 alimb;
+#else
+#error please implement for this limb size.
+#endif
+	unsigned int n = mpi_get_size(a);
+	int i, lzeros;
+
+	if (!buf || !nbytes)
+		return -EINVAL;
+
+	if (sign)
+		*sign = a->sign;
+
+	lzeros = count_lzeros(a);
+
+	if (buf_len < n - lzeros) {
+		*nbytes = n - lzeros;
+		return -EOVERFLOW;
+	}
+
+	p = buf;
+	*nbytes = n - lzeros;
+
+	for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB,
+			lzeros %= BYTES_PER_MPI_LIMB;
+		i >= 0; i--) {
+#if BYTES_PER_MPI_LIMB == 4
+		alimb = cpu_to_be32(a->d[i]);
+#elif BYTES_PER_MPI_LIMB == 8
+		alimb = cpu_to_be64(a->d[i]);
+#else
+#error please implement for this limb size.
+#endif
+		memcpy(p, (u8 *)&alimb + lzeros, BYTES_PER_MPI_LIMB - lzeros);
+		p += BYTES_PER_MPI_LIMB - lzeros;
+		lzeros = 0;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_read_buffer);
+
+/*
+ * mpi_get_buffer() - Returns an allocated buffer with the MPI (msb first).
+ * Caller must free the return string.
+ * This function does return a 0 byte buffer with nbytes set to zero if the
+ * value of A is zero.
+ *
+ * @a:		a multi precision integer.
+ * @nbytes:	receives the length of this buffer.
+ * @sign:	if not NULL, it will be set to the sign of the a.
+ *
+ * Return:	Pointer to MPI buffer or NULL on error
+ */
+void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign)
+{
+	uint8_t *buf;
+	unsigned int n;
+	int ret;
+
+	if (!nbytes)
+		return NULL;
+
+	n = mpi_get_size(a);
+
+	if (!n)
+		n++;
+
+	buf = kmalloc(n, GFP_KERNEL);
+
+	if (!buf)
+		return NULL;
+
+	ret = mpi_read_buffer(a, buf, n, nbytes, sign);
+
+	if (ret) {
+		kfree(buf);
+		return NULL;
+	}
+	return buf;
+}
+EXPORT_SYMBOL_GPL(mpi_get_buffer);
+
+/**
+ * mpi_write_to_sgl() - Funnction exports MPI to an sgl (msb first)
+ *
+ * This function works in the same way as the mpi_read_buffer, but it
+ * takes an sgl instead of u8 * buf.
+ *
+ * @a:		a multi precision integer
+ * @sgl:	scatterlist to write to. Needs to be at least
+ *		mpi_get_size(a) long.
+ * @nbytes:	the number of bytes to write.  Leading bytes will be
+ *		filled with zero.
+ * @sign:	if not NULL, it will be set to the sign of a.
+ *
+ * Return:	0 on success or error code in case of error
+ */
+int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned nbytes,
+		     int *sign)
+{
+	u8 *p, *p2;
+#if BYTES_PER_MPI_LIMB == 4
+	__be32 alimb;
+#elif BYTES_PER_MPI_LIMB == 8
+	__be64 alimb;
+#else
+#error please implement for this limb size.
+#endif
+	unsigned int n = mpi_get_size(a);
+	struct sg_mapping_iter miter;
+	int i, x, buf_len;
+	int nents;
+
+	if (sign)
+		*sign = a->sign;
+
+	if (nbytes < n)
+		return -EOVERFLOW;
+
+	nents = sg_nents_for_len(sgl, nbytes);
+	if (nents < 0)
+		return -EINVAL;
+
+	sg_miter_start(&miter, sgl, nents, SG_MITER_ATOMIC | SG_MITER_TO_SG);
+	sg_miter_next(&miter);
+	buf_len = miter.length;
+	p2 = miter.addr;
+
+	while (nbytes > n) {
+		i = min_t(unsigned, nbytes - n, buf_len);
+		memset(p2, 0, i);
+		p2 += i;
+		nbytes -= i;
+
+		buf_len -= i;
+		if (!buf_len) {
+			sg_miter_next(&miter);
+			buf_len = miter.length;
+			p2 = miter.addr;
+		}
+	}
+
+	for (i = a->nlimbs - 1; i >= 0; i--) {
+#if BYTES_PER_MPI_LIMB == 4
+		alimb = a->d[i] ? cpu_to_be32(a->d[i]) : 0;
+#elif BYTES_PER_MPI_LIMB == 8
+		alimb = a->d[i] ? cpu_to_be64(a->d[i]) : 0;
+#else
+#error please implement for this limb size.
+#endif
+		p = (u8 *)&alimb;
+
+		for (x = 0; x < sizeof(alimb); x++) {
+			*p2++ = *p++;
+			if (!--buf_len) {
+				sg_miter_next(&miter);
+				buf_len = miter.length;
+				p2 = miter.addr;
+			}
+		}
+	}
+
+	sg_miter_stop(&miter);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_write_to_sgl);
+
+/*
+ * mpi_read_raw_from_sgl() - Function allocates an MPI and populates it with
+ *			     data from the sgl
+ *
+ * This function works in the same way as the mpi_read_raw_data, but it
+ * takes an sgl instead of void * buffer. i.e. it allocates
+ * a new MPI and reads the content of the sgl to the MPI.
+ *
+ * @sgl:	scatterlist to read from
+ * @nbytes:	number of bytes to read
+ *
+ * Return:	Pointer to a new MPI or NULL on error
+ */
+MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes)
+{
+	struct sg_mapping_iter miter;
+	unsigned int nbits, nlimbs;
+	int x, j, z, lzeros, ents;
+	unsigned int len;
+	const u8 *buff;
+	mpi_limb_t a;
+	MPI val = NULL;
+
+	ents = sg_nents_for_len(sgl, nbytes);
+	if (ents < 0)
+		return NULL;
+
+	sg_miter_start(&miter, sgl, ents, SG_MITER_ATOMIC | SG_MITER_FROM_SG);
+
+	lzeros = 0;
+	len = 0;
+	while (nbytes > 0) {
+		while (len && !*buff) {
+			lzeros++;
+			len--;
+			buff++;
+		}
+
+		if (len && *buff)
+			break;
+
+		sg_miter_next(&miter);
+		buff = miter.addr;
+		len = miter.length;
+
+		nbytes -= lzeros;
+		lzeros = 0;
+	}
+
+	miter.consumed = lzeros;
+
+	nbytes -= lzeros;
+	nbits = nbytes * 8;
+	if (nbits > MAX_EXTERN_MPI_BITS) {
+		sg_miter_stop(&miter);
+		pr_info("MPI: mpi too large (%u bits)\n", nbits);
+		return NULL;
+	}
+
+	if (nbytes > 0)
+		nbits -= count_leading_zeros(*buff) - (BITS_PER_LONG - 8);
+
+	sg_miter_stop(&miter);
+
+	nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB);
+	val = mpi_alloc(nlimbs);
+	if (!val)
+		return NULL;
+
+	val->nbits = nbits;
+	val->sign = 0;
+	val->nlimbs = nlimbs;
+
+	if (nbytes == 0)
+		return val;
+
+	j = nlimbs - 1;
+	a = 0;
+	z = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB;
+	z %= BYTES_PER_MPI_LIMB;
+
+	while (sg_miter_next(&miter)) {
+		buff = miter.addr;
+		len = min(miter.length, nbytes);
+		nbytes -= len;
+
+		for (x = 0; x < len; x++) {
+			a <<= 8;
+			a |= *buff++;
+			if (((z + x + 1) % BYTES_PER_MPI_LIMB) == 0) {
+				val->d[j--] = a;
+				a = 0;
+			}
+		}
+		z += x;
+	}
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(mpi_read_raw_from_sgl);
diff --git a/lib/crypto/mpi/mpih-cmp.c b/lib/crypto/mpi/mpih-cmp.c
new file mode 100644
index 000000000000..f23709114a65
--- /dev/null
+++ b/lib/crypto/mpi/mpih-cmp.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-sub.c  -  MPI helper functions
+ *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *	Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+
+/****************
+ * Compare OP1_PTR/OP1_SIZE with OP2_PTR/OP2_SIZE.
+ * There are no restrictions on the relative sizes of
+ * the two arguments.
+ * Return 1 if OP1 > OP2, 0 if they are equal, and -1 if OP1 < OP2.
+ */
+int mpihelp_cmp(mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size)
+{
+	mpi_size_t i;
+	mpi_limb_t op1_word, op2_word;
+
+	for (i = size - 1; i >= 0; i--) {
+		op1_word = op1_ptr[i];
+		op2_word = op2_ptr[i];
+		if (op1_word != op2_word)
+			goto diff;
+	}
+	return 0;
+
+diff:
+	/* This can *not* be simplified to
+	 *   op2_word - op2_word
+	 * since that expression might give signed overflow.  */
+	return (op1_word > op2_word) ? 1 : -1;
+}
diff --git a/lib/crypto/mpi/mpih-div.c b/lib/crypto/mpi/mpih-div.c
new file mode 100644
index 000000000000..be70ee2e42d3
--- /dev/null
+++ b/lib/crypto/mpi/mpih-div.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-div.c  -  MPI helper functions
+ *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *	Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+
+mpi_limb_t
+mpihelp_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
+			mpi_limb_t divisor_limb)
+{
+	mpi_size_t i;
+	mpi_limb_t n1, n0, r;
+	mpi_limb_t dummy __maybe_unused;
+
+	/* Botch: Should this be handled at all?  Rely on callers?	*/
+	if (!dividend_size)
+		return 0;
+
+	/* If multiplication is much faster than division, and the
+	 * dividend is large, pre-invert the divisor, and use
+	 * only multiplications in the inner loop.
+	 *
+	 * This test should be read:
+	 *	 Does it ever help to use udiv_qrnnd_preinv?
+	 *	   && Does what we save compensate for the inversion overhead?
+	 */
+	if (UDIV_TIME > (2 * UMUL_TIME + 6)
+			&& (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) {
+		int normalization_steps;
+
+		normalization_steps = count_leading_zeros(divisor_limb);
+		if (normalization_steps) {
+			mpi_limb_t divisor_limb_inverted;
+
+			divisor_limb <<= normalization_steps;
+
+			/* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB.  The
+			 * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+			 * most significant bit (with weight 2**N) implicit.
+			 *
+			 * Special case for DIVISOR_LIMB == 100...000.
+			 */
+			if (!(divisor_limb << 1))
+				divisor_limb_inverted = ~(mpi_limb_t)0;
+			else
+				udiv_qrnnd(divisor_limb_inverted, dummy,
+						-divisor_limb, 0, divisor_limb);
+
+			n1 = dividend_ptr[dividend_size - 1];
+			r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps);
+
+			/* Possible optimization:
+			 * if (r == 0
+			 * && divisor_limb > ((n1 << normalization_steps)
+			 *		       | (dividend_ptr[dividend_size - 2] >> ...)))
+			 * ...one division less...
+			 */
+			for (i = dividend_size - 2; i >= 0; i--) {
+				n0 = dividend_ptr[i];
+				UDIV_QRNND_PREINV(dummy, r, r,
+						((n1 << normalization_steps)
+						 | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))),
+						divisor_limb, divisor_limb_inverted);
+				n1 = n0;
+			}
+			UDIV_QRNND_PREINV(dummy, r, r,
+					n1 << normalization_steps,
+					divisor_limb, divisor_limb_inverted);
+			return r >> normalization_steps;
+		} else {
+			mpi_limb_t divisor_limb_inverted;
+
+			/* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB.  The
+			 * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+			 * most significant bit (with weight 2**N) implicit.
+			 *
+			 * Special case for DIVISOR_LIMB == 100...000.
+			 */
+			if (!(divisor_limb << 1))
+				divisor_limb_inverted = ~(mpi_limb_t)0;
+			else
+				udiv_qrnnd(divisor_limb_inverted, dummy,
+						-divisor_limb, 0, divisor_limb);
+
+			i = dividend_size - 1;
+			r = dividend_ptr[i];
+
+			if (r >= divisor_limb)
+				r = 0;
+			else
+				i--;
+
+			for ( ; i >= 0; i--) {
+				n0 = dividend_ptr[i];
+				UDIV_QRNND_PREINV(dummy, r, r,
+						n0, divisor_limb, divisor_limb_inverted);
+			}
+			return r;
+		}
+	} else {
+		if (UDIV_NEEDS_NORMALIZATION) {
+			int normalization_steps;
+
+			normalization_steps = count_leading_zeros(divisor_limb);
+			if (normalization_steps) {
+				divisor_limb <<= normalization_steps;
+
+				n1 = dividend_ptr[dividend_size - 1];
+				r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps);
+
+				/* Possible optimization:
+				 * if (r == 0
+				 * && divisor_limb > ((n1 << normalization_steps)
+				 *		   | (dividend_ptr[dividend_size - 2] >> ...)))
+				 * ...one division less...
+				 */
+				for (i = dividend_size - 2; i >= 0; i--) {
+					n0 = dividend_ptr[i];
+					udiv_qrnnd(dummy, r, r,
+						((n1 << normalization_steps)
+						 | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))),
+						divisor_limb);
+					n1 = n0;
+				}
+				udiv_qrnnd(dummy, r, r,
+						n1 << normalization_steps,
+						divisor_limb);
+				return r >> normalization_steps;
+			}
+		}
+		/* No normalization needed, either because udiv_qrnnd doesn't require
+		 * it, or because DIVISOR_LIMB is already normalized.
+		 */
+		i = dividend_size - 1;
+		r = dividend_ptr[i];
+
+		if (r >= divisor_limb)
+			r = 0;
+		else
+			i--;
+
+		for (; i >= 0; i--) {
+			n0 = dividend_ptr[i];
+			udiv_qrnnd(dummy, r, r, n0, divisor_limb);
+		}
+		return r;
+	}
+}
+
+/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write
+ * the NSIZE-DSIZE least significant quotient limbs at QP
+ * and the DSIZE long remainder at NP.	If QEXTRA_LIMBS is
+ * non-zero, generate that many fraction bits and append them after the
+ * other quotient limbs.
+ * Return the most significant limb of the quotient, this is always 0 or 1.
+ *
+ * Preconditions:
+ * 0. NSIZE >= DSIZE.
+ * 1. The most significant bit of the divisor must be set.
+ * 2. QP must either not overlap with the input operands at all, or
+ *    QP + DSIZE >= NP must hold true.	(This means that it's
+ *    possible to put the quotient in the high part of NUM, right after the
+ *    remainder in NUM.
+ * 3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero.
+ */
+
+mpi_limb_t
+mpihelp_divrem(mpi_ptr_t qp, mpi_size_t qextra_limbs,
+	       mpi_ptr_t np, mpi_size_t nsize, mpi_ptr_t dp, mpi_size_t dsize)
+{
+	mpi_limb_t most_significant_q_limb = 0;
+
+	switch (dsize) {
+	case 0:
+		/* We are asked to divide by zero, so go ahead and do it!  (To make
+		   the compiler not remove this statement, return the value.)  */
+		/*
+		 * existing clients of this function have been modified
+		 * not to call it with dsize == 0, so this should not happen
+		 */
+		return 1 / dsize;
+
+	case 1:
+		{
+			mpi_size_t i;
+			mpi_limb_t n1;
+			mpi_limb_t d;
+
+			d = dp[0];
+			n1 = np[nsize - 1];
+
+			if (n1 >= d) {
+				n1 -= d;
+				most_significant_q_limb = 1;
+			}
+
+			qp += qextra_limbs;
+			for (i = nsize - 2; i >= 0; i--)
+				udiv_qrnnd(qp[i], n1, n1, np[i], d);
+			qp -= qextra_limbs;
+
+			for (i = qextra_limbs - 1; i >= 0; i--)
+				udiv_qrnnd(qp[i], n1, n1, 0, d);
+
+			np[0] = n1;
+		}
+		break;
+
+	case 2:
+		{
+			mpi_size_t i;
+			mpi_limb_t n1, n0, n2;
+			mpi_limb_t d1, d0;
+
+			np += nsize - 2;
+			d1 = dp[1];
+			d0 = dp[0];
+			n1 = np[1];
+			n0 = np[0];
+
+			if (n1 >= d1 && (n1 > d1 || n0 >= d0)) {
+				sub_ddmmss(n1, n0, n1, n0, d1, d0);
+				most_significant_q_limb = 1;
+			}
+
+			for (i = qextra_limbs + nsize - 2 - 1; i >= 0; i--) {
+				mpi_limb_t q;
+				mpi_limb_t r;
+
+				if (i >= qextra_limbs)
+					np--;
+				else
+					np[0] = 0;
+
+				if (n1 == d1) {
+					/* Q should be either 111..111 or 111..110.  Need special
+					 * treatment of this rare case as normal division would
+					 * give overflow.  */
+					q = ~(mpi_limb_t) 0;
+
+					r = n0 + d1;
+					if (r < d1) {	/* Carry in the addition? */
+						add_ssaaaa(n1, n0, r - d0,
+							   np[0], 0, d0);
+						qp[i] = q;
+						continue;
+					}
+					n1 = d0 - (d0 != 0 ? 1 : 0);
+					n0 = -d0;
+				} else {
+					udiv_qrnnd(q, r, n1, n0, d1);
+					umul_ppmm(n1, n0, d0, q);
+				}
+
+				n2 = np[0];
+q_test:
+				if (n1 > r || (n1 == r && n0 > n2)) {
+					/* The estimated Q was too large.  */
+					q--;
+					sub_ddmmss(n1, n0, n1, n0, 0, d0);
+					r += d1;
+					if (r >= d1)	/* If not carry, test Q again.  */
+						goto q_test;
+				}
+
+				qp[i] = q;
+				sub_ddmmss(n1, n0, r, n2, n1, n0);
+			}
+			np[1] = n1;
+			np[0] = n0;
+		}
+		break;
+
+	default:
+		{
+			mpi_size_t i;
+			mpi_limb_t dX, d1, n0;
+
+			np += nsize - dsize;
+			dX = dp[dsize - 1];
+			d1 = dp[dsize - 2];
+			n0 = np[dsize - 1];
+
+			if (n0 >= dX) {
+				if (n0 > dX
+				    || mpihelp_cmp(np, dp, dsize - 1) >= 0) {
+					mpihelp_sub_n(np, np, dp, dsize);
+					n0 = np[dsize - 1];
+					most_significant_q_limb = 1;
+				}
+			}
+
+			for (i = qextra_limbs + nsize - dsize - 1; i >= 0; i--) {
+				mpi_limb_t q;
+				mpi_limb_t n1, n2;
+				mpi_limb_t cy_limb;
+
+				if (i >= qextra_limbs) {
+					np--;
+					n2 = np[dsize];
+				} else {
+					n2 = np[dsize - 1];
+					MPN_COPY_DECR(np + 1, np, dsize - 1);
+					np[0] = 0;
+				}
+
+				if (n0 == dX) {
+					/* This might over-estimate q, but it's probably not worth
+					 * the extra code here to find out.  */
+					q = ~(mpi_limb_t) 0;
+				} else {
+					mpi_limb_t r;
+
+					udiv_qrnnd(q, r, n0, np[dsize - 1], dX);
+					umul_ppmm(n1, n0, d1, q);
+
+					while (n1 > r
+					       || (n1 == r
+						   && n0 > np[dsize - 2])) {
+						q--;
+						r += dX;
+						if (r < dX)	/* I.e. "carry in previous addition?" */
+							break;
+						n1 -= n0 < d1;
+						n0 -= d1;
+					}
+				}
+
+				/* Possible optimization: We already have (q * n0) and (1 * n1)
+				 * after the calculation of q.  Taking advantage of that, we
+				 * could make this loop make two iterations less.  */
+				cy_limb = mpihelp_submul_1(np, dp, dsize, q);
+
+				if (n2 != cy_limb) {
+					mpihelp_add_n(np, np, dp, dsize);
+					q--;
+				}
+
+				qp[i] = q;
+				n0 = np[dsize - 1];
+			}
+		}
+	}
+
+	return most_significant_q_limb;
+}
+
+/****************
+ * Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+ * Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+ * Return the single-limb remainder.
+ * There are no constraints on the value of the divisor.
+ *
+ * QUOT_PTR and DIVIDEND_PTR might point to the same limb.
+ */
+
+mpi_limb_t
+mpihelp_divmod_1(mpi_ptr_t quot_ptr,
+		mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
+		mpi_limb_t divisor_limb)
+{
+	mpi_size_t i;
+	mpi_limb_t n1, n0, r;
+	mpi_limb_t dummy __maybe_unused;
+
+	if (!dividend_size)
+		return 0;
+
+	/* If multiplication is much faster than division, and the
+	 * dividend is large, pre-invert the divisor, and use
+	 * only multiplications in the inner loop.
+	 *
+	 * This test should be read:
+	 * Does it ever help to use udiv_qrnnd_preinv?
+	 * && Does what we save compensate for the inversion overhead?
+	 */
+	if (UDIV_TIME > (2 * UMUL_TIME + 6)
+			&& (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) {
+		int normalization_steps;
+
+		normalization_steps = count_leading_zeros(divisor_limb);
+		if (normalization_steps) {
+			mpi_limb_t divisor_limb_inverted;
+
+			divisor_limb <<= normalization_steps;
+
+			/* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB.  The
+			 * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+			 * most significant bit (with weight 2**N) implicit.
+			 */
+			/* Special case for DIVISOR_LIMB == 100...000.  */
+			if (!(divisor_limb << 1))
+				divisor_limb_inverted = ~(mpi_limb_t)0;
+			else
+				udiv_qrnnd(divisor_limb_inverted, dummy,
+						-divisor_limb, 0, divisor_limb);
+
+			n1 = dividend_ptr[dividend_size - 1];
+			r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps);
+
+			/* Possible optimization:
+			 * if (r == 0
+			 * && divisor_limb > ((n1 << normalization_steps)
+			 *		       | (dividend_ptr[dividend_size - 2] >> ...)))
+			 * ...one division less...
+			 */
+			for (i = dividend_size - 2; i >= 0; i--) {
+				n0 = dividend_ptr[i];
+				UDIV_QRNND_PREINV(quot_ptr[i + 1], r, r,
+						((n1 << normalization_steps)
+						 | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))),
+						divisor_limb, divisor_limb_inverted);
+				n1 = n0;
+			}
+			UDIV_QRNND_PREINV(quot_ptr[0], r, r,
+					n1 << normalization_steps,
+					divisor_limb, divisor_limb_inverted);
+			return r >> normalization_steps;
+		} else {
+			mpi_limb_t divisor_limb_inverted;
+
+			/* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB.  The
+			 * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+			 * most significant bit (with weight 2**N) implicit.
+			 */
+			/* Special case for DIVISOR_LIMB == 100...000.  */
+			if (!(divisor_limb << 1))
+				divisor_limb_inverted = ~(mpi_limb_t) 0;
+			else
+				udiv_qrnnd(divisor_limb_inverted, dummy,
+						-divisor_limb, 0, divisor_limb);
+
+			i = dividend_size - 1;
+			r = dividend_ptr[i];
+
+			if (r >= divisor_limb)
+				r = 0;
+			else
+				quot_ptr[i--] = 0;
+
+			for ( ; i >= 0; i--) {
+				n0 = dividend_ptr[i];
+				UDIV_QRNND_PREINV(quot_ptr[i], r, r,
+						n0, divisor_limb, divisor_limb_inverted);
+			}
+			return r;
+		}
+	} else {
+		if (UDIV_NEEDS_NORMALIZATION) {
+			int normalization_steps;
+
+			normalization_steps = count_leading_zeros(divisor_limb);
+			if (normalization_steps) {
+				divisor_limb <<= normalization_steps;
+
+				n1 = dividend_ptr[dividend_size - 1];
+				r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps);
+
+				/* Possible optimization:
+				 * if (r == 0
+				 * && divisor_limb > ((n1 << normalization_steps)
+				 *		   | (dividend_ptr[dividend_size - 2] >> ...)))
+				 * ...one division less...
+				 */
+				for (i = dividend_size - 2; i >= 0; i--) {
+					n0 = dividend_ptr[i];
+					udiv_qrnnd(quot_ptr[i + 1], r, r,
+						((n1 << normalization_steps)
+						 | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))),
+						divisor_limb);
+					n1 = n0;
+				}
+				udiv_qrnnd(quot_ptr[0], r, r,
+						n1 << normalization_steps,
+						divisor_limb);
+				return r >> normalization_steps;
+			}
+		}
+		/* No normalization needed, either because udiv_qrnnd doesn't require
+		 * it, or because DIVISOR_LIMB is already normalized.
+		 */
+		i = dividend_size - 1;
+		r = dividend_ptr[i];
+
+		if (r >= divisor_limb)
+			r = 0;
+		else
+			quot_ptr[i--] = 0;
+
+		for (; i >= 0; i--) {
+			n0 = dividend_ptr[i];
+			udiv_qrnnd(quot_ptr[i], r, r, n0, divisor_limb);
+		}
+		return r;
+	}
+}
diff --git a/lib/crypto/mpi/mpih-mul.c b/lib/crypto/mpi/mpih-mul.c
new file mode 100644
index 000000000000..a93647564054
--- /dev/null
+++ b/lib/crypto/mpi/mpih-mul.c
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-mul.c  -  MPI helper functions
+ * Copyright (C) 1994, 1996, 1998, 1999,
+ *               2000 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include <linux/string.h>
+#include "mpi-internal.h"
+#include "longlong.h"
+
+#define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace)		\
+	do {							\
+		if ((size) < KARATSUBA_THRESHOLD)		\
+			mul_n_basecase(prodp, up, vp, size);	\
+		else						\
+			mul_n(prodp, up, vp, size, tspace);	\
+	} while (0);
+
+#define MPN_SQR_N_RECURSE(prodp, up, size, tspace)		\
+	do {							\
+		if ((size) < KARATSUBA_THRESHOLD)		\
+			mpih_sqr_n_basecase(prodp, up, size);	\
+		else						\
+			mpih_sqr_n(prodp, up, size, tspace);	\
+	} while (0);
+
+/* Multiply the natural numbers u (pointed to by UP) and v (pointed to by VP),
+ * both with SIZE limbs, and store the result at PRODP.  2 * SIZE limbs are
+ * always stored.  Return the most significant limb.
+ *
+ * Argument constraints:
+ * 1. PRODP != UP and PRODP != VP, i.e. the destination
+ *    must be distinct from the multiplier and the multiplicand.
+ *
+ *
+ * Handle simple cases with traditional multiplication.
+ *
+ * This is the most critical code of multiplication.  All multiplies rely
+ * on this, both small and huge.  Small ones arrive here immediately.  Huge
+ * ones arrive here as this is the base case for Karatsuba's recursive
+ * algorithm below.
+ */
+
+static mpi_limb_t
+mul_n_basecase(mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size)
+{
+	mpi_size_t i;
+	mpi_limb_t cy;
+	mpi_limb_t v_limb;
+
+	/* Multiply by the first limb in V separately, as the result can be
+	 * stored (not added) to PROD.  We also avoid a loop for zeroing.  */
+	v_limb = vp[0];
+	if (v_limb <= 1) {
+		if (v_limb == 1)
+			MPN_COPY(prodp, up, size);
+		else
+			MPN_ZERO(prodp, size);
+		cy = 0;
+	} else
+		cy = mpihelp_mul_1(prodp, up, size, v_limb);
+
+	prodp[size] = cy;
+	prodp++;
+
+	/* For each iteration in the outer loop, multiply one limb from
+	 * U with one limb from V, and add it to PROD.  */
+	for (i = 1; i < size; i++) {
+		v_limb = vp[i];
+		if (v_limb <= 1) {
+			cy = 0;
+			if (v_limb == 1)
+				cy = mpihelp_add_n(prodp, prodp, up, size);
+		} else
+			cy = mpihelp_addmul_1(prodp, up, size, v_limb);
+
+		prodp[size] = cy;
+		prodp++;
+	}
+
+	return cy;
+}
+
+static void
+mul_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp,
+		mpi_size_t size, mpi_ptr_t tspace)
+{
+	if (size & 1) {
+		/* The size is odd, and the code below doesn't handle that.
+		 * Multiply the least significant (size - 1) limbs with a recursive
+		 * call, and handle the most significant limb of S1 and S2
+		 * separately.
+		 * A slightly faster way to do this would be to make the Karatsuba
+		 * code below behave as if the size were even, and let it check for
+		 * odd size in the end.  I.e., in essence move this code to the end.
+		 * Doing so would save us a recursive call, and potentially make the
+		 * stack grow a lot less.
+		 */
+		mpi_size_t esize = size - 1;	/* even size */
+		mpi_limb_t cy_limb;
+
+		MPN_MUL_N_RECURSE(prodp, up, vp, esize, tspace);
+		cy_limb = mpihelp_addmul_1(prodp + esize, up, esize, vp[esize]);
+		prodp[esize + esize] = cy_limb;
+		cy_limb = mpihelp_addmul_1(prodp + esize, vp, size, up[esize]);
+		prodp[esize + size] = cy_limb;
+	} else {
+		/* Anatolij Alekseevich Karatsuba's divide-and-conquer algorithm.
+		 *
+		 * Split U in two pieces, U1 and U0, such that
+		 * U = U0 + U1*(B**n),
+		 * and V in V1 and V0, such that
+		 * V = V0 + V1*(B**n).
+		 *
+		 * UV is then computed recursively using the identity
+		 *
+		 *        2n   n          n                     n
+		 * UV = (B  + B )U V  +  B (U -U )(V -V )  +  (B + 1)U V
+		 *                1 1        1  0   0  1              0 0
+		 *
+		 * Where B = 2**BITS_PER_MP_LIMB.
+		 */
+		mpi_size_t hsize = size >> 1;
+		mpi_limb_t cy;
+		int negflg;
+
+		/* Product H.      ________________  ________________
+		 *                |_____U1 x V1____||____U0 x V0_____|
+		 * Put result in upper part of PROD and pass low part of TSPACE
+		 * as new TSPACE.
+		 */
+		MPN_MUL_N_RECURSE(prodp + size, up + hsize, vp + hsize, hsize,
+				  tspace);
+
+		/* Product M.      ________________
+		 *                |_(U1-U0)(V0-V1)_|
+		 */
+		if (mpihelp_cmp(up + hsize, up, hsize) >= 0) {
+			mpihelp_sub_n(prodp, up + hsize, up, hsize);
+			negflg = 0;
+		} else {
+			mpihelp_sub_n(prodp, up, up + hsize, hsize);
+			negflg = 1;
+		}
+		if (mpihelp_cmp(vp + hsize, vp, hsize) >= 0) {
+			mpihelp_sub_n(prodp + hsize, vp + hsize, vp, hsize);
+			negflg ^= 1;
+		} else {
+			mpihelp_sub_n(prodp + hsize, vp, vp + hsize, hsize);
+			/* No change of NEGFLG.  */
+		}
+		/* Read temporary operands from low part of PROD.
+		 * Put result in low part of TSPACE using upper part of TSPACE
+		 * as new TSPACE.
+		 */
+		MPN_MUL_N_RECURSE(tspace, prodp, prodp + hsize, hsize,
+				  tspace + size);
+
+		/* Add/copy product H. */
+		MPN_COPY(prodp + hsize, prodp + size, hsize);
+		cy = mpihelp_add_n(prodp + size, prodp + size,
+				   prodp + size + hsize, hsize);
+
+		/* Add product M (if NEGFLG M is a negative number) */
+		if (negflg)
+			cy -=
+			    mpihelp_sub_n(prodp + hsize, prodp + hsize, tspace,
+					  size);
+		else
+			cy +=
+			    mpihelp_add_n(prodp + hsize, prodp + hsize, tspace,
+					  size);
+
+		/* Product L.      ________________  ________________
+		 *                |________________||____U0 x V0_____|
+		 * Read temporary operands from low part of PROD.
+		 * Put result in low part of TSPACE using upper part of TSPACE
+		 * as new TSPACE.
+		 */
+		MPN_MUL_N_RECURSE(tspace, up, vp, hsize, tspace + size);
+
+		/* Add/copy Product L (twice) */
+
+		cy += mpihelp_add_n(prodp + hsize, prodp + hsize, tspace, size);
+		if (cy)
+			mpihelp_add_1(prodp + hsize + size,
+				      prodp + hsize + size, hsize, cy);
+
+		MPN_COPY(prodp, tspace, hsize);
+		cy = mpihelp_add_n(prodp + hsize, prodp + hsize, tspace + hsize,
+				   hsize);
+		if (cy)
+			mpihelp_add_1(prodp + size, prodp + size, size, 1);
+	}
+}
+
+void mpih_sqr_n_basecase(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size)
+{
+	mpi_size_t i;
+	mpi_limb_t cy_limb;
+	mpi_limb_t v_limb;
+
+	/* Multiply by the first limb in V separately, as the result can be
+	 * stored (not added) to PROD.  We also avoid a loop for zeroing.  */
+	v_limb = up[0];
+	if (v_limb <= 1) {
+		if (v_limb == 1)
+			MPN_COPY(prodp, up, size);
+		else
+			MPN_ZERO(prodp, size);
+		cy_limb = 0;
+	} else
+		cy_limb = mpihelp_mul_1(prodp, up, size, v_limb);
+
+	prodp[size] = cy_limb;
+	prodp++;
+
+	/* For each iteration in the outer loop, multiply one limb from
+	 * U with one limb from V, and add it to PROD.  */
+	for (i = 1; i < size; i++) {
+		v_limb = up[i];
+		if (v_limb <= 1) {
+			cy_limb = 0;
+			if (v_limb == 1)
+				cy_limb = mpihelp_add_n(prodp, prodp, up, size);
+		} else
+			cy_limb = mpihelp_addmul_1(prodp, up, size, v_limb);
+
+		prodp[size] = cy_limb;
+		prodp++;
+	}
+}
+
+void
+mpih_sqr_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size, mpi_ptr_t tspace)
+{
+	if (size & 1) {
+		/* The size is odd, and the code below doesn't handle that.
+		 * Multiply the least significant (size - 1) limbs with a recursive
+		 * call, and handle the most significant limb of S1 and S2
+		 * separately.
+		 * A slightly faster way to do this would be to make the Karatsuba
+		 * code below behave as if the size were even, and let it check for
+		 * odd size in the end.  I.e., in essence move this code to the end.
+		 * Doing so would save us a recursive call, and potentially make the
+		 * stack grow a lot less.
+		 */
+		mpi_size_t esize = size - 1;	/* even size */
+		mpi_limb_t cy_limb;
+
+		MPN_SQR_N_RECURSE(prodp, up, esize, tspace);
+		cy_limb = mpihelp_addmul_1(prodp + esize, up, esize, up[esize]);
+		prodp[esize + esize] = cy_limb;
+		cy_limb = mpihelp_addmul_1(prodp + esize, up, size, up[esize]);
+
+		prodp[esize + size] = cy_limb;
+	} else {
+		mpi_size_t hsize = size >> 1;
+		mpi_limb_t cy;
+
+		/* Product H.      ________________  ________________
+		 *                |_____U1 x U1____||____U0 x U0_____|
+		 * Put result in upper part of PROD and pass low part of TSPACE
+		 * as new TSPACE.
+		 */
+		MPN_SQR_N_RECURSE(prodp + size, up + hsize, hsize, tspace);
+
+		/* Product M.      ________________
+		 *                |_(U1-U0)(U0-U1)_|
+		 */
+		if (mpihelp_cmp(up + hsize, up, hsize) >= 0)
+			mpihelp_sub_n(prodp, up + hsize, up, hsize);
+		else
+			mpihelp_sub_n(prodp, up, up + hsize, hsize);
+
+		/* Read temporary operands from low part of PROD.
+		 * Put result in low part of TSPACE using upper part of TSPACE
+		 * as new TSPACE.  */
+		MPN_SQR_N_RECURSE(tspace, prodp, hsize, tspace + size);
+
+		/* Add/copy product H  */
+		MPN_COPY(prodp + hsize, prodp + size, hsize);
+		cy = mpihelp_add_n(prodp + size, prodp + size,
+				   prodp + size + hsize, hsize);
+
+		/* Add product M (if NEGFLG M is a negative number).  */
+		cy -= mpihelp_sub_n(prodp + hsize, prodp + hsize, tspace, size);
+
+		/* Product L.      ________________  ________________
+		 *                |________________||____U0 x U0_____|
+		 * Read temporary operands from low part of PROD.
+		 * Put result in low part of TSPACE using upper part of TSPACE
+		 * as new TSPACE.  */
+		MPN_SQR_N_RECURSE(tspace, up, hsize, tspace + size);
+
+		/* Add/copy Product L (twice).  */
+		cy += mpihelp_add_n(prodp + hsize, prodp + hsize, tspace, size);
+		if (cy)
+			mpihelp_add_1(prodp + hsize + size,
+				      prodp + hsize + size, hsize, cy);
+
+		MPN_COPY(prodp, tspace, hsize);
+		cy = mpihelp_add_n(prodp + hsize, prodp + hsize, tspace + hsize,
+				   hsize);
+		if (cy)
+			mpihelp_add_1(prodp + size, prodp + size, size, 1);
+	}
+}
+
+int
+mpihelp_mul_karatsuba_case(mpi_ptr_t prodp,
+			   mpi_ptr_t up, mpi_size_t usize,
+			   mpi_ptr_t vp, mpi_size_t vsize,
+			   struct karatsuba_ctx *ctx)
+{
+	mpi_limb_t cy;
+
+	if (!ctx->tspace || ctx->tspace_size < vsize) {
+		if (ctx->tspace)
+			mpi_free_limb_space(ctx->tspace);
+		ctx->tspace = mpi_alloc_limb_space(2 * vsize);
+		if (!ctx->tspace)
+			return -ENOMEM;
+		ctx->tspace_size = vsize;
+	}
+
+	MPN_MUL_N_RECURSE(prodp, up, vp, vsize, ctx->tspace);
+
+	prodp += vsize;
+	up += vsize;
+	usize -= vsize;
+	if (usize >= vsize) {
+		if (!ctx->tp || ctx->tp_size < vsize) {
+			if (ctx->tp)
+				mpi_free_limb_space(ctx->tp);
+			ctx->tp = mpi_alloc_limb_space(2 * vsize);
+			if (!ctx->tp) {
+				if (ctx->tspace)
+					mpi_free_limb_space(ctx->tspace);
+				ctx->tspace = NULL;
+				return -ENOMEM;
+			}
+			ctx->tp_size = vsize;
+		}
+
+		do {
+			MPN_MUL_N_RECURSE(ctx->tp, up, vp, vsize, ctx->tspace);
+			cy = mpihelp_add_n(prodp, prodp, ctx->tp, vsize);
+			mpihelp_add_1(prodp + vsize, ctx->tp + vsize, vsize,
+				      cy);
+			prodp += vsize;
+			up += vsize;
+			usize -= vsize;
+		} while (usize >= vsize);
+	}
+
+	if (usize) {
+		if (usize < KARATSUBA_THRESHOLD) {
+			mpi_limb_t tmp;
+			if (mpihelp_mul(ctx->tspace, vp, vsize, up, usize, &tmp)
+			    < 0)
+				return -ENOMEM;
+		} else {
+			if (!ctx->next) {
+				ctx->next = kzalloc(sizeof *ctx, GFP_KERNEL);
+				if (!ctx->next)
+					return -ENOMEM;
+			}
+			if (mpihelp_mul_karatsuba_case(ctx->tspace,
+						       vp, vsize,
+						       up, usize,
+						       ctx->next) < 0)
+				return -ENOMEM;
+		}
+
+		cy = mpihelp_add_n(prodp, prodp, ctx->tspace, vsize);
+		mpihelp_add_1(prodp + vsize, ctx->tspace + vsize, usize, cy);
+	}
+
+	return 0;
+}
+
+void mpihelp_release_karatsuba_ctx(struct karatsuba_ctx *ctx)
+{
+	struct karatsuba_ctx *ctx2;
+
+	if (ctx->tp)
+		mpi_free_limb_space(ctx->tp);
+	if (ctx->tspace)
+		mpi_free_limb_space(ctx->tspace);
+	for (ctx = ctx->next; ctx; ctx = ctx2) {
+		ctx2 = ctx->next;
+		if (ctx->tp)
+			mpi_free_limb_space(ctx->tp);
+		if (ctx->tspace)
+			mpi_free_limb_space(ctx->tspace);
+		kfree(ctx);
+	}
+}
+
+/* Multiply the natural numbers u (pointed to by UP, with USIZE limbs)
+ * and v (pointed to by VP, with VSIZE limbs), and store the result at
+ * PRODP.  USIZE + VSIZE limbs are always stored, but if the input
+ * operands are normalized.  Return the most significant limb of the
+ * result.
+ *
+ * NOTE: The space pointed to by PRODP is overwritten before finished
+ * with U and V, so overlap is an error.
+ *
+ * Argument constraints:
+ * 1. USIZE >= VSIZE.
+ * 2. PRODP != UP and PRODP != VP, i.e. the destination
+ *    must be distinct from the multiplier and the multiplicand.
+ */
+
+int
+mpihelp_mul(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
+	    mpi_ptr_t vp, mpi_size_t vsize, mpi_limb_t *_result)
+{
+	mpi_ptr_t prod_endp = prodp + usize + vsize - 1;
+	mpi_limb_t cy;
+	struct karatsuba_ctx ctx;
+
+	if (vsize < KARATSUBA_THRESHOLD) {
+		mpi_size_t i;
+		mpi_limb_t v_limb;
+
+		if (!vsize) {
+			*_result = 0;
+			return 0;
+		}
+
+		/* Multiply by the first limb in V separately, as the result can be
+		 * stored (not added) to PROD.  We also avoid a loop for zeroing.  */
+		v_limb = vp[0];
+		if (v_limb <= 1) {
+			if (v_limb == 1)
+				MPN_COPY(prodp, up, usize);
+			else
+				MPN_ZERO(prodp, usize);
+			cy = 0;
+		} else
+			cy = mpihelp_mul_1(prodp, up, usize, v_limb);
+
+		prodp[usize] = cy;
+		prodp++;
+
+		/* For each iteration in the outer loop, multiply one limb from
+		 * U with one limb from V, and add it to PROD.  */
+		for (i = 1; i < vsize; i++) {
+			v_limb = vp[i];
+			if (v_limb <= 1) {
+				cy = 0;
+				if (v_limb == 1)
+					cy = mpihelp_add_n(prodp, prodp, up,
+							   usize);
+			} else
+				cy = mpihelp_addmul_1(prodp, up, usize, v_limb);
+
+			prodp[usize] = cy;
+			prodp++;
+		}
+
+		*_result = cy;
+		return 0;
+	}
+
+	memset(&ctx, 0, sizeof ctx);
+	if (mpihelp_mul_karatsuba_case(prodp, up, usize, vp, vsize, &ctx) < 0)
+		return -ENOMEM;
+	mpihelp_release_karatsuba_ctx(&ctx);
+	*_result = *prod_endp;
+	return 0;
+}
diff --git a/lib/crypto/mpi/mpiutil.c b/lib/crypto/mpi/mpiutil.c
new file mode 100644
index 000000000000..7f2db830f404
--- /dev/null
+++ b/lib/crypto/mpi/mpiutil.c
@@ -0,0 +1,152 @@
+/* mpiutil.ac  -  Utility functions for MPI
+ * Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include <linux/export.h>
+
+#include "mpi-internal.h"
+
+/****************
+ * Note:  It was a bad idea to use the number of limbs to allocate
+ *	  because on a alpha the limbs are large but we normally need
+ *	  integers of n bits - So we should change this to bits (or bytes).
+ *
+ *	  But mpi_alloc is used in a lot of places :-)
+ */
+MPI mpi_alloc(unsigned nlimbs)
+{
+	MPI a;
+
+	a = kmalloc(sizeof *a, GFP_KERNEL);
+	if (!a)
+		return a;
+
+	if (nlimbs) {
+		a->d = mpi_alloc_limb_space(nlimbs);
+		if (!a->d) {
+			kfree(a);
+			return NULL;
+		}
+	} else {
+		a->d = NULL;
+	}
+
+	a->alloced = nlimbs;
+	a->nlimbs = 0;
+	a->sign = 0;
+	a->flags = 0;
+	a->nbits = 0;
+	return a;
+}
+EXPORT_SYMBOL_GPL(mpi_alloc);
+
+mpi_ptr_t mpi_alloc_limb_space(unsigned nlimbs)
+{
+	size_t len = nlimbs * sizeof(mpi_limb_t);
+
+	if (!len)
+		return NULL;
+
+	return kmalloc(len, GFP_KERNEL);
+}
+
+void mpi_free_limb_space(mpi_ptr_t a)
+{
+	if (!a)
+		return;
+
+	kfree_sensitive(a);
+}
+
+void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs)
+{
+	mpi_free_limb_space(a->d);
+	a->d = ap;
+	a->alloced = nlimbs;
+}
+
+/****************
+ * Resize the array of A to NLIMBS. the additional space is cleared
+ * (set to 0) [done by m_realloc()]
+ */
+int mpi_resize(MPI a, unsigned nlimbs)
+{
+	void *p;
+
+	if (nlimbs <= a->alloced)
+		return 0;	/* no need to do it */
+
+	if (a->d) {
+		p = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+		memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t));
+		kfree_sensitive(a->d);
+		a->d = p;
+	} else {
+		a->d = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL);
+		if (!a->d)
+			return -ENOMEM;
+	}
+	a->alloced = nlimbs;
+	return 0;
+}
+
+void mpi_free(MPI a)
+{
+	if (!a)
+		return;
+
+	if (a->flags & 4)
+		kfree_sensitive(a->d);
+	else
+		mpi_free_limb_space(a->d);
+
+	if (a->flags & ~7)
+		pr_info("invalid flag value in mpi\n");
+	kfree(a);
+}
+EXPORT_SYMBOL_GPL(mpi_free);
+
+/****************
+ * Note: This copy function should not interpret the MPI
+ *	 but copy it transparently.
+ */
+MPI mpi_copy(MPI a)
+{
+	int i;
+	MPI b;
+
+	if (a) {
+		b = mpi_alloc(a->nlimbs);
+		if (!b)
+			return NULL;
+		b->nlimbs = a->nlimbs;
+		b->sign = a->sign;
+		b->flags = a->flags;
+		b->flags &= ~(16|32); /* Reset the immutable and constant flags. */
+		for (i = 0; i < b->nlimbs; i++)
+			b->d[i] = a->d[i];
+	} else
+		b = NULL;
+	return b;
+}
+
+MODULE_DESCRIPTION("Multiprecision maths library");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/poly1305-donna32.c b/lib/crypto/poly1305-donna32.c
index 7fb71845cc84..b66131b3f6d4 100644
--- a/lib/crypto/poly1305-donna32.c
+++ b/lib/crypto/poly1305-donna32.c
@@ -6,9 +6,10 @@
  * public domain.
  */
 
-#include <linux/kernel.h>
-#include <asm/unaligned.h>
 #include <crypto/internal/poly1305.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/unaligned.h>
 
 void poly1305_core_setkey(struct poly1305_core_key *key,
 			  const u8 raw_key[POLY1305_BLOCK_SIZE])
diff --git a/lib/crypto/poly1305-donna64.c b/lib/crypto/poly1305-donna64.c
index d34cf4053668..8a72a5a84944 100644
--- a/lib/crypto/poly1305-donna64.c
+++ b/lib/crypto/poly1305-donna64.c
@@ -6,11 +6,10 @@
  * public domain.
  */
 
-#include <linux/kernel.h>
-#include <asm/unaligned.h>
 #include <crypto/internal/poly1305.h>
-
-typedef __uint128_t u128;
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/unaligned.h>
 
 void poly1305_core_setkey(struct poly1305_core_key *key,
 			  const u8 raw_key[POLY1305_BLOCK_SIZE])
diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c
index 26d87fc3823e..f313ccc4b4dd 100644
--- a/lib/crypto/poly1305.c
+++ b/lib/crypto/poly1305.c
@@ -8,71 +8,93 @@
  */
 
 #include <crypto/internal/poly1305.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <asm/unaligned.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
 
-void poly1305_init_generic(struct poly1305_desc_ctx *desc,
-			   const u8 key[POLY1305_KEY_SIZE])
+#ifdef CONFIG_CRYPTO_LIB_POLY1305_ARCH
+#include "poly1305.h" /* $(SRCARCH)/poly1305.h */
+#else
+#define poly1305_block_init	poly1305_block_init_generic
+#define poly1305_blocks		poly1305_blocks_generic
+#define poly1305_emit		poly1305_emit_generic
+#endif
+
+void poly1305_init(struct poly1305_desc_ctx *desc,
+		   const u8 key[POLY1305_KEY_SIZE])
 {
-	poly1305_core_setkey(&desc->core_r, key);
 	desc->s[0] = get_unaligned_le32(key + 16);
 	desc->s[1] = get_unaligned_le32(key + 20);
 	desc->s[2] = get_unaligned_le32(key + 24);
 	desc->s[3] = get_unaligned_le32(key + 28);
-	poly1305_core_init(&desc->h);
 	desc->buflen = 0;
-	desc->sset = true;
-	desc->rset = 2;
+	poly1305_block_init(&desc->state, key);
 }
-EXPORT_SYMBOL_GPL(poly1305_init_generic);
+EXPORT_SYMBOL(poly1305_init);
 
-void poly1305_update_generic(struct poly1305_desc_ctx *desc, const u8 *src,
-			     unsigned int nbytes)
+void poly1305_update(struct poly1305_desc_ctx *desc,
+		     const u8 *src, unsigned int nbytes)
 {
-	unsigned int bytes;
+	if (desc->buflen + nbytes >= POLY1305_BLOCK_SIZE) {
+		unsigned int bulk_len;
 
-	if (unlikely(desc->buflen)) {
-		bytes = min(nbytes, POLY1305_BLOCK_SIZE - desc->buflen);
-		memcpy(desc->buf + desc->buflen, src, bytes);
-		src += bytes;
-		nbytes -= bytes;
-		desc->buflen += bytes;
+		if (desc->buflen) {
+			unsigned int l = POLY1305_BLOCK_SIZE - desc->buflen;
 
-		if (desc->buflen == POLY1305_BLOCK_SIZE) {
-			poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf,
-					     1, 1);
+			memcpy(&desc->buf[desc->buflen], src, l);
+			src += l;
+			nbytes -= l;
+
+			poly1305_blocks(&desc->state, desc->buf,
+					POLY1305_BLOCK_SIZE, 1);
 			desc->buflen = 0;
 		}
-	}
 
-	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
-		poly1305_core_blocks(&desc->h, &desc->core_r, src,
-				     nbytes / POLY1305_BLOCK_SIZE, 1);
-		src += nbytes - (nbytes % POLY1305_BLOCK_SIZE);
+		bulk_len = round_down(nbytes, POLY1305_BLOCK_SIZE);
 		nbytes %= POLY1305_BLOCK_SIZE;
-	}
 
-	if (unlikely(nbytes)) {
-		desc->buflen = nbytes;
-		memcpy(desc->buf, src, nbytes);
+		if (bulk_len) {
+			poly1305_blocks(&desc->state, src, bulk_len, 1);
+			src += bulk_len;
+		}
+	}
+	if (nbytes) {
+		memcpy(&desc->buf[desc->buflen], src, nbytes);
+		desc->buflen += nbytes;
 	}
 }
-EXPORT_SYMBOL_GPL(poly1305_update_generic);
+EXPORT_SYMBOL(poly1305_update);
 
-void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *dst)
+void poly1305_final(struct poly1305_desc_ctx *desc, u8 *dst)
 {
 	if (unlikely(desc->buflen)) {
 		desc->buf[desc->buflen++] = 1;
 		memset(desc->buf + desc->buflen, 0,
 		       POLY1305_BLOCK_SIZE - desc->buflen);
-		poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf, 1, 0);
+		poly1305_blocks(&desc->state, desc->buf, POLY1305_BLOCK_SIZE,
+				0);
 	}
 
-	poly1305_core_emit(&desc->h, desc->s, dst);
+	poly1305_emit(&desc->state.h, dst, desc->s);
 	*desc = (struct poly1305_desc_ctx){};
 }
-EXPORT_SYMBOL_GPL(poly1305_final_generic);
+EXPORT_SYMBOL(poly1305_final);
+
+#ifdef poly1305_mod_init_arch
+static int __init poly1305_mod_init(void)
+{
+	poly1305_mod_init_arch();
+	return 0;
+}
+subsys_initcall(poly1305_mod_init);
+
+static void __exit poly1305_mod_exit(void)
+{
+}
+module_exit(poly1305_mod_exit);
+#endif
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("Poly1305 authenticator algorithm, RFC7539");
diff --git a/lib/crypto/polyval.c b/lib/crypto/polyval.c
new file mode 100644
index 000000000000..5796275f574a
--- /dev/null
+++ b/lib/crypto/polyval.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * POLYVAL library functions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/polyval.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+/*
+ * POLYVAL is an almost-XOR-universal hash function.  Similar to GHASH, POLYVAL
+ * interprets the message as the coefficients of a polynomial in GF(2^128) and
+ * evaluates that polynomial at a secret point.  POLYVAL has a simple
+ * mathematical relationship with GHASH, but it uses a better field convention
+ * which makes it easier and faster to implement.
+ *
+ * POLYVAL is not a cryptographic hash function, and it should be used only by
+ * algorithms that are specifically designed to use it.
+ *
+ * POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated
+ * Encryption" (https://datatracker.ietf.org/doc/html/rfc8452)
+ *
+ * POLYVAL is also used by HCTR2.  See "Length-preserving encryption with HCTR2"
+ * (https://eprint.iacr.org/2021/1441.pdf).
+ *
+ * This file provides a library API for POLYVAL.  This API can delegate to
+ * either a generic implementation or an architecture-optimized implementation.
+ *
+ * For the generic implementation, we don't use the traditional table approach
+ * to GF(2^128) multiplication.  That approach is not constant-time and requires
+ * a lot of memory.  Instead, we use a different approach which emulates
+ * carryless multiplication using standard multiplications by spreading the data
+ * bits apart using "holes".  This allows the carries to spill harmlessly.  This
+ * approach is borrowed from BoringSSL, which in turn credits BearSSL's
+ * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
+ * "holes" trick and a presentation by Shay Gueron
+ * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
+ * 256-bit => 128-bit reduction algorithm.
+ */
+
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+	/*
+	 * With 64-bit multiplicands and one term every 4 bits, there would be
+	 * up to 64 / 4 = 16 one bits per column when each multiplication is
+	 * written out as a series of additions in the schoolbook manner.
+	 * Unfortunately, that doesn't work since the value 16 is 1 too large to
+	 * fit in 4 bits.  Carries would sometimes overflow into the next term.
+	 *
+	 * Using one term every 5 bits would work.  However, that would cost
+	 * 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
+	 *
+	 * Instead, mask off 4 bits from one multiplicand, giving a max of 15
+	 * one bits per column.  Then handle those 4 bits separately.
+	 */
+	u64 a0 = a & 0x1111111111111110;
+	u64 a1 = a & 0x2222222222222220;
+	u64 a2 = a & 0x4444444444444440;
+	u64 a3 = a & 0x8888888888888880;
+
+	u64 b0 = b & 0x1111111111111111;
+	u64 b1 = b & 0x2222222222222222;
+	u64 b2 = b & 0x4444444444444444;
+	u64 b3 = b & 0x8888888888888888;
+
+	/* Multiply the high 60 bits of @a by @b. */
+	u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
+		  (a2 * (u128)b2) ^ (a3 * (u128)b1);
+	u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
+		  (a2 * (u128)b3) ^ (a3 * (u128)b2);
+	u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
+		  (a2 * (u128)b0) ^ (a3 * (u128)b3);
+	u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
+		  (a2 * (u128)b1) ^ (a3 * (u128)b0);
+
+	/* Multiply the low 4 bits of @a by @b. */
+	u64 e0 = -(a & 1) & b;
+	u64 e1 = -((a >> 1) & 1) & b;
+	u64 e2 = -((a >> 2) & 1) & b;
+	u64 e3 = -((a >> 3) & 1) & b;
+	u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
+	u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
+
+	/* Add all the intermediate products together. */
+	*out_lo = (((u64)c0) & 0x1111111111111111) ^
+		  (((u64)c1) & 0x2222222222222222) ^
+		  (((u64)c2) & 0x4444444444444444) ^
+		  (((u64)c3) & 0x8888888888888888) ^ extra_lo;
+	*out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
+		  (((u64)(c1 >> 64)) & 0x2222222222222222) ^
+		  (((u64)(c2 >> 64)) & 0x4444444444444444) ^
+		  (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
+}
+
+#else /* CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Do a 32 x 32 => 64 bit carryless multiplication. */
+static u64 clmul32(u32 a, u32 b)
+{
+	/*
+	 * With 32-bit multiplicands and one term every 4 bits, there are up to
+	 * 32 / 4 = 8 one bits per column when each multiplication is written
+	 * out as a series of additions in the schoolbook manner.  The value 8
+	 * fits in 4 bits, so the carries don't overflow into the next term.
+	 */
+	u32 a0 = a & 0x11111111;
+	u32 a1 = a & 0x22222222;
+	u32 a2 = a & 0x44444444;
+	u32 a3 = a & 0x88888888;
+
+	u32 b0 = b & 0x11111111;
+	u32 b1 = b & 0x22222222;
+	u32 b2 = b & 0x44444444;
+	u32 b3 = b & 0x88888888;
+
+	u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
+		 (a2 * (u64)b2) ^ (a3 * (u64)b1);
+	u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
+		 (a2 * (u64)b3) ^ (a3 * (u64)b2);
+	u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
+		 (a2 * (u64)b0) ^ (a3 * (u64)b3);
+	u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
+		 (a2 * (u64)b1) ^ (a3 * (u64)b0);
+
+	/* Add all the intermediate products together. */
+	return (c0 & 0x1111111111111111) ^
+	       (c1 & 0x2222222222222222) ^
+	       (c2 & 0x4444444444444444) ^
+	       (c3 & 0x8888888888888888);
+}
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+	u32 a_lo = (u32)a;
+	u32 a_hi = a >> 32;
+	u32 b_lo = (u32)b;
+	u32 b_hi = b >> 32;
+
+	/* Karatsuba multiplication */
+	u64 lo = clmul32(a_lo, b_lo);
+	u64 hi = clmul32(a_hi, b_hi);
+	u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
+
+	*out_lo = lo ^ (mi << 32);
+	*out_hi = hi ^ (mi >> 32);
+}
+#endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
+static void __maybe_unused
+polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
+{
+	u64 c0, c1, c2, c3, mi0, mi1;
+
+	/*
+	 * Carryless-multiply @a by @b using Karatsuba multiplication.  Store
+	 * the 256-bit product in @c0 (low) through @c3 (high).
+	 */
+	clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
+	clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
+	clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
+		&mi0, &mi1);
+	mi0 ^= c0 ^ c2;
+	mi1 ^= c1 ^ c3;
+	c1 ^= mi0;
+	c2 ^= mi1;
+
+	/*
+	 * Cancel out the low 128 bits of the product by adding multiples of
+	 * G(x) = x^128 + x^127 + x^126 + x^121 + 1.  Do this in two steps, each
+	 * of which cancels out 64 bits.  Note that we break G(x) into three
+	 * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
+	 */
+
+	/*
+	 * First, add G(x) times c0 as follows:
+	 *
+	 * (c0, c1, c2) = (0,
+	 *                 c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
+	 *		   c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
+	 */
+	c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
+	c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
+
+	/*
+	 * Second, add G(x) times the new c1:
+	 *
+	 * (c1, c2, c3) = (0,
+	 *                 c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
+	 *		   c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
+	 */
+	c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
+	c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
+
+	/* Return (c2, c3).  This implicitly multiplies by x^-128. */
+	a->lo = cpu_to_le64(c2);
+	a->hi = cpu_to_le64(c3);
+}
+
+static void __maybe_unused
+polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
+		       const u8 *data, size_t nblocks)
+{
+	do {
+		acc->lo ^= get_unaligned((__le64 *)data);
+		acc->hi ^= get_unaligned((__le64 *)(data + 8));
+		polyval_mul_generic(acc, key);
+		data += POLYVAL_BLOCK_SIZE;
+	} while (--nblocks);
+}
+
+/* Include the arch-optimized implementation of POLYVAL, if one is available. */
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#include "polyval.h" /* $(SRCARCH)/polyval.h */
+void polyval_preparekey(struct polyval_key *key,
+			const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	polyval_preparekey_arch(key, raw_key);
+}
+EXPORT_SYMBOL_GPL(polyval_preparekey);
+#endif /* Else, polyval_preparekey() is an inline function. */
+
+/*
+ * polyval_mul_generic() and polyval_blocks_generic() take the key as a
+ * polyval_elem rather than a polyval_key, so that arch-optimized
+ * implementations with a different key format can use it as a fallback (if they
+ * have H^1 stored somewhere in their struct).  Thus, the following dispatch
+ * code is needed to pass the appropriate key argument.
+ */
+
+static void polyval_mul(struct polyval_ctx *ctx)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+	polyval_mul_arch(&ctx->acc, ctx->key);
+#else
+	polyval_mul_generic(&ctx->acc, &ctx->key->h);
+#endif
+}
+
+static void polyval_blocks(struct polyval_ctx *ctx,
+			   const u8 *data, size_t nblocks)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+	polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
+#else
+	polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
+#endif
+}
+
+void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
+{
+	if (unlikely(ctx->partial)) {
+		size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
+
+		len -= n;
+		while (n--)
+			ctx->acc.bytes[ctx->partial++] ^= *data++;
+		if (ctx->partial < POLYVAL_BLOCK_SIZE)
+			return;
+		polyval_mul(ctx);
+	}
+	if (len >= POLYVAL_BLOCK_SIZE) {
+		size_t nblocks = len / POLYVAL_BLOCK_SIZE;
+
+		polyval_blocks(ctx, data, nblocks);
+		data += len & ~(POLYVAL_BLOCK_SIZE - 1);
+		len &= POLYVAL_BLOCK_SIZE - 1;
+	}
+	for (size_t i = 0; i < len; i++)
+		ctx->acc.bytes[i] ^= data[i];
+	ctx->partial = len;
+}
+EXPORT_SYMBOL_GPL(polyval_update);
+
+void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
+{
+	if (unlikely(ctx->partial))
+		polyval_mul(ctx);
+	memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(polyval_final);
+
+#ifdef polyval_mod_init_arch
+static int __init polyval_mod_init(void)
+{
+	polyval_mod_init_arch();
+	return 0;
+}
+subsys_initcall(polyval_mod_init);
+
+static void __exit polyval_mod_exit(void)
+{
+}
+module_exit(polyval_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/powerpc/chacha-p10le-8x.S b/lib/crypto/powerpc/chacha-p10le-8x.S
new file mode 100644
index 000000000000..b29562bd5d40
--- /dev/null
+++ b/lib/crypto/powerpc/chacha-p10le-8x.S
@@ -0,0 +1,840 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated chacha20 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# do rounds,  8 quarter rounds
+# 1.  a += b; d ^= a; d <<<= 16;
+# 2.  c += d; b ^= c; b <<<= 12;
+# 3.  a += b; d ^= a; d <<<= 8;
+# 4.  c += d; b ^= c; b <<<= 7
+#
+# row1 = (row1 + row2),  row4 = row1 xor row4,  row4 rotate each word by 16
+# row3 = (row3 + row4),  row2 = row3 xor row2,  row2 rotate each word by 12
+# row1 = (row1 + row2), row4 = row1 xor row4,  row4 rotate each word by 8
+# row3 = (row3 + row4), row2 = row3 xor row2,  row2 rotate each word by 7
+#
+# 4 blocks (a b c d)
+#
+# a0 b0 c0 d0
+# a1 b1 c1 d1
+# ...
+# a4 b4 c4 d4
+# ...
+# a8 b8 c8 d8
+# ...
+# a12 b12 c12 d12
+# a13 ...
+# a14 ...
+# a15 b15 c15 d15
+#
+# Column round (v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
+#
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine	"any"
+.text
+
+.macro	SAVE_GPR GPR OFFSET FRAME
+	std	\GPR,\OFFSET(\FRAME)
+.endm
+
+.macro	SAVE_VRS VRS OFFSET FRAME
+	li	16, \OFFSET
+	stvx	\VRS, 16, \FRAME
+.endm
+
+.macro	SAVE_VSX VSX OFFSET FRAME
+	li	16, \OFFSET
+	stxvx	\VSX, 16, \FRAME
+.endm
+
+.macro	RESTORE_GPR GPR OFFSET FRAME
+	ld	\GPR,\OFFSET(\FRAME)
+.endm
+
+.macro	RESTORE_VRS VRS OFFSET FRAME
+	li	16, \OFFSET
+	lvx	\VRS, 16, \FRAME
+.endm
+
+.macro	RESTORE_VSX VSX OFFSET FRAME
+	li	16, \OFFSET
+	lxvx	\VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+	mflr 0
+	std 0, 16(1)
+	stdu 1,-752(1)
+
+	SAVE_GPR 14, 112, 1
+	SAVE_GPR 15, 120, 1
+	SAVE_GPR 16, 128, 1
+	SAVE_GPR 17, 136, 1
+	SAVE_GPR 18, 144, 1
+	SAVE_GPR 19, 152, 1
+	SAVE_GPR 20, 160, 1
+	SAVE_GPR 21, 168, 1
+	SAVE_GPR 22, 176, 1
+	SAVE_GPR 23, 184, 1
+	SAVE_GPR 24, 192, 1
+	SAVE_GPR 25, 200, 1
+	SAVE_GPR 26, 208, 1
+	SAVE_GPR 27, 216, 1
+	SAVE_GPR 28, 224, 1
+	SAVE_GPR 29, 232, 1
+	SAVE_GPR 30, 240, 1
+	SAVE_GPR 31, 248, 1
+
+	addi	9, 1, 256
+	SAVE_VRS 20, 0, 9
+	SAVE_VRS 21, 16, 9
+	SAVE_VRS 22, 32, 9
+	SAVE_VRS 23, 48, 9
+	SAVE_VRS 24, 64, 9
+	SAVE_VRS 25, 80, 9
+	SAVE_VRS 26, 96, 9
+	SAVE_VRS 27, 112, 9
+	SAVE_VRS 28, 128, 9
+	SAVE_VRS 29, 144, 9
+	SAVE_VRS 30, 160, 9
+	SAVE_VRS 31, 176, 9
+
+	SAVE_VSX 14, 192, 9
+	SAVE_VSX 15, 208, 9
+	SAVE_VSX 16, 224, 9
+	SAVE_VSX 17, 240, 9
+	SAVE_VSX 18, 256, 9
+	SAVE_VSX 19, 272, 9
+	SAVE_VSX 20, 288, 9
+	SAVE_VSX 21, 304, 9
+	SAVE_VSX 22, 320, 9
+	SAVE_VSX 23, 336, 9
+	SAVE_VSX 24, 352, 9
+	SAVE_VSX 25, 368, 9
+	SAVE_VSX 26, 384, 9
+	SAVE_VSX 27, 400, 9
+	SAVE_VSX 28, 416, 9
+	SAVE_VSX 29, 432, 9
+	SAVE_VSX 30, 448, 9
+	SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+	addi	9, 1, 256
+	RESTORE_VRS 20, 0, 9
+	RESTORE_VRS 21, 16, 9
+	RESTORE_VRS 22, 32, 9
+	RESTORE_VRS 23, 48, 9
+	RESTORE_VRS 24, 64, 9
+	RESTORE_VRS 25, 80, 9
+	RESTORE_VRS 26, 96, 9
+	RESTORE_VRS 27, 112, 9
+	RESTORE_VRS 28, 128, 9
+	RESTORE_VRS 29, 144, 9
+	RESTORE_VRS 30, 160, 9
+	RESTORE_VRS 31, 176, 9
+
+	RESTORE_VSX 14, 192, 9
+	RESTORE_VSX 15, 208, 9
+	RESTORE_VSX 16, 224, 9
+	RESTORE_VSX 17, 240, 9
+	RESTORE_VSX 18, 256, 9
+	RESTORE_VSX 19, 272, 9
+	RESTORE_VSX 20, 288, 9
+	RESTORE_VSX 21, 304, 9
+	RESTORE_VSX 22, 320, 9
+	RESTORE_VSX 23, 336, 9
+	RESTORE_VSX 24, 352, 9
+	RESTORE_VSX 25, 368, 9
+	RESTORE_VSX 26, 384, 9
+	RESTORE_VSX 27, 400, 9
+	RESTORE_VSX 28, 416, 9
+	RESTORE_VSX 29, 432, 9
+	RESTORE_VSX 30, 448, 9
+	RESTORE_VSX 31, 464, 9
+
+	RESTORE_GPR 14, 112, 1
+	RESTORE_GPR 15, 120, 1
+	RESTORE_GPR 16, 128, 1
+	RESTORE_GPR 17, 136, 1
+	RESTORE_GPR 18, 144, 1
+	RESTORE_GPR 19, 152, 1
+	RESTORE_GPR 20, 160, 1
+	RESTORE_GPR 21, 168, 1
+	RESTORE_GPR 22, 176, 1
+	RESTORE_GPR 23, 184, 1
+	RESTORE_GPR 24, 192, 1
+	RESTORE_GPR 25, 200, 1
+	RESTORE_GPR 26, 208, 1
+	RESTORE_GPR 27, 216, 1
+	RESTORE_GPR 28, 224, 1
+	RESTORE_GPR 29, 232, 1
+	RESTORE_GPR 30, 240, 1
+	RESTORE_GPR 31, 248, 1
+
+	addi    1, 1, 752
+	ld 0, 16(1)
+	mtlr 0
+.endm # RESTORE_REGS
+
+.macro QT_loop_8x
+	# QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+	xxlor	0, 32+25, 32+25
+	xxlor	32+25, 20, 20
+	vadduwm 0, 0, 4
+	vadduwm 1, 1, 5
+	vadduwm 2, 2, 6
+	vadduwm 3, 3, 7
+	  vadduwm 16, 16, 20
+	  vadduwm 17, 17, 21
+	  vadduwm 18, 18, 22
+	  vadduwm 19, 19, 23
+
+	  vpermxor 12, 12, 0, 25
+	  vpermxor 13, 13, 1, 25
+	  vpermxor 14, 14, 2, 25
+	  vpermxor 15, 15, 3, 25
+	  vpermxor 28, 28, 16, 25
+	  vpermxor 29, 29, 17, 25
+	  vpermxor 30, 30, 18, 25
+	  vpermxor 31, 31, 19, 25
+	xxlor	32+25, 0, 0
+	vadduwm 8, 8, 12
+	vadduwm 9, 9, 13
+	vadduwm 10, 10, 14
+	vadduwm 11, 11, 15
+	  vadduwm 24, 24, 28
+	  vadduwm 25, 25, 29
+	  vadduwm 26, 26, 30
+	  vadduwm 27, 27, 31
+	vxor 4, 4, 8
+	vxor 5, 5, 9
+	vxor 6, 6, 10
+	vxor 7, 7, 11
+	  vxor 20, 20, 24
+	  vxor 21, 21, 25
+	  vxor 22, 22, 26
+	  vxor 23, 23, 27
+
+	xxlor	0, 32+25, 32+25
+	xxlor	32+25, 21, 21
+	vrlw 4, 4, 25  #
+	vrlw 5, 5, 25
+	vrlw 6, 6, 25
+	vrlw 7, 7, 25
+	  vrlw 20, 20, 25  #
+	  vrlw 21, 21, 25
+	  vrlw 22, 22, 25
+	  vrlw 23, 23, 25
+	xxlor	32+25, 0, 0
+	vadduwm 0, 0, 4
+	vadduwm 1, 1, 5
+	vadduwm 2, 2, 6
+	vadduwm 3, 3, 7
+	  vadduwm 16, 16, 20
+	  vadduwm 17, 17, 21
+	  vadduwm 18, 18, 22
+	  vadduwm 19, 19, 23
+
+	xxlor	0, 32+25, 32+25
+	xxlor	32+25, 22, 22
+	  vpermxor 12, 12, 0, 25
+	  vpermxor 13, 13, 1, 25
+	  vpermxor 14, 14, 2, 25
+	  vpermxor 15, 15, 3, 25
+	  vpermxor 28, 28, 16, 25
+	  vpermxor 29, 29, 17, 25
+	  vpermxor 30, 30, 18, 25
+	  vpermxor 31, 31, 19, 25
+	xxlor	32+25, 0, 0
+	vadduwm 8, 8, 12
+	vadduwm 9, 9, 13
+	vadduwm 10, 10, 14
+	vadduwm 11, 11, 15
+	  vadduwm 24, 24, 28
+	  vadduwm 25, 25, 29
+	  vadduwm 26, 26, 30
+	  vadduwm 27, 27, 31
+	xxlor	0, 32+28, 32+28
+	xxlor	32+28, 23, 23
+	vxor 4, 4, 8
+	vxor 5, 5, 9
+	vxor 6, 6, 10
+	vxor 7, 7, 11
+	  vxor 20, 20, 24
+	  vxor 21, 21, 25
+	  vxor 22, 22, 26
+	  vxor 23, 23, 27
+	vrlw 4, 4, 28  #
+	vrlw 5, 5, 28
+	vrlw 6, 6, 28
+	vrlw 7, 7, 28
+	  vrlw 20, 20, 28  #
+	  vrlw 21, 21, 28
+	  vrlw 22, 22, 28
+	  vrlw 23, 23, 28
+	xxlor	32+28, 0, 0
+
+	# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
+	xxlor	0, 32+25, 32+25
+	xxlor	32+25, 20, 20
+	vadduwm 0, 0, 5
+	vadduwm 1, 1, 6
+	vadduwm 2, 2, 7
+	vadduwm 3, 3, 4
+	  vadduwm 16, 16, 21
+	  vadduwm 17, 17, 22
+	  vadduwm 18, 18, 23
+	  vadduwm 19, 19, 20
+
+	  vpermxor 15, 15, 0, 25
+	  vpermxor 12, 12, 1, 25
+	  vpermxor 13, 13, 2, 25
+	  vpermxor 14, 14, 3, 25
+	  vpermxor 31, 31, 16, 25
+	  vpermxor 28, 28, 17, 25
+	  vpermxor 29, 29, 18, 25
+	  vpermxor 30, 30, 19, 25
+
+	xxlor	32+25, 0, 0
+	vadduwm 10, 10, 15
+	vadduwm 11, 11, 12
+	vadduwm 8, 8, 13
+	vadduwm 9, 9, 14
+	  vadduwm 26, 26, 31
+	  vadduwm 27, 27, 28
+	  vadduwm 24, 24, 29
+	  vadduwm 25, 25, 30
+	vxor 5, 5, 10
+	vxor 6, 6, 11
+	vxor 7, 7, 8
+	vxor 4, 4, 9
+	  vxor 21, 21, 26
+	  vxor 22, 22, 27
+	  vxor 23, 23, 24
+	  vxor 20, 20, 25
+
+	xxlor	0, 32+25, 32+25
+	xxlor	32+25, 21, 21
+	vrlw 5, 5, 25
+	vrlw 6, 6, 25
+	vrlw 7, 7, 25
+	vrlw 4, 4, 25
+	  vrlw 21, 21, 25
+	  vrlw 22, 22, 25
+	  vrlw 23, 23, 25
+	  vrlw 20, 20, 25
+	xxlor	32+25, 0, 0
+
+	vadduwm 0, 0, 5
+	vadduwm 1, 1, 6
+	vadduwm 2, 2, 7
+	vadduwm 3, 3, 4
+	  vadduwm 16, 16, 21
+	  vadduwm 17, 17, 22
+	  vadduwm 18, 18, 23
+	  vadduwm 19, 19, 20
+
+	xxlor	0, 32+25, 32+25
+	xxlor	32+25, 22, 22
+	  vpermxor 15, 15, 0, 25
+	  vpermxor 12, 12, 1, 25
+	  vpermxor 13, 13, 2, 25
+	  vpermxor 14, 14, 3, 25
+	  vpermxor 31, 31, 16, 25
+	  vpermxor 28, 28, 17, 25
+	  vpermxor 29, 29, 18, 25
+	  vpermxor 30, 30, 19, 25
+	xxlor	32+25, 0, 0
+
+	vadduwm 10, 10, 15
+	vadduwm 11, 11, 12
+	vadduwm 8, 8, 13
+	vadduwm 9, 9, 14
+	  vadduwm 26, 26, 31
+	  vadduwm 27, 27, 28
+	  vadduwm 24, 24, 29
+	  vadduwm 25, 25, 30
+
+	xxlor	0, 32+28, 32+28
+	xxlor	32+28, 23, 23
+	vxor 5, 5, 10
+	vxor 6, 6, 11
+	vxor 7, 7, 8
+	vxor 4, 4, 9
+	  vxor 21, 21, 26
+	  vxor 22, 22, 27
+	  vxor 23, 23, 24
+	  vxor 20, 20, 25
+	vrlw 5, 5, 28
+	vrlw 6, 6, 28
+	vrlw 7, 7, 28
+	vrlw 4, 4, 28
+	  vrlw 21, 21, 28
+	  vrlw 22, 22, 28
+	  vrlw 23, 23, 28
+	  vrlw 20, 20, 28
+	xxlor	32+28, 0, 0
+.endm
+
+.macro QT_loop_4x
+	# QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+	vadduwm 0, 0, 4
+	vadduwm 1, 1, 5
+	vadduwm 2, 2, 6
+	vadduwm 3, 3, 7
+	  vpermxor 12, 12, 0, 20
+	  vpermxor 13, 13, 1, 20
+	  vpermxor 14, 14, 2, 20
+	  vpermxor 15, 15, 3, 20
+	vadduwm 8, 8, 12
+	vadduwm 9, 9, 13
+	vadduwm 10, 10, 14
+	vadduwm 11, 11, 15
+	vxor 4, 4, 8
+	vxor 5, 5, 9
+	vxor 6, 6, 10
+	vxor 7, 7, 11
+	vrlw 4, 4, 21
+	vrlw 5, 5, 21
+	vrlw 6, 6, 21
+	vrlw 7, 7, 21
+	vadduwm 0, 0, 4
+	vadduwm 1, 1, 5
+	vadduwm 2, 2, 6
+	vadduwm 3, 3, 7
+	  vpermxor 12, 12, 0, 22
+	  vpermxor 13, 13, 1, 22
+	  vpermxor 14, 14, 2, 22
+	  vpermxor 15, 15, 3, 22
+	vadduwm 8, 8, 12
+	vadduwm 9, 9, 13
+	vadduwm 10, 10, 14
+	vadduwm 11, 11, 15
+	vxor 4, 4, 8
+	vxor 5, 5, 9
+	vxor 6, 6, 10
+	vxor 7, 7, 11
+	vrlw 4, 4, 23
+	vrlw 5, 5, 23
+	vrlw 6, 6, 23
+	vrlw 7, 7, 23
+
+	# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
+	vadduwm 0, 0, 5
+	vadduwm 1, 1, 6
+	vadduwm 2, 2, 7
+	vadduwm 3, 3, 4
+	  vpermxor 15, 15, 0, 20
+	  vpermxor 12, 12, 1, 20
+	  vpermxor 13, 13, 2, 20
+	  vpermxor 14, 14, 3, 20
+	vadduwm 10, 10, 15
+	vadduwm 11, 11, 12
+	vadduwm 8, 8, 13
+	vadduwm 9, 9, 14
+	vxor 5, 5, 10
+	vxor 6, 6, 11
+	vxor 7, 7, 8
+	vxor 4, 4, 9
+	vrlw 5, 5, 21
+	vrlw 6, 6, 21
+	vrlw 7, 7, 21
+	vrlw 4, 4, 21
+	vadduwm 0, 0, 5
+	vadduwm 1, 1, 6
+	vadduwm 2, 2, 7
+	vadduwm 3, 3, 4
+	  vpermxor 15, 15, 0, 22
+	  vpermxor 12, 12, 1, 22
+	  vpermxor 13, 13, 2, 22
+	  vpermxor 14, 14, 3, 22
+	vadduwm 10, 10, 15
+	vadduwm 11, 11, 12
+	vadduwm 8, 8, 13
+	vadduwm 9, 9, 14
+	vxor 5, 5, 10
+	vxor 6, 6, 11
+	vxor 7, 7, 8
+	vxor 4, 4, 9
+	vrlw 5, 5, 23
+	vrlw 6, 6, 23
+	vrlw 7, 7, 23
+	vrlw 4, 4, 23
+.endm
+
+# Transpose
+.macro TP_4x a0 a1 a2 a3
+	xxmrghw  10, 32+\a0, 32+\a1	# a0, a1, b0, b1
+	xxmrghw  11, 32+\a2, 32+\a3	# a2, a3, b2, b3
+	xxmrglw  12, 32+\a0, 32+\a1	# c0, c1, d0, d1
+	xxmrglw  13, 32+\a2, 32+\a3	# c2, c3, d2, d3
+	xxpermdi	32+\a0, 10, 11, 0	# a0, a1, a2, a3
+	xxpermdi	32+\a1, 10, 11, 3	# b0, b1, b2, b3
+	xxpermdi	32+\a2, 12, 13, 0	# c0, c1, c2, c3
+	xxpermdi	32+\a3, 12, 13, 3	# d0, d1, d2, d3
+.endm
+
+# key stream = working state + state
+.macro Add_state S
+	vadduwm \S+0, \S+0, 16-\S
+	vadduwm \S+4, \S+4, 17-\S
+	vadduwm \S+8, \S+8, 18-\S
+	vadduwm \S+12, \S+12, 19-\S
+
+	vadduwm \S+1, \S+1, 16-\S
+	vadduwm \S+5, \S+5, 17-\S
+	vadduwm \S+9, \S+9, 18-\S
+	vadduwm \S+13, \S+13, 19-\S
+
+	vadduwm \S+2, \S+2, 16-\S
+	vadduwm \S+6, \S+6, 17-\S
+	vadduwm \S+10, \S+10, 18-\S
+	vadduwm \S+14, \S+14, 19-\S
+
+	vadduwm	\S+3, \S+3, 16-\S
+	vadduwm	\S+7, \S+7, 17-\S
+	vadduwm	\S+11, \S+11, 18-\S
+	vadduwm	\S+15, \S+15, 19-\S
+.endm
+
+#
+# write 256 bytes
+#
+.macro Write_256 S
+	add 9, 14, 5
+	add 16, 14, 4
+	lxvw4x 0, 0, 9
+	lxvw4x 1, 17, 9
+	lxvw4x 2, 18, 9
+	lxvw4x 3, 19, 9
+	lxvw4x 4, 20, 9
+	lxvw4x 5, 21, 9
+	lxvw4x 6, 22, 9
+	lxvw4x 7, 23, 9
+	lxvw4x 8, 24, 9
+	lxvw4x 9, 25, 9
+	lxvw4x 10, 26, 9
+	lxvw4x 11, 27, 9
+	lxvw4x 12, 28, 9
+	lxvw4x 13, 29, 9
+	lxvw4x 14, 30, 9
+	lxvw4x 15, 31, 9
+
+	xxlxor \S+32, \S+32, 0
+	xxlxor \S+36, \S+36, 1
+	xxlxor \S+40, \S+40, 2
+	xxlxor \S+44, \S+44, 3
+	xxlxor \S+33, \S+33, 4
+	xxlxor \S+37, \S+37, 5
+	xxlxor \S+41, \S+41, 6
+	xxlxor \S+45, \S+45, 7
+	xxlxor \S+34, \S+34, 8
+	xxlxor \S+38, \S+38, 9
+	xxlxor \S+42, \S+42, 10
+	xxlxor \S+46, \S+46, 11
+	xxlxor \S+35, \S+35, 12
+	xxlxor \S+39, \S+39, 13
+	xxlxor \S+43, \S+43, 14
+	xxlxor \S+47, \S+47, 15
+
+	stxvw4x \S+32, 0, 16
+	stxvw4x \S+36, 17, 16
+	stxvw4x \S+40, 18, 16
+	stxvw4x \S+44, 19, 16
+
+	stxvw4x \S+33, 20, 16
+	stxvw4x \S+37, 21, 16
+	stxvw4x \S+41, 22, 16
+	stxvw4x \S+45, 23, 16
+
+	stxvw4x \S+34, 24, 16
+	stxvw4x \S+38, 25, 16
+	stxvw4x \S+42, 26, 16
+	stxvw4x \S+46, 27, 16
+
+	stxvw4x \S+35, 28, 16
+	stxvw4x \S+39, 29, 16
+	stxvw4x \S+43, 30, 16
+	stxvw4x \S+47, 31, 16
+
+.endm
+
+#
+# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src,
+#		       unsigned int len, int nrounds);
+#
+SYM_FUNC_START(chacha_p10le_8x)
+.align 5
+	cmpdi	6, 0
+	ble	Out_no_chacha
+
+	SAVE_REGS
+
+	# r17 - r31 mainly for Write_256 macro.
+	li	17, 16
+	li	18, 32
+	li	19, 48
+	li	20, 64
+	li	21, 80
+	li	22, 96
+	li	23, 112
+	li	24, 128
+	li	25, 144
+	li	26, 160
+	li	27, 176
+	li	28, 192
+	li	29, 208
+	li	30, 224
+	li	31, 240
+
+	mr 15, 6			# len
+	li 14, 0			# offset to inp and outp
+
+        lxvw4x	48, 0, 3		#  vr16, constants
+	lxvw4x	49, 17, 3		#  vr17, key 1
+	lxvw4x	50, 18, 3		#  vr18, key 2
+	lxvw4x	51, 19, 3		#  vr19, counter, nonce
+
+	# create (0, 1, 2, 3) counters
+	vspltisw 0, 0
+	vspltisw 1, 1
+	vspltisw 2, 2
+	vspltisw 3, 3
+	vmrghw	4, 0, 1
+	vmrglw	5, 2, 3
+	vsldoi	30, 4, 5, 8		# vr30 counter, 4 (0, 1, 2, 3)
+
+	vspltisw 21, 12
+	vspltisw 23, 7
+
+	addis	11, 2, permx@toc@ha
+	addi	11, 11, permx@toc@l
+	lxvw4x	32+20, 0, 11
+	lxvw4x	32+22, 17, 11
+
+	sradi	8, 7, 1
+
+	mtctr 8
+
+	# save constants to vsx
+	xxlor	16, 48, 48
+	xxlor	17, 49, 49
+	xxlor	18, 50, 50
+	xxlor	19, 51, 51
+
+	vspltisw 25, 4
+	vspltisw 26, 8
+
+	xxlor	25, 32+26, 32+26
+	xxlor	24, 32+25, 32+25
+
+	vadduwm	31, 30, 25		# counter = (0, 1, 2, 3) + (4, 4, 4, 4)
+	xxlor	30, 32+30, 32+30
+	xxlor	31, 32+31, 32+31
+
+	xxlor	20, 32+20, 32+20
+	xxlor	21, 32+21, 32+21
+	xxlor	22, 32+22, 32+22
+	xxlor	23, 32+23, 32+23
+
+	cmpdi	6, 512
+	blt	Loop_last
+
+Loop_8x:
+	xxspltw  32+0, 16, 0
+	xxspltw  32+1, 16, 1
+	xxspltw  32+2, 16, 2
+	xxspltw  32+3, 16, 3
+
+	xxspltw  32+4, 17, 0
+	xxspltw  32+5, 17, 1
+	xxspltw  32+6, 17, 2
+	xxspltw  32+7, 17, 3
+	xxspltw  32+8, 18, 0
+	xxspltw  32+9, 18, 1
+	xxspltw  32+10, 18, 2
+	xxspltw  32+11, 18, 3
+	xxspltw  32+12, 19, 0
+	xxspltw  32+13, 19, 1
+	xxspltw  32+14, 19, 2
+	xxspltw  32+15, 19, 3
+	vadduwm	12, 12, 30	# increase counter
+
+	xxspltw  32+16, 16, 0
+	xxspltw  32+17, 16, 1
+	xxspltw  32+18, 16, 2
+	xxspltw  32+19, 16, 3
+
+	xxspltw  32+20, 17, 0
+	xxspltw  32+21, 17, 1
+	xxspltw  32+22, 17, 2
+	xxspltw  32+23, 17, 3
+	xxspltw  32+24, 18, 0
+	xxspltw  32+25, 18, 1
+	xxspltw  32+26, 18, 2
+	xxspltw  32+27, 18, 3
+	xxspltw  32+28, 19, 0
+	xxspltw  32+29, 19, 1
+	vadduwm	28, 28, 31	# increase counter
+	xxspltw  32+30, 19, 2
+	xxspltw  32+31, 19, 3
+
+.align 5
+quarter_loop_8x:
+	QT_loop_8x
+
+	bdnz	quarter_loop_8x
+
+	xxlor	0, 32+30, 32+30
+	xxlor	32+30, 30, 30
+	vadduwm	12, 12, 30
+	xxlor	32+30, 0, 0
+	TP_4x 0, 1, 2, 3
+	TP_4x 4, 5, 6, 7
+	TP_4x 8, 9, 10, 11
+	TP_4x 12, 13, 14, 15
+
+	xxlor	0, 48, 48
+	xxlor	1, 49, 49
+	xxlor	2, 50, 50
+	xxlor	3, 51, 51
+	xxlor	48, 16, 16
+	xxlor	49, 17, 17
+	xxlor	50, 18, 18
+	xxlor	51, 19, 19
+	Add_state 0
+	xxlor	48, 0, 0
+	xxlor	49, 1, 1
+	xxlor	50, 2, 2
+	xxlor	51, 3, 3
+	Write_256 0
+	addi	14, 14, 256	# offset +=256
+	addi	15, 15, -256	# len -=256
+
+	xxlor	5, 32+31, 32+31
+	xxlor	32+31, 31, 31
+	vadduwm	28, 28, 31
+	xxlor	32+31, 5, 5
+	TP_4x 16+0, 16+1, 16+2, 16+3
+	TP_4x 16+4, 16+5, 16+6, 16+7
+	TP_4x 16+8, 16+9, 16+10, 16+11
+	TP_4x 16+12, 16+13, 16+14, 16+15
+
+	xxlor	32, 16, 16
+	xxlor	33, 17, 17
+	xxlor	34, 18, 18
+	xxlor	35, 19, 19
+	Add_state 16
+	Write_256 16
+	addi	14, 14, 256	# offset +=256
+	addi	15, 15, -256	# len +=256
+
+	xxlor	32+24, 24, 24
+	xxlor	32+25, 25, 25
+	xxlor	32+30, 30, 30
+	vadduwm	30, 30, 25
+	vadduwm	31, 30, 24
+	xxlor	30, 32+30, 32+30
+	xxlor	31, 32+31, 32+31
+
+	cmpdi	15, 0
+	beq	Out_loop
+
+	cmpdi	15, 512
+	blt	Loop_last
+
+	mtctr 8
+	b Loop_8x
+
+Loop_last:
+        lxvw4x	48, 0, 3		#  vr16, constants
+	lxvw4x	49, 17, 3		#  vr17, key 1
+	lxvw4x	50, 18, 3		#  vr18, key 2
+	lxvw4x	51, 19, 3		#  vr19, counter, nonce
+
+	vspltisw 21, 12
+	vspltisw 23, 7
+	addis	11, 2, permx@toc@ha
+	addi	11, 11, permx@toc@l
+	lxvw4x	32+20, 0, 11
+	lxvw4x	32+22, 17, 11
+
+	sradi	8, 7, 1
+	mtctr 8
+
+Loop_4x:
+	vspltw  0, 16, 0
+	vspltw  1, 16, 1
+	vspltw  2, 16, 2
+	vspltw  3, 16, 3
+
+	vspltw  4, 17, 0
+	vspltw  5, 17, 1
+	vspltw  6, 17, 2
+	vspltw  7, 17, 3
+	vspltw  8, 18, 0
+	vspltw  9, 18, 1
+	vspltw  10, 18, 2
+	vspltw  11, 18, 3
+	vspltw  12, 19, 0
+	vadduwm	12, 12, 30	# increase counter
+	vspltw  13, 19, 1
+	vspltw  14, 19, 2
+	vspltw  15, 19, 3
+
+.align 5
+quarter_loop:
+	QT_loop_4x
+
+	bdnz	quarter_loop
+
+	vadduwm	12, 12, 30
+	TP_4x 0, 1, 2, 3
+	TP_4x 4, 5, 6, 7
+	TP_4x 8, 9, 10, 11
+	TP_4x 12, 13, 14, 15
+
+	Add_state 0
+	Write_256 0
+	addi	14, 14, 256	# offset += 256
+	addi	15, 15, -256	# len += 256
+
+	# Update state counter
+	vspltisw 25, 4
+	vadduwm	30, 30, 25
+
+	cmpdi	15, 0
+	beq	Out_loop
+	cmpdi	15, 256
+	blt	Out_loop
+
+	mtctr 8
+	b Loop_4x
+
+Out_loop:
+	RESTORE_REGS
+	blr
+
+Out_no_chacha:
+	li	3, 0
+	blr
+SYM_FUNC_END(chacha_p10le_8x)
+
+SYM_DATA_START_LOCAL(PERMX)
+.align 5
+permx:
+.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
+.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
+SYM_DATA_END(PERMX)
diff --git a/lib/crypto/powerpc/chacha.h b/lib/crypto/powerpc/chacha.h
new file mode 100644
index 000000000000..1df6e1ce31c4
--- /dev/null
+++ b/lib/crypto/powerpc/chacha.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha stream cipher (P10 accelerated)
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+
+#include <crypto/internal/simd.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <linux/sizes.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+
+asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst,
+				const u8 *src, unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+	preempt_disable();
+	enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+	disable_kernel_vsx();
+	preempt_enable();
+}
+
+static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src,
+			     unsigned int bytes, int nrounds)
+{
+	unsigned int l = bytes & ~0x0FF;
+
+	if (l > 0) {
+		chacha_p10le_8x(state, dst, src, l, nrounds);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state->x[12] += l / CHACHA_BLOCK_SIZE;
+	}
+
+	if (bytes > 0)
+		chacha_crypt_generic(state, dst, src, bytes, nrounds);
+}
+
+#define hchacha_block_arch hchacha_block_generic /* not implemented yet */
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+			      const u8 *src, unsigned int bytes, int nrounds)
+{
+	if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE ||
+	    !crypto_simd_usable())
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		vsx_begin();
+		chacha_p10_do_8x(state, dst, src, todo, nrounds);
+		vsx_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		static_branch_enable(&have_p10);
+}
diff --git a/lib/crypto/powerpc/curve25519-ppc64le_asm.S b/lib/crypto/powerpc/curve25519-ppc64le_asm.S
new file mode 100644
index 000000000000..06c1febe24b9
--- /dev/null
+++ b/lib/crypto/powerpc/curve25519-ppc64le_asm.S
@@ -0,0 +1,671 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://github.com/dot-asm/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+#       * Redistributions of source code must retain copyright notices,
+#         this list of conditions and the following disclaimer.
+#
+#       * Redistributions in binary form must reproduce the above
+#         copyright notice, this list of conditions and the following
+#         disclaimer in the documentation and/or other materials
+#         provided with the distribution.
+#
+#       * Neither the name of the CRYPTOGAMS nor the names of its
+#         copyright holder and contributors may be used to endorse or
+#         promote products derived from this software without specific
+#         prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+#
+# ====================================================================
+# Written and Modified by Danny Tsen <dtsen@us.ibm.com>
+# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes
+#   and x25519_cswap
+#
+# Copyright 2024- IBM Corp.
+#
+# X25519 lower-level primitives for PPC64.
+#
+
+#include <linux/linkage.h>
+
+.text
+
+.align	5
+SYM_FUNC_START(x25519_fe51_mul)
+
+	stdu	1,-144(1)
+	std	21,56(1)
+	std	22,64(1)
+	std	23,72(1)
+	std	24,80(1)
+	std	25,88(1)
+	std	26,96(1)
+	std	27,104(1)
+	std	28,112(1)
+	std	29,120(1)
+	std	30,128(1)
+	std	31,136(1)
+
+	ld	6,0(5)
+	ld	7,0(4)
+	ld	8,8(4)
+	ld	9,16(4)
+	ld	10,24(4)
+	ld	11,32(4)
+
+	mulld	22,7,6
+	mulhdu	23,7,6
+
+	mulld	24,8,6
+	mulhdu	25,8,6
+
+	mulld	30,11,6
+	mulhdu	31,11,6
+	ld	4,8(5)
+	mulli	11,11,19
+
+	mulld	26,9,6
+	mulhdu	27,9,6
+
+	mulld	28,10,6
+	mulhdu	29,10,6
+	mulld	12,11,4
+	mulhdu	21,11,4
+	addc	22,22,12
+	adde	23,23,21
+
+	mulld	12,7,4
+	mulhdu	21,7,4
+	addc	24,24,12
+	adde	25,25,21
+
+	mulld	12,10,4
+	mulhdu	21,10,4
+	ld	6,16(5)
+	mulli	10,10,19
+	addc	30,30,12
+	adde	31,31,21
+
+	mulld	12,8,4
+	mulhdu	21,8,4
+	addc	26,26,12
+	adde	27,27,21
+
+	mulld	12,9,4
+	mulhdu	21,9,4
+	addc	28,28,12
+	adde	29,29,21
+	mulld	12,10,6
+	mulhdu	21,10,6
+	addc	22,22,12
+	adde	23,23,21
+
+	mulld	12,11,6
+	mulhdu	21,11,6
+	addc	24,24,12
+	adde	25,25,21
+
+	mulld	12,9,6
+	mulhdu	21,9,6
+	ld	4,24(5)
+	mulli	9,9,19
+	addc	30,30,12
+	adde	31,31,21
+
+	mulld	12,7,6
+	mulhdu	21,7,6
+	addc	26,26,12
+	adde	27,27,21
+
+	mulld	12,8,6
+	mulhdu	21,8,6
+	addc	28,28,12
+	adde	29,29,21
+	mulld	12,9,4
+	mulhdu	21,9,4
+	addc	22,22,12
+	adde	23,23,21
+
+	mulld	12,10,4
+	mulhdu	21,10,4
+	addc	24,24,12
+	adde	25,25,21
+
+	mulld	12,8,4
+	mulhdu	21,8,4
+	ld	6,32(5)
+	mulli	8,8,19
+	addc	30,30,12
+	adde	31,31,21
+
+	mulld	12,11,4
+	mulhdu	21,11,4
+	addc	26,26,12
+	adde	27,27,21
+
+	mulld	12,7,4
+	mulhdu	21,7,4
+	addc	28,28,12
+	adde	29,29,21
+	mulld	12,8,6
+	mulhdu	21,8,6
+	addc	22,22,12
+	adde	23,23,21
+
+	mulld	12,9,6
+	mulhdu	21,9,6
+	addc	24,24,12
+	adde	25,25,21
+
+	mulld	12,10,6
+	mulhdu	21,10,6
+	addc	26,26,12
+	adde	27,27,21
+
+	mulld	12,11,6
+	mulhdu	21,11,6
+	addc	28,28,12
+	adde	29,29,21
+
+	mulld	12,7,6
+	mulhdu	21,7,6
+	addc	30,30,12
+	adde	31,31,21
+
+.Lfe51_reduce:
+	li	0,-1
+	srdi	0,0,13
+
+	srdi	12,26,51
+	and	9,26,0
+	insrdi	12,27,51,0
+	srdi	21,22,51
+	and	7,22,0
+	insrdi	21,23,51,0
+	addc	28,28,12
+	addze	29,29
+	addc	24,24,21
+	addze	25,25
+
+	srdi	12,28,51
+	and	10,28,0
+	insrdi	12,29,51,0
+	srdi	21,24,51
+	and	8,24,0
+	insrdi	21,25,51,0
+	addc	30,30,12
+	addze	31,31
+	add	9,9,21
+
+	srdi	12,30,51
+	and	11,30,0
+	insrdi	12,31,51,0
+	mulli	12,12,19
+
+	add	7,7,12
+
+	srdi	21,9,51
+	and	9,9,0
+	add	10,10,21
+
+	srdi	12,7,51
+	and	7,7,0
+	add	8,8,12
+
+	std	9,16(3)
+	std	10,24(3)
+	std	11,32(3)
+	std	7,0(3)
+	std	8,8(3)
+
+	ld	21,56(1)
+	ld	22,64(1)
+	ld	23,72(1)
+	ld	24,80(1)
+	ld	25,88(1)
+	ld	26,96(1)
+	ld	27,104(1)
+	ld	28,112(1)
+	ld	29,120(1)
+	ld	30,128(1)
+	ld	31,136(1)
+	addi	1,1,144
+	blr
+SYM_FUNC_END(x25519_fe51_mul)
+
+.align	5
+SYM_FUNC_START(x25519_fe51_sqr)
+
+	stdu	1,-144(1)
+	std	21,56(1)
+	std	22,64(1)
+	std	23,72(1)
+	std	24,80(1)
+	std	25,88(1)
+	std	26,96(1)
+	std	27,104(1)
+	std	28,112(1)
+	std	29,120(1)
+	std	30,128(1)
+	std	31,136(1)
+
+	ld	7,0(4)
+	ld	8,8(4)
+	ld	9,16(4)
+	ld	10,24(4)
+	ld	11,32(4)
+
+	add	6,7,7
+	mulli	21,11,19
+
+	mulld	22,7,7
+	mulhdu	23,7,7
+	mulld	24,8,6
+	mulhdu	25,8,6
+	mulld	26,9,6
+	mulhdu	27,9,6
+	mulld	28,10,6
+	mulhdu	29,10,6
+	mulld	30,11,6
+	mulhdu	31,11,6
+	add	6,8,8
+	mulld	12,11,21
+	mulhdu	11,11,21
+	addc	28,28,12
+	adde	29,29,11
+
+	mulli	5,10,19
+
+	mulld	12,8,8
+	mulhdu	11,8,8
+	addc	26,26,12
+	adde	27,27,11
+	mulld	12,9,6
+	mulhdu	11,9,6
+	addc	28,28,12
+	adde	29,29,11
+	mulld	12,10,6
+	mulhdu	11,10,6
+	addc	30,30,12
+	adde	31,31,11
+	mulld	12,21,6
+	mulhdu	11,21,6
+	add	6,10,10
+	addc	22,22,12
+	adde	23,23,11
+	mulld	12,10,5
+	mulhdu	10,10,5
+	addc	24,24,12
+	adde	25,25,10
+	mulld	12,6,21
+	mulhdu	10,6,21
+	add	6,9,9
+	addc	26,26,12
+	adde	27,27,10
+
+	mulld	12,9,9
+	mulhdu	10,9,9
+	addc	30,30,12
+	adde	31,31,10
+	mulld	12,5,6
+	mulhdu	10,5,6
+	addc	22,22,12
+	adde	23,23,10
+	mulld	12,21,6
+	mulhdu	10,21,6
+	addc	24,24,12
+	adde	25,25,10
+
+	b	.Lfe51_reduce
+SYM_FUNC_END(x25519_fe51_sqr)
+
+.align	5
+SYM_FUNC_START(x25519_fe51_mul121666)
+
+	stdu	1,-144(1)
+	std	21,56(1)
+	std	22,64(1)
+	std	23,72(1)
+	std	24,80(1)
+	std	25,88(1)
+	std	26,96(1)
+	std	27,104(1)
+	std	28,112(1)
+	std	29,120(1)
+	std	30,128(1)
+	std	31,136(1)
+
+	lis	6,1
+	ori	6,6,56130
+	ld	7,0(4)
+	ld	8,8(4)
+	ld	9,16(4)
+	ld	10,24(4)
+	ld	11,32(4)
+
+	mulld	22,7,6
+	mulhdu	23,7,6
+	mulld	24,8,6
+	mulhdu	25,8,6
+	mulld	26,9,6
+	mulhdu	27,9,6
+	mulld	28,10,6
+	mulhdu	29,10,6
+	mulld	30,11,6
+	mulhdu	31,11,6
+
+	b	.Lfe51_reduce
+SYM_FUNC_END(x25519_fe51_mul121666)
+
+.align	5
+SYM_FUNC_START(x25519_fe51_sqr_times)
+
+	stdu	1,-144(1)
+	std	21,56(1)
+	std	22,64(1)
+	std	23,72(1)
+	std	24,80(1)
+	std	25,88(1)
+	std	26,96(1)
+	std	27,104(1)
+	std	28,112(1)
+	std	29,120(1)
+	std	30,128(1)
+	std	31,136(1)
+
+	ld	7,0(4)
+	ld	8,8(4)
+	ld	9,16(4)
+	ld	10,24(4)
+	ld	11,32(4)
+
+	mtctr	5
+
+.Lsqr_times_loop:
+	add	6,7,7
+	mulli	21,11,19
+
+	mulld	22,7,7
+	mulhdu	23,7,7
+	mulld	24,8,6
+	mulhdu	25,8,6
+	mulld	26,9,6
+	mulhdu	27,9,6
+	mulld	28,10,6
+	mulhdu	29,10,6
+	mulld	30,11,6
+	mulhdu	31,11,6
+	add	6,8,8
+	mulld	12,11,21
+	mulhdu	11,11,21
+	addc	28,28,12
+	adde	29,29,11
+
+	mulli	5,10,19
+
+	mulld	12,8,8
+	mulhdu	11,8,8
+	addc	26,26,12
+	adde	27,27,11
+	mulld	12,9,6
+	mulhdu	11,9,6
+	addc	28,28,12
+	adde	29,29,11
+	mulld	12,10,6
+	mulhdu	11,10,6
+	addc	30,30,12
+	adde	31,31,11
+	mulld	12,21,6
+	mulhdu	11,21,6
+	add	6,10,10
+	addc	22,22,12
+	adde	23,23,11
+	mulld	12,10,5
+	mulhdu	10,10,5
+	addc	24,24,12
+	adde	25,25,10
+	mulld	12,6,21
+	mulhdu	10,6,21
+	add	6,9,9
+	addc	26,26,12
+	adde	27,27,10
+
+	mulld	12,9,9
+	mulhdu	10,9,9
+	addc	30,30,12
+	adde	31,31,10
+	mulld	12,5,6
+	mulhdu	10,5,6
+	addc	22,22,12
+	adde	23,23,10
+	mulld	12,21,6
+	mulhdu	10,21,6
+	addc	24,24,12
+	adde	25,25,10
+
+	# fe51_reduce
+	li	0,-1
+	srdi	0,0,13
+
+	srdi	12,26,51
+	and	9,26,0
+	insrdi	12,27,51,0
+	srdi	21,22,51
+	and	7,22,0
+	insrdi	21,23,51,0
+	addc	28,28,12
+	addze	29,29
+	addc	24,24,21
+	addze	25,25
+
+	srdi	12,28,51
+	and	10,28,0
+	insrdi	12,29,51,0
+	srdi	21,24,51
+	and	8,24,0
+	insrdi	21,25,51,0
+	addc	30,30,12
+	addze	31,31
+	add	9,9,21
+
+	srdi	12,30,51
+	and	11,30,0
+	insrdi	12,31,51,0
+	mulli	12,12,19
+
+	add	7,7,12
+
+	srdi	21,9,51
+	and	9,9,0
+	add	10,10,21
+
+	srdi	12,7,51
+	and	7,7,0
+	add	8,8,12
+
+	bdnz	.Lsqr_times_loop
+
+	std	9,16(3)
+	std	10,24(3)
+	std	11,32(3)
+	std	7,0(3)
+	std	8,8(3)
+
+	ld	21,56(1)
+	ld	22,64(1)
+	ld	23,72(1)
+	ld	24,80(1)
+	ld	25,88(1)
+	ld	26,96(1)
+	ld	27,104(1)
+	ld	28,112(1)
+	ld	29,120(1)
+	ld	30,128(1)
+	ld	31,136(1)
+	addi	1,1,144
+	blr
+SYM_FUNC_END(x25519_fe51_sqr_times)
+
+.align	5
+SYM_FUNC_START(x25519_fe51_frombytes)
+
+	li	12, -1
+	srdi	12, 12, 13	# 0x7ffffffffffff
+
+	ld	5, 0(4)
+	ld	6, 8(4)
+	ld	7, 16(4)
+	ld	8, 24(4)
+
+	srdi	10, 5, 51
+	and	5, 5, 12	# h0
+
+	sldi	11, 6, 13
+	or	11, 10, 11	# h1t
+	srdi	10, 6, 38
+	and	6, 11, 12	# h1
+
+	sldi	11, 7, 26
+	or	10, 10, 11	# h2t
+
+	srdi	11, 7, 25
+	and	7, 10, 12	# h2
+	sldi	10, 8, 39
+	or	11, 11, 10	# h3t
+
+	srdi	9, 8, 12
+	and	8, 11, 12	# h3
+	and	9, 9, 12	# h4
+
+	std	5, 0(3)
+	std	6, 8(3)
+	std	7, 16(3)
+	std	8, 24(3)
+	std	9, 32(3)
+
+	blr
+SYM_FUNC_END(x25519_fe51_frombytes)
+
+.align	5
+SYM_FUNC_START(x25519_fe51_tobytes)
+
+	ld	5, 0(4)
+	ld	6, 8(4)
+	ld	7, 16(4)
+	ld	8, 24(4)
+	ld	9, 32(4)
+
+	li	12, -1
+	srdi	12, 12, 13	# 0x7ffffffffffff
+
+	# Full reducuction
+	addi	10, 5, 19
+	srdi	10, 10, 51
+	add	10, 10, 6
+	srdi	10, 10, 51
+	add	10, 10, 7
+	srdi	10, 10, 51
+	add	10, 10, 8
+	srdi	10, 10, 51
+	add	10, 10, 9
+	srdi	10, 10, 51
+
+	mulli	10, 10, 19
+	add	5, 5, 10
+	srdi	11, 5, 51
+	add	6, 6, 11
+	srdi	11, 6, 51
+	add	7, 7, 11
+	srdi	11, 7, 51
+	add	8, 8, 11
+	srdi	11, 8, 51
+	add	9, 9, 11
+
+	and	5, 5, 12
+	and	6, 6, 12
+	and	7, 7, 12
+	and	8, 8, 12
+	and	9, 9, 12
+
+	sldi	10, 6, 51
+	or	5, 5, 10	# s0
+
+	srdi	11, 6, 13
+	sldi	10, 7, 38
+	or	6, 11, 10	# s1
+
+	srdi	11, 7, 26
+	sldi	10, 8, 25
+	or	7, 11, 10	# s2
+
+	srdi	11, 8, 39
+	sldi	10, 9, 12
+	or	8, 11, 10	# s4
+
+	std	5, 0(3)
+	std	6, 8(3)
+	std	7, 16(3)
+	std	8, 24(3)
+
+	blr
+SYM_FUNC_END(x25519_fe51_tobytes)
+
+.align	5
+SYM_FUNC_START(x25519_cswap)
+
+	li	7, 5
+	neg	6, 5
+	mtctr	7
+
+.Lswap_loop:
+	ld	8, 0(3)
+	ld	9, 0(4)
+	xor	10, 8, 9
+	and	10, 10, 6
+	xor	11, 8, 10
+	xor	12, 9, 10
+	std	11, 0(3)
+	addi	3, 3, 8
+	std	12, 0(4)
+	addi	4, 4, 8
+	bdnz	.Lswap_loop
+
+	blr
+SYM_FUNC_END(x25519_cswap)
diff --git a/lib/crypto/powerpc/curve25519.h b/lib/crypto/powerpc/curve25519.h
new file mode 100644
index 000000000000..dee6234c48e9
--- /dev/null
+++ b/lib/crypto/powerpc/curve25519.h
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2024- IBM Corp.
+ *
+ * X25519 scalar multiplication with 51 bits limbs for PPC64le.
+ *   Based on RFC7748 and AArch64 optimized implementation for X25519
+ *     - Algorithm 1 Scalar multiplication of a variable point
+ */
+
+#include <linux/types.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+#include <linux/cpufeature.h>
+#include <linux/processor.h>
+
+typedef uint64_t fe51[5];
+
+asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
+asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f);
+asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f);
+asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n);
+asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s);
+asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h);
+asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit);
+
+#define fmul x25519_fe51_mul
+#define fsqr x25519_fe51_sqr
+#define fmul121666 x25519_fe51_mul121666
+#define fe51_tobytes x25519_fe51_tobytes
+
+static void fadd(fe51 h, const fe51 f, const fe51 g)
+{
+	h[0] = f[0] + g[0];
+	h[1] = f[1] + g[1];
+	h[2] = f[2] + g[2];
+	h[3] = f[3] + g[3];
+	h[4] = f[4] + g[4];
+}
+
+/*
+ * Prime = 2 ** 255 - 19, 255 bits
+ *    (0x7fffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffed)
+ *
+ * Prime in 5 51-bit limbs
+ */
+static fe51 prime51 = { 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff};
+
+static void fsub(fe51 h, const fe51 f, const fe51 g)
+{
+	h[0] = (f[0] + ((prime51[0] * 2))) - g[0];
+	h[1] = (f[1] + ((prime51[1] * 2))) - g[1];
+	h[2] = (f[2] + ((prime51[2] * 2))) - g[2];
+	h[3] = (f[3] + ((prime51[3] * 2))) - g[3];
+	h[4] = (f[4] + ((prime51[4] * 2))) - g[4];
+}
+
+static void fe51_frombytes(fe51 h, const uint8_t *s)
+{
+	/*
+	 * Make sure 64-bit aligned.
+	 */
+	unsigned char sbuf[32+8];
+	unsigned char *sb = PTR_ALIGN((void *)sbuf, 8);
+
+	memcpy(sb, s, 32);
+	x25519_fe51_frombytes(h, sb);
+}
+
+static void finv(fe51 o, const fe51 i)
+{
+	fe51 a0, b, c, t00;
+
+	fsqr(a0, i);
+	x25519_fe51_sqr_times(t00, a0, 2);
+
+	fmul(b, t00, i);
+	fmul(a0, b, a0);
+
+	fsqr(t00, a0);
+
+	fmul(b, t00, b);
+	x25519_fe51_sqr_times(t00, b, 5);
+
+	fmul(b, t00, b);
+	x25519_fe51_sqr_times(t00, b, 10);
+
+	fmul(c, t00, b);
+	x25519_fe51_sqr_times(t00, c, 20);
+
+	fmul(t00, t00, c);
+	x25519_fe51_sqr_times(t00, t00, 10);
+
+	fmul(b, t00, b);
+	x25519_fe51_sqr_times(t00, b, 50);
+
+	fmul(c, t00, b);
+	x25519_fe51_sqr_times(t00, c, 100);
+
+	fmul(t00, t00, c);
+	x25519_fe51_sqr_times(t00, t00, 50);
+
+	fmul(t00, t00, b);
+	x25519_fe51_sqr_times(t00, t00, 5);
+
+	fmul(o, t00, a0);
+}
+
+static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32],
+			    const uint8_t point[32])
+{
+	fe51 x1, x2, z2, x3, z3;
+	uint8_t s[32];
+	unsigned int swap = 0;
+	int i;
+
+	memcpy(s, scalar, 32);
+	s[0]  &= 0xf8;
+	s[31] &= 0x7f;
+	s[31] |= 0x40;
+	fe51_frombytes(x1, point);
+
+	z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0;
+	x3[0] = x1[0];
+	x3[1] = x1[1];
+	x3[2] = x1[2];
+	x3[3] = x1[3];
+	x3[4] = x1[4];
+
+	x2[0] = z3[0] = 1;
+	x2[1] = z3[1] = 0;
+	x2[2] = z3[2] = 0;
+	x2[3] = z3[3] = 0;
+	x2[4] = z3[4] = 0;
+
+	for (i = 254; i >= 0; --i) {
+		unsigned int k_t = 1 & (s[i / 8] >> (i & 7));
+		fe51 a, b, c, d, e;
+		fe51 da, cb, aa, bb;
+		fe51 dacb_p, dacb_m;
+
+		swap ^= k_t;
+		x25519_cswap(x2, x3, swap);
+		x25519_cswap(z2, z3, swap);
+		swap = k_t;
+
+		fsub(b, x2, z2);		// B = x_2 - z_2
+		fadd(a, x2, z2);		// A = x_2 + z_2
+		fsub(d, x3, z3);		// D = x_3 - z_3
+		fadd(c, x3, z3);		// C = x_3 + z_3
+
+		fsqr(bb, b);			// BB = B^2
+		fsqr(aa, a);			// AA = A^2
+		fmul(da, d, a);			// DA = D * A
+		fmul(cb, c, b);			// CB = C * B
+
+		fsub(e, aa, bb);		// E = AA - BB
+		fmul(x2, aa, bb);		// x2 = AA * BB
+		fadd(dacb_p, da, cb);		// DA + CB
+		fsub(dacb_m, da, cb);		// DA - CB
+
+		fmul121666(z3, e);		// 121666 * E
+		fsqr(z2, dacb_m);		// (DA - CB)^2
+		fsqr(x3, dacb_p);		// x3 = (DA + CB)^2
+		fadd(b, bb, z3);		// BB + 121666 * E
+		fmul(z3, x1, z2);		// z3 = x1 * (DA - CB)^2
+		fmul(z2, e, b);		// z2 = e * (BB + (DA + CB)^2)
+	}
+
+	finv(z2, z2);
+	fmul(x2, x2, z2);
+	fe51_tobytes(out, x2);
+}
+
+static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+			    const u8 secret[CURVE25519_KEY_SIZE],
+			    const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+	curve25519_fe51(mypublic, secret, basepoint);
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+				 const u8 secret[CURVE25519_KEY_SIZE])
+{
+	curve25519_fe51(pub, secret, curve25519_base_point);
+}
diff --git a/lib/crypto/powerpc/md5-asm.S b/lib/crypto/powerpc/md5-asm.S
new file mode 100644
index 000000000000..fa6bc440cf4a
--- /dev/null
+++ b/lib/crypto/powerpc/md5-asm.S
@@ -0,0 +1,235 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast MD5 implementation for PPC
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+
+#define rHP	r3
+#define rWP	r4
+
+#define rH0	r0
+#define rH1	r6
+#define rH2	r7
+#define rH3	r5
+
+#define rW00	r8
+#define rW01	r9
+#define rW02	r10
+#define rW03	r11
+#define rW04	r12
+#define rW05	r14
+#define rW06	r15
+#define rW07	r16
+#define rW08	r17
+#define rW09	r18
+#define rW10	r19
+#define rW11	r20
+#define rW12	r21
+#define rW13	r22
+#define rW14	r23
+#define rW15	r24
+
+#define rT0	r25
+#define rT1	r26
+
+#define INITIALIZE \
+	PPC_STLU r1,-INT_FRAME_SIZE(r1); \
+	SAVE_GPRS(14, 26, r1)		/* push registers onto stack	*/
+
+#define FINALIZE \
+	REST_GPRS(14, 26, r1);		/* pop registers from stack	*/ \
+	addi	r1,r1,INT_FRAME_SIZE
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+	lwbrx		reg,0,rWP;	/* load data			*/
+#define INC_PTR \
+	addi		rWP,rWP,4;	/* increment per word		*/
+#define NEXT_BLOCK			/* nothing to do		*/
+#else
+#define LOAD_DATA(reg, off) \
+	lwz		reg,off(rWP);	/* load data			*/
+#define INC_PTR				/* nothing to do		*/
+#define NEXT_BLOCK \
+	addi		rWP,rWP,64;	/* increment per block		*/
+#endif
+
+#define R_00_15(a, b, c, d, w0, w1, p, q, off, k0h, k0l, k1h, k1l) \
+	LOAD_DATA(w0, off)		/*    W				*/ \
+	and		rT0,b,c;	/* 1: f = b and c		*/ \
+	INC_PTR				/*    ptr++			*/ \
+	andc		rT1,d,b;	/* 1: f' = ~b and d		*/ \
+	LOAD_DATA(w1, off+4)		/*    W				*/ \
+	or		rT0,rT0,rT1;	/* 1: f = f or f'		*/ \
+	addi		w0,w0,k0l;	/* 1: wk = w + k		*/ \
+	add		a,a,rT0;	/* 1: a = a + f			*/ \
+	addis		w0,w0,k0h;	/* 1: wk = w + k'		*/ \
+	addis		w1,w1,k1h;	/* 2: wk = w + k		*/ \
+	add		a,a,w0;		/* 1: a = a + wk		*/ \
+	addi		w1,w1,k1l;	/* 2: wk = w + k'		*/ \
+	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
+	add		d,d,w1;		/* 2: a = a + wk		*/ \
+	add		a,a,b;		/* 1: a = a + b			*/ \
+	and		rT0,a,b;	/* 2: f = b and c		*/ \
+	andc		rT1,c,a;	/* 2: f' = ~b and d		*/ \
+	or		rT0,rT0,rT1;	/* 2: f = f or f'		*/ \
+	add		d,d,rT0;	/* 2: a = a + f			*/ \
+	INC_PTR				/*    ptr++			*/ \
+	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
+	add		d,d,a;		/* 2: a = a + b			*/
+
+#define R_16_31(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
+	andc		rT0,c,d;	/* 1: f = c and ~d		*/ \
+	and		rT1,b,d;	/* 1: f' = b and d		*/ \
+	addi		w0,w0,k0l;	/* 1: wk = w + k		*/ \
+	or		rT0,rT0,rT1;	/* 1: f = f or f'		*/ \
+	addis		w0,w0,k0h;	/* 1: wk = w + k'		*/ \
+	add		a,a,rT0;	/* 1: a = a + f			*/ \
+	addi		w1,w1,k1l;	/* 2: wk = w + k		*/ \
+	add		a,a,w0;		/* 1: a = a + wk		*/ \
+	addis		w1,w1,k1h;	/* 2: wk = w + k'		*/ \
+	andc		rT0,b,c;	/* 2: f = c and ~d		*/ \
+	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
+	add		a,a,b;		/* 1: a = a + b			*/ \
+	add		d,d,w1;		/* 2: a = a + wk		*/ \
+	and		rT1,a,c;	/* 2: f' = b and d		*/ \
+	or		rT0,rT0,rT1;	/* 2: f = f or f'		*/ \
+	add		d,d,rT0;	/* 2: a = a + f			*/ \
+	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
+	add		d,d,a;		/* 2: a = a +b			*/
+
+#define R_32_47(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
+	xor		rT0,b,c;	/* 1: f' = b xor c		*/ \
+	addi		w0,w0,k0l;	/* 1: wk = w + k		*/ \
+	xor		rT1,rT0,d;	/* 1: f = f xor f'		*/ \
+	addis		w0,w0,k0h;	/* 1: wk = w + k'		*/ \
+	add		a,a,rT1;	/* 1: a = a + f			*/ \
+	addi		w1,w1,k1l;	/* 2: wk = w + k		*/ \
+	add		a,a,w0;		/* 1: a = a + wk		*/ \
+	addis		w1,w1,k1h;	/* 2: wk = w + k'		*/ \
+	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
+	add		d,d,w1;		/* 2: a = a + wk		*/ \
+	add		a,a,b;		/* 1: a = a + b			*/ \
+	xor		rT1,rT0,a;	/* 2: f = b xor f'		*/ \
+	add		d,d,rT1;	/* 2: a = a + f			*/ \
+	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
+	add		d,d,a;		/* 2: a = a + b			*/
+
+#define R_48_63(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
+	addi		w0,w0,k0l;	/* 1: w = w + k			*/ \
+	orc		rT0,b,d;	/* 1: f = b or ~d		*/ \
+	addis		w0,w0,k0h;	/* 1: w = w + k'		*/ \
+	xor		rT0,rT0,c;	/* 1: f = f xor c		*/ \
+	add		a,a,w0;		/* 1: a = a + wk		*/ \
+	addi		w1,w1,k1l;	/* 2: w = w + k			*/ \
+	add		a,a,rT0;	/* 1: a = a + f			*/ \
+	addis		w1,w1,k1h;	/* 2: w = w + k'		*/ \
+	rotrwi		a,a,p;		/* 1: a = a rotl x		*/ \
+	add		a,a,b;		/* 1: a = a + b			*/ \
+	orc		rT0,a,c;	/* 2: f = b or ~d		*/ \
+	add		d,d,w1;		/* 2: a = a + wk		*/ \
+	xor		rT0,rT0,b;	/* 2: f = f xor c		*/ \
+	add		d,d,rT0;	/* 2: a = a + f			*/ \
+	rotrwi		d,d,q;		/* 2: a = a rotl x		*/ \
+	add		d,d,a;		/* 2: a = a + b			*/
+
+_GLOBAL(ppc_md5_transform)
+	INITIALIZE
+
+	mtctr		r5
+	lwz		rH0,0(rHP)
+	lwz		rH1,4(rHP)
+	lwz		rH2,8(rHP)
+	lwz		rH3,12(rHP)
+
+ppc_md5_main:
+	R_00_15(rH0, rH1, rH2, rH3, rW00, rW01, 25, 20, 0,
+		0xd76b, -23432, 0xe8c8, -18602)
+	R_00_15(rH2, rH3, rH0, rH1, rW02, rW03, 15, 10, 8,
+		0x2420, 0x70db, 0xc1be, -12562)
+	R_00_15(rH0, rH1, rH2, rH3, rW04, rW05, 25, 20, 16,
+		0xf57c, 0x0faf, 0x4788, -14806)
+	R_00_15(rH2, rH3, rH0, rH1, rW06, rW07, 15, 10, 24,
+		0xa830, 0x4613, 0xfd47, -27391)
+	R_00_15(rH0, rH1, rH2, rH3, rW08, rW09, 25, 20, 32,
+		0x6981, -26408, 0x8b45,  -2129)
+	R_00_15(rH2, rH3, rH0, rH1, rW10, rW11, 15, 10, 40,
+		0xffff, 0x5bb1, 0x895d, -10306)
+	R_00_15(rH0, rH1, rH2, rH3, rW12, rW13, 25, 20, 48,
+		0x6b90, 0x1122, 0xfd98, 0x7193)
+	R_00_15(rH2, rH3, rH0, rH1, rW14, rW15, 15, 10, 56,
+		0xa679, 0x438e, 0x49b4, 0x0821)
+
+	R_16_31(rH0, rH1, rH2, rH3, rW01, rW06, 27, 23,
+		0x0d56, 0x6e0c, 0x1810, 0x6d2d)
+	R_16_31(rH2, rH3, rH0, rH1, rW11, rW00, 18, 12,
+		0x9d02, -32109, 0x124c, 0x2332)
+	R_16_31(rH0, rH1, rH2, rH3, rW05, rW10, 27, 23,
+		0x8ea7, 0x4a33, 0x0245, -18270)
+	R_16_31(rH2, rH3, rH0, rH1, rW15, rW04, 18, 12,
+		0x8eee,  -8608, 0xf258,  -5095)
+	R_16_31(rH0, rH1, rH2, rH3, rW09, rW14, 27, 23,
+		0x969d, -10697, 0x1cbe, -15288)
+	R_16_31(rH2, rH3, rH0, rH1, rW03, rW08, 18, 12,
+		0x3317, 0x3e99, 0xdbd9, 0x7c15)
+	R_16_31(rH0, rH1, rH2, rH3, rW13, rW02, 27, 23,
+		0xac4b, 0x7772, 0xd8cf, 0x331d)
+	R_16_31(rH2, rH3, rH0, rH1, rW07, rW12, 18, 12,
+		0x6a28, 0x6dd8, 0x219a, 0x3b68)
+
+	R_32_47(rH0, rH1, rH2, rH3, rW05, rW08, 28, 21,
+		0x29cb, 0x28e5, 0x4218,  -7788)
+	R_32_47(rH2, rH3, rH0, rH1, rW11, rW14, 16,  9,
+		0x473f, 0x06d1, 0x3aae, 0x3036)
+	R_32_47(rH0, rH1, rH2, rH3, rW01, rW04, 28, 21,
+		0xaea1, -15134, 0x640b, -11295)
+	R_32_47(rH2, rH3, rH0, rH1, rW07, rW10, 16,  9,
+		0x8f4c, 0x4887, 0xbc7c, -22499)
+	R_32_47(rH0, rH1, rH2, rH3, rW13, rW00, 28, 21,
+		0x7eb8, -27199, 0x00ea, 0x6050)
+	R_32_47(rH2, rH3, rH0, rH1, rW03, rW06, 16,  9,
+		0xe01a, 0x22fe, 0x4447, 0x69c5)
+	R_32_47(rH0, rH1, rH2, rH3, rW09, rW12, 28, 21,
+		0xb7f3, 0x0253, 0x59b1, 0x4d5b)
+	R_32_47(rH2, rH3, rH0, rH1, rW15, rW02, 16,  9,
+		0x4701, -27017, 0xc7bd, -19859)
+
+	R_48_63(rH0, rH1, rH2, rH3, rW00, rW07, 26, 22,
+		0x0988,  -1462, 0x4c70, -19401)
+	R_48_63(rH2, rH3, rH0, rH1, rW14, rW05, 17, 11,
+		0xadaf,  -5221, 0xfc99, 0x66f7)
+	R_48_63(rH0, rH1, rH2, rH3, rW12, rW03, 26, 22,
+		0x7e80, -16418, 0xba1e, -25587)
+	R_48_63(rH2, rH3, rH0, rH1, rW10, rW01, 17, 11,
+		0x4130, 0x380d, 0xe0c5, 0x738d)
+	lwz		rW00,0(rHP)
+	R_48_63(rH0, rH1, rH2, rH3, rW08, rW15, 26, 22,
+		0xe837, -30770, 0xde8a, 0x69e8)
+	lwz		rW14,4(rHP)
+	R_48_63(rH2, rH3, rH0, rH1, rW06, rW13, 17, 11,
+		0x9e79, 0x260f, 0x256d, -27941)
+	lwz		rW12,8(rHP)
+	R_48_63(rH0, rH1, rH2, rH3, rW04, rW11, 26, 22,
+		0xab75, -20775, 0x4f9e, -28397)
+	lwz		rW10,12(rHP)
+	R_48_63(rH2, rH3, rH0, rH1, rW02, rW09, 17, 11,
+		0x662b, 0x7c56, 0x11b2, 0x0358)
+
+	add		rH0,rH0,rW00
+	stw		rH0,0(rHP)
+	add		rH1,rH1,rW14
+	stw		rH1,4(rHP)
+	add		rH2,rH2,rW12
+	stw		rH2,8(rHP)
+	add		rH3,rH3,rW10
+	stw		rH3,12(rHP)
+	NEXT_BLOCK
+
+	bdnz		ppc_md5_main
+
+	FINALIZE
+	blr
diff --git a/lib/crypto/powerpc/md5.h b/lib/crypto/powerpc/md5.h
new file mode 100644
index 000000000000..540b08e34d1d
--- /dev/null
+++ b/lib/crypto/powerpc/md5.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * MD5 optimized for PowerPC
+ */
+
+void ppc_md5_transform(u32 *state, const u8 *data, size_t nblocks);
+
+static void md5_blocks(struct md5_block_state *state,
+		       const u8 *data, size_t nblocks)
+{
+	ppc_md5_transform(state->h, data, nblocks);
+}
diff --git a/lib/crypto/powerpc/poly1305-p10le_64.S b/lib/crypto/powerpc/poly1305-p10le_64.S
new file mode 100644
index 000000000000..a3c1987f1ecd
--- /dev/null
+++ b/lib/crypto/powerpc/poly1305-p10le_64.S
@@ -0,0 +1,1075 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated poly1305 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# Poly1305 - this version mainly using vector/VSX/Scalar
+#  - 26 bits limbs
+#  - Handle multiple 64 byte blcok.
+#
+# Block size 16 bytes
+# key = (r, s)
+# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
+# p = 2^130 - 5
+# a += m
+# a = (r + a) % p
+# a += s
+#
+# Improve performance by breaking down polynominal to the sum of products with
+#     h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+#
+#  07/22/21 - this revison based on the above sum of products.  Setup r^4, r^3, r^2, r and s3, s2, s1, s0
+#             to 9 vectors for multiplications.
+#
+# setup r^4, r^3, r^2, r vectors
+#    vs    [r^1, r^3, r^2, r^4]
+#    vs0 = [r0,.....]
+#    vs1 = [r1,.....]
+#    vs2 = [r2,.....]
+#    vs3 = [r3,.....]
+#    vs4 = [r4,.....]
+#    vs5 = [r1*5,...]
+#    vs6 = [r2*5,...]
+#    vs7 = [r2*5,...]
+#    vs8 = [r4*5,...]
+#
+#  Each word in a vector consists a member of a "r/s" in [a * r/s].
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0,   r4*5, r3*5, r2*5;
+# r2, r1,   r0,   r4*5, r3*5;
+# r3, r2,   r1,   r0,   r4*5;
+# r4, r3,   r2,   r1,   r0  ;
+#
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+#  k = 32 bytes key
+#  r3 = k (r, s)
+#  r4 = mlen
+#  r5 = m
+#
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine "any"
+
+.text
+
+.macro	SAVE_GPR GPR OFFSET FRAME
+	std	\GPR,\OFFSET(\FRAME)
+.endm
+
+.macro	SAVE_VRS VRS OFFSET FRAME
+	li	16, \OFFSET
+	stvx	\VRS, 16, \FRAME
+.endm
+
+.macro	SAVE_VSX VSX OFFSET FRAME
+	li	16, \OFFSET
+	stxvx	\VSX, 16, \FRAME
+.endm
+
+.macro	RESTORE_GPR GPR OFFSET FRAME
+	ld	\GPR,\OFFSET(\FRAME)
+.endm
+
+.macro	RESTORE_VRS VRS OFFSET FRAME
+	li	16, \OFFSET
+	lvx	\VRS, 16, \FRAME
+.endm
+
+.macro	RESTORE_VSX VSX OFFSET FRAME
+	li	16, \OFFSET
+	lxvx	\VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+	mflr 0
+	std 0, 16(1)
+	stdu 1,-752(1)
+
+	SAVE_GPR 14, 112, 1
+	SAVE_GPR 15, 120, 1
+	SAVE_GPR 16, 128, 1
+	SAVE_GPR 17, 136, 1
+	SAVE_GPR 18, 144, 1
+	SAVE_GPR 19, 152, 1
+	SAVE_GPR 20, 160, 1
+	SAVE_GPR 21, 168, 1
+	SAVE_GPR 22, 176, 1
+	SAVE_GPR 23, 184, 1
+	SAVE_GPR 24, 192, 1
+	SAVE_GPR 25, 200, 1
+	SAVE_GPR 26, 208, 1
+	SAVE_GPR 27, 216, 1
+	SAVE_GPR 28, 224, 1
+	SAVE_GPR 29, 232, 1
+	SAVE_GPR 30, 240, 1
+	SAVE_GPR 31, 248, 1
+
+	addi	9, 1, 256
+	SAVE_VRS 20, 0, 9
+	SAVE_VRS 21, 16, 9
+	SAVE_VRS 22, 32, 9
+	SAVE_VRS 23, 48, 9
+	SAVE_VRS 24, 64, 9
+	SAVE_VRS 25, 80, 9
+	SAVE_VRS 26, 96, 9
+	SAVE_VRS 27, 112, 9
+	SAVE_VRS 28, 128, 9
+	SAVE_VRS 29, 144, 9
+	SAVE_VRS 30, 160, 9
+	SAVE_VRS 31, 176, 9
+
+	SAVE_VSX 14, 192, 9
+	SAVE_VSX 15, 208, 9
+	SAVE_VSX 16, 224, 9
+	SAVE_VSX 17, 240, 9
+	SAVE_VSX 18, 256, 9
+	SAVE_VSX 19, 272, 9
+	SAVE_VSX 20, 288, 9
+	SAVE_VSX 21, 304, 9
+	SAVE_VSX 22, 320, 9
+	SAVE_VSX 23, 336, 9
+	SAVE_VSX 24, 352, 9
+	SAVE_VSX 25, 368, 9
+	SAVE_VSX 26, 384, 9
+	SAVE_VSX 27, 400, 9
+	SAVE_VSX 28, 416, 9
+	SAVE_VSX 29, 432, 9
+	SAVE_VSX 30, 448, 9
+	SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+	addi	9, 1, 256
+	RESTORE_VRS 20, 0, 9
+	RESTORE_VRS 21, 16, 9
+	RESTORE_VRS 22, 32, 9
+	RESTORE_VRS 23, 48, 9
+	RESTORE_VRS 24, 64, 9
+	RESTORE_VRS 25, 80, 9
+	RESTORE_VRS 26, 96, 9
+	RESTORE_VRS 27, 112, 9
+	RESTORE_VRS 28, 128, 9
+	RESTORE_VRS 29, 144, 9
+	RESTORE_VRS 30, 160, 9
+	RESTORE_VRS 31, 176, 9
+
+	RESTORE_VSX 14, 192, 9
+	RESTORE_VSX 15, 208, 9
+	RESTORE_VSX 16, 224, 9
+	RESTORE_VSX 17, 240, 9
+	RESTORE_VSX 18, 256, 9
+	RESTORE_VSX 19, 272, 9
+	RESTORE_VSX 20, 288, 9
+	RESTORE_VSX 21, 304, 9
+	RESTORE_VSX 22, 320, 9
+	RESTORE_VSX 23, 336, 9
+	RESTORE_VSX 24, 352, 9
+	RESTORE_VSX 25, 368, 9
+	RESTORE_VSX 26, 384, 9
+	RESTORE_VSX 27, 400, 9
+	RESTORE_VSX 28, 416, 9
+	RESTORE_VSX 29, 432, 9
+	RESTORE_VSX 30, 448, 9
+	RESTORE_VSX 31, 464, 9
+
+	RESTORE_GPR 14, 112, 1
+	RESTORE_GPR 15, 120, 1
+	RESTORE_GPR 16, 128, 1
+	RESTORE_GPR 17, 136, 1
+	RESTORE_GPR 18, 144, 1
+	RESTORE_GPR 19, 152, 1
+	RESTORE_GPR 20, 160, 1
+	RESTORE_GPR 21, 168, 1
+	RESTORE_GPR 22, 176, 1
+	RESTORE_GPR 23, 184, 1
+	RESTORE_GPR 24, 192, 1
+	RESTORE_GPR 25, 200, 1
+	RESTORE_GPR 26, 208, 1
+	RESTORE_GPR 27, 216, 1
+	RESTORE_GPR 28, 224, 1
+	RESTORE_GPR 29, 232, 1
+	RESTORE_GPR 30, 240, 1
+	RESTORE_GPR 31, 248, 1
+
+	addi    1, 1, 752
+	ld 0, 16(1)
+	mtlr 0
+.endm # RESTORE_REGS
+
+#
+# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
+# p[1] = a0*r1 + a1*r0   + a2*r4*5 + a3*r3*5 + a4*r2*5;
+# p[2] = a0*r2 + a1*r1   + a2*r0   + a3*r4*5 + a4*r3*5;
+# p[3] = a0*r3 + a1*r2   + a2*r1   + a3*r0   + a4*r4*5;
+# p[4] = a0*r4 + a1*r3   + a2*r2   + a3*r1   + a4*r0  ;
+#
+#    [r^2, r^3, r^1, r^4]
+#    [m3,  m2,  m4,  m1]
+#
+# multiply odd and even words
+.macro mul_odd
+	vmulouw	14, 4, 26
+	vmulouw	10, 5, 3
+	vmulouw	11, 6, 2
+	vmulouw	12, 7, 1
+	vmulouw	13, 8, 0
+	vmulouw	15, 4, 27
+	vaddudm	14, 14, 10
+	vaddudm	14, 14, 11
+	vmulouw	10, 5, 26
+	vmulouw	11, 6, 3
+	vaddudm	14, 14, 12
+	vaddudm	14, 14, 13	# x0
+	vaddudm	15, 15, 10
+	vaddudm	15, 15, 11
+	vmulouw	12, 7, 2
+	vmulouw	13, 8, 1
+	vaddudm	15, 15, 12
+	vaddudm	15, 15, 13	# x1
+	vmulouw	16, 4, 28
+	vmulouw	10, 5, 27
+	vmulouw	11, 6, 26
+	vaddudm	16, 16, 10
+	vaddudm	16, 16, 11
+	vmulouw	12, 7, 3
+	vmulouw	13, 8, 2
+	vaddudm	16, 16, 12
+	vaddudm	16, 16, 13	# x2
+	vmulouw	17, 4, 29
+	vmulouw	10, 5, 28
+	vmulouw	11, 6, 27
+	vaddudm	17, 17, 10
+	vaddudm	17, 17, 11
+	vmulouw	12, 7, 26
+	vmulouw	13, 8, 3
+	vaddudm	17, 17, 12
+	vaddudm	17, 17, 13	# x3
+	vmulouw	18, 4, 30
+	vmulouw	10, 5, 29
+	vmulouw	11, 6, 28
+	vaddudm	18, 18, 10
+	vaddudm	18, 18, 11
+	vmulouw	12, 7, 27
+	vmulouw	13, 8, 26
+	vaddudm	18, 18, 12
+	vaddudm	18, 18, 13	# x4
+.endm
+
+.macro mul_even
+	vmuleuw	9, 4, 26
+	vmuleuw	10, 5, 3
+	vmuleuw	11, 6, 2
+	vmuleuw	12, 7, 1
+	vmuleuw	13, 8, 0
+	vaddudm	14, 14, 9
+	vaddudm	14, 14, 10
+	vaddudm	14, 14, 11
+	vaddudm	14, 14, 12
+	vaddudm	14, 14, 13	# x0
+
+	vmuleuw	9, 4, 27
+	vmuleuw	10, 5, 26
+	vmuleuw	11, 6, 3
+	vmuleuw	12, 7, 2
+	vmuleuw	13, 8, 1
+	vaddudm	15, 15, 9
+	vaddudm	15, 15, 10
+	vaddudm	15, 15, 11
+	vaddudm	15, 15, 12
+	vaddudm	15, 15, 13	# x1
+
+	vmuleuw	9, 4, 28
+	vmuleuw	10, 5, 27
+	vmuleuw	11, 6, 26
+	vmuleuw	12, 7, 3
+	vmuleuw	13, 8, 2
+	vaddudm	16, 16, 9
+	vaddudm	16, 16, 10
+	vaddudm	16, 16, 11
+	vaddudm	16, 16, 12
+	vaddudm	16, 16, 13	# x2
+
+	vmuleuw	9, 4, 29
+	vmuleuw	10, 5, 28
+	vmuleuw	11, 6, 27
+	vmuleuw	12, 7, 26
+	vmuleuw	13, 8, 3
+	vaddudm	17, 17, 9
+	vaddudm	17, 17, 10
+	vaddudm	17, 17, 11
+	vaddudm	17, 17, 12
+	vaddudm	17, 17, 13	# x3
+
+	vmuleuw	9, 4, 30
+	vmuleuw	10, 5, 29
+	vmuleuw	11, 6, 28
+	vmuleuw	12, 7, 27
+	vmuleuw	13, 8, 26
+	vaddudm	18, 18, 9
+	vaddudm	18, 18, 10
+	vaddudm	18, 18, 11
+	vaddudm	18, 18, 12
+	vaddudm	18, 18, 13	# x4
+.endm
+
+#
+# poly1305_setup_r
+#
+# setup r^4, r^3, r^2, r vectors
+#    [r, r^3, r^2, r^4]
+#    vs0 = [r0,...]
+#    vs1 = [r1,...]
+#    vs2 = [r2,...]
+#    vs3 = [r3,...]
+#    vs4 = [r4,...]
+#    vs5 = [r4*5,...]
+#    vs6 = [r3*5,...]
+#    vs7 = [r2*5,...]
+#    vs8 = [r1*5,...]
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0,   r4*5, r3*5, r2*5;
+# r2, r1,   r0,   r4*5, r3*5;
+# r3, r2,   r1,   r0,   r4*5;
+# r4, r3,   r2,   r1,   r0  ;
+#
+.macro poly1305_setup_r
+
+	# save r
+	xxlor	26, 58, 58
+	xxlor	27, 59, 59
+	xxlor	28, 60, 60
+	xxlor	29, 61, 61
+	xxlor	30, 62, 62
+
+	xxlxor	31, 31, 31
+
+#    [r, r^3, r^2, r^4]
+	# compute r^2
+	vmr	4, 26
+	vmr	5, 27
+	vmr	6, 28
+	vmr	7, 29
+	vmr	8, 30
+	bl	do_mul		# r^2 r^1
+	xxpermdi 58, 58, 36, 0x3		# r0
+	xxpermdi 59, 59, 37, 0x3		# r1
+	xxpermdi 60, 60, 38, 0x3		# r2
+	xxpermdi 61, 61, 39, 0x3		# r3
+	xxpermdi 62, 62, 40, 0x3		# r4
+	xxpermdi 36, 36, 36, 0x3
+	xxpermdi 37, 37, 37, 0x3
+	xxpermdi 38, 38, 38, 0x3
+	xxpermdi 39, 39, 39, 0x3
+	xxpermdi 40, 40, 40, 0x3
+	vspltisb 13, 2
+	vsld	9, 27, 13
+	vsld	10, 28, 13
+	vsld	11, 29, 13
+	vsld	12, 30, 13
+	vaddudm	0, 9, 27
+	vaddudm	1, 10, 28
+	vaddudm	2, 11, 29
+	vaddudm	3, 12, 30
+
+	bl	do_mul		# r^4 r^3
+	vmrgow	26, 26, 4
+	vmrgow	27, 27, 5
+	vmrgow	28, 28, 6
+	vmrgow	29, 29, 7
+	vmrgow	30, 30, 8
+	vspltisb 13, 2
+	vsld	9, 27, 13
+	vsld	10, 28, 13
+	vsld	11, 29, 13
+	vsld	12, 30, 13
+	vaddudm	0, 9, 27
+	vaddudm	1, 10, 28
+	vaddudm	2, 11, 29
+	vaddudm	3, 12, 30
+
+	# r^2 r^4
+	xxlor	0, 58, 58
+	xxlor	1, 59, 59
+	xxlor	2, 60, 60
+	xxlor	3, 61, 61
+	xxlor	4, 62, 62
+	xxlor	5, 32, 32
+	xxlor	6, 33, 33
+	xxlor	7, 34, 34
+	xxlor	8, 35, 35
+
+	vspltw	9, 26, 3
+	vspltw	10, 26, 2
+	vmrgow	26, 10, 9
+	vspltw	9, 27, 3
+	vspltw	10, 27, 2
+	vmrgow	27, 10, 9
+	vspltw	9, 28, 3
+	vspltw	10, 28, 2
+	vmrgow	28, 10, 9
+	vspltw	9, 29, 3
+	vspltw	10, 29, 2
+	vmrgow	29, 10, 9
+	vspltw	9, 30, 3
+	vspltw	10, 30, 2
+	vmrgow	30, 10, 9
+
+	vsld	9, 27, 13
+	vsld	10, 28, 13
+	vsld	11, 29, 13
+	vsld	12, 30, 13
+	vaddudm	0, 9, 27
+	vaddudm	1, 10, 28
+	vaddudm	2, 11, 29
+	vaddudm	3, 12, 30
+.endm
+
+SYM_FUNC_START_LOCAL(do_mul)
+	mul_odd
+
+	# do reduction ( h %= p )
+	# carry reduction
+	vspltisb 9, 2
+	vsrd	10, 14, 31
+	vsrd	11, 17, 31
+	vand	7, 17, 25
+	vand	4, 14, 25
+	vaddudm	18, 18, 11
+	vsrd	12, 18, 31
+	vaddudm	15, 15, 10
+
+	vsrd	11, 15, 31
+	vand	8, 18, 25
+	vand	5, 15, 25
+	vaddudm	4, 4, 12
+	vsld	10, 12, 9
+	vaddudm	6, 16, 11
+
+	vsrd	13, 6, 31
+	vand	6, 6, 25
+	vaddudm	4, 4, 10
+	vsrd	10, 4, 31
+	vaddudm	7, 7, 13
+
+	vsrd	11, 7, 31
+	vand	7, 7, 25
+	vand	4, 4, 25
+	vaddudm	5, 5, 10
+	vaddudm	8, 8, 11
+	blr
+SYM_FUNC_END(do_mul)
+
+#
+# init key
+#
+.macro do_poly1305_init
+	addis	10, 2, rmask@toc@ha
+	addi	10, 10, rmask@toc@l
+
+	ld	11, 0(10)
+	ld	12, 8(10)
+
+	li	14, 16
+	li	15, 32
+	addis	10, 2, cnum@toc@ha
+	addi	10, 10, cnum@toc@l
+	lvx	25, 0, 10	# v25 - mask
+	lvx	31, 14, 10	# v31 = 1a
+	lvx	19, 15, 10	# v19 = 1 << 24
+	lxv	24, 48(10)	# vs24
+	lxv	25, 64(10)	# vs25
+
+	# initialize
+	# load key from r3 to vectors
+	ld	9, 24(3)
+	ld	10, 32(3)
+	and.	9, 9, 11
+	and.	10, 10, 12
+
+	# break 26 bits
+	extrdi	14, 9, 26, 38
+	extrdi	15, 9, 26, 12
+	extrdi	16, 9, 12, 0
+	mtvsrdd	58, 0, 14
+	insrdi	16, 10, 14, 38
+	mtvsrdd	59, 0, 15
+	extrdi	17, 10, 26, 24
+	mtvsrdd	60, 0, 16
+	extrdi	18, 10, 24, 0
+	mtvsrdd	61, 0, 17
+	mtvsrdd	62, 0, 18
+
+	# r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
+	li	9, 5
+	mtvsrdd	36, 0, 9
+	vmulouw	0, 27, 4		# v0 = rr0
+	vmulouw	1, 28, 4		# v1 = rr1
+	vmulouw	2, 29, 4		# v2 = rr2
+	vmulouw	3, 30, 4		# v3 = rr3
+.endm
+
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+#  k = 32 bytes key
+#  r3 = k (r, s)
+#  r4 = mlen
+#  r5 = m
+#
+SYM_FUNC_START(poly1305_p10le_4blocks)
+.align 5
+	cmpdi	5, 64
+	blt	Out_no_poly1305
+
+	SAVE_REGS
+
+	do_poly1305_init
+
+	li	21, 0	# counter to message
+
+	poly1305_setup_r
+
+	# load previous H state
+	# break/convert r6 to 26 bits
+	ld	9, 0(3)
+	ld	10, 8(3)
+	ld	19, 16(3)
+	sldi	19, 19, 24
+	mtvsrdd	41, 0, 19
+	extrdi	14, 9, 26, 38
+	extrdi	15, 9, 26, 12
+	extrdi	16, 9, 12, 0
+	mtvsrdd	36, 0, 14
+	insrdi	16, 10, 14, 38
+	mtvsrdd	37, 0, 15
+	extrdi	17, 10, 26, 24
+	mtvsrdd	38, 0, 16
+	extrdi	18, 10, 24, 0
+	mtvsrdd	39, 0, 17
+	mtvsrdd	40, 0, 18
+	vor	8, 8, 9
+
+	# input m1 m2
+	add	20, 4, 21
+	xxlor	49, 24, 24
+	xxlor	50, 25, 25
+	lxvw4x	43, 0, 20
+	addi	17, 20, 16
+	lxvw4x	44, 0, 17
+	vperm	14, 11, 12, 17
+	vperm	15, 11, 12, 18
+	vand	9, 14, 25	# a0
+	vsrd	10, 14, 31	# >> 26
+	vsrd	11, 10, 31	# 12 bits left
+	vand	10, 10, 25	# a1
+	vspltisb 13, 12
+	vand	16, 15, 25
+	vsld	12, 16, 13
+	vor	11, 11, 12
+	vand	11, 11, 25	# a2
+	vspltisb 13, 14
+	vsrd	12, 15, 13	# >> 14
+	vsrd	13, 12, 31	# >> 26, a4
+	vand	12, 12, 25	# a3
+
+	vaddudm	20, 4, 9
+	vaddudm	21, 5, 10
+	vaddudm	22, 6, 11
+	vaddudm	23, 7, 12
+	vaddudm	24, 8, 13
+
+	# m3 m4
+	addi	17, 17, 16
+	lxvw4x	43, 0, 17
+	addi	17, 17, 16
+	lxvw4x	44, 0, 17
+	vperm	14, 11, 12, 17
+	vperm	15, 11, 12, 18
+	vand	9, 14, 25	# a0
+	vsrd	10, 14, 31	# >> 26
+	vsrd	11, 10, 31	# 12 bits left
+	vand	10, 10, 25	# a1
+	vspltisb 13, 12
+	vand	16, 15, 25
+	vsld	12, 16, 13
+	vspltisb 13, 14
+	vor	11, 11, 12
+	vand	11, 11, 25	# a2
+	vsrd	12, 15, 13	# >> 14
+	vsrd	13, 12, 31	# >> 26, a4
+	vand	12, 12, 25	# a3
+
+	# Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
+	vmrgow	4, 9, 20
+	vmrgow	5, 10, 21
+	vmrgow	6, 11, 22
+	vmrgow	7, 12, 23
+	vmrgow	8, 13, 24
+	vaddudm	8, 8, 19
+
+	addi	5, 5, -64	# len -= 64
+	addi	21, 21, 64	# offset += 64
+
+	li      9, 64
+	divdu   31, 5, 9
+
+	cmpdi	31, 0
+	ble	Skip_block_loop
+
+	mtctr	31
+
+# h4 =   m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+# Rewrite the polynominal sum of product as follows,
+# h1 = (h0 + m1) * r^2,	h2 = (h0 + m2) * r^2
+# h3 = (h1 + m3) * r^2,	h4 = (h2 + m4) * r^2  --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
+#  .... Repeat
+# h5 = (h3 + m5) * r^2,	h6 = (h4 + m6) * r^2  -->
+# h7 = (h5 + m7) * r^2,	h8 = (h6 + m8) * r^1  --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
+#
+loop_4blocks:
+
+	# Multiply odd words and even words
+	mul_odd
+	mul_even
+	# carry reduction
+	vspltisb 9, 2
+	vsrd	10, 14, 31
+	vsrd	11, 17, 31
+	vand	7, 17, 25
+	vand	4, 14, 25
+	vaddudm	18, 18, 11
+	vsrd	12, 18, 31
+	vaddudm	15, 15, 10
+
+	vsrd	11, 15, 31
+	vand	8, 18, 25
+	vand	5, 15, 25
+	vaddudm	4, 4, 12
+	vsld	10, 12, 9
+	vaddudm	6, 16, 11
+
+	vsrd	13, 6, 31
+	vand	6, 6, 25
+	vaddudm	4, 4, 10
+	vsrd	10, 4, 31
+	vaddudm	7, 7, 13
+
+	vsrd	11, 7, 31
+	vand	7, 7, 25
+	vand	4, 4, 25
+	vaddudm	5, 5, 10
+	vaddudm	8, 8, 11
+
+	# input m1  m2  m3  m4
+	add	20, 4, 21
+	xxlor	49, 24, 24
+	xxlor	50, 25, 25
+	lxvw4x	43, 0, 20
+	addi	17, 20, 16
+	lxvw4x	44, 0, 17
+	vperm	14, 11, 12, 17
+	vperm	15, 11, 12, 18
+	addi	17, 17, 16
+	lxvw4x	43, 0, 17
+	addi	17, 17, 16
+	lxvw4x	44, 0, 17
+	vperm	17, 11, 12, 17
+	vperm	18, 11, 12, 18
+
+	vand	20, 14, 25	# a0
+	vand	9, 17, 25	# a0
+	vsrd	21, 14, 31	# >> 26
+	vsrd	22, 21, 31	# 12 bits left
+	vsrd	10, 17, 31	# >> 26
+	vsrd	11, 10, 31	# 12 bits left
+
+	vand	21, 21, 25	# a1
+	vand	10, 10, 25	# a1
+
+	vspltisb 13, 12
+	vand	16, 15, 25
+	vsld	23, 16, 13
+	vor	22, 22, 23
+	vand	22, 22, 25	# a2
+	vand	16, 18, 25
+	vsld	12, 16, 13
+	vor	11, 11, 12
+	vand	11, 11, 25	# a2
+	vspltisb 13, 14
+	vsrd	23, 15, 13	# >> 14
+	vsrd	24, 23, 31	# >> 26, a4
+	vand	23, 23, 25	# a3
+	vsrd	12, 18, 13	# >> 14
+	vsrd	13, 12, 31	# >> 26, a4
+	vand	12, 12, 25	# a3
+
+	vaddudm	4, 4, 20
+	vaddudm	5, 5, 21
+	vaddudm	6, 6, 22
+	vaddudm	7, 7, 23
+	vaddudm	8, 8, 24
+
+	# Smash 4 message blocks into 5 vectors of [m4,  m2,  m3,  m1]
+	vmrgow	4, 9, 4
+	vmrgow	5, 10, 5
+	vmrgow	6, 11, 6
+	vmrgow	7, 12, 7
+	vmrgow	8, 13, 8
+	vaddudm	8, 8, 19
+
+	addi	5, 5, -64	# len -= 64
+	addi	21, 21, 64	# offset += 64
+
+	bdnz	loop_4blocks
+
+Skip_block_loop:
+	xxlor	58, 0, 0
+	xxlor	59, 1, 1
+	xxlor	60, 2, 2
+	xxlor	61, 3, 3
+	xxlor	62, 4, 4
+	xxlor	32, 5, 5
+	xxlor	33, 6, 6
+	xxlor	34, 7, 7
+	xxlor	35, 8, 8
+
+	# Multiply odd words and even words
+	mul_odd
+	mul_even
+
+	# Sum the products.
+	xxpermdi 41, 31, 46, 0
+	xxpermdi 42, 31, 47, 0
+	vaddudm	4, 14, 9
+	xxpermdi 36, 31, 36, 3
+	vaddudm	5, 15, 10
+	xxpermdi 37, 31, 37, 3
+	xxpermdi 43, 31, 48, 0
+	vaddudm	6, 16, 11
+	xxpermdi 38, 31, 38, 3
+	xxpermdi 44, 31, 49, 0
+	vaddudm	7, 17, 12
+	xxpermdi 39, 31, 39, 3
+	xxpermdi 45, 31, 50, 0
+	vaddudm	8, 18, 13
+	xxpermdi 40, 31, 40, 3
+
+	# carry reduction
+	vspltisb 9, 2
+	vsrd	10, 4, 31
+	vsrd	11, 7, 31
+	vand	7, 7, 25
+	vand	4, 4, 25
+	vaddudm	8, 8, 11
+	vsrd	12, 8, 31
+	vaddudm	5, 5, 10
+
+	vsrd	11, 5, 31
+	vand	8, 8, 25
+	vand	5, 5, 25
+	vaddudm	4, 4, 12
+	vsld	10, 12, 9
+	vaddudm	6, 6, 11
+
+	vsrd	13, 6, 31
+	vand	6, 6, 25
+	vaddudm	4, 4, 10
+	vsrd	10, 4, 31
+	vaddudm	7, 7, 13
+
+	vsrd	11, 7, 31
+	vand	7, 7, 25
+	vand	4, 4, 25
+	vaddudm	5, 5, 10
+	vsrd	10, 5, 31
+	vand	5, 5, 25
+	vaddudm	6, 6, 10
+	vaddudm	8, 8, 11
+
+	b	do_final_update
+
+do_final_update:
+	# combine 26 bit limbs
+	# v4, v5, v6, v7 and v8 are 26 bit vectors
+	vsld	5, 5, 31
+	vor	20, 4, 5
+	vspltisb 11, 12
+	vsrd	12, 6, 11
+	vsld	6, 6, 31
+	vsld	6, 6, 31
+	vor	20, 20, 6
+	vspltisb 11, 14
+	vsld	7, 7, 11
+	vor	21, 7, 12
+	mfvsrld	16, 40		# save last 2 bytes
+	vsld	8, 8, 11
+	vsld	8, 8, 31
+	vor	21, 21, 8
+	mfvsrld	17, 52
+	mfvsrld	19, 53
+	srdi	16, 16, 24
+
+	std	17, 0(3)
+	std	19, 8(3)
+	stw	16, 16(3)
+
+Out_loop:
+	li	3, 0
+
+	RESTORE_REGS
+
+	blr
+
+Out_no_poly1305:
+	li	3, 0
+	blr
+SYM_FUNC_END(poly1305_p10le_4blocks)
+
+#
+# =======================================================================
+# The following functions implement 64 x 64 bits multiplication poly1305.
+#
+SYM_FUNC_START_LOCAL(Poly1305_init_64)
+	#  mask 0x0FFFFFFC0FFFFFFC
+	#  mask 0x0FFFFFFC0FFFFFFF
+	addis	10, 2, rmask@toc@ha
+	addi	10, 10, rmask@toc@l
+	ld	11, 0(10)
+	ld	12, 8(10)
+
+	# initialize
+	# load key from r3
+	ld	9, 24(3)
+	ld	10, 32(3)
+	and.	9, 9, 11	# cramp mask r0
+	and.	10, 10, 12	# cramp mask r1
+
+        srdi    21, 10, 2
+        add     19, 21, 10      # s1: r19 - (r1 >> 2) *5
+
+        # setup r and s
+        li      25, 0
+	mtvsrdd 32+0, 9, 19	# r0, s1
+	mtvsrdd 32+1, 10, 9	# r1, r0
+	mtvsrdd 32+2, 19, 25	# s1
+	mtvsrdd 32+3, 9, 25	# r0
+
+	blr
+SYM_FUNC_END(Poly1305_init_64)
+
+# Poly1305_mult
+# v6 = (h0, h1), v8 = h2
+# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
+#
+# Output: v7, v10, v11
+#
+SYM_FUNC_START_LOCAL(Poly1305_mult)
+	#
+	#	d0 = h0 * r0 + h1 * s1
+	vmsumudm	7, 6, 0, 9		# h0 * r0, h1 * s1
+
+	#	d1 = h0 * r1 + h1 * r0 + h2 * s1
+	vmsumudm	11, 6, 1, 9		# h0 * r1, h1 * r0
+	vmsumudm	10, 8, 2, 11		# d1 += h2 * s1
+
+	#       d2 = r0
+	vmsumudm	11, 8, 3, 9		# d2 = h2 * r0
+	blr
+SYM_FUNC_END(Poly1305_mult)
+
+#
+# carry reduction
+# h %=p
+#
+# Input: v7, v10, v11
+# Output: r27, r28, r29
+#
+SYM_FUNC_START_LOCAL(Carry_reduction)
+	mfvsrld	27, 32+7
+	mfvsrld	28, 32+10
+	mfvsrld	29, 32+11
+	mfvsrd	20, 32+7	# h0.h
+	mfvsrd	21, 32+10	# h1.h
+
+	addc	28, 28, 20
+	adde	29, 29, 21
+	srdi	22, 29, 0x2
+	sldi	23, 22, 0x2
+	add	23, 23, 22	# (h2 & 3) * 5
+	addc	27, 27, 23	# h0
+	addze	28, 28		# h1
+	andi.	29, 29, 0x3	# h2
+	blr
+SYM_FUNC_END(Carry_reduction)
+
+#
+# poly1305 multiplication
+# h *= r, h %= p
+#	d0 = h0 * r0 + h1 * s1
+#	d1 = h0 * r1 + h1 * r0 + h2 * s1
+#       d2 = h0 * r0
+#
+#
+# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
+#   - no highbit if final leftover block (highbit = 0)
+#
+SYM_FUNC_START(poly1305_64s)
+	cmpdi	5, 0
+	ble	Out_no_poly1305_64
+
+	mflr 0
+	std 0, 16(1)
+	stdu 1,-400(1)
+
+	SAVE_GPR 14, 112, 1
+	SAVE_GPR 15, 120, 1
+	SAVE_GPR 16, 128, 1
+	SAVE_GPR 17, 136, 1
+	SAVE_GPR 18, 144, 1
+	SAVE_GPR 19, 152, 1
+	SAVE_GPR 20, 160, 1
+	SAVE_GPR 21, 168, 1
+	SAVE_GPR 22, 176, 1
+	SAVE_GPR 23, 184, 1
+	SAVE_GPR 24, 192, 1
+	SAVE_GPR 25, 200, 1
+	SAVE_GPR 26, 208, 1
+	SAVE_GPR 27, 216, 1
+	SAVE_GPR 28, 224, 1
+	SAVE_GPR 29, 232, 1
+	SAVE_GPR 30, 240, 1
+	SAVE_GPR 31, 248, 1
+
+	# Init poly1305
+	bl Poly1305_init_64
+
+	li 25, 0			# offset to inp and outp
+
+	add 11, 25, 4
+
+	# load h
+	# h0, h1, h2?
+        ld	27, 0(3)
+        ld	28, 8(3)
+        lwz	29, 16(3)
+
+        li      30, 16
+        divdu   31, 5, 30
+
+        mtctr   31
+
+        mr      24, 6		# highbit
+
+Loop_block_64:
+	vxor	9, 9, 9
+
+	ld	20, 0(11)
+	ld	21, 8(11)
+	addi	11, 11, 16
+
+	addc	27, 27, 20
+	adde	28, 28, 21
+	adde	29, 29, 24
+
+	li	22, 0
+	mtvsrdd	32+6, 27, 28	# h0, h1
+	mtvsrdd	32+8, 29, 22	# h2
+
+	bl	Poly1305_mult
+
+	bl	Carry_reduction
+
+	bdnz	Loop_block_64
+
+	std	27, 0(3)
+	std	28, 8(3)
+	stw	29, 16(3)
+
+	li	3, 0
+
+	RESTORE_GPR 14, 112, 1
+	RESTORE_GPR 15, 120, 1
+	RESTORE_GPR 16, 128, 1
+	RESTORE_GPR 17, 136, 1
+	RESTORE_GPR 18, 144, 1
+	RESTORE_GPR 19, 152, 1
+	RESTORE_GPR 20, 160, 1
+	RESTORE_GPR 21, 168, 1
+	RESTORE_GPR 22, 176, 1
+	RESTORE_GPR 23, 184, 1
+	RESTORE_GPR 24, 192, 1
+	RESTORE_GPR 25, 200, 1
+	RESTORE_GPR 26, 208, 1
+	RESTORE_GPR 27, 216, 1
+	RESTORE_GPR 28, 224, 1
+	RESTORE_GPR 29, 232, 1
+	RESTORE_GPR 30, 240, 1
+	RESTORE_GPR 31, 248, 1
+
+	addi    1, 1, 400
+	ld 0, 16(1)
+	mtlr 0
+
+	blr
+
+Out_no_poly1305_64:
+	li	3, 0
+	blr
+SYM_FUNC_END(poly1305_64s)
+
+#
+# Input: r3 = h, r4 = s, r5 = mac
+# mac = h + s
+#
+SYM_FUNC_START(poly1305_emit_64)
+	ld	10, 0(3)
+	ld	11, 8(3)
+	ld	12, 16(3)
+
+	# compare modulus
+	# h + 5 + (-p)
+	mr	6, 10
+	mr	7, 11
+	mr	8, 12
+	addic.	6, 6, 5
+	addze	7, 7
+	addze	8, 8
+	srdi	9, 8, 2		# overflow?
+	cmpdi	9, 0
+	beq	Skip_h64
+	mr	10, 6
+	mr	11, 7
+	mr	12, 8
+
+Skip_h64:
+	ld	6, 0(4)
+	ld	7, 8(4)
+	addc	10, 10, 6
+	adde	11, 11, 7
+	addze	12, 12
+
+	std	10, 0(5)
+	std	11, 8(5)
+	blr
+SYM_FUNC_END(poly1305_emit_64)
+
+SYM_DATA_START_LOCAL(RMASK)
+.align 5
+rmask:
+.byte	0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
+cnum:
+.long	0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
+.long	0x1a, 0x00, 0x1a, 0x00
+.long	0x01000000, 0x01000000, 0x01000000, 0x01000000
+.long	0x00010203, 0x04050607, 0x10111213, 0x14151617
+.long	0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
+SYM_DATA_END(RMASK)
diff --git a/lib/crypto/powerpc/poly1305.h b/lib/crypto/powerpc/poly1305.h
new file mode 100644
index 000000000000..b8ed098a0e95
--- /dev/null
+++ b/lib/crypto/powerpc/poly1305.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Poly1305 authenticator algorithm, RFC7539.
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+#include <asm/switch_to.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen);
+asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit);
+asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+	preempt_disable();
+	enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+	disable_kernel_vsx();
+	preempt_enable();
+}
+
+static void poly1305_block_init(struct poly1305_block_state *dctx,
+				const u8 raw_key[POLY1305_BLOCK_SIZE])
+{
+	if (!static_key_enabled(&have_p10))
+		return poly1305_block_init_generic(dctx, raw_key);
+
+	dctx->h = (struct poly1305_state){};
+	dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0);
+	dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8);
+}
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
+			    unsigned int len, u32 padbit)
+{
+	if (!static_key_enabled(&have_p10))
+		return poly1305_blocks_generic(state, src, len, padbit);
+	vsx_begin();
+	if (len >= POLY1305_BLOCK_SIZE * 4) {
+		poly1305_p10le_4blocks(state, src, len);
+		src += len - (len % (POLY1305_BLOCK_SIZE * 4));
+		len %= POLY1305_BLOCK_SIZE * 4;
+	}
+	while (len >= POLY1305_BLOCK_SIZE) {
+		poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit);
+		len -= POLY1305_BLOCK_SIZE;
+		src += POLY1305_BLOCK_SIZE;
+	}
+	vsx_end();
+}
+
+static void poly1305_emit(const struct poly1305_state *state,
+			  u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4])
+{
+	if (!static_key_enabled(&have_p10))
+		return poly1305_emit_generic(state, digest, nonce);
+	poly1305_emit_64(state, nonce, digest);
+}
+
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		static_branch_enable(&have_p10);
+}
diff --git a/lib/crypto/powerpc/sha1-powerpc-asm.S b/lib/crypto/powerpc/sha1-powerpc-asm.S
new file mode 100644
index 000000000000..f0d5ed557ab1
--- /dev/null
+++ b/lib/crypto/powerpc/sha1-powerpc-asm.S
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+
+#ifdef __BIG_ENDIAN__
+#define LWZ(rt, d, ra)	\
+	lwz	rt,d(ra)
+#else
+#define LWZ(rt, d, ra)	\
+	li	rt,d;	\
+	lwbrx	rt,rt,ra
+#endif
+
+/*
+ * We roll the registers for T, A, B, C, D, E around on each
+ * iteration; T on iteration t is A on iteration t+1, and so on.
+ * We use registers 7 - 12 for this.
+ */
+#define RT(t)	((((t)+5)%6)+7)
+#define RA(t)	((((t)+4)%6)+7)
+#define RB(t)	((((t)+3)%6)+7)
+#define RC(t)	((((t)+2)%6)+7)
+#define RD(t)	((((t)+1)%6)+7)
+#define RE(t)	((((t)+0)%6)+7)
+
+/* We use registers 16 - 31 for the W values */
+#define W(t)	(((t)%16)+16)
+
+#define LOADW(t)				\
+	LWZ(W(t),(t)*4,r4)
+
+#define STEPD0_LOAD(t)				\
+	andc	r0,RD(t),RB(t);		\
+	and	r6,RB(t),RC(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	or	r6,r6,r0;			\
+	add	r0,RE(t),r15;			\
+	add	RT(t),RT(t),r6;		\
+	add	r14,r0,W(t);			\
+	LWZ(W((t)+4),((t)+4)*4,r4);	\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RT(t),RT(t),r14
+
+#define STEPD0_UPDATE(t)			\
+	and	r6,RB(t),RC(t);		\
+	andc	r0,RD(t),RB(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	or	r6,r6,r0;			\
+	add	r0,RE(t),r15;			\
+	xor	r5,W((t)+4-3),W((t)+4-8);		\
+	add	RT(t),RT(t),r6;		\
+	xor	W((t)+4),W((t)+4-16),W((t)+4-14);	\
+	add	r0,r0,W(t);			\
+	xor	W((t)+4),W((t)+4),r5;			\
+	add	RT(t),RT(t),r0;		\
+	rotlwi	W((t)+4),W((t)+4),1
+
+#define STEPD1(t)				\
+	xor	r6,RB(t),RC(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	xor	r6,r6,RD(t);			\
+	add	r0,RE(t),r15;			\
+	add	RT(t),RT(t),r6;		\
+	add	r0,r0,W(t);			\
+	add	RT(t),RT(t),r0
+
+#define STEPD1_UPDATE(t)				\
+	xor	r6,RB(t),RC(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	xor	r6,r6,RD(t);			\
+	add	r0,RE(t),r15;			\
+	xor	r5,W((t)+4-3),W((t)+4-8);		\
+	add	RT(t),RT(t),r6;		\
+	xor	W((t)+4),W((t)+4-16),W((t)+4-14);	\
+	add	r0,r0,W(t);			\
+	xor	W((t)+4),W((t)+4),r5;			\
+	add	RT(t),RT(t),r0;		\
+	rotlwi	W((t)+4),W((t)+4),1
+
+#define STEPD2_UPDATE(t)			\
+	and	r6,RB(t),RC(t);		\
+	and	r0,RB(t),RD(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	or	r6,r6,r0;			\
+	rotlwi	RB(t),RB(t),30;			\
+	and	r0,RC(t),RD(t);		\
+	xor	r5,W((t)+4-3),W((t)+4-8);	\
+	or	r6,r6,r0;			\
+	xor	W((t)+4),W((t)+4-16),W((t)+4-14);	\
+	add	r0,RE(t),r15;			\
+	add	RT(t),RT(t),r6;		\
+	add	r0,r0,W(t);			\
+	xor	W((t)+4),W((t)+4),r5;		\
+	add	RT(t),RT(t),r0;		\
+	rotlwi	W((t)+4),W((t)+4),1
+
+#define STEP0LD4(t)				\
+	STEPD0_LOAD(t);				\
+	STEPD0_LOAD((t)+1);			\
+	STEPD0_LOAD((t)+2);			\
+	STEPD0_LOAD((t)+3)
+
+#define STEPUP4(t, fn)				\
+	STEP##fn##_UPDATE(t);			\
+	STEP##fn##_UPDATE((t)+1);		\
+	STEP##fn##_UPDATE((t)+2);		\
+	STEP##fn##_UPDATE((t)+3)
+
+#define STEPUP20(t, fn)				\
+	STEPUP4(t, fn);				\
+	STEPUP4((t)+4, fn);			\
+	STEPUP4((t)+8, fn);			\
+	STEPUP4((t)+12, fn);			\
+	STEPUP4((t)+16, fn)
+
+_GLOBAL(powerpc_sha_transform)
+	PPC_STLU r1,-INT_FRAME_SIZE(r1)
+	SAVE_GPRS(14, 31, r1)
+
+	/* Load up A - E */
+	lwz	RA(0),0(r3)	/* A */
+	lwz	RB(0),4(r3)	/* B */
+	lwz	RC(0),8(r3)	/* C */
+	lwz	RD(0),12(r3)	/* D */
+	lwz	RE(0),16(r3)	/* E */
+
+	LOADW(0)
+	LOADW(1)
+	LOADW(2)
+	LOADW(3)
+
+	lis	r15,0x5a82	/* K0-19 */
+	ori	r15,r15,0x7999
+	STEP0LD4(0)
+	STEP0LD4(4)
+	STEP0LD4(8)
+	STEPUP4(12, D0)
+	STEPUP4(16, D0)
+
+	lis	r15,0x6ed9	/* K20-39 */
+	ori	r15,r15,0xeba1
+	STEPUP20(20, D1)
+
+	lis	r15,0x8f1b	/* K40-59 */
+	ori	r15,r15,0xbcdc
+	STEPUP20(40, D2)
+
+	lis	r15,0xca62	/* K60-79 */
+	ori	r15,r15,0xc1d6
+	STEPUP4(60, D1)
+	STEPUP4(64, D1)
+	STEPUP4(68, D1)
+	STEPUP4(72, D1)
+	lwz	r20,16(r3)
+	STEPD1(76)
+	lwz	r19,12(r3)
+	STEPD1(77)
+	lwz	r18,8(r3)
+	STEPD1(78)
+	lwz	r17,4(r3)
+	STEPD1(79)
+
+	lwz	r16,0(r3)
+	add	r20,RE(80),r20
+	add	RD(0),RD(80),r19
+	add	RC(0),RC(80),r18
+	add	RB(0),RB(80),r17
+	add	RA(0),RA(80),r16
+	mr	RE(0),r20
+	stw	RA(0),0(r3)
+	stw	RB(0),4(r3)
+	stw	RC(0),8(r3)
+	stw	RD(0),12(r3)
+	stw	RE(0),16(r3)
+
+	REST_GPRS(14, 31, r1)
+	addi	r1,r1,INT_FRAME_SIZE
+	blr
diff --git a/lib/crypto/powerpc/sha1-spe-asm.S b/lib/crypto/powerpc/sha1-spe-asm.S
new file mode 100644
index 000000000000..0f447523be5e
--- /dev/null
+++ b/lib/crypto/powerpc/sha1-spe-asm.S
@@ -0,0 +1,294 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast SHA-1 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP	r3	/* pointer to hash value			*/
+#define rWP	r4	/* pointer to input				*/
+#define rKP	r5	/* pointer to constants				*/
+
+#define rW0	r14	/* 64 bit round words				*/
+#define rW1	r15
+#define rW2	r16
+#define rW3	r17
+#define rW4	r18
+#define rW5	r19
+#define rW6	r20
+#define rW7	r21
+
+#define rH0	r6	/* 32 bit hash values 				*/
+#define rH1	r7
+#define rH2	r8
+#define rH3	r9
+#define rH4	r10
+
+#define rT0	r22	/* 64 bit temporary				*/
+#define rT1	r0	/* 32 bit temporaries				*/
+#define rT2	r11
+#define rT3	r12
+
+#define rK	r23	/* 64 bit constant in volatile register		*/
+
+#define LOAD_K01
+
+#define LOAD_K11 \
+	evlwwsplat	rK,0(rKP);
+
+#define LOAD_K21 \
+	evlwwsplat	rK,4(rKP);
+
+#define LOAD_K31 \
+	evlwwsplat	rK,8(rKP);
+
+#define LOAD_K41 \
+	evlwwsplat	rK,12(rKP);
+
+#define INITIALIZE \
+	stwu		r1,-128(r1);	/* create stack frame		*/ \
+	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
+	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
+	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
+	evstdw		r17,32(r1);					   \
+	evstdw		r18,40(r1);					   \
+	evstdw		r19,48(r1);					   \
+	evstdw		r20,56(r1);					   \
+	evstdw		r21,64(r1);					   \
+	evstdw		r22,72(r1);					   \
+	evstdw		r23,80(r1);
+
+
+#define FINALIZE \
+	evldw		r14,8(r1);	/* restore SPE registers	*/ \
+	evldw		r15,16(r1);					   \
+	evldw		r16,24(r1);					   \
+	evldw		r17,32(r1);					   \
+	evldw		r18,40(r1);					   \
+	evldw		r19,48(r1);					   \
+	evldw		r20,56(r1);					   \
+	evldw		r21,64(r1);					   \
+	evldw		r22,72(r1);					   \
+	evldw		r23,80(r1);					   \
+	xor		r0,r0,r0;					   \
+	stw		r0,8(r1);	/* Delete sensitive data	*/ \
+	stw		r0,16(r1);	/* that we might have pushed	*/ \
+	stw		r0,24(r1);	/* from other context that runs	*/ \
+	stw		r0,32(r1);	/* the same code. Assume that	*/ \
+	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
+	stw		r0,48(r1);	/* were already overwritten on	*/ \
+	stw		r0,56(r1);	/* the way down to here		*/ \
+	stw		r0,64(r1);					   \
+	stw		r0,72(r1);					   \
+	stw		r0,80(r1);					   \
+	addi		r1,r1,128;	/* cleanup stack frame		*/
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+	lwz		reg,off(rWP);	/* load data			*/
+#define NEXT_BLOCK \
+	addi		rWP,rWP,64;	/* increment per block		*/
+#else
+#define LOAD_DATA(reg, off) \
+	lwbrx		reg,0,rWP;	/* load data			*/ \
+	addi		rWP,rWP,4;	/* increment per word		*/
+#define NEXT_BLOCK			/* nothing to do		*/
+#endif
+
+#define	R_00_15(a, b, c, d, e, w0, w1, k, off) \
+	LOAD_DATA(w0, off)		/* 1: W				*/ \
+	and		rT2,b,c;	/* 1: F' = B and C 		*/ \
+	LOAD_K##k##1							   \
+	andc		rT1,d,b;	/* 1: F" = ~B and D 		*/ \
+	rotrwi		rT0,a,27;	/* 1: A' = A rotl 5		*/ \
+	or		rT2,rT2,rT1;	/* 1: F = F' or F"		*/ \
+	add		e,e,rT0;	/* 1: E = E + A'		*/ \
+	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
+	add		e,e,w0;		/* 1: E = E + W			*/ \
+	LOAD_DATA(w1, off+4)		/* 2: W				*/ \
+	add		e,e,rT2;	/* 1: E = E + F			*/ \
+	and		rT1,a,b;	/* 2: F' = B and C 		*/ \
+	add		e,e,rK;		/* 1: E = E + K			*/ \
+	andc		rT2,c,a;	/* 2: F" = ~B and D 		*/ \
+	add		d,d,rK;		/* 2: E = E + K			*/ \
+	or		rT2,rT2,rT1;	/* 2: F = F' or F"		*/ \
+	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
+	add		d,d,w1;		/* 2: E = E + W			*/ \
+	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
+	add		d,d,rT0;	/* 2: E = E + A'		*/ \
+	evmergelo	w1,w1,w0;	/*    mix W[0]/W[1]		*/ \
+	add		d,d,rT2		/* 2: E = E + F			*/
+
+#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+	and		rT2,b,c;	/* 1: F' = B and C 		*/ \
+	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
+	andc		rT1,d,b;	/* 1: F" = ~B and D 		*/ \
+	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
+	or		rT1,rT1,rT2;	/* 1: F = F' or F"		*/ \
+	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
+	add		e,e,rT1;	/* 1: E = E + F			*/ \
+	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
+	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
+	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
+	add		e,e,rT2;	/* 1: E = E + A'		*/ \
+	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
+	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
+	LOAD_K##k##1							   \
+	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
+	add		e,e,rT0;	/* 1: E = E + WK		*/ \
+	add		d,d,rT1;	/* 2: E = E + WK		*/ \
+	and		rT2,a,b;	/* 2: F' = B and C 		*/ \
+	andc		rT1,c,a;	/* 2: F" = ~B and D 		*/ \
+	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
+	or		rT1,rT1,rT2;	/* 2: F = F' or F"		*/ \
+	add		d,d,rT0;	/* 2: E = E + A'		*/ \
+	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
+	add		d,d,rT1		/* 2: E = E + F			*/
+
+#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
+	xor		rT2,b,c;	/* 1: F' = B xor C		*/ \
+	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
+	xor		rT2,rT2,d;	/* 1: F = F' xor D		*/ \
+	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
+	add		e,e,rT2;	/* 1: E = E + F			*/ \
+	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
+	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
+	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
+	add		e,e,rT2;	/* 1: E = E + A'		*/ \
+	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
+	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
+	LOAD_K##k##1							   \
+	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
+	add		e,e,rT0;	/* 1: E = E + WK		*/ \
+	xor		rT2,a,b;	/* 2: F' = B xor C		*/ \
+	add		d,d,rT1;	/* 2: E = E + WK		*/ \
+	xor		rT2,rT2,c;	/* 2: F = F' xor D		*/ \
+	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
+	add		d,d,rT2;	/* 2: E = E + F			*/ \
+	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
+	add		d,d,rT0		/* 2: E = E + A'		*/
+
+#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+	and		rT2,b,c;	/* 1: F' = B and C		*/ \
+	evmergelohi	rT0,w7,w6;	/*    W[-3]			*/ \
+	or		rT1,b,c;	/* 1: F" = B or C		*/ \
+	evxor		w0,w0,rT0;	/*    W = W[-16] xor W[-3]	*/ \
+	and		rT1,d,rT1;	/* 1: F" = F" and D		*/ \
+	evxor		w0,w0,w4;	/*    W = W xor W[-8]		*/ \
+	or		rT2,rT2,rT1;	/* 1: F = F' or F"		*/ \
+	evxor		w0,w0,w1;	/*    W = W xor W[-14]		*/ \
+	add		e,e,rT2;	/* 1: E = E + F			*/ \
+	evrlwi		w0,w0,1;	/*    W = W rotl 1		*/ \
+	rotrwi		rT2,a,27;	/* 1: A' = A rotl 5		*/ \
+	evaddw		rT0,w0,rK;	/*    WK = W + K		*/ \
+	add		e,e,rT2;	/* 1: E = E + A'		*/ \
+	LOAD_K##k##1							   \
+	evmergehi	rT1,rT1,rT0;	/*    WK1/WK2			*/ \
+	rotrwi		b,b,2;		/* 1: B = B rotl 30		*/ \
+	add		e,e,rT0;	/* 1: E = E + WK		*/ \
+	and		rT2,a,b;	/* 2: F' = B and C		*/ \
+	or		rT0,a,b;	/* 2: F" = B or C		*/ \
+	add		d,d,rT1;	/* 2: E = E + WK		*/ \
+	and		rT0,c,rT0;	/* 2: F" = F" and D		*/ \
+	rotrwi		a,a,2;		/* 2: B = B rotl 30		*/ \
+	or		rT2,rT2,rT0;	/* 2: F = F' or F"		*/ \
+	rotrwi		rT0,e,27;	/* 2: A' = A rotl 5		*/ \
+	add		d,d,rT2;	/* 2: E = E + F			*/ \
+	add		d,d,rT0		/* 2: E = E + A'		*/
+
+#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+	R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
+
+_GLOBAL(ppc_spe_sha1_transform)
+	INITIALIZE
+
+	lwz		rH0,0(rHP)
+	lwz		rH1,4(rHP)
+	mtctr		r5
+	lwz		rH2,8(rHP)
+	lis		rKP,PPC_SPE_SHA1_K@h
+	lwz		rH3,12(rHP)
+	ori		rKP,rKP,PPC_SPE_SHA1_K@l
+	lwz		rH4,16(rHP)
+
+ppc_spe_sha1_main:
+	R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
+	R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
+	R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
+	R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
+	R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
+	R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
+	R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
+	R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
+
+	R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
+	R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
+
+	R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
+	R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
+	R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
+	R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
+	R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
+	R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
+	R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
+	R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
+	R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
+	R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
+
+	R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
+	R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
+	R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
+	R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
+	R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
+	R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
+	R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
+	R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
+	R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
+	R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
+
+	R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
+	R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
+	R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
+	R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
+	R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
+	R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
+	R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
+	lwz		rT3,0(rHP)
+	R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
+	lwz		rW1,4(rHP)
+	R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
+	lwz		rW2,8(rHP)
+	R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
+	lwz		rW3,12(rHP)
+	NEXT_BLOCK
+	lwz		rW4,16(rHP)
+
+	add		rH0,rH0,rT3
+	stw		rH0,0(rHP)
+	add		rH1,rH1,rW1
+	stw		rH1,4(rHP)
+	add		rH2,rH2,rW2
+	stw		rH2,8(rHP)
+	add		rH3,rH3,rW3
+	stw		rH3,12(rHP)
+	add		rH4,rH4,rW4
+	stw		rH4,16(rHP)
+
+	bdnz		ppc_spe_sha1_main
+
+	FINALIZE
+	blr
+
+.data
+.align 4
+PPC_SPE_SHA1_K:
+	.long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
diff --git a/lib/crypto/powerpc/sha1.h b/lib/crypto/powerpc/sha1.h
new file mode 100644
index 000000000000..e2c010f0370b
--- /dev/null
+++ b/lib/crypto/powerpc/sha1.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for PowerPC
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <linux/preempt.h>
+
+#ifdef CONFIG_SPE
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA1 takes ~1000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 2KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 2048
+
+asmlinkage void ppc_spe_sha1_transform(struct sha1_block_state *state,
+				       const u8 *data, u32 nblocks);
+
+static void spe_begin(void)
+{
+	/* We just start SPE operations and will save SPE registers later. */
+	preempt_disable();
+	enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+	disable_kernel_spe();
+	/* reenable preemption */
+	preempt_enable();
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	do {
+		u32 unit = min_t(size_t, nblocks, MAX_BYTES / SHA1_BLOCK_SIZE);
+
+		spe_begin();
+		ppc_spe_sha1_transform(state, data, unit);
+		spe_end();
+
+		data += unit * SHA1_BLOCK_SIZE;
+		nblocks -= unit;
+	} while (nblocks);
+}
+#else /* CONFIG_SPE */
+asmlinkage void powerpc_sha_transform(struct sha1_block_state *state,
+				      const u8 data[SHA1_BLOCK_SIZE]);
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	do {
+		powerpc_sha_transform(state, data);
+		data += SHA1_BLOCK_SIZE;
+	} while (--nblocks);
+}
+#endif /* !CONFIG_SPE */
diff --git a/lib/crypto/powerpc/sha256-spe-asm.S b/lib/crypto/powerpc/sha256-spe-asm.S
new file mode 100644
index 000000000000..cd99d71dae34
--- /dev/null
+++ b/lib/crypto/powerpc/sha256-spe-asm.S
@@ -0,0 +1,318 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast SHA-256 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP	r3	/* pointer to hash values in memory		*/
+#define rKP	r24	/* pointer to round constants			*/
+#define rWP	r4	/* pointer to input data			*/
+
+#define rH0	r5	/* 8 32 bit hash values in 8 registers		*/
+#define rH1	r6
+#define rH2	r7
+#define rH3	r8
+#define rH4	r9
+#define rH5	r10
+#define rH6	r11
+#define rH7	r12
+
+#define rW0	r14	/* 64 bit registers. 16 words in 8 registers	*/
+#define rW1	r15
+#define rW2	r16
+#define rW3	r17
+#define rW4	r18
+#define rW5	r19
+#define rW6	r20
+#define rW7	r21
+
+#define rT0	r22	/* 64 bit temporaries 				*/
+#define rT1	r23
+#define rT2	r0	/* 32 bit temporaries				*/
+#define rT3	r25
+
+#define CMP_KN_LOOP
+#define CMP_KC_LOOP \
+	cmpwi		rT1,0;
+
+#define INITIALIZE \
+	stwu		r1,-128(r1);	/* create stack frame		*/ \
+	evstdw		r14,8(r1);	/* We must save non volatile	*/ \
+	evstdw		r15,16(r1);	/* registers. Take the chance	*/ \
+	evstdw		r16,24(r1);	/* and save the SPE part too	*/ \
+	evstdw		r17,32(r1);					   \
+	evstdw		r18,40(r1);					   \
+	evstdw		r19,48(r1);					   \
+	evstdw		r20,56(r1);					   \
+	evstdw		r21,64(r1);					   \
+	evstdw		r22,72(r1);					   \
+	evstdw		r23,80(r1);					   \
+	stw		r24,88(r1);	/* save normal registers	*/ \
+	stw		r25,92(r1);
+
+
+#define FINALIZE \
+	evldw		r14,8(r1);	/* restore SPE registers	*/ \
+	evldw		r15,16(r1);					   \
+	evldw		r16,24(r1);					   \
+	evldw		r17,32(r1);					   \
+	evldw		r18,40(r1);					   \
+	evldw		r19,48(r1);					   \
+	evldw		r20,56(r1);					   \
+	evldw		r21,64(r1);					   \
+	evldw		r22,72(r1);					   \
+	evldw		r23,80(r1);					   \
+	lwz		r24,88(r1);	/* restore normal registers	*/ \
+	lwz		r25,92(r1);					   \
+	xor		r0,r0,r0;					   \
+	stw		r0,8(r1);	/* Delete sensitive data	*/ \
+	stw		r0,16(r1);	/* that we might have pushed	*/ \
+	stw		r0,24(r1);	/* from other context that runs	*/ \
+	stw		r0,32(r1);	/* the same code. Assume that	*/ \
+	stw		r0,40(r1);	/* the lower part of the GPRs	*/ \
+	stw		r0,48(r1);	/* was already overwritten on	*/ \
+	stw		r0,56(r1);	/* the way down to here		*/ \
+	stw		r0,64(r1);					   \
+	stw		r0,72(r1);					   \
+	stw		r0,80(r1);					   \
+	addi		r1,r1,128;	/* cleanup stack frame		*/
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+	lwz		reg,off(rWP);	/* load data			*/
+#define NEXT_BLOCK \
+	addi		rWP,rWP,64;	/* increment per block		*/
+#else
+#define LOAD_DATA(reg, off) \
+	lwbrx		reg,0,rWP; 	/* load data			*/ \
+	addi		rWP,rWP,4;	/* increment per word		*/
+#define NEXT_BLOCK			/* nothing to do		*/
+#endif
+
+#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
+	LOAD_DATA(w, off)		/* 1: W				*/ \
+	rotrwi		rT0,e,6;	/* 1: S1 = e rotr 6		*/ \
+	rotrwi		rT1,e,11;	/* 1: S1' = e rotr 11		*/ \
+	rotrwi		rT2,e,25;	/* 1: S1" = e rotr 25		*/ \
+	xor		rT0,rT0,rT1;	/* 1: S1 = S1 xor S1'		*/ \
+	and		rT3,e,f;	/* 1: ch = e and f		*/ \
+	xor		rT0,rT0,rT2;	/* 1: S1 = S1 xor S1"		*/ \
+	andc		rT1,g,e;	/* 1: ch' = ~e and g		*/ \
+	lwz		rT2,off(rKP);	/* 1: K				*/ \
+	xor		rT3,rT3,rT1;	/* 1: ch = ch xor ch'		*/ \
+	add		h,h,rT0;	/* 1: temp1 = h + S1		*/ \
+	add		rT3,rT3,w;	/* 1: temp1' = ch + w		*/ \
+	rotrwi		rT0,a,2;	/* 1: S0 = a rotr 2		*/ \
+	add		h,h,rT3;	/* 1: temp1 = temp1 + temp1'	*/ \
+	rotrwi		rT1,a,13;	/* 1: S0' = a rotr 13		*/ \
+	add		h,h,rT2;	/* 1: temp1 = temp1 + K		*/ \
+	rotrwi		rT3,a,22;	/* 1: S0" = a rotr 22		*/ \
+	xor		rT0,rT0,rT1;	/* 1: S0 = S0 xor S0'		*/ \
+	add		d,d,h;		/* 1: d = d + temp1		*/ \
+	xor		rT3,rT0,rT3;	/* 1: S0 = S0 xor S0"		*/ \
+	evmergelo	w,w,w;		/*    shift W			*/ \
+	or		rT2,a,b;	/* 1: maj = a or b		*/ \
+	and		rT1,a,b;	/* 1: maj' = a and b		*/ \
+	and		rT2,rT2,c;	/* 1: maj = maj and c		*/ \
+	LOAD_DATA(w, off+4)		/* 2: W				*/ \
+	or		rT2,rT1,rT2;	/* 1: maj = maj or maj'		*/ \
+	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
+	add		rT3,rT3,rT2;	/* 1: temp2 = S0 + maj		*/ \
+	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
+	add		h,h,rT3;	/* 1: h = temp1 + temp2		*/ \
+	rotrwi		rT2,d,25;	/* 2: S1" = e rotr 25		*/ \
+	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
+	and		rT3,d,e;	/* 2: ch = e and f		*/ \
+	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
+	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
+	lwz		rT2,off+4(rKP);	/* 2: K				*/ \
+	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
+	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
+	add		rT3,rT3,w;	/* 2: temp1' = ch + w		*/ \
+	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
+	add		g,g,rT3;	/* 2: temp1 = temp1 + temp1'	*/ \
+	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
+	add		g,g,rT2;	/* 2: temp1 = temp1 + K		*/ \
+	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
+	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
+	or		rT2,h,a;	/* 2: maj = a or b		*/ \
+	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
+	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
+	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
+	add		c,c,g;		/* 2: d = d + temp1		*/ \
+	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
+	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
+	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
+
+#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
+	rotrwi		rT2,e,6;	/* 1: S1 = e rotr 6		*/ \
+	evmergelohi	rT0,w0,w1;	/*    w[-15]			*/ \
+	rotrwi		rT3,e,11;	/* 1: S1' = e rotr 11		*/ \
+	evsrwiu		rT1,rT0,3;	/*    s0 = w[-15] >> 3		*/ \
+	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
+	evrlwi		rT0,rT0,25;	/*    s0' = w[-15] rotr	7	*/ \
+	rotrwi		rT3,e,25;	/* 1: S1' = e rotr 25		*/ \
+	evxor		rT1,rT1,rT0;	/*    s0 = s0 xor s0'		*/ \
+	xor		rT2,rT2,rT3;	/* 1: S1 = S1 xor S1'		*/ \
+	evrlwi		rT0,rT0,21;	/*    s0' = w[-15] rotr 18	*/ \
+	add		h,h,rT2;	/* 1: temp1 = h + S1		*/ \
+	evxor		rT0,rT0,rT1;	/*    s0 = s0 xor s0'		*/ \
+	and		rT2,e,f;	/* 1: ch = e and f		*/ \
+	evaddw		w0,w0,rT0;	/*    w = w[-16] + s0		*/ \
+	andc		rT3,g,e;	/* 1: ch' = ~e and g		*/ \
+	evsrwiu		rT0,w7,10;	/*    s1 = w[-2] >> 10		*/ \
+	xor		rT2,rT2,rT3;	/* 1: ch = ch xor ch'		*/ \
+	evrlwi		rT1,w7,15;	/*    s1' = w[-2] rotr 17	*/ \
+	add		h,h,rT2;	/* 1: temp1 = temp1 + ch	*/ \
+	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
+	rotrwi		rT2,a,2;	/* 1: S0 = a rotr 2		*/ \
+	evrlwi		rT1,w7,13;	/*    s1' = w[-2] rotr 19	*/ \
+	rotrwi		rT3,a,13;	/* 1: S0' = a rotr 13		*/ \
+	evxor		rT0,rT0,rT1;	/*    s1 = s1 xor s1'		*/ \
+	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
+	evldw		rT1,off(rKP);	/*    k				*/ \
+	rotrwi		rT3,a,22;	/* 1: S0' = a rotr 22		*/ \
+	evaddw		w0,w0,rT0;	/*    w = w + s1		*/ \
+	xor		rT2,rT2,rT3;	/* 1: S0 = S0 xor S0'		*/ \
+	evmergelohi	rT0,w4,w5;	/*    w[-7]			*/ \
+	and		rT3,a,b;	/* 1: maj = a and b		*/ \
+	evaddw		w0,w0,rT0;	/*    w = w + w[-7]		*/ \
+	CMP_K##k##_LOOP							   \
+	add		rT2,rT2,rT3;	/* 1: temp2 = S0 + maj		*/ \
+	evaddw		rT1,rT1,w0;	/*    wk = w + k		*/ \
+	xor		rT3,a,b;	/* 1: maj = a xor b		*/ \
+	evmergehi	rT0,rT1,rT1;	/*    wk1/wk2			*/ \
+	and		rT3,rT3,c;	/* 1: maj = maj and c		*/ \
+	add		h,h,rT0;	/* 1: temp1 = temp1 + wk	*/ \
+	add		rT2,rT2,rT3;	/* 1: temp2 = temp2 + maj	*/ \
+	add		g,g,rT1;	/* 2: temp1 = temp1 + wk	*/ \
+	add		d,d,h;		/* 1: d = d + temp1		*/ \
+	rotrwi		rT0,d,6;	/* 2: S1 = e rotr 6		*/ \
+	add		h,h,rT2;	/* 1: h = temp1 + temp2		*/ \
+	rotrwi		rT1,d,11;	/* 2: S1' = e rotr 11		*/ \
+	rotrwi		rT2,d,25;	/* 2: S" = e rotr 25		*/ \
+	xor		rT0,rT0,rT1;	/* 2: S1 = S1 xor S1'		*/ \
+	and		rT3,d,e;	/* 2: ch = e and f		*/ \
+	xor		rT0,rT0,rT2;	/* 2: S1 = S1 xor S1"		*/ \
+	andc		rT1,f,d;	/* 2: ch' = ~e and g		*/ \
+	add		g,g,rT0;	/* 2: temp1 = h + S1		*/ \
+	xor		rT3,rT3,rT1;	/* 2: ch = ch xor ch'		*/ \
+	rotrwi		rT0,h,2;	/* 2: S0 = a rotr 2		*/ \
+	add		g,g,rT3;	/* 2: temp1 = temp1 + ch	*/ \
+	rotrwi		rT1,h,13;	/* 2: S0' = a rotr 13		*/ \
+	rotrwi		rT3,h,22;	/* 2: S0" = a rotr 22		*/ \
+	xor		rT0,rT0,rT1;	/* 2: S0 = S0 xor S0'		*/ \
+	or		rT2,h,a;	/* 2: maj = a or b		*/ \
+	and		rT1,h,a;	/* 2: maj' = a and b		*/ \
+	and		rT2,rT2,b;	/* 2: maj = maj and c		*/ \
+	xor		rT3,rT0,rT3;	/* 2: S0 = S0 xor S0"		*/ \
+	or		rT2,rT1,rT2;	/* 2: maj = maj or maj'		*/ \
+	add		c,c,g;		/* 2: d = d + temp1		*/ \
+	add		rT3,rT3,rT2;	/* 2: temp2 = S0 + maj		*/ \
+	add		g,g,rT3		/* 2: h = temp1 + temp2		*/
+
+_GLOBAL(ppc_spe_sha256_transform)
+	INITIALIZE
+
+	mtctr		r5
+	lwz		rH0,0(rHP)
+	lwz		rH1,4(rHP)
+	lwz		rH2,8(rHP)
+	lwz		rH3,12(rHP)
+	lwz		rH4,16(rHP)
+	lwz		rH5,20(rHP)
+	lwz		rH6,24(rHP)
+	lwz		rH7,28(rHP)
+
+ppc_spe_sha256_main:
+	lis		rKP,PPC_SPE_SHA256_K@ha
+	addi		rKP,rKP,PPC_SPE_SHA256_K@l
+
+	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
+	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
+	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
+	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
+	R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
+	R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
+	R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
+	R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
+ppc_spe_sha256_16_rounds:
+	addi		rKP,rKP,64
+	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+		 rW0, rW1, rW4, rW5, rW7, N, 0)
+	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+		 rW1, rW2, rW5, rW6, rW0, N, 8)
+	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+		 rW2, rW3, rW6, rW7, rW1, N, 16)
+	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+		 rW3, rW4, rW7, rW0, rW2, N, 24)
+	R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+		 rW4, rW5, rW0, rW1, rW3, N, 32)
+	R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+		 rW5, rW6, rW1, rW2, rW4, N, 40)
+	R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+		 rW6, rW7, rW2, rW3, rW5, N, 48)
+	R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+		 rW7, rW0, rW3, rW4, rW6, C, 56)
+	bt		gt,ppc_spe_sha256_16_rounds
+
+	lwz		rW0,0(rHP)
+	NEXT_BLOCK
+	lwz		rW1,4(rHP)
+	lwz		rW2,8(rHP)
+	lwz		rW3,12(rHP)
+	lwz		rW4,16(rHP)
+	lwz		rW5,20(rHP)
+	lwz		rW6,24(rHP)
+	lwz		rW7,28(rHP)
+
+	add		rH0,rH0,rW0
+	stw		rH0,0(rHP)
+	add		rH1,rH1,rW1
+	stw		rH1,4(rHP)
+	add		rH2,rH2,rW2
+	stw		rH2,8(rHP)
+	add		rH3,rH3,rW3
+	stw		rH3,12(rHP)
+	add		rH4,rH4,rW4
+	stw		rH4,16(rHP)
+	add		rH5,rH5,rW5
+	stw		rH5,20(rHP)
+	add		rH6,rH6,rW6
+	stw		rH6,24(rHP)
+	add		rH7,rH7,rW7
+	stw		rH7,28(rHP)
+
+	bdnz		ppc_spe_sha256_main
+
+	FINALIZE
+	blr
+
+.data
+.align 5
+PPC_SPE_SHA256_K:
+	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/lib/crypto/powerpc/sha256.h b/lib/crypto/powerpc/sha256.h
new file mode 100644
index 000000000000..50d355441c7e
--- /dev/null
+++ b/lib/crypto/powerpc/sha256.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 Secure Hash Algorithm, SPE optimized
+ *
+ * Based on generic implementation. The assembler module takes care
+ * about the SPE registers so it can run from interrupt context.
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <linux/preempt.h>
+
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 1024
+
+extern void ppc_spe_sha256_transform(struct sha256_block_state *state,
+				     const u8 *src, u32 blocks);
+
+static void spe_begin(void)
+{
+	/* We just start SPE operations and will save SPE registers later. */
+	preempt_disable();
+	enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+	disable_kernel_spe();
+	/* reenable preemption */
+	preempt_enable();
+}
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	do {
+		/* cut input data into smaller blocks */
+		u32 unit = min_t(size_t, nblocks,
+				 MAX_BYTES / SHA256_BLOCK_SIZE);
+
+		spe_begin();
+		ppc_spe_sha256_transform(state, data, unit);
+		spe_end();
+
+		data += unit * SHA256_BLOCK_SIZE;
+		nblocks -= unit;
+	} while (nblocks);
+}
diff --git a/lib/crypto/riscv/chacha-riscv64-zvkb.S b/lib/crypto/riscv/chacha-riscv64-zvkb.S
new file mode 100644
index 000000000000..b777d0b4e379
--- /dev/null
+++ b/lib/crypto/riscv/chacha-riscv64-zvkb.S
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvkb
+
+#define STATEP		a0
+#define INP		a1
+#define OUTP		a2
+#define NBLOCKS		a3
+#define NROUNDS		a4
+
+#define CONSTS0		a5
+#define CONSTS1		a6
+#define CONSTS2		a7
+#define CONSTS3		t0
+#define TMP		t1
+#define VL		t2
+#define STRIDE		t3
+#define ROUND_CTR	t4
+#define KEY0		s0
+#define KEY1		s1
+#define KEY2		s2
+#define KEY3		s3
+#define KEY4		s4
+#define KEY5		s5
+#define KEY6		s6
+#define KEY7		s7
+#define COUNTER		s8
+#define NONCE0		s9
+#define NONCE1		s10
+#define NONCE2		s11
+
+.macro	chacha_round	a0, b0, c0, d0,  a1, b1, c1, d1, \
+			a2, b2, c2, d2,  a3, b3, c3, d3
+	// a += b; d ^= a; d = rol(d, 16);
+	vadd.vv		\a0, \a0, \b0
+	vadd.vv		\a1, \a1, \b1
+	vadd.vv		\a2, \a2, \b2
+	vadd.vv		\a3, \a3, \b3
+	vxor.vv		\d0, \d0, \a0
+	vxor.vv		\d1, \d1, \a1
+	vxor.vv		\d2, \d2, \a2
+	vxor.vv		\d3, \d3, \a3
+	vror.vi		\d0, \d0, 32 - 16
+	vror.vi		\d1, \d1, 32 - 16
+	vror.vi		\d2, \d2, 32 - 16
+	vror.vi		\d3, \d3, 32 - 16
+
+	// c += d; b ^= c; b = rol(b, 12);
+	vadd.vv		\c0, \c0, \d0
+	vadd.vv		\c1, \c1, \d1
+	vadd.vv		\c2, \c2, \d2
+	vadd.vv		\c3, \c3, \d3
+	vxor.vv		\b0, \b0, \c0
+	vxor.vv		\b1, \b1, \c1
+	vxor.vv		\b2, \b2, \c2
+	vxor.vv		\b3, \b3, \c3
+	vror.vi		\b0, \b0, 32 - 12
+	vror.vi		\b1, \b1, 32 - 12
+	vror.vi		\b2, \b2, 32 - 12
+	vror.vi		\b3, \b3, 32 - 12
+
+	// a += b; d ^= a; d = rol(d, 8);
+	vadd.vv		\a0, \a0, \b0
+	vadd.vv		\a1, \a1, \b1
+	vadd.vv		\a2, \a2, \b2
+	vadd.vv		\a3, \a3, \b3
+	vxor.vv		\d0, \d0, \a0
+	vxor.vv		\d1, \d1, \a1
+	vxor.vv		\d2, \d2, \a2
+	vxor.vv		\d3, \d3, \a3
+	vror.vi		\d0, \d0, 32 - 8
+	vror.vi		\d1, \d1, 32 - 8
+	vror.vi		\d2, \d2, 32 - 8
+	vror.vi		\d3, \d3, 32 - 8
+
+	// c += d; b ^= c; b = rol(b, 7);
+	vadd.vv		\c0, \c0, \d0
+	vadd.vv		\c1, \c1, \d1
+	vadd.vv		\c2, \c2, \d2
+	vadd.vv		\c3, \c3, \d3
+	vxor.vv		\b0, \b0, \c0
+	vxor.vv		\b1, \b1, \c1
+	vxor.vv		\b2, \b2, \c2
+	vxor.vv		\b3, \b3, \c3
+	vror.vi		\b0, \b0, 32 - 7
+	vror.vi		\b1, \b1, 32 - 7
+	vror.vi		\b2, \b2, 32 - 7
+	vror.vi		\b3, \b3, 32 - 7
+.endm
+
+// void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
+//		    size_t nblocks, int nrounds);
+//
+// |nblocks| is the number of 64-byte blocks to process, and must be nonzero.
+//
+// |state| gives the ChaCha state matrix, including the 32-bit counter in
+// state->x[12] following the RFC7539 convention; note that this differs from
+// the original Salsa20 paper which uses a 64-bit counter in state->x[12..13].
+// The updated 32-bit counter is written back to state->x[12] before returning.
+SYM_FUNC_START(chacha_zvkb)
+	addi		sp, sp, -96
+	sd		s0, 0(sp)
+	sd		s1, 8(sp)
+	sd		s2, 16(sp)
+	sd		s3, 24(sp)
+	sd		s4, 32(sp)
+	sd		s5, 40(sp)
+	sd		s6, 48(sp)
+	sd		s7, 56(sp)
+	sd		s8, 64(sp)
+	sd		s9, 72(sp)
+	sd		s10, 80(sp)
+	sd		s11, 88(sp)
+
+	li		STRIDE, 64
+
+	// Set up the initial state matrix in scalar registers.
+	lw		CONSTS0, 0(STATEP)
+	lw		CONSTS1, 4(STATEP)
+	lw		CONSTS2, 8(STATEP)
+	lw		CONSTS3, 12(STATEP)
+	lw		KEY0, 16(STATEP)
+	lw		KEY1, 20(STATEP)
+	lw		KEY2, 24(STATEP)
+	lw		KEY3, 28(STATEP)
+	lw		KEY4, 32(STATEP)
+	lw		KEY5, 36(STATEP)
+	lw		KEY6, 40(STATEP)
+	lw		KEY7, 44(STATEP)
+	lw		COUNTER, 48(STATEP)
+	lw		NONCE0, 52(STATEP)
+	lw		NONCE1, 56(STATEP)
+	lw		NONCE2, 60(STATEP)
+
+.Lblock_loop:
+	// Set vl to the number of blocks to process in this iteration.
+	vsetvli		VL, NBLOCKS, e32, m1, ta, ma
+
+	// Set up the initial state matrix for the next VL blocks in v0-v15.
+	// v{i} holds the i'th 32-bit word of the state matrix for all blocks.
+	// Note that only the counter word, at index 12, differs across blocks.
+	vmv.v.x		v0, CONSTS0
+	vmv.v.x		v1, CONSTS1
+	vmv.v.x		v2, CONSTS2
+	vmv.v.x		v3, CONSTS3
+	vmv.v.x		v4, KEY0
+	vmv.v.x		v5, KEY1
+	vmv.v.x		v6, KEY2
+	vmv.v.x		v7, KEY3
+	vmv.v.x		v8, KEY4
+	vmv.v.x		v9, KEY5
+	vmv.v.x		v10, KEY6
+	vmv.v.x		v11, KEY7
+	vid.v		v12
+	vadd.vx		v12, v12, COUNTER
+	vmv.v.x		v13, NONCE0
+	vmv.v.x		v14, NONCE1
+	vmv.v.x		v15, NONCE2
+
+	// Load the first half of the input data for each block into v16-v23.
+	// v{16+i} holds the i'th 32-bit word for all blocks.
+	vlsseg8e32.v	v16, (INP), STRIDE
+
+	mv		ROUND_CTR, NROUNDS
+.Lnext_doubleround:
+	addi		ROUND_CTR, ROUND_CTR, -2
+	// column round
+	chacha_round	v0, v4, v8, v12, v1, v5, v9, v13, \
+			v2, v6, v10, v14, v3, v7, v11, v15
+	// diagonal round
+	chacha_round	v0, v5, v10, v15, v1, v6, v11, v12, \
+			v2, v7, v8, v13, v3, v4, v9, v14
+	bnez		ROUND_CTR, .Lnext_doubleround
+
+	// Load the second half of the input data for each block into v24-v31.
+	// v{24+i} holds the {8+i}'th 32-bit word for all blocks.
+	addi		TMP, INP, 32
+	vlsseg8e32.v	v24, (TMP), STRIDE
+
+	// Finalize the first half of the keystream for each block.
+	vadd.vx		v0, v0, CONSTS0
+	vadd.vx		v1, v1, CONSTS1
+	vadd.vx		v2, v2, CONSTS2
+	vadd.vx		v3, v3, CONSTS3
+	vadd.vx		v4, v4, KEY0
+	vadd.vx		v5, v5, KEY1
+	vadd.vx		v6, v6, KEY2
+	vadd.vx		v7, v7, KEY3
+
+	// Encrypt/decrypt the first half of the data for each block.
+	vxor.vv		v16, v16, v0
+	vxor.vv		v17, v17, v1
+	vxor.vv		v18, v18, v2
+	vxor.vv		v19, v19, v3
+	vxor.vv		v20, v20, v4
+	vxor.vv		v21, v21, v5
+	vxor.vv		v22, v22, v6
+	vxor.vv		v23, v23, v7
+
+	// Store the first half of the output data for each block.
+	vssseg8e32.v	v16, (OUTP), STRIDE
+
+	// Finalize the second half of the keystream for each block.
+	vadd.vx		v8, v8, KEY4
+	vadd.vx		v9, v9, KEY5
+	vadd.vx		v10, v10, KEY6
+	vadd.vx		v11, v11, KEY7
+	vid.v		v0
+	vadd.vx		v12, v12, COUNTER
+	vadd.vx		v13, v13, NONCE0
+	vadd.vx		v14, v14, NONCE1
+	vadd.vx		v15, v15, NONCE2
+	vadd.vv		v12, v12, v0
+
+	// Encrypt/decrypt the second half of the data for each block.
+	vxor.vv		v24, v24, v8
+	vxor.vv		v25, v25, v9
+	vxor.vv		v26, v26, v10
+	vxor.vv		v27, v27, v11
+	vxor.vv		v29, v29, v13
+	vxor.vv		v28, v28, v12
+	vxor.vv		v30, v30, v14
+	vxor.vv		v31, v31, v15
+
+	// Store the second half of the output data for each block.
+	addi		TMP, OUTP, 32
+	vssseg8e32.v	v24, (TMP), STRIDE
+
+	// Update the counter, the remaining number of blocks, and the input and
+	// output pointers according to the number of blocks processed (VL).
+	add		COUNTER, COUNTER, VL
+	sub		NBLOCKS, NBLOCKS, VL
+	slli		TMP, VL, 6
+	add		OUTP, OUTP, TMP
+	add		INP, INP, TMP
+	bnez		NBLOCKS, .Lblock_loop
+
+	sw		COUNTER, 48(STATEP)
+	ld		s0, 0(sp)
+	ld		s1, 8(sp)
+	ld		s2, 16(sp)
+	ld		s3, 24(sp)
+	ld		s4, 32(sp)
+	ld		s5, 40(sp)
+	ld		s6, 48(sp)
+	ld		s7, 56(sp)
+	ld		s8, 64(sp)
+	ld		s9, 72(sp)
+	ld		s10, 80(sp)
+	ld		s11, 88(sp)
+	addi		sp, sp, 96
+	ret
+SYM_FUNC_END(chacha_zvkb)
diff --git a/lib/crypto/riscv/chacha.h b/lib/crypto/riscv/chacha.h
new file mode 100644
index 000000000000..5c000c6aef4b
--- /dev/null
+++ b/lib/crypto/riscv/chacha.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * ChaCha stream cipher (RISC-V optimized)
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/simd.h>
+#include <linux/linkage.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb);
+
+asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
+			    size_t nblocks, int nrounds);
+
+#define hchacha_block_arch hchacha_block_generic /* not implemented yet */
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+			      const u8 *src, unsigned int bytes, int nrounds)
+{
+	u8 block_buffer[CHACHA_BLOCK_SIZE];
+	unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE;
+	unsigned int tail_bytes = bytes % CHACHA_BLOCK_SIZE;
+
+	if (!static_branch_likely(&use_zvkb) || !crypto_simd_usable())
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	kernel_vector_begin();
+	if (full_blocks) {
+		chacha_zvkb(state, src, dst, full_blocks, nrounds);
+		src += full_blocks * CHACHA_BLOCK_SIZE;
+		dst += full_blocks * CHACHA_BLOCK_SIZE;
+	}
+	if (tail_bytes) {
+		memcpy(block_buffer, src, tail_bytes);
+		chacha_zvkb(state, block_buffer, block_buffer, 1, nrounds);
+		memcpy(dst, block_buffer, tail_bytes);
+	}
+	kernel_vector_end();
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+	if (riscv_isa_extension_available(NULL, ZVKB) &&
+	    riscv_vector_vlen() >= 128)
+		static_branch_enable(&use_zvkb);
+}
diff --git a/lib/crypto/riscv/poly1305-riscv.pl b/lib/crypto/riscv/poly1305-riscv.pl
new file mode 100644
index 000000000000..e25e6338a9ac
--- /dev/null
+++ b/lib/crypto/riscv/poly1305-riscv.pl
@@ -0,0 +1,847 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL.
+# ====================================================================
+#
+# Poly1305 hash for RISC-V.
+#
+# February 2019
+#
+# In the essence it's pretty straightforward transliteration of MIPS
+# module [without big-endian option].
+#
+# 1.8 cycles per byte on U74, >100% faster than compiler-generated
+# code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69%
+# improvement.
+#
+# June 2024.
+#
+# Add CHERI support.
+#
+######################################################################
+#
+($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4));
+($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27));
+#
+######################################################################
+
+$flavour = shift || "64";
+
+for (@ARGV) {   $output=$_ if (/\w[\w\-]*\.\w+$/);   }
+open STDOUT,">$output";
+
+$code.=<<___;
+#ifdef __KERNEL__
+# ifdef __riscv_zicfilp
+#  undef __riscv_zicfilp // calls are expected to be direct
+# endif
+#endif
+
+#if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast)
+# define __riscv_misaligned_fast 1
+#endif
+___
+
+if ($flavour =~ /64/) {{{
+######################################################################
+# 64-bit code path...
+#
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2);
+
+$code.=<<___;
+#if __riscv_xlen == 64
+# if __SIZEOF_POINTER__ == 16
+#  define PUSH	csc
+#  define POP	clc
+# else
+#  define PUSH	sd
+#  define POP	ld
+# endif
+#else
+# error "unsupported __riscv_xlen"
+#endif
+
+.option	pic
+.text
+
+.globl	poly1305_init
+.type	poly1305_init,\@function
+poly1305_init:
+#ifdef	__riscv_zicfilp
+	lpad	0
+#endif
+	sd	$zero,0($ctx)
+	sd	$zero,8($ctx)
+	sd	$zero,16($ctx)
+
+	beqz	$inp,.Lno_key
+
+#ifndef	__riscv_misaligned_fast
+	andi	$tmp0,$inp,7		# $inp % 8
+	andi	$inp,$inp,-8		# align $inp
+	slli	$tmp0,$tmp0,3		# byte to bit offset
+#endif
+	ld	$in0,0($inp)
+	ld	$in1,8($inp)
+#ifndef	__riscv_misaligned_fast
+	beqz	$tmp0,.Laligned_key
+
+	ld	$tmp2,16($inp)
+	neg	$tmp1,$tmp0		# implicit &63 in sll
+	srl	$in0,$in0,$tmp0
+	sll	$tmp3,$in1,$tmp1
+	srl	$in1,$in1,$tmp0
+	sll	$tmp2,$tmp2,$tmp1
+	or	$in0,$in0,$tmp3
+	or	$in1,$in1,$tmp2
+
+.Laligned_key:
+#endif
+	li	$tmp0,1
+	slli	$tmp0,$tmp0,32		# 0x0000000100000000
+	addi	$tmp0,$tmp0,-63		# 0x00000000ffffffc1
+	slli	$tmp0,$tmp0,28		# 0x0ffffffc10000000
+	addi	$tmp0,$tmp0,-1		# 0x0ffffffc0fffffff
+
+	and	$in0,$in0,$tmp0
+	addi	$tmp0,$tmp0,-3		# 0x0ffffffc0ffffffc
+	and	$in1,$in1,$tmp0
+
+	sd	$in0,24($ctx)
+	srli	$tmp0,$in1,2
+	sd	$in1,32($ctx)
+	add	$tmp0,$tmp0,$in1	# s1 = r1 + (r1 >> 2)
+	sd	$tmp0,40($ctx)
+
+.Lno_key:
+	li	$a0,0			# return 0
+	ret
+.size	poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
+   ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2);
+my ($shr,$shl) = ($t5,$t6);		# used on R6
+
+$code.=<<___;
+.globl	poly1305_blocks
+.type	poly1305_blocks,\@function
+poly1305_blocks:
+#ifdef	__riscv_zicfilp
+	lpad	0
+#endif
+	andi	$len,$len,-16		# complete blocks only
+	beqz	$len,.Lno_data
+
+	caddi	$sp,$sp,-4*__SIZEOF_POINTER__
+	PUSH	$s0,3*__SIZEOF_POINTER__($sp)
+	PUSH	$s1,2*__SIZEOF_POINTER__($sp)
+	PUSH	$s2,1*__SIZEOF_POINTER__($sp)
+	PUSH	$s3,0*__SIZEOF_POINTER__($sp)
+
+#ifndef	__riscv_misaligned_fast
+	andi	$shr,$inp,7
+	andi	$inp,$inp,-8		# align $inp
+	slli	$shr,$shr,3		# byte to bit offset
+	neg	$shl,$shr		# implicit &63 in sll
+#endif
+
+	ld	$h0,0($ctx)		# load hash value
+	ld	$h1,8($ctx)
+	ld	$h2,16($ctx)
+
+	ld	$r0,24($ctx)		# load key
+	ld	$r1,32($ctx)
+	ld	$rs1,40($ctx)
+
+	add	$len,$len,$inp		# end of buffer
+
+.Loop:
+	ld	$in0,0($inp)		# load input
+	ld	$in1,8($inp)
+#ifndef	__riscv_misaligned_fast
+	beqz	$shr,.Laligned_inp
+
+	ld	$tmp2,16($inp)
+	srl	$in0,$in0,$shr
+	sll	$tmp3,$in1,$shl
+	srl	$in1,$in1,$shr
+	sll	$tmp2,$tmp2,$shl
+	or	$in0,$in0,$tmp3
+	or	$in1,$in1,$tmp2
+
+.Laligned_inp:
+#endif
+	caddi	$inp,$inp,16
+
+	andi	$tmp0,$h2,-4		# modulo-scheduled reduction
+	srli	$tmp1,$h2,2
+	andi	$h2,$h2,3
+
+	add	$d0,$h0,$in0		# accumulate input
+	 add	$tmp1,$tmp1,$tmp0
+	sltu	$tmp0,$d0,$h0
+	add	$d0,$d0,$tmp1		# ... and residue
+	sltu	$tmp1,$d0,$tmp1
+	add	$d1,$h1,$in1
+	add	$tmp0,$tmp0,$tmp1
+	sltu	$tmp1,$d1,$h1
+	add	$d1,$d1,$tmp0
+
+	 add	$d2,$h2,$padbit
+	 sltu	$tmp0,$d1,$tmp0
+	mulhu	$h1,$r0,$d0		# h0*r0
+	mul	$h0,$r0,$d0
+
+	 add	$d2,$d2,$tmp1
+	 add	$d2,$d2,$tmp0
+	mulhu	$tmp1,$rs1,$d1		# h1*5*r1
+	mul	$tmp0,$rs1,$d1
+
+	mulhu	$h2,$r1,$d0		# h0*r1
+	mul	$tmp2,$r1,$d0
+	 add	$h0,$h0,$tmp0
+	 add	$h1,$h1,$tmp1
+	 sltu	$tmp0,$h0,$tmp0
+
+	 add	$h1,$h1,$tmp0
+	 add	$h1,$h1,$tmp2
+	mulhu	$tmp1,$r0,$d1		# h1*r0
+	mul	$tmp0,$r0,$d1
+
+	 sltu	$tmp2,$h1,$tmp2
+	 add	$h2,$h2,$tmp2
+	mul	$tmp2,$rs1,$d2		# h2*5*r1
+
+	 add	$h1,$h1,$tmp0
+	 add	$h2,$h2,$tmp1
+	mul	$tmp3,$r0,$d2		# h2*r0
+	 sltu	$tmp0,$h1,$tmp0
+	 add	$h2,$h2,$tmp0
+
+	add	$h1,$h1,$tmp2
+	sltu	$tmp2,$h1,$tmp2
+	add	$h2,$h2,$tmp2
+	add	$h2,$h2,$tmp3
+
+	bne	$inp,$len,.Loop
+
+	sd	$h0,0($ctx)		# store hash value
+	sd	$h1,8($ctx)
+	sd	$h2,16($ctx)
+
+	POP	$s0,3*__SIZEOF_POINTER__($sp)		# epilogue
+	POP	$s1,2*__SIZEOF_POINTER__($sp)
+	POP	$s2,1*__SIZEOF_POINTER__($sp)
+	POP	$s3,0*__SIZEOF_POINTER__($sp)
+	caddi	$sp,$sp,4*__SIZEOF_POINTER__
+
+.Lno_data:
+	ret
+.size	poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.globl	poly1305_emit
+.type	poly1305_emit,\@function
+poly1305_emit:
+#ifdef	__riscv_zicfilp
+	lpad	0
+#endif
+	ld	$tmp2,16($ctx)
+	ld	$tmp0,0($ctx)
+	ld	$tmp1,8($ctx)
+
+	andi	$in0,$tmp2,-4		# final reduction
+	srl	$in1,$tmp2,2
+	andi	$tmp2,$tmp2,3
+	add	$in0,$in0,$in1
+
+	add	$tmp0,$tmp0,$in0
+	sltu	$in1,$tmp0,$in0
+	 addi	$in0,$tmp0,5		# compare to modulus
+	add	$tmp1,$tmp1,$in1
+	 sltiu	$tmp3,$in0,5
+	sltu	$tmp4,$tmp1,$in1
+	 add	$in1,$tmp1,$tmp3
+	add	$tmp2,$tmp2,$tmp4
+	 sltu	$tmp3,$in1,$tmp3
+	 add	$tmp2,$tmp2,$tmp3
+
+	srli	$tmp2,$tmp2,2		# see if it carried/borrowed
+	neg	$tmp2,$tmp2
+
+	xor	$in0,$in0,$tmp0
+	xor	$in1,$in1,$tmp1
+	and	$in0,$in0,$tmp2
+	and	$in1,$in1,$tmp2
+	xor	$in0,$in0,$tmp0
+	xor	$in1,$in1,$tmp1
+
+	lwu	$tmp0,0($nonce)		# load nonce
+	lwu	$tmp1,4($nonce)
+	lwu	$tmp2,8($nonce)
+	lwu	$tmp3,12($nonce)
+	slli	$tmp1,$tmp1,32
+	slli	$tmp3,$tmp3,32
+	or	$tmp0,$tmp0,$tmp1
+	or	$tmp2,$tmp2,$tmp3
+
+	add	$in0,$in0,$tmp0		# accumulate nonce
+	add	$in1,$in1,$tmp2
+	sltu	$tmp0,$in0,$tmp0
+	add	$in1,$in1,$tmp0
+
+#ifdef	__riscv_misaligned_fast
+	sd	$in0,0($mac)		# write mac value
+	sd	$in1,8($mac)
+#else
+	srli	$tmp0,$in0,8		# write mac value
+	srli	$tmp1,$in0,16
+	srli	$tmp2,$in0,24
+	sb	$in0,0($mac)
+	srli	$tmp3,$in0,32
+	sb	$tmp0,1($mac)
+	srli	$tmp0,$in0,40
+	sb	$tmp1,2($mac)
+	srli	$tmp1,$in0,48
+	sb	$tmp2,3($mac)
+	srli	$tmp2,$in0,56
+	sb	$tmp3,4($mac)
+	srli	$tmp3,$in1,8
+	sb	$tmp0,5($mac)
+	srli	$tmp0,$in1,16
+	sb	$tmp1,6($mac)
+	srli	$tmp1,$in1,24
+	sb	$tmp2,7($mac)
+
+	sb	$in1,8($mac)
+	srli	$tmp2,$in1,32
+	sb	$tmp3,9($mac)
+	srli	$tmp3,$in1,40
+	sb	$tmp0,10($mac)
+	srli	$tmp0,$in1,48
+	sb	$tmp1,11($mac)
+	srli	$tmp1,$in1,56
+	sb	$tmp2,12($mac)
+	sb	$tmp3,13($mac)
+	sb	$tmp0,14($mac)
+	sb	$tmp1,15($mac)
+#endif
+
+	ret
+.size	poly1305_emit,.-poly1305_emit
+.string	"Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
+___
+}
+}}} else {{{
+######################################################################
+# 32-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
+   ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3);
+
+$code.=<<___;
+#if __riscv_xlen == 32
+# if __SIZEOF_POINTER__ == 8
+#  define PUSH	csc
+#  define POP	clc
+# else
+#  define PUSH	sw
+#  define POP	lw
+# endif
+# define MULX(hi,lo,a,b)	mulhu hi,a,b; mul lo,a,b
+# define srliw	srli
+# define srlw	srl
+# define sllw	sll
+# define addw	add
+# define addiw	addi
+# define mulw	mul
+#elif __riscv_xlen == 64
+# if __SIZEOF_POINTER__ == 16
+#  define PUSH	csc
+#  define POP	clc
+# else
+#  define PUSH	sd
+#  define POP	ld
+# endif
+# define MULX(hi,lo,a,b)	slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32
+#else
+# error "unsupported __riscv_xlen"
+#endif
+
+.option	pic
+.text
+
+.globl	poly1305_init
+.type	poly1305_init,\@function
+poly1305_init:
+#ifdef	__riscv_zicfilp
+	lpad	0
+#endif
+	sw	$zero,0($ctx)
+	sw	$zero,4($ctx)
+	sw	$zero,8($ctx)
+	sw	$zero,12($ctx)
+	sw	$zero,16($ctx)
+
+	beqz	$inp,.Lno_key
+
+#ifndef	__riscv_misaligned_fast
+	andi	$tmp0,$inp,3		# $inp % 4
+	sub	$inp,$inp,$tmp0		# align $inp
+	sll	$tmp0,$tmp0,3		# byte to bit offset
+#endif
+	lw	$in0,0($inp)
+	lw	$in1,4($inp)
+	lw	$in2,8($inp)
+	lw	$in3,12($inp)
+#ifndef	__riscv_misaligned_fast
+	beqz	$tmp0,.Laligned_key
+
+	lw	$tmp2,16($inp)
+	sub	$tmp1,$zero,$tmp0
+	srlw	$in0,$in0,$tmp0
+	sllw	$tmp3,$in1,$tmp1
+	srlw	$in1,$in1,$tmp0
+	or	$in0,$in0,$tmp3
+	sllw	$tmp3,$in2,$tmp1
+	srlw	$in2,$in2,$tmp0
+	or	$in1,$in1,$tmp3
+	sllw	$tmp3,$in3,$tmp1
+	srlw	$in3,$in3,$tmp0
+	or	$in2,$in2,$tmp3
+	sllw	$tmp2,$tmp2,$tmp1
+	or	$in3,$in3,$tmp2
+.Laligned_key:
+#endif
+
+	lui	$tmp0,0x10000
+	addi	$tmp0,$tmp0,-1		# 0x0fffffff
+	and	$in0,$in0,$tmp0
+	addi	$tmp0,$tmp0,-3		# 0x0ffffffc
+	and	$in1,$in1,$tmp0
+	and	$in2,$in2,$tmp0
+	and	$in3,$in3,$tmp0
+
+	sw	$in0,20($ctx)
+	sw	$in1,24($ctx)
+	sw	$in2,28($ctx)
+	sw	$in3,32($ctx)
+
+	srlw	$tmp1,$in1,2
+	srlw	$tmp2,$in2,2
+	srlw	$tmp3,$in3,2
+	addw	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
+	addw	$in2,$in2,$tmp2
+	addw	$in3,$in3,$tmp3
+	sw	$in1,36($ctx)
+	sw	$in2,40($ctx)
+	sw	$in3,44($ctx)
+.Lno_key:
+	li	$a0,0
+	ret
+.size	poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
+   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2);
+my ($d0,$d1,$d2,$d3) =
+   ($a4,$a5,$a6,$a7);
+my $shr = $ra;		# used on R6
+
+$code.=<<___;
+.globl	poly1305_blocks
+.type	poly1305_blocks,\@function
+poly1305_blocks:
+#ifdef	__riscv_zicfilp
+	lpad	0
+#endif
+	andi	$len,$len,-16		# complete blocks only
+	beqz	$len,.Labort
+
+#ifdef	__riscv_zcmp
+	cm.push	{ra,s0-s8}, -48
+#else
+	caddi	$sp,$sp,-__SIZEOF_POINTER__*12
+	PUSH	$ra, __SIZEOF_POINTER__*11($sp)
+	PUSH	$s0, __SIZEOF_POINTER__*10($sp)
+	PUSH	$s1, __SIZEOF_POINTER__*9($sp)
+	PUSH	$s2, __SIZEOF_POINTER__*8($sp)
+	PUSH	$s3, __SIZEOF_POINTER__*7($sp)
+	PUSH	$s4, __SIZEOF_POINTER__*6($sp)
+	PUSH	$s5, __SIZEOF_POINTER__*5($sp)
+	PUSH	$s6, __SIZEOF_POINTER__*4($sp)
+	PUSH	$s7, __SIZEOF_POINTER__*3($sp)
+	PUSH	$s8, __SIZEOF_POINTER__*2($sp)
+#endif
+
+#ifndef	__riscv_misaligned_fast
+	andi	$shr,$inp,3
+	andi	$inp,$inp,-4		# align $inp
+	slli	$shr,$shr,3		# byte to bit offset
+#endif
+
+	lw	$h0,0($ctx)		# load hash value
+	lw	$h1,4($ctx)
+	lw	$h2,8($ctx)
+	lw	$h3,12($ctx)
+	lw	$h4,16($ctx)
+
+	lw	$r0,20($ctx)		# load key
+	lw	$r1,24($ctx)
+	lw	$r2,28($ctx)
+	lw	$r3,32($ctx)
+	lw	$rs1,36($ctx)
+	lw	$rs2,40($ctx)
+	lw	$rs3,44($ctx)
+
+	add	$len,$len,$inp		# end of buffer
+
+.Loop:
+	lw	$d0,0($inp)		# load input
+	lw	$d1,4($inp)
+	lw	$d2,8($inp)
+	lw	$d3,12($inp)
+#ifndef	__riscv_misaligned_fast
+	beqz	$shr,.Laligned_inp
+
+	lw	$t4,16($inp)
+	sub	$t5,$zero,$shr
+	srlw	$d0,$d0,$shr
+	sllw	$t3,$d1,$t5
+	srlw	$d1,$d1,$shr
+	or	$d0,$d0,$t3
+	sllw	$t3,$d2,$t5
+	srlw	$d2,$d2,$shr
+	or	$d1,$d1,$t3
+	sllw	$t3,$d3,$t5
+	srlw	$d3,$d3,$shr
+	or	$d2,$d2,$t3
+	sllw	$t4,$t4,$t5
+	or	$d3,$d3,$t4
+
+.Laligned_inp:
+#endif
+	srliw	$t3,$h4,2		# modulo-scheduled reduction
+	andi	$t4,$h4,-4
+	andi	$h4,$h4,3
+
+	addw	$d0,$d0,$h0		# accumulate input
+	 addw	$t4,$t4,$t3
+	sltu	$h0,$d0,$h0
+	addw	$d0,$d0,$t4		# ... and residue
+	sltu	$t4,$d0,$t4
+
+	addw	$d1,$d1,$h1
+	 addw	$h0,$h0,$t4		# carry
+	sltu	$h1,$d1,$h1
+	addw	$d1,$d1,$h0
+	sltu	$h0,$d1,$h0
+
+	addw	$d2,$d2,$h2
+	 addw	$h1,$h1,$h0		# carry
+	sltu	$h2,$d2,$h2
+	addw	$d2,$d2,$h1
+	sltu	$h1,$d2,$h1
+
+	addw	$d3,$d3,$h3
+	 addw	$h2,$h2,$h1		# carry
+	sltu	$h3,$d3,$h3
+	addw	$d3,$d3,$h2
+
+	MULX	($h1,$h0,$r0,$d0)	# d0*r0
+
+	 sltu	$h2,$d3,$h2
+	 addw	$h3,$h3,$h2		# carry
+
+	MULX	($t4,$t3,$rs3,$d1)	# d1*s3
+
+	 addw	$h4,$h4,$padbit
+	 caddi	$inp,$inp,16
+	 addw	$h4,$h4,$h3
+
+	MULX	($t6,$a3,$rs2,$d2)	# d2*s2
+	 addw	$h0,$h0,$t3
+	 addw	$h1,$h1,$t4
+	 sltu	$t3,$h0,$t3
+	 addw	$h1,$h1,$t3
+
+	MULX	($t4,$t3,$rs1,$d3)	# d3*s1
+	 addw	$h0,$h0,$a3
+	 addw	$h1,$h1,$t6
+	 sltu	$a3,$h0,$a3
+	 addw	$h1,$h1,$a3
+
+
+	MULX	($h2,$a3,$r1,$d0)	# d0*r1
+	 addw	$h0,$h0,$t3
+	 addw	$h1,$h1,$t4
+	 sltu	$t3,$h0,$t3
+	 addw	$h1,$h1,$t3
+
+	MULX	($t4,$t3,$r0,$d1)	# d1*r0
+	 addw	$h1,$h1,$a3
+	 sltu	$a3,$h1,$a3
+	 addw	$h2,$h2,$a3
+
+	MULX	($t6,$a3,$rs3,$d2)	# d2*s3
+	 addw	$h1,$h1,$t3
+	 addw	$h2,$h2,$t4
+	 sltu	$t3,$h1,$t3
+	 addw	$h2,$h2,$t3
+
+	MULX	($t4,$t3,$rs2,$d3)	# d3*s2
+	 addw	$h1,$h1,$a3
+	 addw	$h2,$h2,$t6
+	 sltu	$a3,$h1,$a3
+	 addw	$h2,$h2,$a3
+
+	mulw	$a3,$rs1,$h4		# h4*s1
+	 addw	$h1,$h1,$t3
+	 addw	$h2,$h2,$t4
+	 sltu	$t3,$h1,$t3
+	 addw	$h2,$h2,$t3
+
+
+	MULX	($h3,$t3,$r2,$d0)	# d0*r2
+	 addw	$h1,$h1,$a3
+	 sltu	$a3,$h1,$a3
+	 addw	$h2,$h2,$a3
+
+	MULX	($t6,$a3,$r1,$d1)	# d1*r1
+	 addw	$h2,$h2,$t3
+	 sltu	$t3,$h2,$t3
+	 addw	$h3,$h3,$t3
+
+	MULX	($t4,$t3,$r0,$d2)	# d2*r0
+	 addw	$h2,$h2,$a3
+	 addw	$h3,$h3,$t6
+	 sltu	$a3,$h2,$a3
+	 addw	$h3,$h3,$a3
+
+	MULX	($t6,$a3,$rs3,$d3)	# d3*s3
+	 addw	$h2,$h2,$t3
+	 addw	$h3,$h3,$t4
+	 sltu	$t3,$h2,$t3
+	 addw	$h3,$h3,$t3
+
+	mulw	$t3,$rs2,$h4		# h4*s2
+	 addw	$h2,$h2,$a3
+	 addw	$h3,$h3,$t6
+	 sltu	$a3,$h2,$a3
+	 addw	$h3,$h3,$a3
+
+
+	MULX	($t6,$a3,$r3,$d0)	# d0*r3
+	 addw	$h2,$h2,$t3
+	 sltu	$t3,$h2,$t3
+	 addw	$h3,$h3,$t3
+
+	MULX	($t4,$t3,$r2,$d1)	# d1*r2
+	 addw	$h3,$h3,$a3
+	 sltu	$a3,$h3,$a3
+	 addw	$t6,$t6,$a3
+
+	MULX	($a3,$d3,$r0,$d3)	# d3*r0
+	 addw	$h3,$h3,$t3
+	 addw	$t6,$t6,$t4
+	 sltu	$t3,$h3,$t3
+	 addw	$t6,$t6,$t3
+
+	MULX	($t4,$t3,$r1,$d2)	# d2*r1
+	 addw	$h3,$h3,$d3
+	 addw	$t6,$t6,$a3
+	 sltu	$d3,$h3,$d3
+	 addw	$t6,$t6,$d3
+
+	mulw	$a3,$rs3,$h4		# h4*s3
+	 addw	$h3,$h3,$t3
+	 addw	$t6,$t6,$t4
+	 sltu	$t3,$h3,$t3
+	 addw	$t6,$t6,$t3
+
+
+	mulw	$h4,$r0,$h4		# h4*r0
+	 addw	$h3,$h3,$a3
+	 sltu	$a3,$h3,$a3
+	 addw	$t6,$t6,$a3
+	addw	$h4,$t6,$h4
+
+	li	$padbit,1		# if we loop, padbit is 1
+
+	bne	$inp,$len,.Loop
+
+	sw	$h0,0($ctx)		# store hash value
+	sw	$h1,4($ctx)
+	sw	$h2,8($ctx)
+	sw	$h3,12($ctx)
+	sw	$h4,16($ctx)
+
+#ifdef	__riscv_zcmp
+	cm.popret	{ra,s0-s8}, 48
+#else
+	POP	$ra, __SIZEOF_POINTER__*11($sp)
+	POP	$s0, __SIZEOF_POINTER__*10($sp)
+	POP	$s1, __SIZEOF_POINTER__*9($sp)
+	POP	$s2, __SIZEOF_POINTER__*8($sp)
+	POP	$s3, __SIZEOF_POINTER__*7($sp)
+	POP	$s4, __SIZEOF_POINTER__*6($sp)
+	POP	$s5, __SIZEOF_POINTER__*5($sp)
+	POP	$s6, __SIZEOF_POINTER__*4($sp)
+	POP	$s7, __SIZEOF_POINTER__*3($sp)
+	POP	$s8, __SIZEOF_POINTER__*2($sp)
+	caddi	$sp,$sp,__SIZEOF_POINTER__*12
+#endif
+.Labort:
+	ret
+.size	poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
+
+$code.=<<___;
+.globl	poly1305_emit
+.type	poly1305_emit,\@function
+poly1305_emit:
+#ifdef	__riscv_zicfilp
+	lpad	0
+#endif
+	lw	$tmp4,16($ctx)
+	lw	$tmp0,0($ctx)
+	lw	$tmp1,4($ctx)
+	lw	$tmp2,8($ctx)
+	lw	$tmp3,12($ctx)
+
+	srliw	$ctx,$tmp4,2		# final reduction
+	andi	$in0,$tmp4,-4
+	andi	$tmp4,$tmp4,3
+	addw	$ctx,$ctx,$in0
+
+	addw	$tmp0,$tmp0,$ctx
+	sltu	$ctx,$tmp0,$ctx
+	 addiw	$in0,$tmp0,5		# compare to modulus
+	addw	$tmp1,$tmp1,$ctx
+	 sltiu	$in1,$in0,5
+	sltu	$ctx,$tmp1,$ctx
+	 addw	$in1,$in1,$tmp1
+	addw	$tmp2,$tmp2,$ctx
+	 sltu	$in2,$in1,$tmp1
+	sltu	$ctx,$tmp2,$ctx
+	 addw	$in2,$in2,$tmp2
+	addw	$tmp3,$tmp3,$ctx
+	 sltu	$in3,$in2,$tmp2
+	sltu	$ctx,$tmp3,$ctx
+	 addw	$in3,$in3,$tmp3
+	addw	$tmp4,$tmp4,$ctx
+	 sltu	$ctx,$in3,$tmp3
+	 addw	$ctx,$ctx,$tmp4
+
+	srl	$ctx,$ctx,2		# see if it carried/borrowed
+	sub	$ctx,$zero,$ctx
+
+	xor	$in0,$in0,$tmp0
+	xor	$in1,$in1,$tmp1
+	xor	$in2,$in2,$tmp2
+	xor	$in3,$in3,$tmp3
+	and	$in0,$in0,$ctx
+	and	$in1,$in1,$ctx
+	and	$in2,$in2,$ctx
+	and	$in3,$in3,$ctx
+	xor	$in0,$in0,$tmp0
+	xor	$in1,$in1,$tmp1
+	xor	$in2,$in2,$tmp2
+	xor	$in3,$in3,$tmp3
+
+	lw	$tmp0,0($nonce)		# load nonce
+	lw	$tmp1,4($nonce)
+	lw	$tmp2,8($nonce)
+	lw	$tmp3,12($nonce)
+
+	addw	$in0,$in0,$tmp0		# accumulate nonce
+	sltu	$ctx,$in0,$tmp0
+
+	addw	$in1,$in1,$tmp1
+	sltu	$tmp1,$in1,$tmp1
+	addw	$in1,$in1,$ctx
+	sltu	$ctx,$in1,$ctx
+	addw	$ctx,$ctx,$tmp1
+
+	addw	$in2,$in2,$tmp2
+	sltu	$tmp2,$in2,$tmp2
+	addw	$in2,$in2,$ctx
+	sltu	$ctx,$in2,$ctx
+	addw	$ctx,$ctx,$tmp2
+
+	addw	$in3,$in3,$tmp3
+	addw	$in3,$in3,$ctx
+
+#ifdef	__riscv_misaligned_fast
+	sw	$in0,0($mac)		# write mac value
+	sw	$in1,4($mac)
+	sw	$in2,8($mac)
+	sw	$in3,12($mac)
+#else
+	srl	$tmp0,$in0,8		# write mac value
+	srl	$tmp1,$in0,16
+	srl	$tmp2,$in0,24
+	sb	$in0, 0($mac)
+	sb	$tmp0,1($mac)
+	srl	$tmp0,$in1,8
+	sb	$tmp1,2($mac)
+	srl	$tmp1,$in1,16
+	sb	$tmp2,3($mac)
+	srl	$tmp2,$in1,24
+	sb	$in1, 4($mac)
+	sb	$tmp0,5($mac)
+	srl	$tmp0,$in2,8
+	sb	$tmp1,6($mac)
+	srl	$tmp1,$in2,16
+	sb	$tmp2,7($mac)
+	srl	$tmp2,$in2,24
+	sb	$in2, 8($mac)
+	sb	$tmp0,9($mac)
+	srl	$tmp0,$in3,8
+	sb	$tmp1,10($mac)
+	srl	$tmp1,$in3,16
+	sb	$tmp2,11($mac)
+	srl	$tmp2,$in3,24
+	sb	$in3, 12($mac)
+	sb	$tmp0,13($mac)
+	sb	$tmp1,14($mac)
+	sb	$tmp2,15($mac)
+#endif
+
+	ret
+.size	poly1305_emit,.-poly1305_emit
+.string	"Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
+___
+}
+}}}
+
+foreach (split("\n", $code)) {
+    if ($flavour =~ /^cheri/) {
+	s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/;
+	s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or
+	s/\b(ret|jal)\b/c$1/;
+	s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or
+	m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g;
+    } else {
+	s/\bcaddi?\b/add/ or
+	s/\bcmove\b/mv/;
+    }
+    print $_, "\n";
+}
+
+close STDOUT;
diff --git a/lib/crypto/riscv/poly1305.h b/lib/crypto/riscv/poly1305.h
new file mode 100644
index 000000000000..88f3df44e355
--- /dev/null
+++ b/lib/crypto/riscv/poly1305.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for riscv
+ *
+ * Copyright (C) 2025 Institute of Software, CAS.
+ */
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+				    const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
+				const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+			      u8 digest[POLY1305_DIGEST_SIZE],
+			      const u32 nonce[4]);
diff --git a/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
new file mode 100644
index 000000000000..1618d1220a6e
--- /dev/null
+++ b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvknha, +zvkb
+
+#define STATEP		a0
+#define DATA		a1
+#define NUM_BLOCKS	a2
+
+#define STATEP_C	a3
+
+#define MASK		v0
+#define INDICES		v1
+#define W0		v2
+#define W1		v3
+#define W2		v4
+#define W3		v5
+#define VTMP		v6
+#define FEBA		v7
+#define HGDC		v8
+#define K0		v10
+#define K1		v11
+#define K2		v12
+#define K3		v13
+#define K4		v14
+#define K5		v15
+#define K6		v16
+#define K7		v17
+#define K8		v18
+#define K9		v19
+#define K10		v20
+#define K11		v21
+#define K12		v22
+#define K13		v23
+#define K14		v24
+#define K15		v25
+#define PREV_FEBA	v26
+#define PREV_HGDC	v27
+
+// Do 4 rounds of SHA-256.  w0 contains the current 4 message schedule words.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words.  w1-w3 contain the next 3 groups of 4
+// message schedule words; this macro computes the group after w3 and writes it
+// to w0.  This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
+// w0), so the caller must cycle through the registers accordingly.
+.macro	sha256_4rounds	last, k, w0, w1, w2, w3
+	vadd.vv		VTMP, \k, \w0
+	vsha2cl.vv	HGDC, FEBA, VTMP
+	vsha2ch.vv	FEBA, HGDC, VTMP
+.if !\last
+	vmerge.vvm	VTMP, \w2, \w1, MASK
+	vsha2ms.vv	\w0, VTMP, \w3
+.endif
+.endm
+
+.macro	sha256_16rounds	last, k0, k1, k2, k3
+	sha256_4rounds	\last, \k0, W0, W1, W2, W3
+	sha256_4rounds	\last, \k1, W1, W2, W3, W0
+	sha256_4rounds	\last, \k2, W2, W3, W0, W1
+	sha256_4rounds	\last, \k3, W3, W0, W1, W2
+.endm
+
+// void sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
+//					       const u8 *data, size_t nblocks);
+SYM_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb)
+
+	// Load the round constants into K0-K15.
+	vsetivli	zero, 4, e32, m1, ta, ma
+	la		t0, K256
+	vle32.v		K0, (t0)
+	addi		t0, t0, 16
+	vle32.v		K1, (t0)
+	addi		t0, t0, 16
+	vle32.v		K2, (t0)
+	addi		t0, t0, 16
+	vle32.v		K3, (t0)
+	addi		t0, t0, 16
+	vle32.v		K4, (t0)
+	addi		t0, t0, 16
+	vle32.v		K5, (t0)
+	addi		t0, t0, 16
+	vle32.v		K6, (t0)
+	addi		t0, t0, 16
+	vle32.v		K7, (t0)
+	addi		t0, t0, 16
+	vle32.v		K8, (t0)
+	addi		t0, t0, 16
+	vle32.v		K9, (t0)
+	addi		t0, t0, 16
+	vle32.v		K10, (t0)
+	addi		t0, t0, 16
+	vle32.v		K11, (t0)
+	addi		t0, t0, 16
+	vle32.v		K12, (t0)
+	addi		t0, t0, 16
+	vle32.v		K13, (t0)
+	addi		t0, t0, 16
+	vle32.v		K14, (t0)
+	addi		t0, t0, 16
+	vle32.v		K15, (t0)
+
+	// Setup mask for the vmerge to replace the first word (idx==0) in
+	// message scheduling.  There are 4 words, so an 8-bit mask suffices.
+	vsetivli	zero, 1, e8, m1, ta, ma
+	vmv.v.i		MASK, 0x01
+
+	// Load the state.  The state is stored as {a,b,c,d,e,f,g,h}, but we
+	// need {f,e,b,a},{h,g,d,c}.  The dst vtype is e32m1 and the index vtype
+	// is e8mf4.  We use index-load with the i8 indices {20, 16, 4, 0},
+	// loaded using the 32-bit little endian value 0x00041014.
+	li		t0, 0x00041014
+	vsetivli	zero, 1, e32, m1, ta, ma
+	vmv.v.x		INDICES, t0
+	addi		STATEP_C, STATEP, 8
+	vsetivli	zero, 4, e32, m1, ta, ma
+	vluxei8.v	FEBA, (STATEP), INDICES
+	vluxei8.v	HGDC, (STATEP_C), INDICES
+
+.Lnext_block:
+	addi		NUM_BLOCKS, NUM_BLOCKS, -1
+
+	// Save the previous state, as it's needed later.
+	vmv.v.v		PREV_FEBA, FEBA
+	vmv.v.v		PREV_HGDC, HGDC
+
+	// Load the next 512-bit message block and endian-swap each 32-bit word.
+	vle32.v		W0, (DATA)
+	vrev8.v		W0, W0
+	addi		DATA, DATA, 16
+	vle32.v		W1, (DATA)
+	vrev8.v		W1, W1
+	addi		DATA, DATA, 16
+	vle32.v		W2, (DATA)
+	vrev8.v		W2, W2
+	addi		DATA, DATA, 16
+	vle32.v		W3, (DATA)
+	vrev8.v		W3, W3
+	addi		DATA, DATA, 16
+
+	// Do the 64 rounds of SHA-256.
+	sha256_16rounds	0, K0, K1, K2, K3
+	sha256_16rounds	0, K4, K5, K6, K7
+	sha256_16rounds	0, K8, K9, K10, K11
+	sha256_16rounds	1, K12, K13, K14, K15
+
+	// Add the previous state.
+	vadd.vv		FEBA, FEBA, PREV_FEBA
+	vadd.vv		HGDC, HGDC, PREV_HGDC
+
+	// Repeat if more blocks remain.
+	bnez		NUM_BLOCKS, .Lnext_block
+
+	// Store the new state and return.
+	vsuxei8.v	FEBA, (STATEP), INDICES
+	vsuxei8.v	HGDC, (STATEP_C), INDICES
+	ret
+SYM_FUNC_END(sha256_transform_zvknha_or_zvknhb_zvkb)
+
+.section ".rodata"
+.p2align 2
+.type K256, @object
+K256:
+	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+.size K256, . - K256
diff --git a/lib/crypto/riscv/sha256.h b/lib/crypto/riscv/sha256.h
new file mode 100644
index 000000000000..1def18b0a4fb
--- /dev/null
+++ b/lib/crypto/riscv/sha256.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 (RISC-V accelerated)
+ *
+ * Copyright (C) 2022 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
+
+asmlinkage void
+sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
+				       const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_extensions) && likely(may_use_simd())) {
+		kernel_vector_begin();
+		sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks);
+		kernel_vector_end();
+	} else {
+		sha256_blocks_generic(state, data, nblocks);
+	}
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+	/* Both zvknha and zvknhb provide the SHA-256 instructions. */
+	if ((riscv_isa_extension_available(NULL, ZVKNHA) ||
+	     riscv_isa_extension_available(NULL, ZVKNHB)) &&
+	    riscv_isa_extension_available(NULL, ZVKB) &&
+	    riscv_vector_vlen() >= 128)
+		static_branch_enable(&have_extensions);
+}
diff --git a/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S b/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S
new file mode 100644
index 000000000000..b41eebf60546
--- /dev/null
+++ b/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknhb')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvknhb, +zvkb
+
+#define STATEP		a0
+#define DATA		a1
+#define NUM_BLOCKS	a2
+
+#define STATEP_C	a3
+#define K		a4
+
+#define MASK		v0
+#define INDICES		v1
+#define W0		v10	// LMUL=2
+#define W1		v12	// LMUL=2
+#define W2		v14	// LMUL=2
+#define W3		v16	// LMUL=2
+#define VTMP		v20	// LMUL=2
+#define FEBA		v22	// LMUL=2
+#define HGDC		v24	// LMUL=2
+#define PREV_FEBA	v26	// LMUL=2
+#define PREV_HGDC	v28	// LMUL=2
+
+// Do 4 rounds of SHA-512.  w0 contains the current 4 message schedule words.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words.  w1-w3 contain the next 3 groups of 4
+// message schedule words; this macro computes the group after w3 and writes it
+// to w0.  This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
+// w0), so the caller must cycle through the registers accordingly.
+.macro	sha512_4rounds	last, w0, w1, w2, w3
+	vle64.v		VTMP, (K)
+	addi		K, K, 32
+	vadd.vv		VTMP, VTMP, \w0
+	vsha2cl.vv	HGDC, FEBA, VTMP
+	vsha2ch.vv	FEBA, HGDC, VTMP
+.if !\last
+	vmerge.vvm	VTMP, \w2, \w1, MASK
+	vsha2ms.vv	\w0, VTMP, \w3
+.endif
+.endm
+
+.macro	sha512_16rounds	last
+	sha512_4rounds	\last, W0, W1, W2, W3
+	sha512_4rounds	\last, W1, W2, W3, W0
+	sha512_4rounds	\last, W2, W3, W0, W1
+	sha512_4rounds	\last, W3, W0, W1, W2
+.endm
+
+// void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state,
+//				     const u8 *data, size_t nblocks);
+SYM_FUNC_START(sha512_transform_zvknhb_zvkb)
+
+	// Setup mask for the vmerge to replace the first word (idx==0) in
+	// message scheduling.  There are 4 words, so an 8-bit mask suffices.
+	vsetivli	zero, 1, e8, m1, ta, ma
+	vmv.v.i		MASK, 0x01
+
+	// Load the state.  The state is stored as {a,b,c,d,e,f,g,h}, but we
+	// need {f,e,b,a},{h,g,d,c}.  The dst vtype is e64m2 and the index vtype
+	// is e8mf4.  We use index-load with the i8 indices {40, 32, 8, 0},
+	// loaded using the 32-bit little endian value 0x00082028.
+	li		t0, 0x00082028
+	vsetivli	zero, 1, e32, m1, ta, ma
+	vmv.v.x		INDICES, t0
+	addi		STATEP_C, STATEP, 16
+	vsetivli	zero, 4, e64, m2, ta, ma
+	vluxei8.v	FEBA, (STATEP), INDICES
+	vluxei8.v	HGDC, (STATEP_C), INDICES
+
+.Lnext_block:
+	la		K, K512
+	addi		NUM_BLOCKS, NUM_BLOCKS, -1
+
+	// Save the previous state, as it's needed later.
+	vmv.v.v		PREV_FEBA, FEBA
+	vmv.v.v		PREV_HGDC, HGDC
+
+	// Load the next 1024-bit message block and endian-swap each 64-bit word
+	vle64.v		W0, (DATA)
+	vrev8.v		W0, W0
+	addi		DATA, DATA, 32
+	vle64.v		W1, (DATA)
+	vrev8.v		W1, W1
+	addi		DATA, DATA, 32
+	vle64.v		W2, (DATA)
+	vrev8.v		W2, W2
+	addi		DATA, DATA, 32
+	vle64.v		W3, (DATA)
+	vrev8.v		W3, W3
+	addi		DATA, DATA, 32
+
+	// Do the 80 rounds of SHA-512.
+	sha512_16rounds 0
+	sha512_16rounds 0
+	sha512_16rounds 0
+	sha512_16rounds 0
+	sha512_16rounds 1
+
+	// Add the previous state.
+	vadd.vv		FEBA, FEBA, PREV_FEBA
+	vadd.vv		HGDC, HGDC, PREV_HGDC
+
+	// Repeat if more blocks remain.
+	bnez		NUM_BLOCKS, .Lnext_block
+
+	// Store the new state and return.
+	vsuxei8.v	FEBA, (STATEP), INDICES
+	vsuxei8.v	HGDC, (STATEP_C), INDICES
+	ret
+SYM_FUNC_END(sha512_transform_zvknhb_zvkb)
+
+.section ".rodata"
+.p2align 3
+.type K512, @object
+K512:
+	.dword		0x428a2f98d728ae22, 0x7137449123ef65cd
+	.dword		0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+	.dword		0x3956c25bf348b538, 0x59f111f1b605d019
+	.dword		0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+	.dword		0xd807aa98a3030242, 0x12835b0145706fbe
+	.dword		0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+	.dword		0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+	.dword		0x9bdc06a725c71235, 0xc19bf174cf692694
+	.dword		0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+	.dword		0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+	.dword		0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+	.dword		0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+	.dword		0x983e5152ee66dfab, 0xa831c66d2db43210
+	.dword		0xb00327c898fb213f, 0xbf597fc7beef0ee4
+	.dword		0xc6e00bf33da88fc2, 0xd5a79147930aa725
+	.dword		0x06ca6351e003826f, 0x142929670a0e6e70
+	.dword		0x27b70a8546d22ffc, 0x2e1b21385c26c926
+	.dword		0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+	.dword		0x650a73548baf63de, 0x766a0abb3c77b2a8
+	.dword		0x81c2c92e47edaee6, 0x92722c851482353b
+	.dword		0xa2bfe8a14cf10364, 0xa81a664bbc423001
+	.dword		0xc24b8b70d0f89791, 0xc76c51a30654be30
+	.dword		0xd192e819d6ef5218, 0xd69906245565a910
+	.dword		0xf40e35855771202a, 0x106aa07032bbd1b8
+	.dword		0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+	.dword		0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+	.dword		0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+	.dword		0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+	.dword		0x748f82ee5defb2fc, 0x78a5636f43172f60
+	.dword		0x84c87814a1f0ab72, 0x8cc702081a6439ec
+	.dword		0x90befffa23631e28, 0xa4506cebde82bde9
+	.dword		0xbef9a3f7b2c67915, 0xc67178f2e372532b
+	.dword		0xca273eceea26619c, 0xd186b8c721c0c207
+	.dword		0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+	.dword		0x06f067aa72176fba, 0x0a637dc5a2c898a6
+	.dword		0x113f9804bef90dae, 0x1b710b35131c471b
+	.dword		0x28db77f523047d84, 0x32caab7b40c72493
+	.dword		0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+	.dword		0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+	.dword		0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+.size K512, . - K512
diff --git a/lib/crypto/riscv/sha512.h b/lib/crypto/riscv/sha512.h
new file mode 100644
index 000000000000..145bdab1214e
--- /dev/null
+++ b/lib/crypto/riscv/sha512.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-512 and SHA-384 using the RISC-V vector crypto extensions
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
+
+asmlinkage void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state,
+					     const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_extensions) && likely(may_use_simd())) {
+		kernel_vector_begin();
+		sha512_transform_zvknhb_zvkb(state, data, nblocks);
+		kernel_vector_end();
+	} else {
+		sha512_blocks_generic(state, data, nblocks);
+	}
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+	if (riscv_isa_extension_available(NULL, ZVKNHB) &&
+	    riscv_isa_extension_available(NULL, ZVKB) &&
+	    riscv_vector_vlen() >= 128)
+		static_branch_enable(&have_extensions);
+}
diff --git a/lib/crypto/s390/chacha-s390.S b/lib/crypto/s390/chacha-s390.S
new file mode 100644
index 000000000000..63f3102678c0
--- /dev/null
+++ b/lib/crypto/s390/chacha-s390.S
@@ -0,0 +1,908 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Original implementation written by Andy Polyakov, @dot-asm.
+ * This is an adaptation of the original code for kernel use.
+ *
+ * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/nospec-insn.h>
+#include <asm/fpu-insn.h>
+
+#define SP	%r15
+#define FRAME	(16 * 8 + 4 * 8)
+
+	.data
+	.balign	32
+
+SYM_DATA_START_LOCAL(sigma)
+	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	# endian-neutral
+	.long	1,0,0,0
+	.long	2,0,0,0
+	.long	3,0,0,0
+	.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c	# byte swap
+
+	.long	0,1,2,3
+	.long	0x61707865,0x61707865,0x61707865,0x61707865	# smashed sigma
+	.long	0x3320646e,0x3320646e,0x3320646e,0x3320646e
+	.long	0x79622d32,0x79622d32,0x79622d32,0x79622d32
+	.long	0x6b206574,0x6b206574,0x6b206574,0x6b206574
+SYM_DATA_END(sigma)
+
+	.previous
+
+	GEN_BR_THUNK %r14
+
+	.text
+
+#############################################################################
+# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
+#		      counst u32 *key, const u32 *counter)
+
+#define	OUT		%r2
+#define	INP		%r3
+#define	LEN		%r4
+#define	KEY		%r5
+#define	COUNTER		%r6
+
+#define BEPERM		%v31
+#define CTR		%v26
+
+#define K0		%v16
+#define K1		%v17
+#define K2		%v18
+#define K3		%v19
+
+#define XA0		%v0
+#define XA1		%v1
+#define XA2		%v2
+#define XA3		%v3
+
+#define XB0		%v4
+#define XB1		%v5
+#define XB2		%v6
+#define XB3		%v7
+
+#define XC0		%v8
+#define XC1		%v9
+#define XC2		%v10
+#define XC3		%v11
+
+#define XD0		%v12
+#define XD1		%v13
+#define XD2		%v14
+#define XD3		%v15
+
+#define XT0		%v27
+#define XT1		%v28
+#define XT2		%v29
+#define XT3		%v30
+
+SYM_FUNC_START(chacha20_vx_4x)
+	stmg	%r6,%r7,6*8(SP)
+
+	larl	%r7,sigma
+	lhi	%r0,10
+	lhi	%r1,0
+
+	VL	K0,0,,%r7		# load sigma
+	VL	K1,0,,KEY		# load key
+	VL	K2,16,,KEY
+	VL	K3,0,,COUNTER		# load counter
+
+	VL	BEPERM,0x40,,%r7
+	VL	CTR,0x50,,%r7
+
+	VLM	XA0,XA3,0x60,%r7,4	# load [smashed] sigma
+
+	VREPF	XB0,K1,0		# smash the key
+	VREPF	XB1,K1,1
+	VREPF	XB2,K1,2
+	VREPF	XB3,K1,3
+
+	VREPF	XD0,K3,0
+	VREPF	XD1,K3,1
+	VREPF	XD2,K3,2
+	VREPF	XD3,K3,3
+	VAF	XD0,XD0,CTR
+
+	VREPF	XC0,K2,0
+	VREPF	XC1,K2,1
+	VREPF	XC2,K2,2
+	VREPF	XC3,K2,3
+
+.Loop_4x:
+	VAF	XA0,XA0,XB0
+	VX	XD0,XD0,XA0
+	VERLLF	XD0,XD0,16
+
+	VAF	XA1,XA1,XB1
+	VX	XD1,XD1,XA1
+	VERLLF	XD1,XD1,16
+
+	VAF	XA2,XA2,XB2
+	VX	XD2,XD2,XA2
+	VERLLF	XD2,XD2,16
+
+	VAF	XA3,XA3,XB3
+	VX	XD3,XD3,XA3
+	VERLLF	XD3,XD3,16
+
+	VAF	XC0,XC0,XD0
+	VX	XB0,XB0,XC0
+	VERLLF	XB0,XB0,12
+
+	VAF	XC1,XC1,XD1
+	VX	XB1,XB1,XC1
+	VERLLF	XB1,XB1,12
+
+	VAF	XC2,XC2,XD2
+	VX	XB2,XB2,XC2
+	VERLLF	XB2,XB2,12
+
+	VAF	XC3,XC3,XD3
+	VX	XB3,XB3,XC3
+	VERLLF	XB3,XB3,12
+
+	VAF	XA0,XA0,XB0
+	VX	XD0,XD0,XA0
+	VERLLF	XD0,XD0,8
+
+	VAF	XA1,XA1,XB1
+	VX	XD1,XD1,XA1
+	VERLLF	XD1,XD1,8
+
+	VAF	XA2,XA2,XB2
+	VX	XD2,XD2,XA2
+	VERLLF	XD2,XD2,8
+
+	VAF	XA3,XA3,XB3
+	VX	XD3,XD3,XA3
+	VERLLF	XD3,XD3,8
+
+	VAF	XC0,XC0,XD0
+	VX	XB0,XB0,XC0
+	VERLLF	XB0,XB0,7
+
+	VAF	XC1,XC1,XD1
+	VX	XB1,XB1,XC1
+	VERLLF	XB1,XB1,7
+
+	VAF	XC2,XC2,XD2
+	VX	XB2,XB2,XC2
+	VERLLF	XB2,XB2,7
+
+	VAF	XC3,XC3,XD3
+	VX	XB3,XB3,XC3
+	VERLLF	XB3,XB3,7
+
+	VAF	XA0,XA0,XB1
+	VX	XD3,XD3,XA0
+	VERLLF	XD3,XD3,16
+
+	VAF	XA1,XA1,XB2
+	VX	XD0,XD0,XA1
+	VERLLF	XD0,XD0,16
+
+	VAF	XA2,XA2,XB3
+	VX	XD1,XD1,XA2
+	VERLLF	XD1,XD1,16
+
+	VAF	XA3,XA3,XB0
+	VX	XD2,XD2,XA3
+	VERLLF	XD2,XD2,16
+
+	VAF	XC2,XC2,XD3
+	VX	XB1,XB1,XC2
+	VERLLF	XB1,XB1,12
+
+	VAF	XC3,XC3,XD0
+	VX	XB2,XB2,XC3
+	VERLLF	XB2,XB2,12
+
+	VAF	XC0,XC0,XD1
+	VX	XB3,XB3,XC0
+	VERLLF	XB3,XB3,12
+
+	VAF	XC1,XC1,XD2
+	VX	XB0,XB0,XC1
+	VERLLF	XB0,XB0,12
+
+	VAF	XA0,XA0,XB1
+	VX	XD3,XD3,XA0
+	VERLLF	XD3,XD3,8
+
+	VAF	XA1,XA1,XB2
+	VX	XD0,XD0,XA1
+	VERLLF	XD0,XD0,8
+
+	VAF	XA2,XA2,XB3
+	VX	XD1,XD1,XA2
+	VERLLF	XD1,XD1,8
+
+	VAF	XA3,XA3,XB0
+	VX	XD2,XD2,XA3
+	VERLLF	XD2,XD2,8
+
+	VAF	XC2,XC2,XD3
+	VX	XB1,XB1,XC2
+	VERLLF	XB1,XB1,7
+
+	VAF	XC3,XC3,XD0
+	VX	XB2,XB2,XC3
+	VERLLF	XB2,XB2,7
+
+	VAF	XC0,XC0,XD1
+	VX	XB3,XB3,XC0
+	VERLLF	XB3,XB3,7
+
+	VAF	XC1,XC1,XD2
+	VX	XB0,XB0,XC1
+	VERLLF	XB0,XB0,7
+	brct	%r0,.Loop_4x
+
+	VAF	XD0,XD0,CTR
+
+	VMRHF	XT0,XA0,XA1		# transpose data
+	VMRHF	XT1,XA2,XA3
+	VMRLF	XT2,XA0,XA1
+	VMRLF	XT3,XA2,XA3
+	VPDI	XA0,XT0,XT1,0b0000
+	VPDI	XA1,XT0,XT1,0b0101
+	VPDI	XA2,XT2,XT3,0b0000
+	VPDI	XA3,XT2,XT3,0b0101
+
+	VMRHF	XT0,XB0,XB1
+	VMRHF	XT1,XB2,XB3
+	VMRLF	XT2,XB0,XB1
+	VMRLF	XT3,XB2,XB3
+	VPDI	XB0,XT0,XT1,0b0000
+	VPDI	XB1,XT0,XT1,0b0101
+	VPDI	XB2,XT2,XT3,0b0000
+	VPDI	XB3,XT2,XT3,0b0101
+
+	VMRHF	XT0,XC0,XC1
+	VMRHF	XT1,XC2,XC3
+	VMRLF	XT2,XC0,XC1
+	VMRLF	XT3,XC2,XC3
+	VPDI	XC0,XT0,XT1,0b0000
+	VPDI	XC1,XT0,XT1,0b0101
+	VPDI	XC2,XT2,XT3,0b0000
+	VPDI	XC3,XT2,XT3,0b0101
+
+	VMRHF	XT0,XD0,XD1
+	VMRHF	XT1,XD2,XD3
+	VMRLF	XT2,XD0,XD1
+	VMRLF	XT3,XD2,XD3
+	VPDI	XD0,XT0,XT1,0b0000
+	VPDI	XD1,XT0,XT1,0b0101
+	VPDI	XD2,XT2,XT3,0b0000
+	VPDI	XD3,XT2,XT3,0b0101
+
+	VAF	XA0,XA0,K0
+	VAF	XB0,XB0,K1
+	VAF	XC0,XC0,K2
+	VAF	XD0,XD0,K3
+
+	VPERM	XA0,XA0,XA0,BEPERM
+	VPERM	XB0,XB0,XB0,BEPERM
+	VPERM	XC0,XC0,XC0,BEPERM
+	VPERM	XD0,XD0,XD0,BEPERM
+
+	VLM	XT0,XT3,0,INP,0
+
+	VX	XT0,XT0,XA0
+	VX	XT1,XT1,XB0
+	VX	XT2,XT2,XC0
+	VX	XT3,XT3,XD0
+
+	VSTM	XT0,XT3,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+
+	VAF	XA0,XA1,K0
+	VAF	XB0,XB1,K1
+	VAF	XC0,XC1,K2
+	VAF	XD0,XD1,K3
+
+	VPERM	XA0,XA0,XA0,BEPERM
+	VPERM	XB0,XB0,XB0,BEPERM
+	VPERM	XC0,XC0,XC0,BEPERM
+	VPERM	XD0,XD0,XD0,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_4x
+
+	VLM	XT0,XT3,0,INP,0
+
+	VX	XT0,XT0,XA0
+	VX	XT1,XT1,XB0
+	VX	XT2,XT2,XC0
+	VX	XT3,XT3,XD0
+
+	VSTM	XT0,XT3,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_4x
+
+	VAF	XA0,XA2,K0
+	VAF	XB0,XB2,K1
+	VAF	XC0,XC2,K2
+	VAF	XD0,XD2,K3
+
+	VPERM	XA0,XA0,XA0,BEPERM
+	VPERM	XB0,XB0,XB0,BEPERM
+	VPERM	XC0,XC0,XC0,BEPERM
+	VPERM	XD0,XD0,XD0,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_4x
+
+	VLM	XT0,XT3,0,INP,0
+
+	VX	XT0,XT0,XA0
+	VX	XT1,XT1,XB0
+	VX	XT2,XT2,XC0
+	VX	XT3,XT3,XD0
+
+	VSTM	XT0,XT3,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_4x
+
+	VAF	XA0,XA3,K0
+	VAF	XB0,XB3,K1
+	VAF	XC0,XC3,K2
+	VAF	XD0,XD3,K3
+
+	VPERM	XA0,XA0,XA0,BEPERM
+	VPERM	XB0,XB0,XB0,BEPERM
+	VPERM	XC0,XC0,XC0,BEPERM
+	VPERM	XD0,XD0,XD0,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_4x
+
+	VLM	XT0,XT3,0,INP,0
+
+	VX	XT0,XT0,XA0
+	VX	XT1,XT1,XB0
+	VX	XT2,XT2,XC0
+	VX	XT3,XT3,XD0
+
+	VSTM	XT0,XT3,0,OUT,0
+
+.Ldone_4x:
+	lmg	%r6,%r7,6*8(SP)
+	BR_EX	%r14
+
+.Ltail_4x:
+	VLR	XT0,XC0
+	VLR	XT1,XD0
+
+	VST	XA0,8*8+0x00,,SP
+	VST	XB0,8*8+0x10,,SP
+	VST	XT0,8*8+0x20,,SP
+	VST	XT1,8*8+0x30,,SP
+
+	lghi	%r1,0
+
+.Loop_tail_4x:
+	llgc	%r5,0(%r1,INP)
+	llgc	%r6,8*8(%r1,SP)
+	xr	%r6,%r5
+	stc	%r6,0(%r1,OUT)
+	la	%r1,1(%r1)
+	brct	LEN,.Loop_tail_4x
+
+	lmg	%r6,%r7,6*8(SP)
+	BR_EX	%r14
+SYM_FUNC_END(chacha20_vx_4x)
+
+#undef	OUT
+#undef	INP
+#undef	LEN
+#undef	KEY
+#undef	COUNTER
+
+#undef BEPERM
+
+#undef K0
+#undef K1
+#undef K2
+#undef K3
+
+
+#############################################################################
+# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
+#		   counst u32 *key, const u32 *counter)
+
+#define	OUT		%r2
+#define	INP		%r3
+#define	LEN		%r4
+#define	KEY		%r5
+#define	COUNTER		%r6
+
+#define BEPERM		%v31
+
+#define K0		%v27
+#define K1		%v24
+#define K2		%v25
+#define K3		%v26
+
+#define A0		%v0
+#define B0		%v1
+#define C0		%v2
+#define D0		%v3
+
+#define A1		%v4
+#define B1		%v5
+#define C1		%v6
+#define D1		%v7
+
+#define A2		%v8
+#define B2		%v9
+#define C2		%v10
+#define D2		%v11
+
+#define A3		%v12
+#define B3		%v13
+#define C3		%v14
+#define D3		%v15
+
+#define A4		%v16
+#define B4		%v17
+#define C4		%v18
+#define D4		%v19
+
+#define A5		%v20
+#define B5		%v21
+#define C5		%v22
+#define D5		%v23
+
+#define T0		%v27
+#define T1		%v28
+#define T2		%v29
+#define T3		%v30
+
+SYM_FUNC_START(chacha20_vx)
+	clgfi	LEN,256
+	jle	chacha20_vx_4x
+	stmg	%r6,%r7,6*8(SP)
+
+	lghi	%r1,-FRAME
+	lgr	%r0,SP
+	la	SP,0(%r1,SP)
+	stg	%r0,0(SP)		# back-chain
+
+	larl	%r7,sigma
+	lhi	%r0,10
+
+	VLM	K1,K2,0,KEY,0		# load key
+	VL	K3,0,,COUNTER		# load counter
+
+	VLM	K0,BEPERM,0,%r7,4	# load sigma, increments, ...
+
+.Loop_outer_vx:
+	VLR	A0,K0
+	VLR	B0,K1
+	VLR	A1,K0
+	VLR	B1,K1
+	VLR	A2,K0
+	VLR	B2,K1
+	VLR	A3,K0
+	VLR	B3,K1
+	VLR	A4,K0
+	VLR	B4,K1
+	VLR	A5,K0
+	VLR	B5,K1
+
+	VLR	D0,K3
+	VAF	D1,K3,T1		# K[3]+1
+	VAF	D2,K3,T2		# K[3]+2
+	VAF	D3,K3,T3		# K[3]+3
+	VAF	D4,D2,T2		# K[3]+4
+	VAF	D5,D2,T3		# K[3]+5
+
+	VLR	C0,K2
+	VLR	C1,K2
+	VLR	C2,K2
+	VLR	C3,K2
+	VLR	C4,K2
+	VLR	C5,K2
+
+	VLR	T1,D1
+	VLR	T2,D2
+	VLR	T3,D3
+
+.Loop_vx:
+	VAF	A0,A0,B0
+	VAF	A1,A1,B1
+	VAF	A2,A2,B2
+	VAF	A3,A3,B3
+	VAF	A4,A4,B4
+	VAF	A5,A5,B5
+	VX	D0,D0,A0
+	VX	D1,D1,A1
+	VX	D2,D2,A2
+	VX	D3,D3,A3
+	VX	D4,D4,A4
+	VX	D5,D5,A5
+	VERLLF	D0,D0,16
+	VERLLF	D1,D1,16
+	VERLLF	D2,D2,16
+	VERLLF	D3,D3,16
+	VERLLF	D4,D4,16
+	VERLLF	D5,D5,16
+
+	VAF	C0,C0,D0
+	VAF	C1,C1,D1
+	VAF	C2,C2,D2
+	VAF	C3,C3,D3
+	VAF	C4,C4,D4
+	VAF	C5,C5,D5
+	VX	B0,B0,C0
+	VX	B1,B1,C1
+	VX	B2,B2,C2
+	VX	B3,B3,C3
+	VX	B4,B4,C4
+	VX	B5,B5,C5
+	VERLLF	B0,B0,12
+	VERLLF	B1,B1,12
+	VERLLF	B2,B2,12
+	VERLLF	B3,B3,12
+	VERLLF	B4,B4,12
+	VERLLF	B5,B5,12
+
+	VAF	A0,A0,B0
+	VAF	A1,A1,B1
+	VAF	A2,A2,B2
+	VAF	A3,A3,B3
+	VAF	A4,A4,B4
+	VAF	A5,A5,B5
+	VX	D0,D0,A0
+	VX	D1,D1,A1
+	VX	D2,D2,A2
+	VX	D3,D3,A3
+	VX	D4,D4,A4
+	VX	D5,D5,A5
+	VERLLF	D0,D0,8
+	VERLLF	D1,D1,8
+	VERLLF	D2,D2,8
+	VERLLF	D3,D3,8
+	VERLLF	D4,D4,8
+	VERLLF	D5,D5,8
+
+	VAF	C0,C0,D0
+	VAF	C1,C1,D1
+	VAF	C2,C2,D2
+	VAF	C3,C3,D3
+	VAF	C4,C4,D4
+	VAF	C5,C5,D5
+	VX	B0,B0,C0
+	VX	B1,B1,C1
+	VX	B2,B2,C2
+	VX	B3,B3,C3
+	VX	B4,B4,C4
+	VX	B5,B5,C5
+	VERLLF	B0,B0,7
+	VERLLF	B1,B1,7
+	VERLLF	B2,B2,7
+	VERLLF	B3,B3,7
+	VERLLF	B4,B4,7
+	VERLLF	B5,B5,7
+
+	VSLDB	C0,C0,C0,8
+	VSLDB	C1,C1,C1,8
+	VSLDB	C2,C2,C2,8
+	VSLDB	C3,C3,C3,8
+	VSLDB	C4,C4,C4,8
+	VSLDB	C5,C5,C5,8
+	VSLDB	B0,B0,B0,4
+	VSLDB	B1,B1,B1,4
+	VSLDB	B2,B2,B2,4
+	VSLDB	B3,B3,B3,4
+	VSLDB	B4,B4,B4,4
+	VSLDB	B5,B5,B5,4
+	VSLDB	D0,D0,D0,12
+	VSLDB	D1,D1,D1,12
+	VSLDB	D2,D2,D2,12
+	VSLDB	D3,D3,D3,12
+	VSLDB	D4,D4,D4,12
+	VSLDB	D5,D5,D5,12
+
+	VAF	A0,A0,B0
+	VAF	A1,A1,B1
+	VAF	A2,A2,B2
+	VAF	A3,A3,B3
+	VAF	A4,A4,B4
+	VAF	A5,A5,B5
+	VX	D0,D0,A0
+	VX	D1,D1,A1
+	VX	D2,D2,A2
+	VX	D3,D3,A3
+	VX	D4,D4,A4
+	VX	D5,D5,A5
+	VERLLF	D0,D0,16
+	VERLLF	D1,D1,16
+	VERLLF	D2,D2,16
+	VERLLF	D3,D3,16
+	VERLLF	D4,D4,16
+	VERLLF	D5,D5,16
+
+	VAF	C0,C0,D0
+	VAF	C1,C1,D1
+	VAF	C2,C2,D2
+	VAF	C3,C3,D3
+	VAF	C4,C4,D4
+	VAF	C5,C5,D5
+	VX	B0,B0,C0
+	VX	B1,B1,C1
+	VX	B2,B2,C2
+	VX	B3,B3,C3
+	VX	B4,B4,C4
+	VX	B5,B5,C5
+	VERLLF	B0,B0,12
+	VERLLF	B1,B1,12
+	VERLLF	B2,B2,12
+	VERLLF	B3,B3,12
+	VERLLF	B4,B4,12
+	VERLLF	B5,B5,12
+
+	VAF	A0,A0,B0
+	VAF	A1,A1,B1
+	VAF	A2,A2,B2
+	VAF	A3,A3,B3
+	VAF	A4,A4,B4
+	VAF	A5,A5,B5
+	VX	D0,D0,A0
+	VX	D1,D1,A1
+	VX	D2,D2,A2
+	VX	D3,D3,A3
+	VX	D4,D4,A4
+	VX	D5,D5,A5
+	VERLLF	D0,D0,8
+	VERLLF	D1,D1,8
+	VERLLF	D2,D2,8
+	VERLLF	D3,D3,8
+	VERLLF	D4,D4,8
+	VERLLF	D5,D5,8
+
+	VAF	C0,C0,D0
+	VAF	C1,C1,D1
+	VAF	C2,C2,D2
+	VAF	C3,C3,D3
+	VAF	C4,C4,D4
+	VAF	C5,C5,D5
+	VX	B0,B0,C0
+	VX	B1,B1,C1
+	VX	B2,B2,C2
+	VX	B3,B3,C3
+	VX	B4,B4,C4
+	VX	B5,B5,C5
+	VERLLF	B0,B0,7
+	VERLLF	B1,B1,7
+	VERLLF	B2,B2,7
+	VERLLF	B3,B3,7
+	VERLLF	B4,B4,7
+	VERLLF	B5,B5,7
+
+	VSLDB	C0,C0,C0,8
+	VSLDB	C1,C1,C1,8
+	VSLDB	C2,C2,C2,8
+	VSLDB	C3,C3,C3,8
+	VSLDB	C4,C4,C4,8
+	VSLDB	C5,C5,C5,8
+	VSLDB	B0,B0,B0,12
+	VSLDB	B1,B1,B1,12
+	VSLDB	B2,B2,B2,12
+	VSLDB	B3,B3,B3,12
+	VSLDB	B4,B4,B4,12
+	VSLDB	B5,B5,B5,12
+	VSLDB	D0,D0,D0,4
+	VSLDB	D1,D1,D1,4
+	VSLDB	D2,D2,D2,4
+	VSLDB	D3,D3,D3,4
+	VSLDB	D4,D4,D4,4
+	VSLDB	D5,D5,D5,4
+	brct	%r0,.Loop_vx
+
+	VAF	A0,A0,K0
+	VAF	B0,B0,K1
+	VAF	C0,C0,K2
+	VAF	D0,D0,K3
+	VAF	A1,A1,K0
+	VAF	D1,D1,T1		# +K[3]+1
+
+	VPERM	A0,A0,A0,BEPERM
+	VPERM	B0,B0,B0,BEPERM
+	VPERM	C0,C0,C0,BEPERM
+	VPERM	D0,D0,D0,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VAF	D2,D2,T2		# +K[3]+2
+	VAF	D3,D3,T3		# +K[3]+3
+	VLM	T0,T3,0,INP,0
+
+	VX	A0,A0,T0
+	VX	B0,B0,T1
+	VX	C0,C0,T2
+	VX	D0,D0,T3
+
+	VLM	K0,T3,0,%r7,4		# re-load sigma and increments
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	B1,B1,K1
+	VAF	C1,C1,K2
+
+	VPERM	A0,A1,A1,BEPERM
+	VPERM	B0,B1,B1,BEPERM
+	VPERM	C0,C1,C1,BEPERM
+	VPERM	D0,D1,D1,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	A2,A2,K0
+	VAF	B2,B2,K1
+	VAF	C2,C2,K2
+
+	VPERM	A0,A2,A2,BEPERM
+	VPERM	B0,B2,B2,BEPERM
+	VPERM	C0,C2,C2,BEPERM
+	VPERM	D0,D2,D2,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	A3,A3,K0
+	VAF	B3,B3,K1
+	VAF	C3,C3,K2
+	VAF	D2,K3,T3		# K[3]+3
+
+	VPERM	A0,A3,A3,BEPERM
+	VPERM	B0,B3,B3,BEPERM
+	VPERM	C0,C3,C3,BEPERM
+	VPERM	D0,D3,D3,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VAF	D3,D2,T1		# K[3]+4
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	A4,A4,K0
+	VAF	B4,B4,K1
+	VAF	C4,C4,K2
+	VAF	D4,D4,D3		# +K[3]+4
+	VAF	D3,D3,T1		# K[3]+5
+	VAF	K3,D2,T3		# K[3]+=6
+
+	VPERM	A0,A4,A4,BEPERM
+	VPERM	B0,B4,B4,BEPERM
+	VPERM	C0,C4,C4,BEPERM
+	VPERM	D0,D4,D4,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	aghi	LEN,-0x40
+	je	.Ldone_vx
+
+	VAF	A5,A5,K0
+	VAF	B5,B5,K1
+	VAF	C5,C5,K2
+	VAF	D5,D5,D3		# +K[3]+5
+
+	VPERM	A0,A5,A5,BEPERM
+	VPERM	B0,B5,B5,BEPERM
+	VPERM	C0,C5,C5,BEPERM
+	VPERM	D0,D5,D5,BEPERM
+
+	clgfi	LEN,0x40
+	jl	.Ltail_vx
+
+	VLM	A1,D1,0,INP,0
+
+	VX	A0,A0,A1
+	VX	B0,B0,B1
+	VX	C0,C0,C1
+	VX	D0,D0,D1
+
+	VSTM	A0,D0,0,OUT,0
+
+	la	INP,0x40(INP)
+	la	OUT,0x40(OUT)
+	lhi	%r0,10
+	aghi	LEN,-0x40
+	jne	.Loop_outer_vx
+
+.Ldone_vx:
+	lmg	%r6,%r7,FRAME+6*8(SP)
+	la	SP,FRAME(SP)
+	BR_EX	%r14
+
+.Ltail_vx:
+	VSTM	A0,D0,8*8,SP,3
+	lghi	%r1,0
+
+.Loop_tail_vx:
+	llgc	%r5,0(%r1,INP)
+	llgc	%r6,8*8(%r1,SP)
+	xr	%r6,%r5
+	stc	%r6,0(%r1,OUT)
+	la	%r1,1(%r1)
+	brct	LEN,.Loop_tail_vx
+
+	lmg	%r6,%r7,FRAME+6*8(SP)
+	la	SP,FRAME(SP)
+	BR_EX	%r14
+SYM_FUNC_END(chacha20_vx)
+
+.previous
diff --git a/lib/crypto/s390/chacha-s390.h b/lib/crypto/s390/chacha-s390.h
new file mode 100644
index 000000000000..733744ce30f5
--- /dev/null
+++ b/lib/crypto/s390/chacha-s390.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#ifndef _CHACHA_S390_H
+#define _CHACHA_S390_H
+
+void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key,
+		 const u32 *counter);
+
+#endif /* _CHACHA_S390_H */
diff --git a/lib/crypto/s390/chacha.h b/lib/crypto/s390/chacha.h
new file mode 100644
index 000000000000..fd9c4a422365
--- /dev/null
+++ b/lib/crypto/s390/chacha.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ChaCha stream cipher (s390 optimized)
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+#include <asm/fpu.h>
+#include "chacha-s390.h"
+
+#define hchacha_block_arch hchacha_block_generic /* not implemented yet */
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+			      const u8 *src, unsigned int bytes, int nrounds)
+{
+	/* s390 chacha20 implementation has 20 rounds hard-coded,
+	 * it cannot handle a block of data or less, but otherwise
+	 * it can handle data of arbitrary size
+	 */
+	if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) {
+		chacha_crypt_generic(state, dst, src, bytes, nrounds);
+	} else {
+		DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
+
+		kernel_fpu_begin(&vxstate, KERNEL_VXR);
+		chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]);
+		kernel_fpu_end(&vxstate, KERNEL_VXR);
+
+		state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) /
+				CHACHA_BLOCK_SIZE;
+	}
+}
diff --git a/lib/crypto/s390/sha1.h b/lib/crypto/s390/sha1.h
new file mode 100644
index 000000000000..73d94476a157
--- /dev/null
+++ b/lib/crypto/s390/sha1.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha1);
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_cpacf_sha1))
+		cpacf_kimd(CPACF_KIMD_SHA_1, state, data,
+			   nblocks * SHA1_BLOCK_SIZE);
+	else
+		sha1_blocks_generic(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+	    cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1))
+		static_branch_enable(&have_cpacf_sha1);
+}
diff --git a/lib/crypto/s390/sha256.h b/lib/crypto/s390/sha256.h
new file mode 100644
index 000000000000..acd483508789
--- /dev/null
+++ b/lib/crypto/s390/sha256.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_cpacf_sha256))
+		cpacf_kimd(CPACF_KIMD_SHA_256, state, data,
+			   nblocks * SHA256_BLOCK_SIZE);
+	else
+		sha256_blocks_generic(state, data, nblocks);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+	    cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256))
+		static_branch_enable(&have_cpacf_sha256);
+}
diff --git a/lib/crypto/s390/sha3.h b/lib/crypto/s390/sha3.h
new file mode 100644
index 000000000000..85471404775a
--- /dev/null
+++ b/lib/crypto/s390/sha3.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-3 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3_init_optim);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+			       size_t nblocks, size_t block_size)
+{
+	if (static_branch_likely(&have_sha3)) {
+		/*
+		 * Note that KIMD assumes little-endian order of the state
+		 * words.  sha3_state already uses that order, though, so
+		 * there's no need for a byteswap.
+		 */
+		switch (block_size) {
+		case SHA3_224_BLOCK_SIZE:
+			cpacf_kimd(CPACF_KIMD_SHA3_224, state,
+				   data, nblocks * block_size);
+			return;
+		case SHA3_256_BLOCK_SIZE:
+			/*
+			 * This case handles both SHA3-256 and SHAKE256, since
+			 * they have the same block size.
+			 */
+			cpacf_kimd(CPACF_KIMD_SHA3_256, state,
+				   data, nblocks * block_size);
+			return;
+		case SHA3_384_BLOCK_SIZE:
+			cpacf_kimd(CPACF_KIMD_SHA3_384, state,
+				   data, nblocks * block_size);
+			return;
+		case SHA3_512_BLOCK_SIZE:
+			cpacf_kimd(CPACF_KIMD_SHA3_512, state,
+				   data, nblocks * block_size);
+			return;
+		}
+	}
+	sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+	if (static_branch_likely(&have_sha3)) {
+		/*
+		 * Passing zeroes into any of CPACF_KIMD_SHA3_* gives the plain
+		 * Keccak-f permutation, which is what we want here.  Use
+		 * SHA3-512 since it has the smallest block size.
+		 */
+		static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+		cpacf_kimd(CPACF_KIMD_SHA3_512, state, zeroes, sizeof(zeroes));
+	} else {
+		sha3_keccakf_generic(state);
+	}
+}
+
+static inline bool s390_sha3(int func, const u8 *in, size_t in_len,
+			     u8 *out, size_t out_len)
+{
+	struct sha3_state state;
+
+	if (!static_branch_likely(&have_sha3))
+		return false;
+
+	if (static_branch_likely(&have_sha3_init_optim))
+		func |= CPACF_KLMD_NIP | CPACF_KLMD_DUFOP;
+	else
+		memset(&state, 0, sizeof(state));
+
+	cpacf_klmd(func, &state, in, in_len);
+
+	if (static_branch_likely(&have_sha3_init_optim))
+		kmsan_unpoison_memory(&state, out_len);
+
+	memcpy(out, &state, out_len);
+	memzero_explicit(&state, sizeof(state));
+	return true;
+}
+
+#define sha3_224_arch sha3_224_arch
+static bool sha3_224_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_224_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_224, in, in_len,
+			 out, SHA3_224_DIGEST_SIZE);
+}
+
+#define sha3_256_arch sha3_256_arch
+static bool sha3_256_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_256_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_256, in, in_len,
+			 out, SHA3_256_DIGEST_SIZE);
+}
+
+#define sha3_384_arch sha3_384_arch
+static bool sha3_384_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_384_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_384, in, in_len,
+			 out, SHA3_384_DIGEST_SIZE);
+}
+
+#define sha3_512_arch sha3_512_arch
+static bool sha3_512_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_512_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_512, in, in_len,
+			 out, SHA3_512_DIGEST_SIZE);
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+	int num_present = 0;
+	int num_possible = 0;
+
+	if (!cpu_have_feature(S390_CPU_FEATURE_MSA))
+		return;
+	/*
+	 * Since all the SHA-3 functions are in Message-Security-Assist
+	 * Extension 6, just treat them as all or nothing.  This way we need
+	 * only one static_key.
+	 */
+#define QUERY(opcode, func) \
+	({ num_present += !!cpacf_query_func(opcode, func); num_possible++; })
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_224);
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_256);
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_384);
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_512);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_224);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_256);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_384);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_512);
+#undef QUERY
+
+	if (num_present == num_possible) {
+		static_branch_enable(&have_sha3);
+		if (test_facility(86))
+			static_branch_enable(&have_sha3_init_optim);
+	} else if (num_present != 0) {
+		pr_warn("Unsupported combination of SHA-3 facilities\n");
+	}
+}
diff --git a/lib/crypto/s390/sha512.h b/lib/crypto/s390/sha512.h
new file mode 100644
index 000000000000..46699d43df7e
--- /dev/null
+++ b/lib/crypto/s390/sha512.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-512 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha512);
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_cpacf_sha512))
+		cpacf_kimd(CPACF_KIMD_SHA_512, state, data,
+			   nblocks * SHA512_BLOCK_SIZE);
+	else
+		sha512_blocks_generic(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+	    cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512))
+		static_branch_enable(&have_cpacf_sha512);
+}
diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c
index 1aebe7be9401..52788278cd17 100644
--- a/lib/crypto/sha1.c
+++ b/lib/crypto/sha1.c
@@ -1,18 +1,22 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * SHA1 routine optimized to do word accesses rather than byte accesses,
- * and to avoid unnecessary copies into the context array.
- *
- * This was based on the git SHA1 implementation.
+ * SHA-1 and HMAC-SHA1 library functions
  */
 
-#include <linux/kernel.h>
+#include <crypto/hmac.h>
+#include <crypto/sha1.h>
+#include <linux/bitops.h>
 #include <linux/export.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/bitops.h>
 #include <linux/string.h>
-#include <crypto/sha1.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+#include "fips.h"
+
+static const struct sha1_block_state sha1_iv = {
+	.h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+};
 
 /*
  * If you have 32 registers or more, the compiler can (and should)
@@ -124,10 +128,10 @@ void sha1_transform(__u32 *digest, const char *data, __u32 *array)
 EXPORT_SYMBOL(sha1_transform);
 
 /**
- * sha1_init - initialize the vectors for a SHA1 digest
+ * sha1_init_raw - initialize the vectors for a SHA1 digest
  * @buf: vector to initialize
  */
-void sha1_init(__u32 *buf)
+void sha1_init_raw(__u32 *buf)
 {
 	buf[0] = 0x67452301;
 	buf[1] = 0xefcdab89;
@@ -135,6 +139,227 @@ void sha1_init(__u32 *buf)
 	buf[3] = 0x10325476;
 	buf[4] = 0xc3d2e1f0;
 }
-EXPORT_SYMBOL(sha1_init);
+EXPORT_SYMBOL(sha1_init_raw);
+
+static void __maybe_unused sha1_blocks_generic(struct sha1_block_state *state,
+					       const u8 *data, size_t nblocks)
+{
+	u32 workspace[SHA1_WORKSPACE_WORDS];
+
+	do {
+		sha1_transform(state->h, data, workspace);
+		data += SHA1_BLOCK_SIZE;
+	} while (--nblocks);
+
+	memzero_explicit(workspace, sizeof(workspace));
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA1_ARCH
+#include "sha1.h" /* $(SRCARCH)/sha1.h */
+#else
+#define sha1_blocks sha1_blocks_generic
+#endif
+
+void sha1_init(struct sha1_ctx *ctx)
+{
+	ctx->state = sha1_iv;
+	ctx->bytecount = 0;
+}
+EXPORT_SYMBOL_GPL(sha1_init);
+
+void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len)
+{
+	size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;
+
+	ctx->bytecount += len;
+
+	if (partial + len >= SHA1_BLOCK_SIZE) {
+		size_t nblocks;
+
+		if (partial) {
+			size_t l = SHA1_BLOCK_SIZE - partial;
+
+			memcpy(&ctx->buf[partial], data, l);
+			data += l;
+			len -= l;
+
+			sha1_blocks(&ctx->state, ctx->buf, 1);
+		}
+
+		nblocks = len / SHA1_BLOCK_SIZE;
+		len %= SHA1_BLOCK_SIZE;
+
+		if (nblocks) {
+			sha1_blocks(&ctx->state, data, nblocks);
+			data += nblocks * SHA1_BLOCK_SIZE;
+		}
+		partial = 0;
+	}
+	if (len)
+		memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(sha1_update);
+
+static void __sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+	u64 bitcount = ctx->bytecount << 3;
+	size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;
+
+	ctx->buf[partial++] = 0x80;
+	if (partial > SHA1_BLOCK_SIZE - 8) {
+		memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - partial);
+		sha1_blocks(&ctx->state, ctx->buf, 1);
+		partial = 0;
+	}
+	memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - 8 - partial);
+	*(__be64 *)&ctx->buf[SHA1_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
+	sha1_blocks(&ctx->state, ctx->buf, 1);
+
+	for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
+		put_unaligned_be32(ctx->state.h[i / 4], out + i);
+}
+
+void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+	__sha1_final(ctx, out);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha1_final);
+
+void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE])
+{
+	struct sha1_ctx ctx;
+
+	sha1_init(&ctx);
+	sha1_update(&ctx, data, len);
+	sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha1);
+
+static void __hmac_sha1_preparekey(struct sha1_block_state *istate,
+				   struct sha1_block_state *ostate,
+				   const u8 *raw_key, size_t raw_key_len)
+{
+	union {
+		u8 b[SHA1_BLOCK_SIZE];
+		unsigned long w[SHA1_BLOCK_SIZE / sizeof(unsigned long)];
+	} derived_key = { 0 };
+
+	if (unlikely(raw_key_len > SHA1_BLOCK_SIZE))
+		sha1(raw_key, raw_key_len, derived_key.b);
+	else
+		memcpy(derived_key.b, raw_key, raw_key_len);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+	*istate = sha1_iv;
+	sha1_blocks(istate, derived_key.b, 1);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+						HMAC_IPAD_VALUE);
+	*ostate = sha1_iv;
+	sha1_blocks(ostate, derived_key.b, 1);
+
+	memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha1_preparekey(struct hmac_sha1_key *key,
+			  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha1_preparekey(&key->istate, &key->ostate,
+			       raw_key, raw_key_len);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_preparekey);
+
+void hmac_sha1_init(struct hmac_sha1_ctx *ctx, const struct hmac_sha1_key *key)
+{
+	ctx->sha_ctx.state = key->istate;
+	ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE;
+	ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_init);
+
+void hmac_sha1_init_usingrawkey(struct hmac_sha1_ctx *ctx,
+				const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha1_preparekey(&ctx->sha_ctx.state, &ctx->ostate,
+			       raw_key, raw_key_len);
+	ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_init_usingrawkey);
+
+void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+	/* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+	__sha1_final(&ctx->sha_ctx, ctx->sha_ctx.buf);
+	memset(&ctx->sha_ctx.buf[SHA1_DIGEST_SIZE], 0,
+	       SHA1_BLOCK_SIZE - SHA1_DIGEST_SIZE);
+	ctx->sha_ctx.buf[SHA1_DIGEST_SIZE] = 0x80;
+	*(__be32 *)&ctx->sha_ctx.buf[SHA1_BLOCK_SIZE - 4] =
+		cpu_to_be32(8 * (SHA1_BLOCK_SIZE + SHA1_DIGEST_SIZE));
+
+	/* Compute the outer hash, which gives the HMAC value. */
+	sha1_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+	for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
+		put_unaligned_be32(ctx->ostate.h[i / 4], out + i);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_final);
+
+void hmac_sha1(const struct hmac_sha1_key *key,
+	       const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE])
+{
+	struct hmac_sha1_ctx ctx;
+
+	hmac_sha1_init(&ctx, key);
+	hmac_sha1_update(&ctx, data, data_len);
+	hmac_sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1);
+
+void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			   const u8 *data, size_t data_len,
+			   u8 out[SHA1_DIGEST_SIZE])
+{
+	struct hmac_sha1_ctx ctx;
+
+	hmac_sha1_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha1_update(&ctx, data, data_len);
+	hmac_sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_usingrawkey);
+
+#if defined(sha1_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha1_mod_init(void)
+{
+#ifdef sha1_mod_init_arch
+	sha1_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing HMAC-SHA1 satisfies the test
+		 * requirement for SHA-1 too.
+		 */
+		u8 mac[SHA1_DIGEST_SIZE];
+
+		hmac_sha1_usingrawkey(fips_test_key, sizeof(fips_test_key),
+				      fips_test_data, sizeof(fips_test_data),
+				      mac);
+		if (memcmp(fips_test_hmac_sha1_value, mac, sizeof(mac)) != 0)
+			panic("sha1: FIPS self-test failed\n");
+	}
+	return 0;
+}
+subsys_initcall(sha1_mod_init);
+
+static void __exit sha1_mod_exit(void)
+{
+}
+module_exit(sha1_mod_exit);
+#endif
 
+MODULE_DESCRIPTION("SHA-1 and HMAC-SHA1 library functions");
 MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 72a4b0b1df28..5d6b77e7e141 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -1,56 +1,65 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * SHA-256, as specified in
- * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
- *
- * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
+ * SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions
  *
  * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
  * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
  * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
  * Copyright (c) 2014 Red Hat Inc.
+ * Copyright 2025 Google LLC
  */
 
-#include <linux/bitops.h>
+#include <crypto/hmac.h>
+#include <crypto/sha2.h>
 #include <linux/export.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/string.h>
-#include <crypto/sha2.h>
-#include <asm/unaligned.h>
-
-static const u32 SHA256_K[] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+#include "fips.h"
+
+static const struct sha256_block_state sha224_iv = {
+	.h = {
+		SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
+		SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
+	},
 };
 
-static inline u32 Ch(u32 x, u32 y, u32 z)
-{
-	return z ^ (x & (y ^ z));
-}
+static const struct sha256_ctx initial_sha256_ctx = {
+	.ctx = {
+		.state = {
+			.h = {
+				SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+				SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+			},
+		},
+		.bytecount = 0,
+	},
+};
 
-static inline u32 Maj(u32 x, u32 y, u32 z)
-{
-	return (x & y) | (z & (x | y));
-}
+#define sha256_iv (initial_sha256_ctx.ctx.state)
+
+static const u32 sha256_K[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
+	0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
+	0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
+	0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
+	0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
+	0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
 
-#define e0(x)       (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22))
-#define e1(x)       (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25))
-#define s0(x)       (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3))
-#define s1(x)       (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10))
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
+#define e0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22))
+#define e1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25))
+#define s0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3))
+#define s1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10))
 
 static inline void LOAD_OP(int I, u32 *W, const u8 *input)
 {
@@ -59,18 +68,20 @@ static inline void LOAD_OP(int I, u32 *W, const u8 *input)
 
 static inline void BLEND_OP(int I, u32 *W)
 {
-	W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
+	W[I] = s1(W[I - 2]) + W[I - 7] + s0(W[I - 15]) + W[I - 16];
 }
 
-#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) do {		\
-	u32 t1, t2;						\
-	t1 = h + e1(e) + Ch(e, f, g) + SHA256_K[i] + W[i];	\
-	t2 = e0(a) + Maj(a, b, c);				\
-	d += t1;						\
-	h = t1 + t2;						\
-} while (0)
-
-static void sha256_transform(u32 *state, const u8 *input, u32 *W)
+#define SHA256_ROUND(i, a, b, c, d, e, f, g, h)                    \
+	do {                                                       \
+		u32 t1, t2;                                        \
+		t1 = h + e1(e) + Ch(e, f, g) + sha256_K[i] + W[i]; \
+		t2 = e0(a) + Maj(a, b, c);                         \
+		d += t1;                                           \
+		h = t1 + t2;                                       \
+	} while (0)
+
+static void sha256_block_generic(struct sha256_block_state *state,
+				 const u8 *input, u32 W[64])
 {
 	u32 a, b, c, d, e, f, g, h;
 	int i;
@@ -100,8 +111,14 @@ static void sha256_transform(u32 *state, const u8 *input, u32 *W)
 	}
 
 	/* load the state into our registers */
-	a = state[0];  b = state[1];  c = state[2];  d = state[3];
-	e = state[4];  f = state[5];  g = state[6];  h = state[7];
+	a = state->h[0];
+	b = state->h[1];
+	c = state->h[2];
+	d = state->h[3];
+	e = state->h[4];
+	f = state->h[5];
+	g = state->h[6];
+	h = state->h[7];
 
 	/* now iterate */
 	for (i = 0; i < 64; i += 8) {
@@ -115,95 +132,384 @@ static void sha256_transform(u32 *state, const u8 *input, u32 *W)
 		SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
 	}
 
-	state[0] += a; state[1] += b; state[2] += c; state[3] += d;
-	state[4] += e; state[5] += f; state[6] += g; state[7] += h;
+	state->h[0] += a;
+	state->h[1] += b;
+	state->h[2] += c;
+	state->h[3] += d;
+	state->h[4] += e;
+	state->h[5] += f;
+	state->h[6] += g;
+	state->h[7] += h;
 }
 
-void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
+static void __maybe_unused
+sha256_blocks_generic(struct sha256_block_state *state,
+		      const u8 *data, size_t nblocks)
 {
-	unsigned int partial, done;
-	const u8 *src;
 	u32 W[64];
 
-	partial = sctx->count & 0x3f;
-	sctx->count += len;
-	done = 0;
-	src = data;
+	do {
+		sha256_block_generic(state, data, W);
+		data += SHA256_BLOCK_SIZE;
+	} while (--nblocks);
+
+	memzero_explicit(W, sizeof(W));
+}
+
+#if defined(CONFIG_CRYPTO_LIB_SHA256_ARCH) && !defined(__DISABLE_EXPORTS)
+#include "sha256.h" /* $(SRCARCH)/sha256.h */
+#else
+#define sha256_blocks sha256_blocks_generic
+#endif
+
+static void __sha256_init(struct __sha256_ctx *ctx,
+			  const struct sha256_block_state *iv,
+			  u64 initial_bytecount)
+{
+	ctx->state = *iv;
+	ctx->bytecount = initial_bytecount;
+}
+
+void sha224_init(struct sha224_ctx *ctx)
+{
+	__sha256_init(&ctx->ctx, &sha224_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha224_init);
+
+void sha256_init(struct sha256_ctx *ctx)
+{
+	__sha256_init(&ctx->ctx, &sha256_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha256_init);
+
+void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len)
+{
+	size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;
+
+	ctx->bytecount += len;
+
+	if (partial + len >= SHA256_BLOCK_SIZE) {
+		size_t nblocks;
 
-	if ((partial + len) > 63) {
 		if (partial) {
-			done = -partial;
-			memcpy(sctx->buf + partial, data, done + 64);
-			src = sctx->buf;
+			size_t l = SHA256_BLOCK_SIZE - partial;
+
+			memcpy(&ctx->buf[partial], data, l);
+			data += l;
+			len -= l;
+
+			sha256_blocks(&ctx->state, ctx->buf, 1);
 		}
 
-		do {
-			sha256_transform(sctx->state, src, W);
-			done += 64;
-			src = data + done;
-		} while (done + 63 < len);
+		nblocks = len / SHA256_BLOCK_SIZE;
+		len %= SHA256_BLOCK_SIZE;
+
+		if (nblocks) {
+			sha256_blocks(&ctx->state, data, nblocks);
+			data += nblocks * SHA256_BLOCK_SIZE;
+		}
+		partial = 0;
+	}
+	if (len)
+		memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL(__sha256_update);
 
-		memzero_explicit(W, sizeof(W));
+static void __sha256_final(struct __sha256_ctx *ctx,
+			   u8 *out, size_t digest_size)
+{
+	u64 bitcount = ctx->bytecount << 3;
+	size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;
 
+	ctx->buf[partial++] = 0x80;
+	if (partial > SHA256_BLOCK_SIZE - 8) {
+		memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - partial);
+		sha256_blocks(&ctx->state, ctx->buf, 1);
 		partial = 0;
 	}
-	memcpy(sctx->buf + partial, src, len - done);
+	memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - 8 - partial);
+	*(__be64 *)&ctx->buf[SHA256_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
+	sha256_blocks(&ctx->state, ctx->buf, 1);
+
+	for (size_t i = 0; i < digest_size; i += 4)
+		put_unaligned_be32(ctx->state.h[i / 4], out + i);
 }
-EXPORT_SYMBOL(sha256_update);
 
-void sha224_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
+void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE])
 {
-	sha256_update(sctx, data, len);
+	__sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
 }
-EXPORT_SYMBOL(sha224_update);
+EXPORT_SYMBOL(sha224_final);
 
-static void __sha256_final(struct sha256_state *sctx, u8 *out, int digest_words)
+void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE])
 {
-	__be32 *dst = (__be32 *)out;
-	__be64 bits;
-	unsigned int index, pad_len;
-	int i;
-	static const u8 padding[64] = { 0x80, };
+	__sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(sha256_final);
+
+void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE])
+{
+	struct sha224_ctx ctx;
 
-	/* Save number of bits */
-	bits = cpu_to_be64(sctx->count << 3);
+	sha224_init(&ctx);
+	sha224_update(&ctx, data, len);
+	sha224_final(&ctx, out);
+}
+EXPORT_SYMBOL(sha224);
 
-	/* Pad out to 56 mod 64. */
-	index = sctx->count & 0x3f;
-	pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
-	sha256_update(sctx, padding, pad_len);
+void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
+{
+	struct sha256_ctx ctx;
 
-	/* Append length (before padding) */
-	sha256_update(sctx, (const u8 *)&bits, sizeof(bits));
+	sha256_init(&ctx);
+	sha256_update(&ctx, data, len);
+	sha256_final(&ctx, out);
+}
+EXPORT_SYMBOL(sha256);
 
-	/* Store state in digest */
-	for (i = 0; i < digest_words; i++)
-		put_unaligned_be32(sctx->state[i], &dst[i]);
+/*
+ * Pre-boot environments (as indicated by __DISABLE_EXPORTS being defined) just
+ * need the generic SHA-256 code.  Omit all other features from them.
+ */
+#ifndef __DISABLE_EXPORTS
 
-	/* Zeroize sensitive information. */
-	memzero_explicit(sctx, sizeof(*sctx));
+#ifndef sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	return false;
+}
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return false;
 }
+#endif
 
-void sha256_final(struct sha256_state *sctx, u8 *out)
+/* Sequential fallback implementation of sha256_finup_2x() */
+static noinline_for_stack void sha256_finup_2x_sequential(
+	const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2,
+	size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE])
 {
-	__sha256_final(sctx, out, 8);
+	struct __sha256_ctx mut_ctx;
+
+	mut_ctx = *ctx;
+	__sha256_update(&mut_ctx, data1, len);
+	__sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE);
+
+	mut_ctx = *ctx;
+	__sha256_update(&mut_ctx, data2, len);
+	__sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE);
 }
-EXPORT_SYMBOL(sha256_final);
 
-void sha224_final(struct sha256_state *sctx, u8 *out)
+void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
+		     const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
+		     u8 out2[SHA256_DIGEST_SIZE])
+{
+	if (ctx == NULL)
+		ctx = &initial_sha256_ctx;
+
+	if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1,
+					out2)))
+		return;
+	sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2);
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x);
+
+bool sha256_finup_2x_is_optimized(void)
 {
-	__sha256_final(sctx, out, 7);
+	return sha256_finup_2x_is_optimized_arch();
 }
-EXPORT_SYMBOL(sha224_final);
+EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized);
+
+static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
+				     struct sha256_block_state *ostate,
+				     const u8 *raw_key, size_t raw_key_len,
+				     const struct sha256_block_state *iv)
+{
+	union {
+		u8 b[SHA256_BLOCK_SIZE];
+		unsigned long w[SHA256_BLOCK_SIZE / sizeof(unsigned long)];
+	} derived_key = { 0 };
+
+	if (unlikely(raw_key_len > SHA256_BLOCK_SIZE)) {
+		if (iv == &sha224_iv)
+			sha224(raw_key, raw_key_len, derived_key.b);
+		else
+			sha256(raw_key, raw_key_len, derived_key.b);
+	} else {
+		memcpy(derived_key.b, raw_key, raw_key_len);
+	}
 
-void sha256(const u8 *data, unsigned int len, u8 *out)
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+	*istate = *iv;
+	sha256_blocks(istate, derived_key.b, 1);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+						HMAC_IPAD_VALUE);
+	*ostate = *iv;
+	sha256_blocks(ostate, derived_key.b, 1);
+
+	memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha224_preparekey(struct hmac_sha224_key *key,
+			    const u8 *raw_key, size_t raw_key_len)
 {
-	struct sha256_state sctx;
+	__hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
+				 raw_key, raw_key_len, &sha224_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_preparekey);
 
-	sha256_init(&sctx);
-	sha256_update(&sctx, data, len);
-	sha256_final(&sctx, out);
+void hmac_sha256_preparekey(struct hmac_sha256_key *key,
+			    const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
+				 raw_key, raw_key_len, &sha256_iv);
 }
-EXPORT_SYMBOL(sha256);
+EXPORT_SYMBOL_GPL(hmac_sha256_preparekey);
+
+void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx,
+			const struct __hmac_sha256_key *key)
+{
+	__sha256_init(&ctx->sha_ctx, &key->istate, SHA256_BLOCK_SIZE);
+	ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(__hmac_sha256_init);
+
+void hmac_sha224_init_usingrawkey(struct hmac_sha224_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+				 raw_key, raw_key_len, &sha224_iv);
+	ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_init_usingrawkey);
+
+void hmac_sha256_init_usingrawkey(struct hmac_sha256_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+				 raw_key, raw_key_len, &sha256_iv);
+	ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_init_usingrawkey);
+
+static void __hmac_sha256_final(struct __hmac_sha256_ctx *ctx,
+				u8 *out, size_t digest_size)
+{
+	/* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+	__sha256_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
+	memset(&ctx->sha_ctx.buf[digest_size], 0,
+	       SHA256_BLOCK_SIZE - digest_size);
+	ctx->sha_ctx.buf[digest_size] = 0x80;
+	*(__be32 *)&ctx->sha_ctx.buf[SHA256_BLOCK_SIZE - 4] =
+		cpu_to_be32(8 * (SHA256_BLOCK_SIZE + digest_size));
+
+	/* Compute the outer hash, which gives the HMAC value. */
+	sha256_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+	for (size_t i = 0; i < digest_size; i += 4)
+		put_unaligned_be32(ctx->ostate.h[i / 4], out + i);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+void hmac_sha224_final(struct hmac_sha224_ctx *ctx,
+		       u8 out[SHA224_DIGEST_SIZE])
+{
+	__hmac_sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_final);
+
+void hmac_sha256_final(struct hmac_sha256_ctx *ctx,
+		       u8 out[SHA256_DIGEST_SIZE])
+{
+	__hmac_sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_final);
+
+void hmac_sha224(const struct hmac_sha224_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE])
+{
+	struct hmac_sha224_ctx ctx;
+
+	hmac_sha224_init(&ctx, key);
+	hmac_sha224_update(&ctx, data, data_len);
+	hmac_sha224_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224);
+
+void hmac_sha256(const struct hmac_sha256_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE])
+{
+	struct hmac_sha256_ctx ctx;
+
+	hmac_sha256_init(&ctx, key);
+	hmac_sha256_update(&ctx, data, data_len);
+	hmac_sha256_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256);
+
+void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA224_DIGEST_SIZE])
+{
+	struct hmac_sha224_ctx ctx;
+
+	hmac_sha224_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha224_update(&ctx, data, data_len);
+	hmac_sha224_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_usingrawkey);
+
+void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA256_DIGEST_SIZE])
+{
+	struct hmac_sha256_ctx ctx;
+
+	hmac_sha256_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha256_update(&ctx, data, data_len);
+	hmac_sha256_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);
+
+#if defined(sha256_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha256_mod_init(void)
+{
+#ifdef sha256_mod_init_arch
+	sha256_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing HMAC-SHA256 satisfies the
+		 * test requirement for SHA-224, SHA-256, and HMAC-SHA224 too.
+		 */
+		u8 mac[SHA256_DIGEST_SIZE];
+
+		hmac_sha256_usingrawkey(fips_test_key, sizeof(fips_test_key),
+					fips_test_data, sizeof(fips_test_data),
+					mac);
+		if (memcmp(fips_test_hmac_sha256_value, mac, sizeof(mac)) != 0)
+			panic("sha256: FIPS self-test failed\n");
+	}
+	return 0;
+}
+subsys_initcall(sha256_mod_init);
+
+static void __exit sha256_mod_exit(void)
+{
+}
+module_exit(sha256_mod_exit);
+#endif
+
+#endif /* !__DISABLE_EXPORTS */
 
+MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions");
 MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha3.c b/lib/crypto/sha3.c
new file mode 100644
index 000000000000..32b7074de792
--- /dev/null
+++ b/lib/crypto/sha3.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-3, as specified in
+ * https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+ *
+ * SHA-3 code by Jeff Garzik <jeff@garzik.org>
+ *               Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ *               David Howells <dhowells@redhat.com>
+ *
+ * See also Documentation/crypto/sha3.rst
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <crypto/sha3.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+#include "fips.h"
+
+/*
+ * On some 32-bit architectures, such as h8300, GCC ends up using over 1 KB of
+ * stack if the round calculation gets inlined into the loop in
+ * sha3_keccakf_generic().  On the other hand, on 64-bit architectures with
+ * plenty of [64-bit wide] general purpose registers, not inlining it severely
+ * hurts performance.  So let's use 64-bitness as a heuristic to decide whether
+ * to inline or not.
+ */
+#ifdef CONFIG_64BIT
+#define SHA3_INLINE inline
+#else
+#define SHA3_INLINE noinline
+#endif
+
+#define SHA3_KECCAK_ROUNDS 24
+
+static const u64 sha3_keccakf_rndc[SHA3_KECCAK_ROUNDS] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
+	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
+	0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+/*
+ * Perform a single round of Keccak mixing.
+ */
+static SHA3_INLINE void sha3_keccakf_one_round_generic(u64 st[25], int round)
+{
+	u64 t[5], tt, bc[5];
+
+	/* Theta */
+	bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+	bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+	bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+	bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+	bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+
+	t[0] = bc[4] ^ rol64(bc[1], 1);
+	t[1] = bc[0] ^ rol64(bc[2], 1);
+	t[2] = bc[1] ^ rol64(bc[3], 1);
+	t[3] = bc[2] ^ rol64(bc[4], 1);
+	t[4] = bc[3] ^ rol64(bc[0], 1);
+
+	st[0] ^= t[0];
+
+	/* Rho Pi */
+	tt = st[1];
+	st[ 1] = rol64(st[ 6] ^ t[1], 44);
+	st[ 6] = rol64(st[ 9] ^ t[4], 20);
+	st[ 9] = rol64(st[22] ^ t[2], 61);
+	st[22] = rol64(st[14] ^ t[4], 39);
+	st[14] = rol64(st[20] ^ t[0], 18);
+	st[20] = rol64(st[ 2] ^ t[2], 62);
+	st[ 2] = rol64(st[12] ^ t[2], 43);
+	st[12] = rol64(st[13] ^ t[3], 25);
+	st[13] = rol64(st[19] ^ t[4],  8);
+	st[19] = rol64(st[23] ^ t[3], 56);
+	st[23] = rol64(st[15] ^ t[0], 41);
+	st[15] = rol64(st[ 4] ^ t[4], 27);
+	st[ 4] = rol64(st[24] ^ t[4], 14);
+	st[24] = rol64(st[21] ^ t[1],  2);
+	st[21] = rol64(st[ 8] ^ t[3], 55);
+	st[ 8] = rol64(st[16] ^ t[1], 45);
+	st[16] = rol64(st[ 5] ^ t[0], 36);
+	st[ 5] = rol64(st[ 3] ^ t[3], 28);
+	st[ 3] = rol64(st[18] ^ t[3], 21);
+	st[18] = rol64(st[17] ^ t[2], 15);
+	st[17] = rol64(st[11] ^ t[1], 10);
+	st[11] = rol64(st[ 7] ^ t[2],  6);
+	st[ 7] = rol64(st[10] ^ t[0],  3);
+	st[10] = rol64(    tt ^ t[1],  1);
+
+	/* Chi */
+	bc[ 0] = ~st[ 1] & st[ 2];
+	bc[ 1] = ~st[ 2] & st[ 3];
+	bc[ 2] = ~st[ 3] & st[ 4];
+	bc[ 3] = ~st[ 4] & st[ 0];
+	bc[ 4] = ~st[ 0] & st[ 1];
+	st[ 0] ^= bc[ 0];
+	st[ 1] ^= bc[ 1];
+	st[ 2] ^= bc[ 2];
+	st[ 3] ^= bc[ 3];
+	st[ 4] ^= bc[ 4];
+
+	bc[ 0] = ~st[ 6] & st[ 7];
+	bc[ 1] = ~st[ 7] & st[ 8];
+	bc[ 2] = ~st[ 8] & st[ 9];
+	bc[ 3] = ~st[ 9] & st[ 5];
+	bc[ 4] = ~st[ 5] & st[ 6];
+	st[ 5] ^= bc[ 0];
+	st[ 6] ^= bc[ 1];
+	st[ 7] ^= bc[ 2];
+	st[ 8] ^= bc[ 3];
+	st[ 9] ^= bc[ 4];
+
+	bc[ 0] = ~st[11] & st[12];
+	bc[ 1] = ~st[12] & st[13];
+	bc[ 2] = ~st[13] & st[14];
+	bc[ 3] = ~st[14] & st[10];
+	bc[ 4] = ~st[10] & st[11];
+	st[10] ^= bc[ 0];
+	st[11] ^= bc[ 1];
+	st[12] ^= bc[ 2];
+	st[13] ^= bc[ 3];
+	st[14] ^= bc[ 4];
+
+	bc[ 0] = ~st[16] & st[17];
+	bc[ 1] = ~st[17] & st[18];
+	bc[ 2] = ~st[18] & st[19];
+	bc[ 3] = ~st[19] & st[15];
+	bc[ 4] = ~st[15] & st[16];
+	st[15] ^= bc[ 0];
+	st[16] ^= bc[ 1];
+	st[17] ^= bc[ 2];
+	st[18] ^= bc[ 3];
+	st[19] ^= bc[ 4];
+
+	bc[ 0] = ~st[21] & st[22];
+	bc[ 1] = ~st[22] & st[23];
+	bc[ 2] = ~st[23] & st[24];
+	bc[ 3] = ~st[24] & st[20];
+	bc[ 4] = ~st[20] & st[21];
+	st[20] ^= bc[ 0];
+	st[21] ^= bc[ 1];
+	st[22] ^= bc[ 2];
+	st[23] ^= bc[ 3];
+	st[24] ^= bc[ 4];
+
+	/* Iota */
+	st[0] ^= sha3_keccakf_rndc[round];
+}
+
+/* Generic implementation of the Keccak-f[1600] permutation */
+static void sha3_keccakf_generic(struct sha3_state *state)
+{
+	/*
+	 * Temporarily convert the state words from little-endian to native-
+	 * endian so that they can be operated on.  Note that on little-endian
+	 * machines this conversion is a no-op and is optimized out.
+	 */
+
+	for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+		state->native_words[i] = le64_to_cpu(state->words[i]);
+
+	for (int round = 0; round < SHA3_KECCAK_ROUNDS; round++)
+		sha3_keccakf_one_round_generic(state->native_words, round);
+
+	for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+		state->words[i] = cpu_to_le64(state->native_words[i]);
+}
+
+/*
+ * Generic implementation of absorbing the given nonzero number of full blocks
+ * into the sponge function Keccak[r=8*block_size, c=1600-8*block_size].
+ */
+static void __maybe_unused
+sha3_absorb_blocks_generic(struct sha3_state *state, const u8 *data,
+			   size_t nblocks, size_t block_size)
+{
+	do {
+		for (size_t i = 0; i < block_size; i += 8)
+			state->words[i / 8] ^= get_unaligned((__le64 *)&data[i]);
+		sha3_keccakf_generic(state);
+		data += block_size;
+	} while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA3_ARCH
+#include "sha3.h" /* $(SRCARCH)/sha3.h */
+#else
+#define sha3_keccakf		sha3_keccakf_generic
+#define sha3_absorb_blocks	sha3_absorb_blocks_generic
+#endif
+
+void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len)
+{
+	const size_t block_size = ctx->block_size;
+	size_t absorb_offset = ctx->absorb_offset;
+
+	/* Warn if squeezing has already begun. */
+	WARN_ON_ONCE(absorb_offset >= block_size);
+
+	if (absorb_offset && absorb_offset + in_len >= block_size) {
+		crypto_xor(&ctx->state.bytes[absorb_offset], in,
+			   block_size - absorb_offset);
+		in += block_size - absorb_offset;
+		in_len -= block_size - absorb_offset;
+		sha3_keccakf(&ctx->state);
+		absorb_offset = 0;
+	}
+
+	if (in_len >= block_size) {
+		size_t nblocks = in_len / block_size;
+
+		sha3_absorb_blocks(&ctx->state, in, nblocks, block_size);
+		in += nblocks * block_size;
+		in_len -= nblocks * block_size;
+	}
+
+	if (in_len) {
+		crypto_xor(&ctx->state.bytes[absorb_offset], in, in_len);
+		absorb_offset += in_len;
+	}
+	ctx->absorb_offset = absorb_offset;
+}
+EXPORT_SYMBOL_GPL(__sha3_update);
+
+void sha3_final(struct sha3_ctx *sha3_ctx, u8 *out)
+{
+	struct __sha3_ctx *ctx = &sha3_ctx->ctx;
+
+	ctx->state.bytes[ctx->absorb_offset] ^= 0x06;
+	ctx->state.bytes[ctx->block_size - 1] ^= 0x80;
+	sha3_keccakf(&ctx->state);
+	memcpy(out, ctx->state.bytes, ctx->digest_size);
+	sha3_zeroize_ctx(sha3_ctx);
+}
+EXPORT_SYMBOL_GPL(sha3_final);
+
+void shake_squeeze(struct shake_ctx *shake_ctx, u8 *out, size_t out_len)
+{
+	struct __sha3_ctx *ctx = &shake_ctx->ctx;
+	const size_t block_size = ctx->block_size;
+	size_t squeeze_offset = ctx->squeeze_offset;
+
+	if (ctx->absorb_offset < block_size) {
+		/* First squeeze: */
+
+		/* Add the domain separation suffix and padding. */
+		ctx->state.bytes[ctx->absorb_offset] ^= 0x1f;
+		ctx->state.bytes[block_size - 1] ^= 0x80;
+
+		/* Indicate that squeezing has begun. */
+		ctx->absorb_offset = block_size;
+
+		/*
+		 * Indicate that no output is pending yet, i.e. sha3_keccakf()
+		 * will need to be called before the first copy.
+		 */
+		squeeze_offset = block_size;
+	}
+	while (out_len) {
+		if (squeeze_offset == block_size) {
+			sha3_keccakf(&ctx->state);
+			squeeze_offset = 0;
+		}
+		size_t copy = min(out_len, block_size - squeeze_offset);
+
+		memcpy(out, &ctx->state.bytes[squeeze_offset], copy);
+		out += copy;
+		out_len -= copy;
+		squeeze_offset += copy;
+	}
+	ctx->squeeze_offset = squeeze_offset;
+}
+EXPORT_SYMBOL_GPL(shake_squeeze);
+
+#ifndef sha3_224_arch
+static inline bool sha3_224_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_224_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+#ifndef sha3_256_arch
+static inline bool sha3_256_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_256_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+#ifndef sha3_384_arch
+static inline bool sha3_384_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_384_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+#ifndef sha3_512_arch
+static inline bool sha3_512_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_512_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+
+void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	if (sha3_224_arch(in, in_len, out))
+		return;
+	sha3_224_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_224);
+
+void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	if (sha3_256_arch(in, in_len, out))
+		return;
+	sha3_256_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_256);
+
+void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	if (sha3_384_arch(in, in_len, out))
+		return;
+	sha3_384_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_384);
+
+void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	if (sha3_512_arch(in, in_len, out))
+		return;
+	sha3_512_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_512);
+
+void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	struct shake_ctx ctx;
+
+	shake128_init(&ctx);
+	shake_update(&ctx, in, in_len);
+	shake_squeeze(&ctx, out, out_len);
+	shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake128);
+
+void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	struct shake_ctx ctx;
+
+	shake256_init(&ctx);
+	shake_update(&ctx, in, in_len);
+	shake_squeeze(&ctx, out, out_len);
+	shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake256);
+
+#if defined(sha3_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha3_mod_init(void)
+{
+#ifdef sha3_mod_init_arch
+	sha3_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing any SHA-3 algorithm
+		 * satisfies the test requirement for all of them.
+		 */
+		u8 hash[SHA3_256_DIGEST_SIZE];
+
+		sha3_256(fips_test_data, sizeof(fips_test_data), hash);
+		if (memcmp(fips_test_sha3_256_value, hash, sizeof(hash)) != 0)
+			panic("sha3: FIPS self-test failed\n");
+	}
+	return 0;
+}
+subsys_initcall(sha3_mod_init);
+
+static void __exit sha3_mod_exit(void)
+{
+}
+module_exit(sha3_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-3 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c
new file mode 100644
index 000000000000..605eab51aabd
--- /dev/null
+++ b/lib/crypto/sha512.c
@@ -0,0 +1,440 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/hmac.h>
+#include <crypto/sha2.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/overflow.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+#include "fips.h"
+
+static const struct sha512_block_state sha384_iv = {
+	.h = {
+		SHA384_H0, SHA384_H1, SHA384_H2, SHA384_H3,
+		SHA384_H4, SHA384_H5, SHA384_H6, SHA384_H7,
+	},
+};
+
+static const struct sha512_block_state sha512_iv = {
+	.h = {
+		SHA512_H0, SHA512_H1, SHA512_H2, SHA512_H3,
+		SHA512_H4, SHA512_H5, SHA512_H6, SHA512_H7,
+	},
+};
+
+static const u64 sha512_K[80] = {
+	0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+	0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+	0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+	0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+	0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+	0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+	0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+	0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+	0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+	0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+	0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+	0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+	0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+	0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+	0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+	0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+	0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+	0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+	0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+	0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+	0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+	0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+	0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+	0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+	0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+	0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+	0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
+};
+
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
+#define e0(x) (ror64((x), 28) ^ ror64((x), 34) ^ ror64((x), 39))
+#define e1(x) (ror64((x), 14) ^ ror64((x), 18) ^ ror64((x), 41))
+#define s0(x) (ror64((x), 1) ^ ror64((x), 8) ^ ((x) >> 7))
+#define s1(x) (ror64((x), 19) ^ ror64((x), 61) ^ ((x) >> 6))
+
+static void sha512_block_generic(struct sha512_block_state *state,
+				 const u8 *data)
+{
+	u64 a = state->h[0];
+	u64 b = state->h[1];
+	u64 c = state->h[2];
+	u64 d = state->h[3];
+	u64 e = state->h[4];
+	u64 f = state->h[5];
+	u64 g = state->h[6];
+	u64 h = state->h[7];
+	u64 t1, t2;
+	u64 W[16];
+
+	for (int j = 0; j < 16; j++)
+		W[j] = get_unaligned_be64(data + j * sizeof(u64));
+
+	for (int i = 0; i < 80; i += 8) {
+		if ((i & 15) == 0 && i != 0) {
+			for (int j = 0; j < 16; j++) {
+				W[j & 15] += s1(W[(j - 2) & 15]) +
+					     W[(j - 7) & 15] +
+					     s0(W[(j - 15) & 15]);
+			}
+		}
+		t1 = h + e1(e) + Ch(e, f, g) + sha512_K[i]   + W[(i & 15)];
+		t2 = e0(a) + Maj(a, b, c);    d += t1;    h = t1 + t2;
+		t1 = g + e1(d) + Ch(d, e, f) + sha512_K[i+1] + W[(i & 15) + 1];
+		t2 = e0(h) + Maj(h, a, b);    c += t1;    g = t1 + t2;
+		t1 = f + e1(c) + Ch(c, d, e) + sha512_K[i+2] + W[(i & 15) + 2];
+		t2 = e0(g) + Maj(g, h, a);    b += t1;    f = t1 + t2;
+		t1 = e + e1(b) + Ch(b, c, d) + sha512_K[i+3] + W[(i & 15) + 3];
+		t2 = e0(f) + Maj(f, g, h);    a += t1;    e = t1 + t2;
+		t1 = d + e1(a) + Ch(a, b, c) + sha512_K[i+4] + W[(i & 15) + 4];
+		t2 = e0(e) + Maj(e, f, g);    h += t1;    d = t1 + t2;
+		t1 = c + e1(h) + Ch(h, a, b) + sha512_K[i+5] + W[(i & 15) + 5];
+		t2 = e0(d) + Maj(d, e, f);    g += t1;    c = t1 + t2;
+		t1 = b + e1(g) + Ch(g, h, a) + sha512_K[i+6] + W[(i & 15) + 6];
+		t2 = e0(c) + Maj(c, d, e);    f += t1;    b = t1 + t2;
+		t1 = a + e1(f) + Ch(f, g, h) + sha512_K[i+7] + W[(i & 15) + 7];
+		t2 = e0(b) + Maj(b, c, d);    e += t1;    a = t1 + t2;
+	}
+
+	state->h[0] += a;
+	state->h[1] += b;
+	state->h[2] += c;
+	state->h[3] += d;
+	state->h[4] += e;
+	state->h[5] += f;
+	state->h[6] += g;
+	state->h[7] += h;
+}
+
+static void __maybe_unused
+sha512_blocks_generic(struct sha512_block_state *state,
+		      const u8 *data, size_t nblocks)
+{
+	do {
+		sha512_block_generic(state, data);
+		data += SHA512_BLOCK_SIZE;
+	} while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA512_ARCH
+#include "sha512.h" /* $(SRCARCH)/sha512.h */
+#else
+#define sha512_blocks sha512_blocks_generic
+#endif
+
+static void __sha512_init(struct __sha512_ctx *ctx,
+			  const struct sha512_block_state *iv,
+			  u64 initial_bytecount)
+{
+	ctx->state = *iv;
+	ctx->bytecount_lo = initial_bytecount;
+	ctx->bytecount_hi = 0;
+}
+
+void sha384_init(struct sha384_ctx *ctx)
+{
+	__sha512_init(&ctx->ctx, &sha384_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha384_init);
+
+void sha512_init(struct sha512_ctx *ctx)
+{
+	__sha512_init(&ctx->ctx, &sha512_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha512_init);
+
+void __sha512_update(struct __sha512_ctx *ctx, const u8 *data, size_t len)
+{
+	size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE;
+
+	if (check_add_overflow(ctx->bytecount_lo, len, &ctx->bytecount_lo))
+		ctx->bytecount_hi++;
+
+	if (partial + len >= SHA512_BLOCK_SIZE) {
+		size_t nblocks;
+
+		if (partial) {
+			size_t l = SHA512_BLOCK_SIZE - partial;
+
+			memcpy(&ctx->buf[partial], data, l);
+			data += l;
+			len -= l;
+
+			sha512_blocks(&ctx->state, ctx->buf, 1);
+		}
+
+		nblocks = len / SHA512_BLOCK_SIZE;
+		len %= SHA512_BLOCK_SIZE;
+
+		if (nblocks) {
+			sha512_blocks(&ctx->state, data, nblocks);
+			data += nblocks * SHA512_BLOCK_SIZE;
+		}
+		partial = 0;
+	}
+	if (len)
+		memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(__sha512_update);
+
+static void __sha512_final(struct __sha512_ctx *ctx,
+			   u8 *out, size_t digest_size)
+{
+	u64 bitcount_hi = (ctx->bytecount_hi << 3) | (ctx->bytecount_lo >> 61);
+	u64 bitcount_lo = ctx->bytecount_lo << 3;
+	size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE;
+
+	ctx->buf[partial++] = 0x80;
+	if (partial > SHA512_BLOCK_SIZE - 16) {
+		memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - partial);
+		sha512_blocks(&ctx->state, ctx->buf, 1);
+		partial = 0;
+	}
+	memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - 16 - partial);
+	*(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 16] = cpu_to_be64(bitcount_hi);
+	*(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 8] = cpu_to_be64(bitcount_lo);
+	sha512_blocks(&ctx->state, ctx->buf, 1);
+
+	for (size_t i = 0; i < digest_size; i += 8)
+		put_unaligned_be64(ctx->state.h[i / 8], out + i);
+}
+
+void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE])
+{
+	__sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha384_final);
+
+void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE])
+{
+	__sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha512_final);
+
+void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE])
+{
+	struct sha384_ctx ctx;
+
+	sha384_init(&ctx);
+	sha384_update(&ctx, data, len);
+	sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha384);
+
+void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE])
+{
+	struct sha512_ctx ctx;
+
+	sha512_init(&ctx);
+	sha512_update(&ctx, data, len);
+	sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha512);
+
+static void __hmac_sha512_preparekey(struct sha512_block_state *istate,
+				     struct sha512_block_state *ostate,
+				     const u8 *raw_key, size_t raw_key_len,
+				     const struct sha512_block_state *iv)
+{
+	union {
+		u8 b[SHA512_BLOCK_SIZE];
+		unsigned long w[SHA512_BLOCK_SIZE / sizeof(unsigned long)];
+	} derived_key = { 0 };
+
+	if (unlikely(raw_key_len > SHA512_BLOCK_SIZE)) {
+		if (iv == &sha384_iv)
+			sha384(raw_key, raw_key_len, derived_key.b);
+		else
+			sha512(raw_key, raw_key_len, derived_key.b);
+	} else {
+		memcpy(derived_key.b, raw_key, raw_key_len);
+	}
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+	*istate = *iv;
+	sha512_blocks(istate, derived_key.b, 1);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+						HMAC_IPAD_VALUE);
+	*ostate = *iv;
+	sha512_blocks(ostate, derived_key.b, 1);
+
+	memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha384_preparekey(struct hmac_sha384_key *key,
+			    const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha512_preparekey(&key->key.istate, &key->key.ostate,
+				 raw_key, raw_key_len, &sha384_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_preparekey);
+
+void hmac_sha512_preparekey(struct hmac_sha512_key *key,
+			    const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha512_preparekey(&key->key.istate, &key->key.ostate,
+				 raw_key, raw_key_len, &sha512_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_preparekey);
+
+void __hmac_sha512_init(struct __hmac_sha512_ctx *ctx,
+			const struct __hmac_sha512_key *key)
+{
+	__sha512_init(&ctx->sha_ctx, &key->istate, SHA512_BLOCK_SIZE);
+	ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(__hmac_sha512_init);
+
+void hmac_sha384_init_usingrawkey(struct hmac_sha384_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+				 raw_key, raw_key_len, &sha384_iv);
+	ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE;
+	ctx->ctx.sha_ctx.bytecount_hi = 0;
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_init_usingrawkey);
+
+void hmac_sha512_init_usingrawkey(struct hmac_sha512_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+				 raw_key, raw_key_len, &sha512_iv);
+	ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE;
+	ctx->ctx.sha_ctx.bytecount_hi = 0;
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_init_usingrawkey);
+
+static void __hmac_sha512_final(struct __hmac_sha512_ctx *ctx,
+				u8 *out, size_t digest_size)
+{
+	/* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+	__sha512_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
+	memset(&ctx->sha_ctx.buf[digest_size], 0,
+	       SHA512_BLOCK_SIZE - digest_size);
+	ctx->sha_ctx.buf[digest_size] = 0x80;
+	*(__be32 *)&ctx->sha_ctx.buf[SHA512_BLOCK_SIZE - 4] =
+		cpu_to_be32(8 * (SHA512_BLOCK_SIZE + digest_size));
+
+	/* Compute the outer hash, which gives the HMAC value. */
+	sha512_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+	for (size_t i = 0; i < digest_size; i += 8)
+		put_unaligned_be64(ctx->ostate.h[i / 8], out + i);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+void hmac_sha384_final(struct hmac_sha384_ctx *ctx,
+		       u8 out[SHA384_DIGEST_SIZE])
+{
+	__hmac_sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_final);
+
+void hmac_sha512_final(struct hmac_sha512_ctx *ctx,
+		       u8 out[SHA512_DIGEST_SIZE])
+{
+	__hmac_sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_final);
+
+void hmac_sha384(const struct hmac_sha384_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE])
+{
+	struct hmac_sha384_ctx ctx;
+
+	hmac_sha384_init(&ctx, key);
+	hmac_sha384_update(&ctx, data, data_len);
+	hmac_sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384);
+
+void hmac_sha512(const struct hmac_sha512_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE])
+{
+	struct hmac_sha512_ctx ctx;
+
+	hmac_sha512_init(&ctx, key);
+	hmac_sha512_update(&ctx, data, data_len);
+	hmac_sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512);
+
+void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA384_DIGEST_SIZE])
+{
+	struct hmac_sha384_ctx ctx;
+
+	hmac_sha384_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha384_update(&ctx, data, data_len);
+	hmac_sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_usingrawkey);
+
+void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA512_DIGEST_SIZE])
+{
+	struct hmac_sha512_ctx ctx;
+
+	hmac_sha512_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha512_update(&ctx, data, data_len);
+	hmac_sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey);
+
+#if defined(sha512_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha512_mod_init(void)
+{
+#ifdef sha512_mod_init_arch
+	sha512_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing HMAC-SHA512 satisfies the
+		 * test requirement for SHA-384, SHA-512, and HMAC-SHA384 too.
+		 */
+		u8 mac[SHA512_DIGEST_SIZE];
+
+		hmac_sha512_usingrawkey(fips_test_key, sizeof(fips_test_key),
+					fips_test_data, sizeof(fips_test_data),
+					mac);
+		if (memcmp(fips_test_hmac_sha512_value, mac, sizeof(mac)) != 0)
+			panic("sha512: FIPS self-test failed\n");
+	}
+	return 0;
+}
+subsys_initcall(sha512_mod_init);
+
+static void __exit sha512_mod_exit(void)
+{
+}
+module_exit(sha512_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/simd.c b/lib/crypto/simd.c
new file mode 100644
index 000000000000..9c36cb3bb49c
--- /dev/null
+++ b/lib/crypto/simd.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SIMD testing utility functions
+ *
+ * Copyright 2024 Google LLC
+ */
+
+#include <crypto/internal/simd.h>
+
+DEFINE_PER_CPU(bool, crypto_simd_disabled_for_test);
+EXPORT_PER_CPU_SYMBOL_GPL(crypto_simd_disabled_for_test);
diff --git a/lib/crypto/sm3.c b/lib/crypto/sm3.c
new file mode 100644
index 000000000000..c6b9ad8a3ac6
--- /dev/null
+++ b/lib/crypto/sm3.c
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SM3 secure hash, as specified by OSCCA GM/T 0004-2012 SM3 and described
+ * at https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02
+ *
+ * Copyright (C) 2017 ARM Limited or its affiliates.
+ * Copyright (C) 2017 Gilad Ben-Yossef <gilad@benyossef.com>
+ * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <crypto/sm3.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+static const u32 ____cacheline_aligned K[64] = {
+	0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb,
+	0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc,
+	0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce,
+	0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6,
+	0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c,
+	0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce,
+	0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec,
+	0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5,
+	0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53,
+	0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d,
+	0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4,
+	0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43,
+	0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c,
+	0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce,
+	0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec,
+	0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
+};
+
+/*
+ * Transform the message X which consists of 16 32-bit-words. See
+ * GM/T 004-2012 for details.
+ */
+#define R(i, a, b, c, d, e, f, g, h, t, w1, w2)			\
+	do {							\
+		ss1 = rol32((rol32((a), 12) + (e) + (t)), 7);	\
+		ss2 = ss1 ^ rol32((a), 12);			\
+		d += FF ## i(a, b, c) + ss2 + ((w1) ^ (w2));	\
+		h += GG ## i(e, f, g) + ss1 + (w1);		\
+		b = rol32((b), 9);				\
+		f = rol32((f), 19);				\
+		h = P0((h));					\
+	} while (0)
+
+#define R1(a, b, c, d, e, f, g, h, t, w1, w2) \
+	R(1, a, b, c, d, e, f, g, h, t, w1, w2)
+#define R2(a, b, c, d, e, f, g, h, t, w1, w2) \
+	R(2, a, b, c, d, e, f, g, h, t, w1, w2)
+
+#define FF1(x, y, z)  (x ^ y ^ z)
+#define FF2(x, y, z)  ((x & y) | (x & z) | (y & z))
+
+#define GG1(x, y, z)  FF1(x, y, z)
+#define GG2(x, y, z)  ((x & y) | (~x & z))
+
+/* Message expansion */
+#define P0(x) ((x) ^ rol32((x), 9) ^ rol32((x), 17))
+#define P1(x) ((x) ^ rol32((x), 15) ^ rol32((x), 23))
+#define I(i)  (W[i] = get_unaligned_be32(data + i * 4))
+#define W1(i) (W[i & 0x0f])
+#define W2(i) (W[i & 0x0f] =				\
+		P1(W[i & 0x0f]				\
+			^ W[(i-9) & 0x0f]		\
+			^ rol32(W[(i-3) & 0x0f], 15))	\
+		^ rol32(W[(i-13) & 0x0f], 7)		\
+		^ W[(i-6) & 0x0f])
+
+static void sm3_transform(struct sm3_state *sctx, u8 const *data, u32 W[16])
+{
+	u32 a, b, c, d, e, f, g, h, ss1, ss2;
+
+	a = sctx->state[0];
+	b = sctx->state[1];
+	c = sctx->state[2];
+	d = sctx->state[3];
+	e = sctx->state[4];
+	f = sctx->state[5];
+	g = sctx->state[6];
+	h = sctx->state[7];
+
+	R1(a, b, c, d, e, f, g, h, K[0], I(0), I(4));
+	R1(d, a, b, c, h, e, f, g, K[1], I(1), I(5));
+	R1(c, d, a, b, g, h, e, f, K[2], I(2), I(6));
+	R1(b, c, d, a, f, g, h, e, K[3], I(3), I(7));
+	R1(a, b, c, d, e, f, g, h, K[4], W1(4), I(8));
+	R1(d, a, b, c, h, e, f, g, K[5], W1(5), I(9));
+	R1(c, d, a, b, g, h, e, f, K[6], W1(6), I(10));
+	R1(b, c, d, a, f, g, h, e, K[7], W1(7), I(11));
+	R1(a, b, c, d, e, f, g, h, K[8], W1(8), I(12));
+	R1(d, a, b, c, h, e, f, g, K[9], W1(9), I(13));
+	R1(c, d, a, b, g, h, e, f, K[10], W1(10), I(14));
+	R1(b, c, d, a, f, g, h, e, K[11], W1(11), I(15));
+	R1(a, b, c, d, e, f, g, h, K[12], W1(12), W2(16));
+	R1(d, a, b, c, h, e, f, g, K[13], W1(13), W2(17));
+	R1(c, d, a, b, g, h, e, f, K[14], W1(14), W2(18));
+	R1(b, c, d, a, f, g, h, e, K[15], W1(15), W2(19));
+
+	R2(a, b, c, d, e, f, g, h, K[16], W1(16), W2(20));
+	R2(d, a, b, c, h, e, f, g, K[17], W1(17), W2(21));
+	R2(c, d, a, b, g, h, e, f, K[18], W1(18), W2(22));
+	R2(b, c, d, a, f, g, h, e, K[19], W1(19), W2(23));
+	R2(a, b, c, d, e, f, g, h, K[20], W1(20), W2(24));
+	R2(d, a, b, c, h, e, f, g, K[21], W1(21), W2(25));
+	R2(c, d, a, b, g, h, e, f, K[22], W1(22), W2(26));
+	R2(b, c, d, a, f, g, h, e, K[23], W1(23), W2(27));
+	R2(a, b, c, d, e, f, g, h, K[24], W1(24), W2(28));
+	R2(d, a, b, c, h, e, f, g, K[25], W1(25), W2(29));
+	R2(c, d, a, b, g, h, e, f, K[26], W1(26), W2(30));
+	R2(b, c, d, a, f, g, h, e, K[27], W1(27), W2(31));
+	R2(a, b, c, d, e, f, g, h, K[28], W1(28), W2(32));
+	R2(d, a, b, c, h, e, f, g, K[29], W1(29), W2(33));
+	R2(c, d, a, b, g, h, e, f, K[30], W1(30), W2(34));
+	R2(b, c, d, a, f, g, h, e, K[31], W1(31), W2(35));
+
+	R2(a, b, c, d, e, f, g, h, K[32], W1(32), W2(36));
+	R2(d, a, b, c, h, e, f, g, K[33], W1(33), W2(37));
+	R2(c, d, a, b, g, h, e, f, K[34], W1(34), W2(38));
+	R2(b, c, d, a, f, g, h, e, K[35], W1(35), W2(39));
+	R2(a, b, c, d, e, f, g, h, K[36], W1(36), W2(40));
+	R2(d, a, b, c, h, e, f, g, K[37], W1(37), W2(41));
+	R2(c, d, a, b, g, h, e, f, K[38], W1(38), W2(42));
+	R2(b, c, d, a, f, g, h, e, K[39], W1(39), W2(43));
+	R2(a, b, c, d, e, f, g, h, K[40], W1(40), W2(44));
+	R2(d, a, b, c, h, e, f, g, K[41], W1(41), W2(45));
+	R2(c, d, a, b, g, h, e, f, K[42], W1(42), W2(46));
+	R2(b, c, d, a, f, g, h, e, K[43], W1(43), W2(47));
+	R2(a, b, c, d, e, f, g, h, K[44], W1(44), W2(48));
+	R2(d, a, b, c, h, e, f, g, K[45], W1(45), W2(49));
+	R2(c, d, a, b, g, h, e, f, K[46], W1(46), W2(50));
+	R2(b, c, d, a, f, g, h, e, K[47], W1(47), W2(51));
+
+	R2(a, b, c, d, e, f, g, h, K[48], W1(48), W2(52));
+	R2(d, a, b, c, h, e, f, g, K[49], W1(49), W2(53));
+	R2(c, d, a, b, g, h, e, f, K[50], W1(50), W2(54));
+	R2(b, c, d, a, f, g, h, e, K[51], W1(51), W2(55));
+	R2(a, b, c, d, e, f, g, h, K[52], W1(52), W2(56));
+	R2(d, a, b, c, h, e, f, g, K[53], W1(53), W2(57));
+	R2(c, d, a, b, g, h, e, f, K[54], W1(54), W2(58));
+	R2(b, c, d, a, f, g, h, e, K[55], W1(55), W2(59));
+	R2(a, b, c, d, e, f, g, h, K[56], W1(56), W2(60));
+	R2(d, a, b, c, h, e, f, g, K[57], W1(57), W2(61));
+	R2(c, d, a, b, g, h, e, f, K[58], W1(58), W2(62));
+	R2(b, c, d, a, f, g, h, e, K[59], W1(59), W2(63));
+	R2(a, b, c, d, e, f, g, h, K[60], W1(60), W2(64));
+	R2(d, a, b, c, h, e, f, g, K[61], W1(61), W2(65));
+	R2(c, d, a, b, g, h, e, f, K[62], W1(62), W2(66));
+	R2(b, c, d, a, f, g, h, e, K[63], W1(63), W2(67));
+
+	sctx->state[0] ^= a;
+	sctx->state[1] ^= b;
+	sctx->state[2] ^= c;
+	sctx->state[3] ^= d;
+	sctx->state[4] ^= e;
+	sctx->state[5] ^= f;
+	sctx->state[6] ^= g;
+	sctx->state[7] ^= h;
+}
+#undef R
+#undef R1
+#undef R2
+#undef I
+#undef W1
+#undef W2
+
+void sm3_block_generic(struct sm3_state *sctx, u8 const *data, int blocks)
+{
+	u32 W[16];
+
+	do {
+		sm3_transform(sctx, data, W);
+		data += SM3_BLOCK_SIZE;
+	} while (--blocks);
+
+	memzero_explicit(W, sizeof(W));
+}
+EXPORT_SYMBOL_GPL(sm3_block_generic);
+
+MODULE_DESCRIPTION("Generic SM3 library");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/sparc/md5.h b/lib/crypto/sparc/md5.h
new file mode 100644
index 000000000000..3995f3e075eb
--- /dev/null
+++ b/lib/crypto/sparc/md5.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * MD5 accelerated using the sparc64 crypto opcodes
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_md5_opcodes);
+
+asmlinkage void md5_sparc64_transform(struct md5_block_state *state,
+				      const u8 *data, size_t nblocks);
+
+static void md5_blocks(struct md5_block_state *state,
+		       const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_md5_opcodes)) {
+		cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+		md5_sparc64_transform(state, data, nblocks);
+		le32_to_cpu_array(state->h, ARRAY_SIZE(state->h));
+	} else {
+		md5_blocks_generic(state, data, nblocks);
+	}
+}
+
+#define md5_mod_init_arch md5_mod_init_arch
+static void md5_mod_init_arch(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_MD5))
+		return;
+
+	static_branch_enable(&have_md5_opcodes);
+	pr_info("Using sparc64 md5 opcode optimized MD5 implementation\n");
+}
diff --git a/lib/crypto/sparc/md5_asm.S b/lib/crypto/sparc/md5_asm.S
new file mode 100644
index 000000000000..60b544e4d205
--- /dev/null
+++ b/lib/crypto/sparc/md5_asm.S
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(md5_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x08], %f2
+	bne,pn	%xcc, 10f
+	 ld	[%o0 + 0x0c], %f3
+
+1:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	ldd	[%o1 + 0x38], %f22
+
+	MD5
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+5:
+	st	%f0, [%o0 + 0x00]
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	retl
+	 VISExitHalf
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+1:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	ldd	[%o1 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	MD5
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(md5_sparc64_transform)
diff --git a/lib/crypto/sparc/sha1.h b/lib/crypto/sparc/sha1.h
new file mode 100644
index 000000000000..bdf771fcc1f7
--- /dev/null
+++ b/lib/crypto/sparc/sha1.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-1 accelerated using the sparc64 crypto opcodes
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha1_opcodes);
+
+asmlinkage void sha1_sparc64_transform(struct sha1_block_state *state,
+				       const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_sha1_opcodes))
+		sha1_sparc64_transform(state, data, nblocks);
+	else
+		sha1_blocks_generic(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA1))
+		return;
+
+	static_branch_enable(&have_sha1_opcodes);
+	pr_info("Using sparc64 sha1 opcode optimized SHA-1 implementation\n");
+}
diff --git a/lib/crypto/sparc/sha1_asm.S b/lib/crypto/sparc/sha1_asm.S
new file mode 100644
index 000000000000..00b46bac1b08
--- /dev/null
+++ b/lib/crypto/sparc/sha1_asm.S
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(sha1_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	ld	[%o0 + 0x08], %f2
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x0c], %f3
+	bne,pn	%xcc, 10f
+	 ld	[%o0 + 0x10], %f4
+
+1:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	ldd	[%o1 + 0x38], %f22
+
+	SHA1
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+5:
+	st	%f0, [%o0 + 0x00]
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	st	%f4, [%o0 + 0x10]
+	retl
+	 VISExitHalf
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+1:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	ldd	[%o1 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	SHA1
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(sha1_sparc64_transform)
diff --git a/lib/crypto/sparc/sha256.h b/lib/crypto/sparc/sha256.h
new file mode 100644
index 000000000000..b2f4419ec778
--- /dev/null
+++ b/lib/crypto/sparc/sha256.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-256 accelerated using the sparc64 sha256 opcodes
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes);
+
+asmlinkage void sha256_sparc64_transform(struct sha256_block_state *state,
+					 const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_sha256_opcodes))
+		sha256_sparc64_transform(state, data, nblocks);
+	else
+		sha256_blocks_generic(state, data, nblocks);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA256))
+		return;
+
+	static_branch_enable(&have_sha256_opcodes);
+	pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n");
+}
diff --git a/lib/crypto/sparc/sha256_asm.S b/lib/crypto/sparc/sha256_asm.S
new file mode 100644
index 000000000000..ddcdd3daf31e
--- /dev/null
+++ b/lib/crypto/sparc/sha256_asm.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(sha256_sparc64_transform)
+	/* %o0 = state, %o1 = data, %o2 = nblocks */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	ld	[%o0 + 0x08], %f2
+	ld	[%o0 + 0x0c], %f3
+	ld	[%o0 + 0x10], %f4
+	ld	[%o0 + 0x14], %f5
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x18], %f6
+	bne,pn	%xcc, 10f
+	 ld	[%o0 + 0x1c], %f7
+
+1:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	ldd	[%o1 + 0x38], %f22
+
+	SHA256
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+5:
+	st	%f0, [%o0 + 0x00]
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	st	%f4, [%o0 + 0x10]
+	st	%f5, [%o0 + 0x14]
+	st	%f6, [%o0 + 0x18]
+	st	%f7, [%o0 + 0x1c]
+	retl
+	 VISExitHalf
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+1:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	ldd	[%o1 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	SHA256
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(sha256_sparc64_transform)
diff --git a/lib/crypto/sparc/sha512.h b/lib/crypto/sparc/sha512.h
new file mode 100644
index 000000000000..a8c37a7d4c39
--- /dev/null
+++ b/lib/crypto/sparc/sha512.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-512 accelerated using the sparc64 sha512 opcodes
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_opcodes);
+
+asmlinkage void sha512_sparc64_transform(struct sha512_block_state *state,
+					 const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_sha512_opcodes))
+		sha512_sparc64_transform(state, data, nblocks);
+	else
+		sha512_blocks_generic(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA512))
+		return;
+
+	static_branch_enable(&have_sha512_opcodes);
+	pr_info("Using sparc64 sha512 opcode optimized SHA-512/SHA-384 implementation\n");
+}
diff --git a/lib/crypto/sparc/sha512_asm.S b/lib/crypto/sparc/sha512_asm.S
new file mode 100644
index 000000000000..9932b4fe1b59
--- /dev/null
+++ b/lib/crypto/sparc/sha512_asm.S
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(sha512_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntry
+	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	andcc	%o1, 0x7, %g0
+	ldd	[%o0 + 0x30], %f12
+	bne,pn	%xcc, 10f
+	 ldd	[%o0 + 0x38], %f14
+
+1:
+	ldd	[%o1 + 0x00], %f16
+	ldd	[%o1 + 0x08], %f18
+	ldd	[%o1 + 0x10], %f20
+	ldd	[%o1 + 0x18], %f22
+	ldd	[%o1 + 0x20], %f24
+	ldd	[%o1 + 0x28], %f26
+	ldd	[%o1 + 0x30], %f28
+	ldd	[%o1 + 0x38], %f30
+	ldd	[%o1 + 0x40], %f32
+	ldd	[%o1 + 0x48], %f34
+	ldd	[%o1 + 0x50], %f36
+	ldd	[%o1 + 0x58], %f38
+	ldd	[%o1 + 0x60], %f40
+	ldd	[%o1 + 0x68], %f42
+	ldd	[%o1 + 0x70], %f44
+	ldd	[%o1 + 0x78], %f46
+
+	SHA512
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x80, %o1
+
+5:
+	std	%f0, [%o0 + 0x00]
+	std	%f2, [%o0 + 0x08]
+	std	%f4, [%o0 + 0x10]
+	std	%f6, [%o0 + 0x18]
+	std	%f8, [%o0 + 0x20]
+	std	%f10, [%o0 + 0x28]
+	std	%f12, [%o0 + 0x30]
+	std	%f14, [%o0 + 0x38]
+	retl
+	 VISExit
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f18
+1:
+	ldd	[%o1 + 0x08], %f20
+	ldd	[%o1 + 0x10], %f22
+	ldd	[%o1 + 0x18], %f24
+	ldd	[%o1 + 0x20], %f26
+	ldd	[%o1 + 0x28], %f28
+	ldd	[%o1 + 0x30], %f30
+	ldd	[%o1 + 0x38], %f32
+	ldd	[%o1 + 0x40], %f34
+	ldd	[%o1 + 0x48], %f36
+	ldd	[%o1 + 0x50], %f38
+	ldd	[%o1 + 0x58], %f40
+	ldd	[%o1 + 0x60], %f42
+	ldd	[%o1 + 0x68], %f44
+	ldd	[%o1 + 0x70], %f46
+	ldd	[%o1 + 0x78], %f48
+	ldd	[%o1 + 0x80], %f50
+
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+	faligndata %f26, %f28, %f24
+	faligndata %f28, %f30, %f26
+	faligndata %f30, %f32, %f28
+	faligndata %f32, %f34, %f30
+	faligndata %f34, %f36, %f32
+	faligndata %f36, %f38, %f34
+	faligndata %f38, %f40, %f36
+	faligndata %f40, %f42, %f38
+	faligndata %f42, %f44, %f40
+	faligndata %f44, %f46, %f42
+	faligndata %f46, %f48, %f44
+	faligndata %f48, %f50, %f46
+
+	SHA512
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f50, %f18
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x80, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(sha512_sparc64_transform)
diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig
new file mode 100644
index 000000000000..61d435c450bb
--- /dev/null
+++ b/lib/crypto/tests/Kconfig
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+config CRYPTO_LIB_BLAKE2B_KUNIT_TEST
+	tristate "KUnit tests for BLAKE2b" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_BLAKE2B
+	help
+	  KUnit tests for the BLAKE2b cryptographic hash function.
+
+config CRYPTO_LIB_BLAKE2S_KUNIT_TEST
+	tristate "KUnit tests for BLAKE2s" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	# No need to select CRYPTO_LIB_BLAKE2S here, as that option doesn't
+	# exist; the BLAKE2s code is always built-in for the /dev/random driver.
+	help
+	  KUnit tests for the BLAKE2s cryptographic hash function.
+
+config CRYPTO_LIB_CURVE25519_KUNIT_TEST
+	tristate "KUnit tests for Curve25519" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_CURVE25519
+	help
+	  KUnit tests for the Curve25519 Diffie-Hellman function.
+
+config CRYPTO_LIB_MD5_KUNIT_TEST
+	tristate "KUnit tests for MD5" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_MD5
+	help
+	  KUnit tests for the MD5 cryptographic hash function and its
+	  corresponding HMAC.
+
+config CRYPTO_LIB_POLY1305_KUNIT_TEST
+	tristate "KUnit tests for Poly1305" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_POLY1305
+	help
+	  KUnit tests for the Poly1305 library functions.
+
+config CRYPTO_LIB_POLYVAL_KUNIT_TEST
+	tristate "KUnit tests for POLYVAL" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_POLYVAL
+	help
+	  KUnit tests for the POLYVAL library functions.
+
+config CRYPTO_LIB_SHA1_KUNIT_TEST
+	tristate "KUnit tests for SHA-1" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_SHA1
+	help
+	  KUnit tests for the SHA-1 cryptographic hash function and its
+	  corresponding HMAC.
+
+# Option is named *_SHA256_KUNIT_TEST, though both SHA-224 and SHA-256 tests are
+# included, for consistency with the naming used elsewhere (e.g. CRYPTO_SHA256).
+config CRYPTO_LIB_SHA256_KUNIT_TEST
+	tristate "KUnit tests for SHA-224 and SHA-256" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_SHA256
+	help
+	  KUnit tests for the SHA-224 and SHA-256 cryptographic hash functions
+	  and their corresponding HMACs.
+
+# Option is named *_SHA512_KUNIT_TEST, though both SHA-384 and SHA-512 tests are
+# included, for consistency with the naming used elsewhere (e.g. CRYPTO_SHA512).
+config CRYPTO_LIB_SHA512_KUNIT_TEST
+	tristate "KUnit tests for SHA-384 and SHA-512" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_SHA512
+	help
+	  KUnit tests for the SHA-384 and SHA-512 cryptographic hash functions
+	  and their corresponding HMACs.
+
+config CRYPTO_LIB_SHA3_KUNIT_TEST
+	tristate "KUnit tests for SHA-3" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_SHA3
+	help
+	  KUnit tests for the SHA3 cryptographic hash and XOF functions,
+	  including SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128 and
+	  SHAKE256.
+
+config CRYPTO_LIB_BENCHMARK_VISIBLE
+	bool
+
+config CRYPTO_LIB_BENCHMARK
+	bool "Include benchmarks in KUnit tests for cryptographic functions"
+	depends on CRYPTO_LIB_BENCHMARK_VISIBLE
+	help
+	  Include benchmarks in the KUnit tests for cryptographic functions.
+	  The benchmark results are printed to the kernel log when the
+	  corresponding KUnit test suite runs.
+
+	  This is useful for evaluating the performance of the cryptographic
+	  functions.  However, it will increase the runtime of the KUnit tests.
+
+	  If you're only interested in correctness testing, leave this disabled.
diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile
new file mode 100644
index 000000000000..5109a0651925
--- /dev/null
+++ b/lib/crypto/tests/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2B_KUNIT_TEST) += blake2b_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_KUNIT_TEST) += blake2s_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_CURVE25519_KUNIT_TEST) += curve25519_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_MD5_KUNIT_TEST) += md5_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_POLYVAL_KUNIT_TEST) += polyval_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA256_KUNIT_TEST) += sha224_kunit.o sha256_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA512_KUNIT_TEST) += sha384_kunit.o sha512_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA3_KUNIT_TEST) += sha3_kunit.o
diff --git a/lib/crypto/tests/blake2b-testvecs.h b/lib/crypto/tests/blake2b-testvecs.h
new file mode 100644
index 000000000000..9e407dbc219c
--- /dev/null
+++ b/lib/crypto/tests/blake2b-testvecs.h
@@ -0,0 +1,342 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py blake2b */
+
+static const struct {
+	size_t data_len;
+	u8 digest[BLAKE2B_HASH_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0x78, 0x6a, 0x02, 0xf7, 0x42, 0x01, 0x59, 0x03,
+			0xc6, 0xc6, 0xfd, 0x85, 0x25, 0x52, 0xd2, 0x72,
+			0x91, 0x2f, 0x47, 0x40, 0xe1, 0x58, 0x47, 0x61,
+			0x8a, 0x86, 0xe2, 0x17, 0xf7, 0x1f, 0x54, 0x19,
+			0xd2, 0x5e, 0x10, 0x31, 0xaf, 0xee, 0x58, 0x53,
+			0x13, 0x89, 0x64, 0x44, 0x93, 0x4e, 0xb0, 0x4b,
+			0x90, 0x3a, 0x68, 0x5b, 0x14, 0x48, 0xb7, 0x55,
+			0xd5, 0x6f, 0x70, 0x1a, 0xfe, 0x9b, 0xe2, 0xce,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x6f, 0x2e, 0xcc, 0x83, 0x53, 0xa3, 0x20, 0x16,
+			0x5b, 0xda, 0xd0, 0x04, 0xd3, 0xcb, 0xe4, 0x37,
+			0x5b, 0xf0, 0x84, 0x36, 0xe1, 0xad, 0x45, 0xcc,
+			0x4d, 0x7f, 0x09, 0x68, 0xb2, 0x62, 0x93, 0x7f,
+			0x72, 0x32, 0xe8, 0xa7, 0x2f, 0x1f, 0x6f, 0xc6,
+			0x14, 0xd6, 0x70, 0xae, 0x0c, 0xf0, 0xf3, 0xce,
+			0x64, 0x4d, 0x22, 0xdf, 0xc7, 0xa7, 0xf8, 0xa8,
+			0x18, 0x23, 0xd8, 0x6c, 0xaf, 0x65, 0xa2, 0x54,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x04, 0x13, 0xe2, 0x10, 0xbe, 0x65, 0xde, 0xce,
+			0x61, 0xa8, 0xe0, 0xd6, 0x35, 0xb1, 0xb8, 0x88,
+			0xd2, 0xea, 0x45, 0x3a, 0xe1, 0x8d, 0x94, 0xb5,
+			0x66, 0x06, 0x98, 0x96, 0x39, 0xf8, 0x0e, 0xcb,
+			0x34, 0xa6, 0xa8, 0x17, 0xfe, 0x56, 0xbc, 0xa9,
+			0x5e, 0x1b, 0xb1, 0xde, 0x3c, 0xc7, 0x78, 0x4f,
+			0x39, 0xc6, 0xfc, 0xa8, 0xb3, 0x27, 0x66, 0x3e,
+			0x4e, 0xb5, 0x5d, 0x08, 0x89, 0xee, 0xd1, 0xe0,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x2b, 0x4a, 0xa3, 0x4e, 0x2b, 0x7a, 0x47, 0x20,
+			0x30, 0x5b, 0x09, 0x17, 0x3a, 0xf4, 0xcc, 0xf0,
+			0xf7, 0x7b, 0x97, 0x68, 0x98, 0x9f, 0x4f, 0x09,
+			0x46, 0x25, 0xe7, 0xd6, 0x53, 0x6b, 0xf9, 0x68,
+			0x48, 0x12, 0x44, 0x8c, 0x9a, 0xc8, 0xd4, 0x42,
+			0xeb, 0x2c, 0x5f, 0x41, 0xba, 0x17, 0xd0, 0xc3,
+			0xad, 0xfd, 0xfb, 0x42, 0x33, 0xcb, 0x08, 0x5d,
+			0xd2, 0x5c, 0x3d, 0xde, 0x87, 0x4d, 0xd6, 0xe4,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0xbf, 0x40, 0xf2, 0x38, 0x44, 0x8e, 0x24, 0x5e,
+			0xbc, 0x67, 0xbb, 0xf0, 0x10, 0x9a, 0x79, 0xbb,
+			0x36, 0x55, 0xce, 0xd2, 0xba, 0x04, 0x0d, 0xe8,
+			0x30, 0x29, 0x5c, 0x2a, 0xa6, 0x3a, 0x4f, 0x37,
+			0xac, 0x5f, 0xd4, 0x13, 0xa2, 0xf4, 0xfe, 0x80,
+			0x61, 0xd7, 0x58, 0x66, 0x0c, 0x7f, 0xa2, 0x56,
+			0x6b, 0x52, 0x7c, 0x22, 0x73, 0x7f, 0x17, 0xaa,
+			0x91, 0x5a, 0x22, 0x06, 0xd9, 0x00, 0x48, 0x12,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x41, 0x04, 0x65, 0x93, 0x81, 0x9a, 0x20, 0x0a,
+			0x00, 0x60, 0x00, 0x64, 0x4c, 0x04, 0x3d, 0xe0,
+			0x6b, 0x17, 0x0c, 0xe1, 0x0e, 0x28, 0x8b, 0xa0,
+			0x76, 0xd2, 0x79, 0xb0, 0x33, 0x60, 0x61, 0x27,
+			0xf2, 0x64, 0xf1, 0x8a, 0xe5, 0x3e, 0xaa, 0x37,
+			0x60, 0xad, 0x2d, 0x75, 0x13, 0xae, 0xd8, 0x9e,
+			0xec, 0xe0, 0xe4, 0x40, 0x2f, 0x59, 0x44, 0xb0,
+			0x66, 0x7a, 0x68, 0x38, 0xce, 0x21, 0x99, 0x2a,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x19, 0x6f, 0x9d, 0xc7, 0x87, 0x12, 0x5c, 0xa3,
+			0xe2, 0xd3, 0xf1, 0x82, 0xec, 0xf3, 0x55, 0x9c,
+			0x86, 0xd1, 0x6d, 0xde, 0xcf, 0x5b, 0xec, 0x4c,
+			0x43, 0x25, 0x85, 0x90, 0xef, 0xe8, 0xe3, 0x5f,
+			0x2c, 0x3a, 0x84, 0x07, 0xb8, 0x55, 0xfd, 0x5e,
+			0xa4, 0x45, 0xf2, 0xac, 0xe4, 0xbd, 0xc7, 0x96,
+			0x80, 0x59, 0x3e, 0xc9, 0xb1, 0x60, 0xb1, 0x2b,
+			0x17, 0x49, 0x7d, 0x3e, 0x7d, 0x4d, 0x70, 0x24,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x73, 0x72, 0xd5, 0x0a, 0x97, 0xb4, 0x7d, 0xdb,
+			0x05, 0x14, 0x8e, 0x40, 0xc2, 0x9a, 0x8a, 0x74,
+			0x4b, 0xda, 0x7e, 0xfc, 0x97, 0x57, 0x23, 0x39,
+			0xdc, 0x57, 0x09, 0x13, 0x24, 0xfc, 0xf3, 0x23,
+			0x55, 0x48, 0xdd, 0xe5, 0x07, 0x9a, 0x6f, 0x7b,
+			0x62, 0xea, 0x4d, 0x79, 0xb4, 0xb9, 0xc5, 0x86,
+			0xc0, 0x34, 0xd6, 0xd2, 0x6c, 0xc3, 0x94, 0xfb,
+			0x34, 0xd6, 0x62, 0xae, 0xb8, 0x99, 0xf1, 0x38,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x42, 0x3a, 0xe3, 0xa2, 0xae, 0x5a, 0x28, 0xce,
+			0xf1, 0x3c, 0x97, 0xc2, 0x34, 0xf6, 0xb5, 0x1e,
+			0xfc, 0x31, 0xb4, 0x04, 0x61, 0xb7, 0x54, 0x0b,
+			0x0d, 0x1a, 0x22, 0x9c, 0x04, 0x67, 0x5c, 0x4c,
+			0x75, 0x1b, 0x10, 0x0b, 0x99, 0xe2, 0xb1, 0x5e,
+			0x5d, 0x4b, 0x7a, 0xe6, 0xf6, 0xb5, 0x62, 0xee,
+			0x2d, 0x44, 0x57, 0xb2, 0x96, 0x73, 0x5e, 0xb9,
+			0x6a, 0xb2, 0xb3, 0x16, 0xa3, 0xd9, 0x6a, 0x60,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x50, 0xb9, 0xbe, 0xb2, 0x69, 0x07, 0x45, 0x5b,
+			0x59, 0xde, 0x8d, 0xbf, 0x08, 0xdc, 0x2e, 0x7f,
+			0x93, 0x29, 0xc1, 0x91, 0xe8, 0x74, 0x03, 0x89,
+			0x20, 0xfb, 0xb2, 0x4b, 0xe8, 0x68, 0x6f, 0xe1,
+			0xb4, 0x30, 0xbe, 0x11, 0x3c, 0x43, 0x19, 0x66,
+			0x72, 0x78, 0xb7, 0xf4, 0xe9, 0x09, 0x18, 0x4e,
+			0xae, 0x4a, 0x24, 0xe0, 0x6f, 0x44, 0x02, 0xe3,
+			0xfd, 0xda, 0xb3, 0x3e, 0x3c, 0x6d, 0x54, 0x2e,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0xd6, 0xf2, 0xa9, 0x61, 0x3f, 0xce, 0x2a, 0x68,
+			0x19, 0x86, 0xff, 0xd1, 0xee, 0x89, 0x3b, 0xa4,
+			0x10, 0x9a, 0x91, 0x50, 0x35, 0x48, 0x9e, 0xf5,
+			0x9c, 0x95, 0xe0, 0xfb, 0x92, 0x0f, 0xa8, 0xf7,
+			0x6c, 0x43, 0x85, 0xf1, 0x6e, 0x11, 0x4e, 0x67,
+			0x78, 0xd7, 0x53, 0x25, 0x0c, 0xf8, 0xce, 0x38,
+			0x74, 0x08, 0xb0, 0x3c, 0x53, 0x20, 0x4d, 0xc4,
+			0x9a, 0xf5, 0x78, 0xe8, 0x41, 0x8f, 0xed, 0x1f,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0xe8, 0xb2, 0xc5, 0xa7, 0xf5, 0xfa, 0xee, 0xa0,
+			0x57, 0xba, 0x58, 0xf9, 0x0a, 0xf2, 0x64, 0x16,
+			0xa8, 0xa6, 0x03, 0x85, 0x3b, 0xb8, 0x6f, 0xca,
+			0x76, 0xc3, 0xa1, 0x2b, 0xec, 0xef, 0xc4, 0x66,
+			0x11, 0xdf, 0x03, 0x85, 0x9d, 0x0c, 0x37, 0x7b,
+			0xa9, 0x7b, 0x44, 0xfb, 0x11, 0x8f, 0x3f, 0x71,
+			0xcd, 0x81, 0x43, 0x2e, 0x71, 0x5c, 0x54, 0x9f,
+			0xca, 0x0f, 0x01, 0x91, 0xca, 0xaa, 0x93, 0xe9,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x05, 0x8e, 0x9d, 0xdc, 0xe9, 0x36, 0x3e, 0x73,
+			0x63, 0x59, 0x69, 0x81, 0x0b, 0x8c, 0xc7, 0x9e,
+			0xcc, 0xe7, 0x9c, 0x19, 0x54, 0xa7, 0x2f, 0x86,
+			0xb5, 0xea, 0xae, 0x6d, 0xfe, 0x4e, 0x6e, 0x83,
+			0x8d, 0x1a, 0x1c, 0x70, 0x3f, 0x34, 0xa1, 0x04,
+			0x59, 0xd1, 0xbb, 0xaa, 0x58, 0xf7, 0xce, 0xfb,
+			0x86, 0x66, 0x22, 0xfc, 0x78, 0x74, 0x6e, 0x85,
+			0xf1, 0x59, 0x7d, 0x9e, 0x1c, 0x3b, 0xc6, 0x65,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x6b, 0x1f, 0x7c, 0x9a, 0x65, 0x7f, 0x09, 0x61,
+			0xe5, 0x04, 0x9a, 0xf1, 0x4b, 0x36, 0x8e, 0x41,
+			0x86, 0xcf, 0x86, 0x19, 0xd8, 0xc9, 0x34, 0x70,
+			0x67, 0xd1, 0x03, 0x72, 0x12, 0xf7, 0x27, 0x92,
+			0x2e, 0x3d, 0x2b, 0x54, 0x9a, 0x48, 0xa4, 0xc2,
+			0x61, 0xea, 0x6a, 0xe8, 0xdd, 0x07, 0x41, 0x85,
+			0x58, 0x6d, 0xcd, 0x12, 0x0d, 0xbc, 0xb1, 0x23,
+			0xb2, 0xdb, 0x24, 0x1f, 0xc4, 0xa7, 0xae, 0xda,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x50, 0xd8, 0xdc, 0xb2, 0x50, 0x24, 0x7a, 0x49,
+			0xb1, 0x00, 0x73, 0x16, 0x1f, 0xce, 0xf9, 0xe8,
+			0x77, 0x0a, 0x27, 0x74, 0xc7, 0xeb, 0xf0, 0x62,
+			0xb9, 0xf3, 0x24, 0xa6, 0x03, 0x18, 0x40, 0xde,
+			0x9b, 0x1d, 0xa8, 0xd0, 0xbf, 0x66, 0xa3, 0xc1,
+			0x31, 0x04, 0x95, 0xc7, 0xc3, 0xb7, 0x11, 0xe2,
+			0x1e, 0x31, 0x49, 0x98, 0x06, 0xab, 0xf0, 0xe6,
+			0x5c, 0xac, 0x88, 0x28, 0x0b, 0x3d, 0xb2, 0xc2,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0xd4, 0x2b, 0x6b, 0x9e, 0xfc, 0x44, 0xc0, 0x90,
+			0x64, 0x77, 0x5d, 0xf3, 0x44, 0xb6, 0x92, 0x8f,
+			0x80, 0xe2, 0xe4, 0x9b, 0xaf, 0x49, 0x04, 0xea,
+			0x29, 0xf7, 0x4a, 0x33, 0x3f, 0xc7, 0x3b, 0xab,
+			0xa1, 0x71, 0x7f, 0xa2, 0x8e, 0x03, 0xa0, 0xd6,
+			0xa7, 0xcd, 0xe0, 0xf8, 0xd7, 0x3b, 0xa4, 0x0d,
+			0x84, 0x79, 0x12, 0x72, 0x3f, 0x8e, 0x48, 0x35,
+			0x76, 0x4f, 0x56, 0xe9, 0x21, 0x40, 0x19, 0xbe,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x84, 0xd4, 0xd8, 0x6c, 0x60, 0x3d, 0x6e, 0xfd,
+			0x84, 0xb7, 0xdf, 0xba, 0x13, 0x5e, 0x07, 0x94,
+			0x5b, 0x6b, 0x62, 0x1d, 0x82, 0x02, 0xa7, 0xb3,
+			0x21, 0xdf, 0x42, 0x20, 0x85, 0xa8, 0x6f, 0x30,
+			0xf7, 0x03, 0xba, 0x66, 0x0e, 0xa6, 0x42, 0x21,
+			0x37, 0xe8, 0xed, 0x5b, 0x22, 0xf5, 0x4e, 0xa5,
+			0xe5, 0x80, 0x1b, 0x47, 0xf0, 0x49, 0xb3, 0xe5,
+			0x6e, 0xd9, 0xd9, 0x95, 0x3d, 0x2e, 0x42, 0x13,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x71, 0x17, 0xab, 0x93, 0xfe, 0x3b, 0xa4, 0xe6,
+			0xcb, 0xb0, 0xea, 0x95, 0xe7, 0x1a, 0x01, 0xc0,
+			0x12, 0x33, 0xfe, 0xcc, 0x79, 0x15, 0xae, 0x56,
+			0xd2, 0x70, 0x44, 0x60, 0x54, 0x42, 0xa8, 0x69,
+			0x7e, 0xc3, 0x90, 0xa0, 0x0c, 0x63, 0x39, 0xff,
+			0x55, 0x53, 0xb8, 0x46, 0xef, 0x06, 0xcb, 0xba,
+			0x73, 0xf4, 0x76, 0x22, 0xf1, 0x60, 0x98, 0xbc,
+			0xbf, 0x76, 0x95, 0x85, 0x13, 0x1d, 0x11, 0x3b,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x3a, 0xaa, 0x85, 0xa0, 0x8c, 0x8e, 0xe1, 0x9c,
+			0x9b, 0x43, 0x72, 0x7f, 0x40, 0x88, 0x3b, 0xd1,
+			0xc4, 0xd8, 0x2b, 0x69, 0xa6, 0x74, 0x47, 0x69,
+			0x5f, 0x7d, 0xab, 0x75, 0xa9, 0xf9, 0x88, 0x54,
+			0xce, 0x57, 0xcc, 0x9d, 0xac, 0x13, 0x91, 0xdb,
+			0x6d, 0x5c, 0xd8, 0xf4, 0x35, 0xc9, 0x30, 0xf0,
+			0x4b, 0x91, 0x25, 0xab, 0x92, 0xa8, 0xc8, 0x6f,
+			0xa0, 0xeb, 0x71, 0x56, 0x95, 0xab, 0xfd, 0xd7,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xe1, 0xe9, 0xbe, 0x6c, 0x96, 0xe2, 0xe8, 0xa6,
+			0x53, 0xcd, 0x79, 0x77, 0x57, 0x51, 0x2f, 0xb2,
+			0x9f, 0xfc, 0x09, 0xaa, 0x2c, 0xbc, 0x6c, 0x5f,
+			0xb0, 0xf2, 0x12, 0x39, 0x54, 0xd7, 0x27, 0xf8,
+			0x33, 0x5d, 0xd4, 0x8a, 0xca, 0xd8, 0x2e, 0xbb,
+			0x02, 0x82, 0xca, 0x1b, 0x54, 0xfa, 0xd6, 0xf4,
+			0x49, 0x63, 0xfc, 0xc8, 0x73, 0xd4, 0x26, 0x8d,
+			0x4f, 0x1c, 0x56, 0xa7, 0xf4, 0x58, 0x6f, 0x51,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0xf2, 0xf6, 0xe1, 0x16, 0x98, 0x69, 0x74, 0x5f,
+			0x6c, 0xc4, 0x9d, 0x34, 0xa2, 0x84, 0x5d, 0x47,
+			0xac, 0x39, 0xe0, 0x14, 0x2d, 0x78, 0xfa, 0x27,
+			0xd5, 0x18, 0xaf, 0x26, 0x89, 0xa4, 0x69, 0xd3,
+			0x56, 0xde, 0xfe, 0x4b, 0x9f, 0x0c, 0x9d, 0x5a,
+			0x9a, 0x73, 0x3e, 0x3c, 0x76, 0x4b, 0x96, 0xca,
+			0x49, 0xda, 0x05, 0x8c, 0x53, 0xbb, 0x85, 0x89,
+			0x60, 0xc7, 0xe0, 0xb3, 0x51, 0x18, 0xd2, 0xd2,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0xfc, 0x5c, 0xcf, 0xbf, 0x29, 0xe3, 0x01, 0xef,
+			0x4b, 0x40, 0x70, 0x01, 0xca, 0x4d, 0x46, 0xce,
+			0xa9, 0x95, 0x5d, 0xb4, 0xf1, 0x79, 0x29, 0xdb,
+			0xac, 0x32, 0x3d, 0xd9, 0x60, 0x9e, 0x6b, 0xb8,
+			0x28, 0x62, 0xb7, 0x4a, 0xbb, 0x33, 0xb9, 0xd0,
+			0x83, 0xe0, 0xd7, 0x5a, 0x2d, 0x01, 0x4c, 0x61,
+			0x9e, 0x7d, 0x2d, 0x2d, 0x60, 0x29, 0x5e, 0x60,
+			0x10, 0xb7, 0x41, 0x00, 0x3f, 0xe5, 0xf7, 0x52,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0xf8, 0xe5, 0x4b, 0xe5, 0x89, 0xf9, 0x1b, 0x43,
+			0xbb, 0x65, 0x3d, 0xa0, 0xb4, 0xdc, 0x04, 0x26,
+			0x68, 0x15, 0xae, 0x4d, 0xd6, 0x03, 0xb7, 0x27,
+			0x06, 0x8c, 0x2a, 0x82, 0x51, 0x96, 0xbf, 0x83,
+			0x38, 0x96, 0x21, 0x8a, 0xd9, 0xf9, 0x4e, 0x38,
+			0xc6, 0xb3, 0xbd, 0xfe, 0xd3, 0x49, 0x90, 0xbc,
+			0xa1, 0x77, 0xd0, 0xa0, 0x3c, 0x2b, 0x4e, 0x10,
+			0x34, 0xc3, 0x17, 0x85, 0x3d, 0xec, 0xa8, 0x05,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0x38, 0x56, 0xaf, 0x83, 0x68, 0x9c, 0xba, 0xe3,
+			0xec, 0x51, 0xf5, 0xf4, 0x93, 0x48, 0x1d, 0xe6,
+			0xad, 0xa8, 0x8c, 0x70, 0x2a, 0xd9, 0xaa, 0x43,
+			0x04, 0x40, 0x95, 0xc1, 0xe6, 0x8a, 0xf5, 0x01,
+			0x6b, 0x79, 0xd9, 0xb4, 0xd0, 0x1d, 0x93, 0x26,
+			0xfe, 0xf5, 0x07, 0x57, 0xda, 0x08, 0x0a, 0x82,
+			0xc9, 0x17, 0x13, 0x5b, 0x9e, 0x11, 0x96, 0xa5,
+			0xd0, 0x92, 0xcd, 0xf1, 0xa3, 0x5b, 0x43, 0x21,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[BLAKE2B_HASH_SIZE] = {
+	0xa4, 0xf8, 0xf6, 0xa1, 0x36, 0x89, 0xc0, 0x2a,
+	0xc3, 0x42, 0x32, 0x71, 0xe5, 0xea, 0x14, 0x77,
+	0xf3, 0x99, 0x91, 0x87, 0x49, 0xc2, 0x8d, 0xa5,
+	0x2f, 0xed, 0x01, 0x35, 0x39, 0x64, 0x09, 0x25,
+	0xe3, 0xa8, 0x50, 0x97, 0x35, 0x8b, 0xf5, 0x19,
+	0x1e, 0xd5, 0x9f, 0x03, 0x0b, 0x65, 0x55, 0x0e,
+	0xa0, 0xb7, 0xda, 0x18, 0x7b, 0x7f, 0x88, 0x55,
+	0x1f, 0xdb, 0x82, 0x6b, 0x98, 0x90, 0x1c, 0xdd,
+};
+
+static const u8 blake2b_keyed_testvec_consolidated[BLAKE2B_HASH_SIZE] = {
+	0x2b, 0x89, 0x36, 0x3a, 0x36, 0xe4, 0x18, 0x38,
+	0xc4, 0x5b, 0x5c, 0xa5, 0x9a, 0xed, 0xf2, 0xee,
+	0x5a, 0xb6, 0x82, 0x6c, 0x63, 0xf2, 0x29, 0x57,
+	0xc7, 0xd5, 0x32, 0x27, 0xba, 0x88, 0xb1, 0xab,
+	0xf2, 0x2a, 0xc1, 0xea, 0xf3, 0x91, 0x89, 0x66,
+	0x47, 0x1e, 0x5b, 0xc6, 0x98, 0x12, 0xe9, 0x25,
+	0xbf, 0x72, 0xd2, 0x3f, 0x88, 0x97, 0x17, 0x51,
+	0xed, 0x96, 0xfb, 0xe9, 0xca, 0x52, 0x42, 0xc9,
+};
diff --git a/lib/crypto/tests/blake2b_kunit.c b/lib/crypto/tests/blake2b_kunit.c
new file mode 100644
index 000000000000..bc0be7da1e76
--- /dev/null
+++ b/lib/crypto/tests/blake2b_kunit.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/blake2b.h>
+#include "blake2b-testvecs.h"
+
+/*
+ * The following are compatibility functions that present BLAKE2b as an unkeyed
+ * hash function that produces hashes of fixed length BLAKE2B_HASH_SIZE, so that
+ * hash-test-template.h can be reused to test it.
+ */
+
+static void blake2b_default(const u8 *data, size_t len,
+			    u8 out[BLAKE2B_HASH_SIZE])
+{
+	blake2b(NULL, 0, data, len, out, BLAKE2B_HASH_SIZE);
+}
+
+static void blake2b_init_default(struct blake2b_ctx *ctx)
+{
+	blake2b_init(ctx, BLAKE2B_HASH_SIZE);
+}
+
+/*
+ * Generate the HASH_KUNIT_CASES using hash-test-template.h.  These test BLAKE2b
+ * with a key length of 0 and a hash length of BLAKE2B_HASH_SIZE.
+ */
+#define HASH blake2b_default
+#define HASH_CTX blake2b_ctx
+#define HASH_SIZE BLAKE2B_HASH_SIZE
+#define HASH_INIT blake2b_init_default
+#define HASH_UPDATE blake2b_update
+#define HASH_FINAL blake2b_final
+#include "hash-test-template.h"
+
+/*
+ * BLAKE2b specific test case which tests all possible combinations of key
+ * length and hash length.
+ */
+static void test_blake2b_all_key_and_hash_lens(struct kunit *test)
+{
+	const size_t data_len = 100;
+	u8 *data = &test_buf[0];
+	u8 *key = data + data_len;
+	u8 *hash = key + BLAKE2B_KEY_SIZE;
+	struct blake2b_ctx main_ctx;
+	u8 main_hash[BLAKE2B_HASH_SIZE];
+
+	rand_bytes_seeded_from_len(data, data_len);
+	blake2b_init(&main_ctx, BLAKE2B_HASH_SIZE);
+	for (int key_len = 0; key_len <= BLAKE2B_KEY_SIZE; key_len++) {
+		rand_bytes_seeded_from_len(key, key_len);
+		for (int out_len = 1; out_len <= BLAKE2B_HASH_SIZE; out_len++) {
+			blake2b(key, key_len, data, data_len, hash, out_len);
+			blake2b_update(&main_ctx, hash, out_len);
+		}
+	}
+	blake2b_final(&main_ctx, main_hash);
+	KUNIT_ASSERT_MEMEQ(test, main_hash, blake2b_keyed_testvec_consolidated,
+			   BLAKE2B_HASH_SIZE);
+}
+
+/*
+ * BLAKE2b specific test case which tests using a guarded buffer for all allowed
+ * key lengths.  Also tests both blake2b() and blake2b_init_key().
+ */
+static void test_blake2b_with_guarded_key_buf(struct kunit *test)
+{
+	const size_t data_len = 100;
+
+	rand_bytes(test_buf, data_len);
+	for (int key_len = 0; key_len <= BLAKE2B_KEY_SIZE; key_len++) {
+		u8 key[BLAKE2B_KEY_SIZE];
+		u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
+		u8 hash1[BLAKE2B_HASH_SIZE];
+		u8 hash2[BLAKE2B_HASH_SIZE];
+		struct blake2b_ctx ctx;
+
+		rand_bytes(key, key_len);
+		memcpy(guarded_key, key, key_len);
+
+		blake2b(key, key_len, test_buf, data_len,
+			hash1, BLAKE2B_HASH_SIZE);
+		blake2b(guarded_key, key_len, test_buf, data_len,
+			hash2, BLAKE2B_HASH_SIZE);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2B_HASH_SIZE);
+
+		blake2b_init_key(&ctx, BLAKE2B_HASH_SIZE, guarded_key, key_len);
+		blake2b_update(&ctx, test_buf, data_len);
+		blake2b_final(&ctx, hash2);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2B_HASH_SIZE);
+	}
+}
+
+/*
+ * BLAKE2b specific test case which tests using a guarded output buffer for all
+ * allowed output lengths.
+ */
+static void test_blake2b_with_guarded_out_buf(struct kunit *test)
+{
+	const size_t data_len = 100;
+
+	rand_bytes(test_buf, data_len);
+	for (int out_len = 1; out_len <= BLAKE2B_HASH_SIZE; out_len++) {
+		u8 hash[BLAKE2B_HASH_SIZE];
+		u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
+
+		blake2b(NULL, 0, test_buf, data_len, hash, out_len);
+		blake2b(NULL, 0, test_buf, data_len, guarded_hash, out_len);
+		KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
+	}
+}
+
+static struct kunit_case blake2b_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_blake2b_all_key_and_hash_lens),
+	KUNIT_CASE(test_blake2b_with_guarded_key_buf),
+	KUNIT_CASE(test_blake2b_with_guarded_out_buf),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite blake2b_test_suite = {
+	.name = "blake2b",
+	.test_cases = blake2b_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(blake2b_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for BLAKE2b");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/blake2s-testvecs.h b/lib/crypto/tests/blake2s-testvecs.h
new file mode 100644
index 000000000000..6f978b79a59b
--- /dev/null
+++ b/lib/crypto/tests/blake2s-testvecs.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py blake2s */
+
+static const struct {
+	size_t data_len;
+	u8 digest[BLAKE2S_HASH_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0x69, 0x21, 0x7a, 0x30, 0x79, 0x90, 0x80, 0x94,
+			0xe1, 0x11, 0x21, 0xd0, 0x42, 0x35, 0x4a, 0x7c,
+			0x1f, 0x55, 0xb6, 0x48, 0x2c, 0xa1, 0xa5, 0x1e,
+			0x1b, 0x25, 0x0d, 0xfd, 0x1e, 0xd0, 0xee, 0xf9,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x7c, 0xab, 0x53, 0xe2, 0x48, 0x87, 0xdf, 0x64,
+			0x98, 0x6a, 0xc1, 0x7e, 0xf0, 0x01, 0x4d, 0xc9,
+			0x07, 0x4f, 0xb8, 0x2f, 0x46, 0xd7, 0xee, 0xa9,
+			0xad, 0xe5, 0xf8, 0x21, 0xac, 0xfe, 0x17, 0x58,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x5e, 0x63, 0x2c, 0xd0, 0xf8, 0x7b, 0xf5, 0xae,
+			0x61, 0x97, 0x94, 0x57, 0xc8, 0x76, 0x22, 0xd9,
+			0x8b, 0x04, 0x5e, 0xf1, 0x5d, 0xd0, 0xfc, 0xd9,
+			0x0c, 0x19, 0x2e, 0xe2, 0xc5, 0xd9, 0x73, 0x51,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x33, 0x65, 0xa6, 0x37, 0xbf, 0xf8, 0x4f, 0x15,
+			0x4c, 0xac, 0x9e, 0xa4, 0x3b, 0x02, 0x07, 0x0c,
+			0x80, 0x86, 0x0d, 0x6c, 0xe4, 0xaf, 0x1c, 0xbc,
+			0x0b, 0x9c, 0x0a, 0x98, 0xc2, 0x99, 0x71, 0xcd,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x59, 0xd2, 0x10, 0xd3, 0x75, 0xac, 0x48, 0x32,
+			0xb1, 0xea, 0xee, 0xcf, 0x0a, 0xd2, 0x8b, 0x15,
+			0x5d, 0x72, 0x71, 0x4c, 0xa7, 0x29, 0xb0, 0x7a,
+			0x44, 0x48, 0x8a, 0x54, 0x54, 0x54, 0x41, 0xf5,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0xdc, 0xfc, 0x46, 0x81, 0xc6, 0x1b, 0x2b, 0x47,
+			0x8b, 0xed, 0xe0, 0x73, 0x34, 0x38, 0x53, 0x92,
+			0x97, 0x2f, 0xfb, 0x51, 0xab, 0x4f, 0x2d, 0x9d,
+			0x69, 0x04, 0xa9, 0x5d, 0x33, 0xef, 0xcb, 0x1c,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0xd6, 0x2a, 0x7f, 0x96, 0x04, 0x4d, 0x16, 0xc8,
+			0x49, 0xe0, 0x37, 0x33, 0xe3, 0x7b, 0x34, 0x56,
+			0x99, 0xc5, 0x78, 0x57, 0x06, 0x02, 0xb4, 0xea,
+			0x80, 0xc4, 0xf8, 0x8f, 0x8d, 0x2b, 0xe4, 0x05,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x8b, 0x58, 0x62, 0xb5, 0x85, 0xf6, 0x83, 0x36,
+			0xf5, 0x34, 0xb8, 0xd4, 0xbc, 0x5c, 0x8b, 0x38,
+			0xfd, 0x15, 0xcd, 0x44, 0x83, 0x25, 0x71, 0xe1,
+			0xd5, 0xe8, 0xa1, 0xa4, 0x36, 0x98, 0x7e, 0x68,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x7e, 0xeb, 0x06, 0x87, 0xdf, 0x1a, 0xdc, 0xe5,
+			0xfb, 0x64, 0xd4, 0xd1, 0x5d, 0x9e, 0x75, 0xc0,
+			0xb9, 0xad, 0x55, 0x6c, 0xe6, 0xba, 0x4d, 0x98,
+			0x2f, 0xbf, 0x72, 0xad, 0x61, 0x37, 0xf6, 0x11,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x72, 0xdb, 0x43, 0x16, 0x57, 0x8e, 0x3a, 0x96,
+			0xf3, 0x98, 0x19, 0x24, 0x17, 0x3b, 0xe8, 0xad,
+			0xa1, 0x9b, 0xa4, 0x1b, 0x74, 0x85, 0x2e, 0x24,
+			0x70, 0xea, 0x31, 0x5a, 0x1c, 0xbe, 0x43, 0xb5,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x32, 0x48, 0xb0, 0xf0, 0x3f, 0xbb, 0xd2, 0xa3,
+			0xfd, 0xf6, 0x28, 0x4a, 0x2a, 0xc5, 0xbe, 0x4b,
+			0x73, 0x50, 0x63, 0xd6, 0x16, 0x00, 0xef, 0xed,
+			0xfe, 0x97, 0x41, 0x29, 0xb2, 0x84, 0xc4, 0xa3,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x17, 0xda, 0x6b, 0x96, 0x6a, 0xa6, 0xa4, 0xa6,
+			0xa6, 0xf3, 0x9d, 0x18, 0x19, 0x8d, 0x98, 0x7c,
+			0x66, 0x38, 0xe8, 0x99, 0xe7, 0x0a, 0x50, 0x92,
+			0xaf, 0x11, 0x80, 0x05, 0x66, 0xed, 0xab, 0x74,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x13, 0xd5, 0x8b, 0x22, 0xae, 0x90, 0x7b, 0x67,
+			0x87, 0x4e, 0x3c, 0x35, 0x4e, 0x01, 0xf0, 0xb1,
+			0xd3, 0xd1, 0x67, 0xbb, 0x43, 0xdb, 0x7c, 0x75,
+			0xa4, 0xc7, 0x64, 0x83, 0x1e, 0x9b, 0x98, 0xad,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x6f, 0xe0, 0x5d, 0x9d, 0xd5, 0x78, 0x29, 0xfb,
+			0xd0, 0x77, 0xd1, 0x8a, 0xf0, 0x80, 0xcb, 0x81,
+			0x71, 0x9e, 0x4d, 0x49, 0xde, 0x74, 0x2a, 0x37,
+			0xc0, 0xd5, 0xf0, 0xfa, 0x50, 0xe6, 0x23, 0xfe,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x89, 0xac, 0xf6, 0xe7, 0x5e, 0xba, 0x53, 0xf4,
+			0x92, 0x32, 0xd5, 0x64, 0xfb, 0xc4, 0x08, 0xac,
+			0x2c, 0x19, 0x6e, 0x63, 0x13, 0x75, 0xd0, 0x60,
+			0x54, 0x35, 0x82, 0xc4, 0x6d, 0x03, 0x1a, 0x05,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x1c, 0xaf, 0x94, 0x7d, 0x9c, 0xce, 0x57, 0x64,
+			0xf8, 0xa8, 0x25, 0x45, 0x32, 0x86, 0x2b, 0x04,
+			0xb3, 0x2e, 0x67, 0xca, 0x73, 0x04, 0x2f, 0xab,
+			0xcc, 0xda, 0x9e, 0x42, 0xa1, 0xaf, 0x83, 0x5a,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x21, 0xdf, 0xdc, 0x29, 0xd9, 0xfc, 0x7b, 0xe7,
+			0x3a, 0xc4, 0xe1, 0x61, 0xc5, 0xb5, 0xe1, 0xee,
+			0x7a, 0x9d, 0x0c, 0x66, 0x36, 0x63, 0xe4, 0x12,
+			0x62, 0xe2, 0xf5, 0x68, 0x72, 0xfc, 0x1e, 0x18,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x6e, 0xc7, 0x2e, 0xac, 0xd0, 0xbb, 0x22, 0xe0,
+			0xc2, 0x40, 0xb2, 0xfe, 0x8c, 0xaf, 0x9e, 0xcf,
+			0x32, 0x06, 0xc6, 0x45, 0x29, 0xbd, 0xe0, 0x7f,
+			0x53, 0x32, 0xc3, 0x2b, 0x2f, 0x68, 0x12, 0xcd,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x76, 0xba, 0x52, 0xb5, 0x09, 0xf5, 0x19, 0x09,
+			0x70, 0x1c, 0x09, 0x28, 0xb4, 0xaa, 0x98, 0x6a,
+			0x79, 0xe7, 0x5e, 0xcd, 0xe8, 0xa4, 0x73, 0x69,
+			0x1f, 0xf8, 0x05, 0x0a, 0xb4, 0xfe, 0xf9, 0x63,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xf7, 0xad, 0xf9, 0xc8, 0x0e, 0x04, 0x2f, 0xdf,
+			0xbe, 0x39, 0x79, 0x07, 0x0d, 0xd8, 0x1b, 0x06,
+			0x42, 0x3a, 0x43, 0x93, 0xf6, 0x7c, 0xc4, 0xe5,
+			0xc2, 0xd5, 0xd0, 0xa6, 0x35, 0x6c, 0xbd, 0x17,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x38, 0xd7, 0xab, 0x7e, 0x08, 0xdc, 0x1e, 0xab,
+			0x55, 0xbb, 0x3b, 0x7b, 0x6a, 0x17, 0xcc, 0x79,
+			0xa7, 0x02, 0x62, 0x66, 0x9b, 0xca, 0xee, 0xc0,
+			0x3d, 0x75, 0x34, 0x2e, 0x55, 0x82, 0x26, 0x3c,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0xf7, 0xeb, 0x2f, 0x24, 0x98, 0x54, 0x04, 0x5a,
+			0x19, 0xe4, 0x12, 0x9d, 0x97, 0xbc, 0x87, 0xa5,
+			0x0b, 0x85, 0x29, 0xa1, 0x36, 0x89, 0xc9, 0xba,
+			0xa0, 0xe0, 0xac, 0x99, 0x7d, 0xa4, 0x51, 0x9f,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x8f, 0xe8, 0xa7, 0x79, 0x02, 0xbb, 0x4a, 0x56,
+			0x66, 0x91, 0xef, 0x22, 0xd1, 0x09, 0x26, 0x6c,
+			0xa9, 0x13, 0xd7, 0x44, 0xc7, 0x19, 0x9c, 0x0b,
+			0xfb, 0x4f, 0xca, 0x72, 0x8f, 0x34, 0xf7, 0x82,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xaa, 0x21, 0xbb, 0x25, 0x4b, 0x66, 0x6e, 0x29,
+			0x71, 0xc1, 0x44, 0x67, 0x19, 0xed, 0xe6, 0xe6,
+			0x61, 0x13, 0xf4, 0xb7, 0x02, 0x94, 0x81, 0x0f,
+			0xa7, 0x4d, 0xbb, 0x2c, 0xb8, 0xeb, 0x41, 0x0e,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[BLAKE2S_HASH_SIZE] = {
+	0x84, 0x21, 0xbb, 0x73, 0x64, 0x47, 0x45, 0xe0,
+	0xc1, 0x83, 0x78, 0xf1, 0xea, 0xe5, 0xfd, 0xdb,
+	0x01, 0xda, 0xb7, 0x86, 0x70, 0x3b, 0x83, 0xb3,
+	0xbc, 0xd9, 0xfd, 0x96, 0xbd, 0x50, 0x06, 0x67,
+};
+
+static const u8 blake2s_keyed_testvec_consolidated[BLAKE2S_HASH_SIZE] = {
+	0xa6, 0xad, 0xcd, 0xb8, 0xd9, 0xdd, 0xc7, 0x70,
+	0x07, 0x09, 0x7f, 0x9f, 0x41, 0xa9, 0x70, 0xa4,
+	0x1c, 0xca, 0x61, 0xbb, 0x58, 0xb5, 0xb2, 0x1d,
+	0xd1, 0x71, 0x16, 0xb0, 0x49, 0x4f, 0x9e, 0x1b,
+};
diff --git a/lib/crypto/tests/blake2s_kunit.c b/lib/crypto/tests/blake2s_kunit.c
new file mode 100644
index 000000000000..6832d9aa7b82
--- /dev/null
+++ b/lib/crypto/tests/blake2s_kunit.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/blake2s.h>
+#include "blake2s-testvecs.h"
+
+/*
+ * The following are compatibility functions that present BLAKE2s as an unkeyed
+ * hash function that produces hashes of fixed length BLAKE2S_HASH_SIZE, so that
+ * hash-test-template.h can be reused to test it.
+ */
+
+static void blake2s_default(const u8 *data, size_t len,
+			    u8 out[BLAKE2S_HASH_SIZE])
+{
+	blake2s(NULL, 0, data, len, out, BLAKE2S_HASH_SIZE);
+}
+
+static void blake2s_init_default(struct blake2s_ctx *ctx)
+{
+	blake2s_init(ctx, BLAKE2S_HASH_SIZE);
+}
+
+/*
+ * Generate the HASH_KUNIT_CASES using hash-test-template.h.  These test BLAKE2s
+ * with a key length of 0 and a hash length of BLAKE2S_HASH_SIZE.
+ */
+#define HASH blake2s_default
+#define HASH_CTX blake2s_ctx
+#define HASH_SIZE BLAKE2S_HASH_SIZE
+#define HASH_INIT blake2s_init_default
+#define HASH_UPDATE blake2s_update
+#define HASH_FINAL blake2s_final
+#include "hash-test-template.h"
+
+/*
+ * BLAKE2s specific test case which tests all possible combinations of key
+ * length and hash length.
+ */
+static void test_blake2s_all_key_and_hash_lens(struct kunit *test)
+{
+	const size_t data_len = 100;
+	u8 *data = &test_buf[0];
+	u8 *key = data + data_len;
+	u8 *hash = key + BLAKE2S_KEY_SIZE;
+	struct blake2s_ctx main_ctx;
+	u8 main_hash[BLAKE2S_HASH_SIZE];
+
+	rand_bytes_seeded_from_len(data, data_len);
+	blake2s_init(&main_ctx, BLAKE2S_HASH_SIZE);
+	for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
+		rand_bytes_seeded_from_len(key, key_len);
+		for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
+			blake2s(key, key_len, data, data_len, hash, out_len);
+			blake2s_update(&main_ctx, hash, out_len);
+		}
+	}
+	blake2s_final(&main_ctx, main_hash);
+	KUNIT_ASSERT_MEMEQ(test, main_hash, blake2s_keyed_testvec_consolidated,
+			   BLAKE2S_HASH_SIZE);
+}
+
+/*
+ * BLAKE2s specific test case which tests using a guarded buffer for all allowed
+ * key lengths.  Also tests both blake2s() and blake2s_init_key().
+ */
+static void test_blake2s_with_guarded_key_buf(struct kunit *test)
+{
+	const size_t data_len = 100;
+
+	rand_bytes(test_buf, data_len);
+	for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
+		u8 key[BLAKE2S_KEY_SIZE];
+		u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
+		u8 hash1[BLAKE2S_HASH_SIZE];
+		u8 hash2[BLAKE2S_HASH_SIZE];
+		struct blake2s_ctx ctx;
+
+		rand_bytes(key, key_len);
+		memcpy(guarded_key, key, key_len);
+
+		blake2s(key, key_len, test_buf, data_len,
+			hash1, BLAKE2S_HASH_SIZE);
+		blake2s(guarded_key, key_len, test_buf, data_len,
+			hash2, BLAKE2S_HASH_SIZE);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
+
+		blake2s_init_key(&ctx, BLAKE2S_HASH_SIZE, guarded_key, key_len);
+		blake2s_update(&ctx, test_buf, data_len);
+		blake2s_final(&ctx, hash2);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
+	}
+}
+
+/*
+ * BLAKE2s specific test case which tests using a guarded output buffer for all
+ * allowed output lengths.
+ */
+static void test_blake2s_with_guarded_out_buf(struct kunit *test)
+{
+	const size_t data_len = 100;
+
+	rand_bytes(test_buf, data_len);
+	for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
+		u8 hash[BLAKE2S_HASH_SIZE];
+		u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
+
+		blake2s(NULL, 0, test_buf, data_len, hash, out_len);
+		blake2s(NULL, 0, test_buf, data_len, guarded_hash, out_len);
+		KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
+	}
+}
+
+static struct kunit_case blake2s_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_blake2s_all_key_and_hash_lens),
+	KUNIT_CASE(test_blake2s_with_guarded_key_buf),
+	KUNIT_CASE(test_blake2s_with_guarded_out_buf),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite blake2s_test_suite = {
+	.name = "blake2s",
+	.test_cases = blake2s_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(blake2s_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for BLAKE2s");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/curve25519-selftest.c b/lib/crypto/tests/curve25519_kunit.c
index c85e85381e78..248d05f66b35 100644
--- a/lib/crypto/curve25519-selftest.c
+++ b/lib/crypto/tests/curve25519_kunit.c
@@ -4,6 +4,8 @@
  */
 
 #include <crypto/curve25519.h>
+#include <kunit/test.h>
+#include <linux/timekeeping.h>
 
 struct curve25519_test_vector {
 	u8 private[CURVE25519_KEY_SIZE];
@@ -11,7 +13,7 @@ struct curve25519_test_vector {
 	u8 result[CURVE25519_KEY_SIZE];
 	bool valid;
 };
-static const struct curve25519_test_vector curve25519_test_vectors[] __initconst = {
+static const struct curve25519_test_vector curve25519_test_vectors[] = {
 	{
 		.private = { 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d,
 			     0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45,
@@ -1280,42 +1282,82 @@ static const struct curve25519_test_vector curve25519_test_vectors[] __initconst
 	}
 };
 
-bool __init curve25519_selftest(void)
+static void test_curve25519(struct kunit *test)
 {
-	bool success = true, ret, ret2;
-	size_t i = 0, j;
-	u8 in[CURVE25519_KEY_SIZE];
-	u8 out[CURVE25519_KEY_SIZE], out2[CURVE25519_KEY_SIZE],
-	   out3[CURVE25519_KEY_SIZE];
+	for (size_t i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) {
+		const struct curve25519_test_vector *vec =
+			&curve25519_test_vectors[i];
+		u8 out[CURVE25519_KEY_SIZE] = {};
+		bool ret;
 
-	for (i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) {
-		memset(out, 0, CURVE25519_KEY_SIZE);
-		ret = curve25519(out, curve25519_test_vectors[i].private,
-				 curve25519_test_vectors[i].public);
-		if (ret != curve25519_test_vectors[i].valid ||
-		    memcmp(out, curve25519_test_vectors[i].result,
-			   CURVE25519_KEY_SIZE)) {
-			pr_err("curve25519 self-test %zu: FAIL\n", i + 1);
-			success = false;
-		}
+		ret = curve25519(out, vec->private, vec->public);
+		KUNIT_EXPECT_EQ_MSG(test, ret, vec->valid,
+				    "Wrong return value with test vector %zu",
+				    i);
+		KUNIT_EXPECT_MEMEQ_MSG(test, out, vec->result, sizeof(out),
+				       "Wrong output with test vector %zu", i);
 	}
+}
+
+static void test_curve25519_basepoint(struct kunit *test)
+{
+	for (size_t i = 0; i < 5; ++i) {
+		u8 in[CURVE25519_KEY_SIZE];
+		u8 out[CURVE25519_KEY_SIZE];
+		u8 out2[CURVE25519_KEY_SIZE];
+		bool ret, ret2;
 
-	for (i = 0; i < 5; ++i) {
 		get_random_bytes(in, sizeof(in));
 		ret = curve25519_generate_public(out, in);
 		ret2 = curve25519(out2, in, (u8[CURVE25519_KEY_SIZE]){ 9 });
-		curve25519_generic(out3, in, (u8[CURVE25519_KEY_SIZE]){ 9 });
-		if (ret != ret2 ||
-		    memcmp(out, out2, CURVE25519_KEY_SIZE) ||
-		    memcmp(out, out3, CURVE25519_KEY_SIZE)) {
-			pr_err("curve25519 basepoint self-test %zu: FAIL: input - 0x",
-			       i + 1);
-			for (j = CURVE25519_KEY_SIZE; j-- > 0;)
-				printk(KERN_CONT "%02x", in[j]);
-			printk(KERN_CONT "\n");
-			success = false;
-		}
+		KUNIT_EXPECT_EQ_MSG(test, ret, ret2,
+				    "in=%*phN", CURVE25519_KEY_SIZE, in);
+		KUNIT_EXPECT_MEMEQ_MSG(test, out, out2, CURVE25519_KEY_SIZE,
+				       "in=%*phN", CURVE25519_KEY_SIZE, in);
 	}
+}
+
+static void benchmark_curve25519(struct kunit *test)
+{
+	const u8 *private = curve25519_test_vectors[0].private;
+	const u8 *public = curve25519_test_vectors[0].public;
+	const size_t warmup_niter = 5000;
+	const size_t benchmark_niter = 1024;
+	u8 out[CURVE25519_KEY_SIZE];
+	bool ok = true;
+	u64 t;
+
+	if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+		kunit_skip(test, "not enabled");
 
-	return success;
+	/* Warm-up */
+	for (size_t i = 0; i < warmup_niter; i++)
+		ok &= curve25519(out, private, public);
+
+	/* Benchmark */
+	preempt_disable();
+	t = ktime_get_ns();
+	for (size_t i = 0; i < benchmark_niter; i++)
+		ok &= curve25519(out, private, public);
+	t = ktime_get_ns() - t;
+	preempt_enable();
+	KUNIT_EXPECT_TRUE(test, ok);
+	kunit_info(test, "%llu ops/s",
+		   div64_u64((u64)benchmark_niter * NSEC_PER_SEC, t ?: 1));
 }
+
+static struct kunit_case curve25519_test_cases[] = {
+	KUNIT_CASE(test_curve25519),
+	KUNIT_CASE(test_curve25519_basepoint),
+	KUNIT_CASE(benchmark_curve25519),
+	{},
+};
+
+static struct kunit_suite curve25519_test_suite = {
+	.name = "curve25519",
+	.test_cases = curve25519_test_cases,
+};
+kunit_test_suite(curve25519_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for Curve25519");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/hash-test-template.h b/lib/crypto/tests/hash-test-template.h
new file mode 100644
index 000000000000..61b43e62779f
--- /dev/null
+++ b/lib/crypto/tests/hash-test-template.h
@@ -0,0 +1,568 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Test cases for hash functions, including a benchmark.  This is included by
+ * KUnit test suites that want to use it.  See sha512_kunit.c for an example.
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <kunit/run-in-irq-context.h>
+#include <kunit/test.h>
+#include <linux/vmalloc.h>
+
+/* test_buf is a guarded buffer, i.e. &test_buf[TEST_BUF_LEN] is not mapped. */
+#define TEST_BUF_LEN 16384
+static u8 *test_buf;
+
+static u8 *orig_test_buf;
+
+static u64 random_seed;
+
+/*
+ * This is a simple linear congruential generator.  It is used only for testing,
+ * which does not require cryptographically secure random numbers.  A hard-coded
+ * algorithm is used instead of <linux/prandom.h> so that it matches the
+ * algorithm used by the test vector generation script.  This allows the input
+ * data in random test vectors to be concisely stored as just the seed.
+ */
+static u32 rand32(void)
+{
+	random_seed = (random_seed * 25214903917 + 11) & ((1ULL << 48) - 1);
+	return random_seed >> 16;
+}
+
+static void rand_bytes(u8 *out, size_t len)
+{
+	for (size_t i = 0; i < len; i++)
+		out[i] = rand32();
+}
+
+static void rand_bytes_seeded_from_len(u8 *out, size_t len)
+{
+	random_seed = len;
+	rand_bytes(out, len);
+}
+
+static bool rand_bool(void)
+{
+	return rand32() % 2;
+}
+
+/* Generate a random length, preferring small lengths. */
+static size_t rand_length(size_t max_len)
+{
+	size_t len;
+
+	switch (rand32() % 3) {
+	case 0:
+		len = rand32() % 128;
+		break;
+	case 1:
+		len = rand32() % 3072;
+		break;
+	default:
+		len = rand32();
+		break;
+	}
+	return len % (max_len + 1);
+}
+
+static size_t rand_offset(size_t max_offset)
+{
+	return min(rand32() % 128, max_offset);
+}
+
+static int hash_suite_init(struct kunit_suite *suite)
+{
+	/*
+	 * Allocate the test buffer using vmalloc() with a page-aligned length
+	 * so that it is immediately followed by a guard page.  This allows
+	 * buffer overreads to be detected, even in assembly code.
+	 */
+	size_t alloc_len = round_up(TEST_BUF_LEN, PAGE_SIZE);
+
+	orig_test_buf = vmalloc(alloc_len);
+	if (!orig_test_buf)
+		return -ENOMEM;
+
+	test_buf = orig_test_buf + alloc_len - TEST_BUF_LEN;
+	return 0;
+}
+
+static void hash_suite_exit(struct kunit_suite *suite)
+{
+	vfree(orig_test_buf);
+	orig_test_buf = NULL;
+	test_buf = NULL;
+}
+
+/*
+ * Test the hash function against a list of test vectors.
+ *
+ * Note that it's only necessary to run each test vector in one way (e.g.,
+ * one-shot instead of incremental), since consistency between different ways of
+ * using the APIs is verified by other test cases.
+ */
+static void test_hash_test_vectors(struct kunit *test)
+{
+	for (size_t i = 0; i < ARRAY_SIZE(hash_testvecs); i++) {
+		size_t data_len = hash_testvecs[i].data_len;
+		u8 actual_hash[HASH_SIZE];
+
+		KUNIT_ASSERT_LE(test, data_len, TEST_BUF_LEN);
+		rand_bytes_seeded_from_len(test_buf, data_len);
+
+		HASH(test_buf, data_len, actual_hash);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, actual_hash, hash_testvecs[i].digest, HASH_SIZE,
+			"Wrong result with test vector %zu; data_len=%zu", i,
+			data_len);
+	}
+}
+
+/*
+ * Test that the hash function produces correct results for *every* length up to
+ * 4096 bytes.  To do this, generate seeded random data, then calculate a hash
+ * value for each length 0..4096, then hash the hash values.  Verify just the
+ * final hash value, which should match only when all hash values were correct.
+ */
+static void test_hash_all_lens_up_to_4096(struct kunit *test)
+{
+	struct HASH_CTX ctx;
+	u8 hash[HASH_SIZE];
+
+	static_assert(TEST_BUF_LEN >= 4096);
+	rand_bytes_seeded_from_len(test_buf, 4096);
+	HASH_INIT(&ctx);
+	for (size_t len = 0; len <= 4096; len++) {
+		HASH(test_buf, len, hash);
+		HASH_UPDATE(&ctx, hash, HASH_SIZE);
+	}
+	HASH_FINAL(&ctx, hash);
+	KUNIT_ASSERT_MEMEQ(test, hash, hash_testvec_consolidated, HASH_SIZE);
+}
+
+/*
+ * Test that the hash function produces the same result with a one-shot
+ * computation as it does with an incremental computation.
+ */
+static void test_hash_incremental_updates(struct kunit *test)
+{
+	for (int i = 0; i < 1000; i++) {
+		size_t total_len, offset;
+		struct HASH_CTX ctx;
+		u8 hash1[HASH_SIZE];
+		u8 hash2[HASH_SIZE];
+		size_t num_parts = 0;
+		size_t remaining_len, cur_offset;
+
+		total_len = rand_length(TEST_BUF_LEN);
+		offset = rand_offset(TEST_BUF_LEN - total_len);
+		rand_bytes(&test_buf[offset], total_len);
+
+		/* Compute the hash value in one shot. */
+		HASH(&test_buf[offset], total_len, hash1);
+
+		/*
+		 * Compute the hash value incrementally, using a randomly
+		 * selected sequence of update lengths that sum to total_len.
+		 */
+		HASH_INIT(&ctx);
+		remaining_len = total_len;
+		cur_offset = offset;
+		while (rand_bool()) {
+			size_t part_len = rand_length(remaining_len);
+
+			HASH_UPDATE(&ctx, &test_buf[cur_offset], part_len);
+			num_parts++;
+			cur_offset += part_len;
+			remaining_len -= part_len;
+		}
+		if (remaining_len != 0 || rand_bool()) {
+			HASH_UPDATE(&ctx, &test_buf[cur_offset], remaining_len);
+			num_parts++;
+		}
+		HASH_FINAL(&ctx, hash2);
+
+		/* Verify that the two hash values are the same. */
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash1, hash2, HASH_SIZE,
+			"Incremental test failed with total_len=%zu num_parts=%zu offset=%zu",
+			total_len, num_parts, offset);
+	}
+}
+
+/*
+ * Test that the hash function does not overrun any buffers.  Uses a guard page
+ * to catch buffer overruns even if they occur in assembly code.
+ */
+static void test_hash_buffer_overruns(struct kunit *test)
+{
+	const size_t max_tested_len = TEST_BUF_LEN - sizeof(struct HASH_CTX);
+	void *const buf_end = &test_buf[TEST_BUF_LEN];
+	struct HASH_CTX *guarded_ctx = buf_end - sizeof(*guarded_ctx);
+
+	rand_bytes(test_buf, TEST_BUF_LEN);
+
+	for (int i = 0; i < 100; i++) {
+		size_t len = rand_length(max_tested_len);
+		struct HASH_CTX ctx;
+		u8 hash[HASH_SIZE];
+
+		/* Check for overruns of the data buffer. */
+		HASH(buf_end - len, len, hash);
+		HASH_INIT(&ctx);
+		HASH_UPDATE(&ctx, buf_end - len, len);
+		HASH_FINAL(&ctx, hash);
+
+		/* Check for overruns of the hash value buffer. */
+		HASH(test_buf, len, buf_end - HASH_SIZE);
+		HASH_INIT(&ctx);
+		HASH_UPDATE(&ctx, test_buf, len);
+		HASH_FINAL(&ctx, buf_end - HASH_SIZE);
+
+		/* Check for overuns of the hash context. */
+		HASH_INIT(guarded_ctx);
+		HASH_UPDATE(guarded_ctx, test_buf, len);
+		HASH_FINAL(guarded_ctx, hash);
+	}
+}
+
+/*
+ * Test that the caller is permitted to alias the output digest and source data
+ * buffer, and also modify the source data buffer after it has been used.
+ */
+static void test_hash_overlaps(struct kunit *test)
+{
+	const size_t max_tested_len = TEST_BUF_LEN - HASH_SIZE;
+	struct HASH_CTX ctx;
+	u8 hash[HASH_SIZE];
+
+	rand_bytes(test_buf, TEST_BUF_LEN);
+
+	for (int i = 0; i < 100; i++) {
+		size_t len = rand_length(max_tested_len);
+		size_t offset = HASH_SIZE + rand_offset(max_tested_len - len);
+		bool left_end = rand_bool();
+		u8 *ovl_hash = left_end ? &test_buf[offset] :
+					  &test_buf[offset + len - HASH_SIZE];
+
+		HASH(&test_buf[offset], len, hash);
+		HASH(&test_buf[offset], len, ovl_hash);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash, ovl_hash, HASH_SIZE,
+			"Overlap test 1 failed with len=%zu offset=%zu left_end=%d",
+			len, offset, left_end);
+
+		/* Repeat the above test, but this time use init+update+final */
+		HASH(&test_buf[offset], len, hash);
+		HASH_INIT(&ctx);
+		HASH_UPDATE(&ctx, &test_buf[offset], len);
+		HASH_FINAL(&ctx, ovl_hash);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash, ovl_hash, HASH_SIZE,
+			"Overlap test 2 failed with len=%zu offset=%zu left_end=%d",
+			len, offset, left_end);
+
+		/* Test modifying the source data after it was used. */
+		HASH(&test_buf[offset], len, hash);
+		HASH_INIT(&ctx);
+		HASH_UPDATE(&ctx, &test_buf[offset], len);
+		rand_bytes(&test_buf[offset], len);
+		HASH_FINAL(&ctx, ovl_hash);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash, ovl_hash, HASH_SIZE,
+			"Overlap test 3 failed with len=%zu offset=%zu left_end=%d",
+			len, offset, left_end);
+	}
+}
+
+/*
+ * Test that if the same data is hashed at different alignments in memory, the
+ * results are the same.
+ */
+static void test_hash_alignment_consistency(struct kunit *test)
+{
+	u8 hash1[128 + HASH_SIZE];
+	u8 hash2[128 + HASH_SIZE];
+
+	for (int i = 0; i < 100; i++) {
+		size_t len = rand_length(TEST_BUF_LEN);
+		size_t data_offs1 = rand_offset(TEST_BUF_LEN - len);
+		size_t data_offs2 = rand_offset(TEST_BUF_LEN - len);
+		size_t hash_offs1 = rand_offset(128);
+		size_t hash_offs2 = rand_offset(128);
+
+		rand_bytes(&test_buf[data_offs1], len);
+		HASH(&test_buf[data_offs1], len, &hash1[hash_offs1]);
+		memmove(&test_buf[data_offs2], &test_buf[data_offs1], len);
+		HASH(&test_buf[data_offs2], len, &hash2[hash_offs2]);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, &hash1[hash_offs1], &hash2[hash_offs2], HASH_SIZE,
+			"Alignment consistency test failed with len=%zu data_offs=(%zu,%zu) hash_offs=(%zu,%zu)",
+			len, data_offs1, data_offs2, hash_offs1, hash_offs2);
+	}
+}
+
+/* Test that HASH_FINAL zeroizes the context. */
+static void test_hash_ctx_zeroization(struct kunit *test)
+{
+	static const u8 zeroes[sizeof(struct HASH_CTX)];
+	struct HASH_CTX ctx;
+
+	rand_bytes(test_buf, 128);
+	HASH_INIT(&ctx);
+	HASH_UPDATE(&ctx, test_buf, 128);
+	HASH_FINAL(&ctx, test_buf);
+	KUNIT_ASSERT_MEMEQ_MSG(test, &ctx, zeroes, sizeof(ctx),
+			       "Hash context was not zeroized by finalization");
+}
+
+#define IRQ_TEST_DATA_LEN 256
+#define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */
+
+struct hash_irq_test1_state {
+	u8 expected_hashes[IRQ_TEST_NUM_BUFFERS][HASH_SIZE];
+	atomic_t seqno;
+};
+
+/*
+ * Compute the hash of one of the test messages and verify that it matches the
+ * expected hash from @state->expected_hashes.  To increase the chance of
+ * detecting problems, cycle through multiple messages.
+ */
+static bool hash_irq_test1_func(void *state_)
+{
+	struct hash_irq_test1_state *state = state_;
+	u32 i = (u32)atomic_inc_return(&state->seqno) % IRQ_TEST_NUM_BUFFERS;
+	u8 actual_hash[HASH_SIZE];
+
+	HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN, actual_hash);
+	return memcmp(actual_hash, state->expected_hashes[i], HASH_SIZE) == 0;
+}
+
+/*
+ * Test that if hashes are computed in task, softirq, and hardirq context
+ * concurrently, then all results are as expected.
+ */
+static void test_hash_interrupt_context_1(struct kunit *test)
+{
+	struct hash_irq_test1_state state = {};
+
+	/* Prepare some test messages and compute the expected hash of each. */
+	rand_bytes(test_buf, IRQ_TEST_NUM_BUFFERS * IRQ_TEST_DATA_LEN);
+	for (int i = 0; i < IRQ_TEST_NUM_BUFFERS; i++)
+		HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN,
+		     state.expected_hashes[i]);
+
+	kunit_run_irq_test(test, hash_irq_test1_func, 100000, &state);
+}
+
+struct hash_irq_test2_hash_ctx {
+	struct HASH_CTX hash_ctx;
+	atomic_t in_use;
+	int offset;
+	int step;
+};
+
+struct hash_irq_test2_state {
+	struct hash_irq_test2_hash_ctx ctxs[IRQ_TEST_NUM_BUFFERS];
+	u8 expected_hash[HASH_SIZE];
+	u16 update_lens[32];
+	int num_steps;
+};
+
+static bool hash_irq_test2_func(void *state_)
+{
+	struct hash_irq_test2_state *state = state_;
+	struct hash_irq_test2_hash_ctx *ctx;
+	bool ret = true;
+
+	for (ctx = &state->ctxs[0]; ctx < &state->ctxs[ARRAY_SIZE(state->ctxs)];
+	     ctx++) {
+		if (atomic_cmpxchg(&ctx->in_use, 0, 1) == 0)
+			break;
+	}
+	if (WARN_ON_ONCE(ctx == &state->ctxs[ARRAY_SIZE(state->ctxs)])) {
+		/*
+		 * This should never happen, as the number of contexts is equal
+		 * to the maximum concurrency level of kunit_run_irq_test().
+		 */
+		return false;
+	}
+
+	if (ctx->step == 0) {
+		/* Init step */
+		HASH_INIT(&ctx->hash_ctx);
+		ctx->offset = 0;
+		ctx->step++;
+	} else if (ctx->step < state->num_steps - 1) {
+		/* Update step */
+		HASH_UPDATE(&ctx->hash_ctx, &test_buf[ctx->offset],
+			    state->update_lens[ctx->step - 1]);
+		ctx->offset += state->update_lens[ctx->step - 1];
+		ctx->step++;
+	} else {
+		/* Final step */
+		u8 actual_hash[HASH_SIZE];
+
+		if (WARN_ON_ONCE(ctx->offset != TEST_BUF_LEN))
+			ret = false;
+		HASH_FINAL(&ctx->hash_ctx, actual_hash);
+		if (memcmp(actual_hash, state->expected_hash, HASH_SIZE) != 0)
+			ret = false;
+		ctx->step = 0;
+	}
+	atomic_set_release(&ctx->in_use, 0);
+	return ret;
+}
+
+/*
+ * Test that if hashes are computed in task, softirq, and hardirq context
+ * concurrently, *including doing different parts of the same incremental
+ * computation in different contexts*, then all results are as expected.
+ * Besides detecting bugs similar to those that test_hash_interrupt_context_1
+ * can detect, this test case can also detect bugs where hash function
+ * implementations don't correctly handle these mixed incremental computations.
+ */
+static void test_hash_interrupt_context_2(struct kunit *test)
+{
+	struct hash_irq_test2_state *state;
+	int remaining = TEST_BUF_LEN;
+
+	state = kunit_kzalloc(test, sizeof(*state), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, state);
+
+	rand_bytes(test_buf, TEST_BUF_LEN);
+	HASH(test_buf, TEST_BUF_LEN, state->expected_hash);
+
+	/*
+	 * Generate a list of update lengths to use.  Ensure that it contains
+	 * multiple entries but is limited to a maximum length.
+	 */
+	static_assert(TEST_BUF_LEN / 4096 > 1);
+	for (state->num_steps = 0;
+	     state->num_steps < ARRAY_SIZE(state->update_lens) - 1 && remaining;
+	     state->num_steps++) {
+		state->update_lens[state->num_steps] =
+			rand_length(min(remaining, 4096));
+		remaining -= state->update_lens[state->num_steps];
+	}
+	if (remaining)
+		state->update_lens[state->num_steps++] = remaining;
+	state->num_steps += 2; /* for init and final */
+
+	kunit_run_irq_test(test, hash_irq_test2_func, 250000, state);
+}
+
+#define UNKEYED_HASH_KUNIT_CASES                     \
+	KUNIT_CASE(test_hash_test_vectors),          \
+	KUNIT_CASE(test_hash_all_lens_up_to_4096),   \
+	KUNIT_CASE(test_hash_incremental_updates),   \
+	KUNIT_CASE(test_hash_buffer_overruns),       \
+	KUNIT_CASE(test_hash_overlaps),              \
+	KUNIT_CASE(test_hash_alignment_consistency), \
+	KUNIT_CASE(test_hash_ctx_zeroization),       \
+	KUNIT_CASE(test_hash_interrupt_context_1),   \
+	KUNIT_CASE(test_hash_interrupt_context_2)
+/* benchmark_hash is omitted so that the suites can put it last. */
+
+#ifdef HMAC
+/*
+ * Test the corresponding HMAC variant.
+ *
+ * This test case is fairly short, since HMAC is just a simple C wrapper around
+ * the underlying unkeyed hash function, which is already well-tested by the
+ * other test cases.  It's not useful to test things like data alignment or
+ * interrupt context again for HMAC, nor to have a long list of test vectors.
+ *
+ * Thus, just do a single consolidated test, which covers all data lengths up to
+ * 4096 bytes and all key lengths up to 292 bytes.  For each data length, select
+ * a key length, generate the inputs from a seed, and compute the HMAC value.
+ * Concatenate all these HMAC values together, and compute the HMAC of that.
+ * Verify that value.  If this fails, then the HMAC implementation is wrong.
+ * This won't show which specific input failed, but that should be fine.  Any
+ * failure would likely be non-input-specific or also show in the unkeyed tests.
+ */
+static void test_hmac(struct kunit *test)
+{
+	static const u8 zeroes[sizeof(struct HMAC_CTX)];
+	u8 *raw_key;
+	struct HMAC_KEY key;
+	struct HMAC_CTX ctx;
+	u8 mac[HASH_SIZE];
+	u8 mac2[HASH_SIZE];
+
+	static_assert(TEST_BUF_LEN >= 4096 + 293);
+	rand_bytes_seeded_from_len(test_buf, 4096);
+	raw_key = &test_buf[4096];
+
+	rand_bytes_seeded_from_len(raw_key, 32);
+	HMAC_PREPAREKEY(&key, raw_key, 32);
+	HMAC_INIT(&ctx, &key);
+	for (size_t data_len = 0; data_len <= 4096; data_len++) {
+		/*
+		 * Cycle through key lengths as well.  Somewhat arbitrarily go
+		 * up to 293, which is somewhat larger than the largest hash
+		 * block size (which is the size at which the key starts being
+		 * hashed down to one block); going higher would not be useful.
+		 * To reduce correlation with data_len, use a prime number here.
+		 */
+		size_t key_len = data_len % 293;
+
+		HMAC_UPDATE(&ctx, test_buf, data_len);
+
+		rand_bytes_seeded_from_len(raw_key, key_len);
+		HMAC_USINGRAWKEY(raw_key, key_len, test_buf, data_len, mac);
+		HMAC_UPDATE(&ctx, mac, HASH_SIZE);
+
+		/* Verify that HMAC() is consistent with HMAC_USINGRAWKEY(). */
+		HMAC_PREPAREKEY(&key, raw_key, key_len);
+		HMAC(&key, test_buf, data_len, mac2);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, mac, mac2, HASH_SIZE,
+			"HMAC gave different results with raw and prepared keys");
+	}
+	HMAC_FINAL(&ctx, mac);
+	KUNIT_EXPECT_MEMEQ_MSG(test, mac, hmac_testvec_consolidated, HASH_SIZE,
+			       "HMAC gave wrong result");
+	KUNIT_EXPECT_MEMEQ_MSG(test, &ctx, zeroes, sizeof(ctx),
+			       "HMAC context was not zeroized by finalization");
+}
+#define HASH_KUNIT_CASES UNKEYED_HASH_KUNIT_CASES, KUNIT_CASE(test_hmac)
+#else
+#define HASH_KUNIT_CASES UNKEYED_HASH_KUNIT_CASES
+#endif
+
+/* Benchmark the hash function on various data lengths. */
+static void benchmark_hash(struct kunit *test)
+{
+	static const size_t lens_to_test[] = {
+		1,   16,  64,	127,  128,  200,   256,
+		511, 512, 1024, 3173, 4096, 16384,
+	};
+	u8 hash[HASH_SIZE];
+
+	if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+		kunit_skip(test, "not enabled");
+
+	/* Warm-up */
+	for (size_t i = 0; i < 10000000; i += TEST_BUF_LEN)
+		HASH(test_buf, TEST_BUF_LEN, hash);
+
+	for (size_t i = 0; i < ARRAY_SIZE(lens_to_test); i++) {
+		size_t len = lens_to_test[i];
+		/* The '+ 128' tries to account for per-message overhead. */
+		size_t num_iters = 10000000 / (len + 128);
+		u64 t;
+
+		KUNIT_ASSERT_LE(test, len, TEST_BUF_LEN);
+		preempt_disable();
+		t = ktime_get_ns();
+		for (size_t j = 0; j < num_iters; j++)
+			HASH(test_buf, len, hash);
+		t = ktime_get_ns() - t;
+		preempt_enable();
+		kunit_info(test, "len=%zu: %llu MB/s", len,
+			   div64_u64((u64)len * num_iters * 1000, t ?: 1));
+	}
+}
diff --git a/lib/crypto/tests/md5-testvecs.h b/lib/crypto/tests/md5-testvecs.h
new file mode 100644
index 000000000000..be6727feb296
--- /dev/null
+++ b/lib/crypto/tests/md5-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py md5 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[MD5_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04,
+			0xe9, 0x80, 0x09, 0x98, 0xec, 0xf8, 0x42, 0x7e,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x16, 0x7b, 0x86, 0xf2, 0x1d, 0xf3, 0x76, 0xc9,
+			0x6f, 0x10, 0xa0, 0x61, 0x5b, 0x14, 0x20, 0x0b,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x2d, 0x30, 0x96, 0xc7, 0x43, 0x40, 0xed, 0xb2,
+			0xfb, 0x84, 0x63, 0x9a, 0xec, 0xc7, 0x3c, 0x3c,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0xe5, 0x0f, 0xce, 0xe0, 0xc8, 0xff, 0x4e, 0x08,
+			0x5e, 0x19, 0xe5, 0xf2, 0x08, 0x11, 0x19, 0x16,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0xe8, 0xca, 0x29, 0x05, 0x2f, 0xd1, 0xf3, 0x99,
+			0x40, 0x71, 0xf5, 0xc2, 0xf7, 0xf8, 0x17, 0x3e,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0xe3, 0x20, 0xc1, 0xd8, 0x21, 0x14, 0x44, 0x59,
+			0x1a, 0xf5, 0x91, 0xaf, 0x69, 0xbe, 0x93, 0x9d,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0xfb, 0x06, 0xb0, 0xf0, 0x00, 0x10, 0x4b, 0x68,
+			0x3d, 0x75, 0xf9, 0x70, 0xde, 0xbb, 0x32, 0x16,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x52, 0x86, 0x48, 0x8b, 0xae, 0x91, 0x7c, 0x4e,
+			0xc2, 0x2a, 0x69, 0x07, 0x35, 0xcc, 0xb2, 0x88,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0xfa, 0xd3, 0xf6, 0xe6, 0x7b, 0x1a, 0xc6, 0x05,
+			0x73, 0x35, 0x02, 0xab, 0xc7, 0xb3, 0x47, 0xcb,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0xc5, 0x59, 0x29, 0xe9, 0x0a, 0x4a, 0x86, 0x43,
+			0x7c, 0xaf, 0xdf, 0x83, 0xd3, 0xb8, 0x33, 0x5f,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x80, 0x05, 0x75, 0x39, 0xec, 0x44, 0x8a, 0x81,
+			0xe7, 0x6e, 0x8d, 0xd1, 0xc6, 0xeb, 0xc2, 0xf0,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x3f, 0x02, 0xe8, 0xc6, 0xb8, 0x6a, 0x39, 0xc3,
+			0xa4, 0x1c, 0xd9, 0x8f, 0x4a, 0x71, 0x40, 0x30,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x89, 0x4f, 0x79, 0x3e, 0xff, 0x0c, 0x22, 0x60,
+			0xa2, 0xdc, 0x10, 0x5f, 0x23, 0x0a, 0xe7, 0xc6,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x06, 0x56, 0x61, 0xb8, 0x8a, 0x82, 0x77, 0x1b,
+			0x2c, 0x35, 0xb8, 0x9f, 0xd6, 0xf7, 0xbd, 0x5a,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x5d, 0xdf, 0x7d, 0xc8, 0x43, 0x96, 0x3b, 0xdb,
+			0xc7, 0x0e, 0x44, 0x42, 0x23, 0xf7, 0xed, 0xdf,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0xf6, 0x5f, 0x26, 0x51, 0x8a, 0x5a, 0x46, 0x8f,
+			0x48, 0x72, 0x90, 0x74, 0x9d, 0x87, 0xbd, 0xdf,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0xd8, 0x2c, 0xc9, 0x76, 0xfa, 0x67, 0x2e, 0xa6,
+			0xc8, 0x12, 0x4a, 0x64, 0xaa, 0x0b, 0x3d, 0xbd,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0xe2, 0x7e, 0xb4, 0x5f, 0xe1, 0x74, 0x51, 0xfc,
+			0xe0, 0xc8, 0xd5, 0xe6, 0x8b, 0x40, 0xd2, 0x0e,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0xcd, 0x7d, 0x56, 0xa9, 0x4c, 0x47, 0xea, 0xc2,
+			0x34, 0x0b, 0x84, 0x05, 0xf9, 0xad, 0xbb, 0x46,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0x63, 0x6e, 0x58, 0xb3, 0x94, 0x6b, 0x83, 0x5f,
+			0x1f, 0x0e, 0xd3, 0x66, 0x78, 0x71, 0x98, 0x42,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x9d, 0x68, 0xfc, 0x26, 0x8b, 0x4c, 0xa8, 0xe7,
+			0x30, 0x0b, 0x19, 0x52, 0x6e, 0xa5, 0x65, 0x1c,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x1c, 0xaa, 0x7d, 0xee, 0x91, 0x01, 0xe2, 0x5a,
+			0xec, 0xe9, 0xde, 0x57, 0x0a, 0xb6, 0x4c, 0x2f,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x1b, 0x31, 0xe3, 0x14, 0x07, 0x16, 0x17, 0xc6,
+			0x98, 0x79, 0x88, 0x23, 0xb6, 0x3b, 0x25, 0xc4,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xc6, 0x3d, 0x56, 0x90, 0xf0, 0xf6, 0xe6, 0x50,
+			0xf4, 0x76, 0x78, 0x67, 0xa3, 0xdd, 0x62, 0x7b,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[MD5_DIGEST_SIZE] = {
+	0x70, 0x86, 0x9e, 0x6c, 0xa4, 0xc6, 0x71, 0x43,
+	0x26, 0x02, 0x1b, 0x3f, 0xfd, 0x56, 0x9f, 0xa6,
+};
+
+static const u8 hmac_testvec_consolidated[MD5_DIGEST_SIZE] = {
+	0x10, 0x02, 0x74, 0xf6, 0x4d, 0xb3, 0x3c, 0xc7,
+	0xa1, 0xf7, 0xe6, 0xd4, 0x32, 0x64, 0xfa, 0x6d,
+};
diff --git a/lib/crypto/tests/md5_kunit.c b/lib/crypto/tests/md5_kunit.c
new file mode 100644
index 000000000000..38bd52c25ae3
--- /dev/null
+++ b/lib/crypto/tests/md5_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/md5.h>
+#include "md5-testvecs.h"
+
+#define HASH md5
+#define HASH_CTX md5_ctx
+#define HASH_SIZE MD5_DIGEST_SIZE
+#define HASH_INIT md5_init
+#define HASH_UPDATE md5_update
+#define HASH_FINAL md5_final
+#define HMAC_KEY hmac_md5_key
+#define HMAC_CTX hmac_md5_ctx
+#define HMAC_PREPAREKEY hmac_md5_preparekey
+#define HMAC_INIT hmac_md5_init
+#define HMAC_UPDATE hmac_md5_update
+#define HMAC_FINAL hmac_md5_final
+#define HMAC hmac_md5
+#define HMAC_USINGRAWKEY hmac_md5_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite hash_test_suite = {
+	.name = "md5",
+	.test_cases = hash_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for MD5 and HMAC-MD5");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/poly1305-testvecs.h b/lib/crypto/tests/poly1305-testvecs.h
new file mode 100644
index 000000000000..ecf8662225c8
--- /dev/null
+++ b/lib/crypto/tests/poly1305-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py poly1305 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[POLY1305_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xe8, 0x2d, 0x67, 0x2c, 0x01, 0x48, 0xf9, 0xb7,
+			0x87, 0x85, 0x3f, 0xcf, 0x18, 0x66, 0x8c, 0xd3,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0xb8, 0xad, 0xca, 0x6b, 0x32, 0xba, 0x34, 0x42,
+			0x54, 0x10, 0x28, 0xf5, 0x0f, 0x7e, 0x8e, 0xe3,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0xb8, 0xf7, 0xf4, 0xc2, 0x85, 0x33, 0x80, 0x63,
+			0xd1, 0x45, 0xda, 0xf8, 0x7c, 0x79, 0x42, 0xd1,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0xf3, 0x73, 0x7b, 0x60, 0x24, 0xcc, 0x5d, 0x3e,
+			0xd1, 0x95, 0x86, 0xce, 0x89, 0x0a, 0x33, 0xba,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x0a, 0x1a, 0x2d, 0x39, 0xea, 0x49, 0x8f, 0xb7,
+			0x90, 0xb6, 0x74, 0x3b, 0x41, 0x3b, 0x96, 0x11,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x99, 0x05, 0xe3, 0xa7, 0x9e, 0x2a, 0xd2, 0x42,
+			0xb9, 0x45, 0x0c, 0x08, 0xe7, 0x10, 0xe4, 0xe1,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0xe1, 0xb2, 0x15, 0xee, 0xa2, 0xf3, 0x04, 0xac,
+			0xdd, 0x27, 0x57, 0x95, 0x2f, 0x45, 0xa8, 0xd3,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x1c, 0xf3, 0xab, 0x39, 0xc0, 0x69, 0x49, 0x69,
+			0x89, 0x6f, 0x1f, 0x03, 0x16, 0xe7, 0xc0, 0xf0,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x30, 0xb0, 0x32, 0x87, 0x51, 0x55, 0x9c, 0x39,
+			0x38, 0x42, 0x06, 0xe9, 0x2a, 0x3e, 0x2c, 0x92,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x2c, 0x04, 0x16, 0x36, 0x55, 0x25, 0x2d, 0xc6,
+			0x3d, 0x70, 0x5b, 0x88, 0x46, 0xb6, 0x71, 0x77,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x03, 0x87, 0xdd, 0xbe, 0xe8, 0x30, 0xf2, 0x15,
+			0x40, 0x44, 0x29, 0x7b, 0xb1, 0xe9, 0x9d, 0xe7,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x29, 0x83, 0x4f, 0xcb, 0x5a, 0x93, 0x25, 0xad,
+			0x05, 0xa4, 0xb3, 0x24, 0x77, 0x62, 0x2d, 0x3d,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x20, 0x0e, 0x2c, 0x05, 0xe2, 0x0b, 0x85, 0xa0,
+			0x24, 0x73, 0x7f, 0x65, 0x70, 0x6c, 0x3e, 0xb0,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0xef, 0x2f, 0x98, 0x42, 0xc2, 0x90, 0x55, 0xea,
+			0xba, 0x28, 0x76, 0xfd, 0x9e, 0x3e, 0x4d, 0x53,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x9e, 0x75, 0x4b, 0xc7, 0x69, 0x68, 0x51, 0x90,
+			0xdc, 0x29, 0xc8, 0xfa, 0x86, 0xf1, 0xc9, 0xb3,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x9d, 0x13, 0xf5, 0x54, 0xe6, 0xe3, 0x45, 0x38,
+			0x8b, 0x6d, 0x5c, 0xc4, 0x50, 0xeb, 0x90, 0xcb,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0xaa, 0xb2, 0x3e, 0x3c, 0x2a, 0xfc, 0x62, 0x0e,
+			0xd4, 0xe6, 0xe5, 0x5c, 0x6b, 0x9f, 0x3d, 0xc7,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0xd6, 0x8c, 0xea, 0x8a, 0x0f, 0x68, 0xa9, 0xa8,
+			0x67, 0x86, 0xf9, 0xc1, 0x4c, 0x26, 0x10, 0x6d,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0xdc, 0xc1, 0x54, 0xe7, 0x38, 0xc6, 0xdb, 0x24,
+			0xa7, 0x0b, 0xff, 0xd3, 0x1b, 0x93, 0x01, 0xa6,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0x8f, 0x88, 0x3e, 0x9c, 0x7b, 0x2e, 0x82, 0x5a,
+			0x1d, 0x31, 0x82, 0xcc, 0x69, 0xb4, 0x16, 0x26,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x23, 0x45, 0x94, 0xa8, 0x11, 0x54, 0x9d, 0xf2,
+			0xa1, 0x9a, 0xca, 0xf9, 0x3e, 0x65, 0x52, 0xfd,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x7b, 0xfc, 0xa9, 0x1e, 0x03, 0xad, 0xef, 0x03,
+			0xe2, 0x20, 0x92, 0xc7, 0x54, 0x83, 0xfa, 0x37,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x46, 0xab, 0x8c, 0x75, 0xb3, 0x10, 0xa6, 0x3f,
+			0x74, 0x55, 0x42, 0x6d, 0x69, 0x35, 0xd2, 0xf5,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xd0, 0xfe, 0x26, 0xc2, 0xca, 0x94, 0x2d, 0x52,
+			0x2d, 0xe1, 0x11, 0xdd, 0x42, 0x28, 0x83, 0xa6,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[POLY1305_DIGEST_SIZE] = {
+	0x9d, 0x07, 0x5d, 0xc9, 0x6c, 0xeb, 0x62, 0x5d,
+	0x02, 0x5f, 0xe1, 0xe3, 0xd1, 0x71, 0x69, 0x34,
+};
+
+static const u8 poly1305_allones_macofmacs[POLY1305_DIGEST_SIZE] = {
+	0x0c, 0x26, 0x6b, 0x45, 0x87, 0x06, 0xcf, 0xc4,
+	0x3f, 0x70, 0x7d, 0xb3, 0x50, 0xdd, 0x81, 0x25,
+};
diff --git a/lib/crypto/tests/poly1305_kunit.c b/lib/crypto/tests/poly1305_kunit.c
new file mode 100644
index 000000000000..7ac191bd96b6
--- /dev/null
+++ b/lib/crypto/tests/poly1305_kunit.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/poly1305.h>
+#include "poly1305-testvecs.h"
+
+/*
+ * A fixed key used when presenting Poly1305 as an unkeyed hash function in
+ * order to reuse hash-test-template.h.  At the beginning of the test suite,
+ * this is initialized to bytes generated from a fixed seed.
+ */
+static u8 test_key[POLY1305_KEY_SIZE];
+
+/* This probably should be in the actual API, but just define it here for now */
+static void poly1305(const u8 key[POLY1305_KEY_SIZE], const u8 *data,
+		     size_t len, u8 out[POLY1305_DIGEST_SIZE])
+{
+	struct poly1305_desc_ctx ctx;
+
+	poly1305_init(&ctx, key);
+	poly1305_update(&ctx, data, len);
+	poly1305_final(&ctx, out);
+}
+
+static void poly1305_init_withtestkey(struct poly1305_desc_ctx *ctx)
+{
+	poly1305_init(ctx, test_key);
+}
+
+static void poly1305_withtestkey(const u8 *data, size_t len,
+				 u8 out[POLY1305_DIGEST_SIZE])
+{
+	poly1305(test_key, data, len, out);
+}
+
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
+#define HASH poly1305_withtestkey
+#define HASH_CTX poly1305_desc_ctx
+#define HASH_SIZE POLY1305_DIGEST_SIZE
+#define HASH_INIT poly1305_init_withtestkey
+#define HASH_UPDATE poly1305_update
+#define HASH_FINAL poly1305_final
+#include "hash-test-template.h"
+
+static int poly1305_suite_init(struct kunit_suite *suite)
+{
+	rand_bytes_seeded_from_len(test_key, POLY1305_KEY_SIZE);
+	return hash_suite_init(suite);
+}
+
+static void poly1305_suite_exit(struct kunit_suite *suite)
+{
+	hash_suite_exit(suite);
+}
+
+/*
+ * Poly1305 test case which uses a key and message consisting only of one bits:
+ *
+ * - Using an all-one-bits r_key tests the key clamping.
+ * - Using an all-one-bits s_key tests carries in implementations of the
+ *   addition mod 2**128 during finalization.
+ * - Using all-one-bits message, and to a lesser extent r_key, tends to maximize
+ *   any intermediate accumulator values.  This increases the chance of
+ *   detecting bugs that occur only in rare cases where the accumulator values
+ *   get very large, for example the bug fixed by commit 678cce4019d746da
+ *   ("crypto: x86/poly1305 - fix overflow during partial reduction").
+ *
+ * Accumulator overflow bugs may be specific to particular update lengths (in
+ * blocks) and/or particular values of the previous acculumator.  Note that the
+ * accumulator starts at 0 which gives the lowest chance of an overflow.  Thus,
+ * a single all-one-bits test vector may be insufficient.
+ *
+ * Considering that, do the following test: continuously update a single
+ * Poly1305 context with all-one-bits data of varying lengths (0, 16, 32, ...,
+ * 4096 bytes).  After each update, generate the MAC from the current context,
+ * and feed that MAC into a separate Poly1305 context.  Repeat that entire
+ * sequence of updates 32 times without re-initializing either context,
+ * resulting in a total of 8224 MAC computations from a long-running, cumulative
+ * context.  Finally, generate and verify the MAC of all the MACs.
+ */
+static void test_poly1305_allones_keys_and_message(struct kunit *test)
+{
+	struct poly1305_desc_ctx mac_ctx, macofmacs_ctx;
+	u8 mac[POLY1305_DIGEST_SIZE];
+
+	static_assert(TEST_BUF_LEN >= 4096);
+	memset(test_buf, 0xff, 4096);
+
+	poly1305_init(&mac_ctx, test_buf);
+	poly1305_init(&macofmacs_ctx, test_buf);
+	for (int i = 0; i < 32; i++) {
+		for (size_t len = 0; len <= 4096; len += 16) {
+			struct poly1305_desc_ctx tmp_ctx;
+
+			poly1305_update(&mac_ctx, test_buf, len);
+			tmp_ctx = mac_ctx;
+			poly1305_final(&tmp_ctx, mac);
+			poly1305_update(&macofmacs_ctx, mac,
+					POLY1305_DIGEST_SIZE);
+		}
+	}
+	poly1305_final(&macofmacs_ctx, mac);
+	KUNIT_ASSERT_MEMEQ(test, mac, poly1305_allones_macofmacs,
+			   POLY1305_DIGEST_SIZE);
+}
+
+/*
+ * Poly1305 test case which uses r_key=1, s_key=0, and a 48-byte message
+ * consisting of three blocks with integer values [2**128 - i, 0, 0].  In this
+ * case, the result of the polynomial evaluation is 2**130 - i.  For small
+ * values of i, this is very close to the modulus 2**130 - 5, which helps catch
+ * edge case bugs in the modular reduction logic.
+ */
+static void test_poly1305_reduction_edge_cases(struct kunit *test)
+{
+	static const u8 key[POLY1305_KEY_SIZE] = { 1 }; /* r_key=1, s_key=0 */
+	u8 data[3 * POLY1305_BLOCK_SIZE] = {};
+	u8 expected_mac[POLY1305_DIGEST_SIZE];
+	u8 actual_mac[POLY1305_DIGEST_SIZE];
+
+	for (int i = 1; i <= 10; i++) {
+		/* Set the first data block to 2**128 - i. */
+		data[0] = -i;
+		memset(&data[1], 0xff, POLY1305_BLOCK_SIZE - 1);
+
+		/*
+		 * Assuming s_key=0, the expected MAC as an integer is
+		 * (2**130 - i mod 2**130 - 5) + 0 mod 2**128.  If 1 <= i <= 5,
+		 * that's 5 - i.  If 6 <= i <= 10, that's 2**128 - i.
+		 */
+		if (i <= 5) {
+			expected_mac[0] = 5 - i;
+			memset(&expected_mac[1], 0, POLY1305_DIGEST_SIZE - 1);
+		} else {
+			expected_mac[0] = -i;
+			memset(&expected_mac[1], 0xff,
+			       POLY1305_DIGEST_SIZE - 1);
+		}
+
+		/* Compute and verify the MAC. */
+		poly1305(key, data, sizeof(data), actual_mac);
+		KUNIT_ASSERT_MEMEQ(test, actual_mac, expected_mac,
+				   POLY1305_DIGEST_SIZE);
+	}
+}
+
+static struct kunit_case poly1305_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_poly1305_allones_keys_and_message),
+	KUNIT_CASE(test_poly1305_reduction_edge_cases),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite poly1305_test_suite = {
+	.name = "poly1305",
+	.test_cases = poly1305_test_cases,
+	.suite_init = poly1305_suite_init,
+	.suite_exit = poly1305_suite_exit,
+};
+kunit_test_suite(poly1305_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for Poly1305");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/polyval-testvecs.h b/lib/crypto/tests/polyval-testvecs.h
new file mode 100644
index 000000000000..3d33f60d58bb
--- /dev/null
+++ b/lib/crypto/tests/polyval-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py polyval */
+
+static const struct {
+	size_t data_len;
+	u8 digest[POLYVAL_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0xb5, 0x51, 0x69, 0x89, 0xd4, 0x3c, 0x59, 0xca,
+			0x6a, 0x1c, 0x2a, 0xe9, 0xa1, 0x9c, 0x6c, 0x83,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0xf4, 0x50, 0xaf, 0x07, 0xda, 0x42, 0xa7, 0x41,
+			0x4d, 0x24, 0x88, 0x87, 0xe3, 0x40, 0x73, 0x7c,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x9e, 0x88, 0x78, 0x71, 0x4c, 0x55, 0x87, 0xe8,
+			0xb4, 0x96, 0x3d, 0x56, 0xc8, 0xb2, 0xe1, 0x68,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x9e, 0x81, 0x37, 0x8f, 0x49, 0xf7, 0xa2, 0xe4,
+			0x04, 0x45, 0x12, 0x78, 0x45, 0x42, 0x27, 0xad,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x60, 0x19, 0xd0, 0xa4, 0xf0, 0xde, 0x9e, 0xe7,
+			0x6a, 0x89, 0x1a, 0xea, 0x80, 0x14, 0xa9, 0xa3,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x0c, 0xa2, 0x70, 0x4d, 0x7c, 0x89, 0xac, 0x41,
+			0xc2, 0x9e, 0x0d, 0x07, 0x07, 0x6a, 0x7f, 0xd5,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x91, 0xd3, 0xa9, 0x5c, 0x79, 0x3d, 0x6b, 0x84,
+			0x99, 0x54, 0xa7, 0xb4, 0x06, 0x66, 0xfd, 0x1c,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x29, 0x37, 0xb8, 0xe5, 0xd8, 0x27, 0x4d, 0xfb,
+			0x83, 0x4f, 0x67, 0xf7, 0xf9, 0xc1, 0x0a, 0x9d,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x17, 0xa9, 0x06, 0x2c, 0xf3, 0xe8, 0x2e, 0xa6,
+			0x6b, 0xb2, 0x1f, 0x5d, 0x94, 0x3c, 0x02, 0xa2,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x7c, 0x80, 0x74, 0xd7, 0xa1, 0x37, 0x30, 0x64,
+			0x3b, 0xa4, 0xa3, 0x98, 0xde, 0x47, 0x10, 0x23,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x27, 0x3a, 0xcf, 0xf5, 0xaf, 0x9f, 0xd8, 0xd8,
+			0x2d, 0x6a, 0x91, 0xfb, 0xb8, 0xfa, 0xbe, 0x0c,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x97, 0x6e, 0xc4, 0xbe, 0x6b, 0x15, 0xa6, 0x7c,
+			0xc4, 0xa2, 0xb8, 0x0a, 0x0e, 0x9c, 0xc7, 0x3a,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x2b, 0xc3, 0x98, 0xba, 0x6e, 0x42, 0xf8, 0x18,
+			0x85, 0x69, 0x15, 0x37, 0x10, 0x60, 0xe6, 0xac,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x88, 0x21, 0x77, 0x89, 0xd7, 0x93, 0x90, 0xfc,
+			0xf3, 0xb0, 0xe3, 0xfb, 0x14, 0xe2, 0xcf, 0x74,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x66, 0x3d, 0x3e, 0x08, 0xa0, 0x49, 0x81, 0x68,
+			0x3e, 0x3b, 0xc8, 0x80, 0x55, 0xd4, 0x15, 0xe9,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x05, 0xf5, 0x06, 0x66, 0xe7, 0x11, 0x08, 0x84,
+			0xff, 0x94, 0x50, 0x85, 0x65, 0x95, 0x2a, 0x20,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0xd3, 0xa0, 0x51, 0x69, 0xb5, 0x38, 0xae, 0x1b,
+			0xe1, 0xa2, 0x89, 0xc6, 0x8d, 0x2b, 0x62, 0x37,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x37, 0x6d, 0x6a, 0x14, 0xdc, 0xa5, 0x37, 0xfc,
+			0xfe, 0x67, 0x76, 0xb2, 0x64, 0x68, 0x64, 0x05,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xe3, 0x12, 0x0c, 0x58, 0x46, 0x45, 0x27, 0x7a,
+			0x0e, 0xa2, 0xfa, 0x2c, 0x35, 0x73, 0x6c, 0x94,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x63, 0x0d, 0xa1, 0xbc, 0x6e, 0x3e, 0xd3, 0x1d,
+			0x28, 0x52, 0xd2, 0xf4, 0x30, 0x2d, 0xff, 0xc4,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0xb2, 0x91, 0x49, 0xe2, 0x02, 0x98, 0x00, 0x79,
+			0x71, 0xb9, 0xd7, 0xd4, 0xb5, 0x94, 0x6d, 0x7d,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x58, 0x96, 0x48, 0x69, 0x05, 0x17, 0xe1, 0x6d,
+			0xbc, 0xf2, 0x3d, 0x10, 0x96, 0x00, 0x74, 0x58,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0x99, 0x3c, 0xcb, 0x4d, 0x64, 0xc9, 0xa9, 0x41,
+			0x52, 0x93, 0xfd, 0x65, 0xc4, 0xcc, 0xa5, 0xe5,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[POLYVAL_DIGEST_SIZE] = {
+	0xdf, 0x68, 0x52, 0x99, 0x92, 0xc3, 0xe8, 0x88,
+	0x29, 0x13, 0xc8, 0x35, 0x67, 0xa3, 0xd3, 0xad,
+};
+
+static const u8 polyval_allones_hashofhashes[POLYVAL_DIGEST_SIZE] = {
+	0xd5, 0xf7, 0xfd, 0xb2, 0xa6, 0xef, 0x0b, 0x85,
+	0x0d, 0x0a, 0x06, 0x10, 0xbc, 0x64, 0x94, 0x73,
+};
diff --git a/lib/crypto/tests/polyval_kunit.c b/lib/crypto/tests/polyval_kunit.c
new file mode 100644
index 000000000000..e59f598c1572
--- /dev/null
+++ b/lib/crypto/tests/polyval_kunit.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/polyval.h>
+#include "polyval-testvecs.h"
+
+/*
+ * A fixed key used when presenting POLYVAL as an unkeyed hash function in order
+ * to reuse hash-test-template.h.  At the beginning of the test suite, this is
+ * initialized to a key prepared from bytes generated from a fixed seed.
+ */
+static struct polyval_key test_key;
+
+static void polyval_init_withtestkey(struct polyval_ctx *ctx)
+{
+	polyval_init(ctx, &test_key);
+}
+
+static void polyval_withtestkey(const u8 *data, size_t len,
+				u8 out[POLYVAL_BLOCK_SIZE])
+{
+	polyval(&test_key, data, len, out);
+}
+
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
+#define HASH polyval_withtestkey
+#define HASH_CTX polyval_ctx
+#define HASH_SIZE POLYVAL_BLOCK_SIZE
+#define HASH_INIT polyval_init_withtestkey
+#define HASH_UPDATE polyval_update
+#define HASH_FINAL polyval_final
+#include "hash-test-template.h"
+
+/*
+ * Test an example from RFC8452 ("AES-GCM-SIV: Nonce Misuse-Resistant
+ * Authenticated Encryption") to ensure compatibility with that.
+ */
+static void test_polyval_rfc8452_testvec(struct kunit *test)
+{
+	static const u8 raw_key[POLYVAL_BLOCK_SIZE] =
+		"\x31\x07\x28\xd9\x91\x1f\x1f\x38"
+		"\x37\xb2\x43\x16\xc3\xfa\xb9\xa0";
+	static const u8 data[48] =
+		"\x65\x78\x61\x6d\x70\x6c\x65\x00"
+		"\x00\x00\x00\x00\x00\x00\x00\x00"
+		"\x48\x65\x6c\x6c\x6f\x20\x77\x6f"
+		"\x72\x6c\x64\x00\x00\x00\x00\x00"
+		"\x38\x00\x00\x00\x00\x00\x00\x00"
+		"\x58\x00\x00\x00\x00\x00\x00\x00";
+	static const u8 expected_hash[POLYVAL_BLOCK_SIZE] =
+		"\xad\x7f\xcf\x0b\x51\x69\x85\x16"
+		"\x62\x67\x2f\x3c\x5f\x95\x13\x8f";
+	u8 hash[POLYVAL_BLOCK_SIZE];
+	struct polyval_key key;
+
+	polyval_preparekey(&key, raw_key);
+	polyval(&key, data, sizeof(data), hash);
+	KUNIT_ASSERT_MEMEQ(test, hash, expected_hash, sizeof(hash));
+}
+
+/*
+ * Test a key and messages containing all one bits.  This is useful to detect
+ * overflow bugs in implementations that emulate carryless multiplication using
+ * a series of standard multiplications with the bits spread out.
+ */
+static void test_polyval_allones_key_and_message(struct kunit *test)
+{
+	struct polyval_key key;
+	struct polyval_ctx hashofhashes_ctx;
+	u8 hash[POLYVAL_BLOCK_SIZE];
+
+	static_assert(TEST_BUF_LEN >= 4096);
+	memset(test_buf, 0xff, 4096);
+
+	polyval_preparekey(&key, test_buf);
+	polyval_init(&hashofhashes_ctx, &key);
+	for (size_t len = 0; len <= 4096; len += 16) {
+		polyval(&key, test_buf, len, hash);
+		polyval_update(&hashofhashes_ctx, hash, sizeof(hash));
+	}
+	polyval_final(&hashofhashes_ctx, hash);
+	KUNIT_ASSERT_MEMEQ(test, hash, polyval_allones_hashofhashes,
+			   sizeof(hash));
+}
+
+#define MAX_LEN_FOR_KEY_CHECK 1024
+
+/*
+ * Given two prepared keys which should be identical (but may differ in
+ * alignment and/or whether they are followed by a guard page or not), verify
+ * that they produce consistent results on various data lengths.
+ */
+static void check_key_consistency(struct kunit *test,
+				  const struct polyval_key *key1,
+				  const struct polyval_key *key2)
+{
+	u8 *data = test_buf;
+	u8 hash1[POLYVAL_BLOCK_SIZE];
+	u8 hash2[POLYVAL_BLOCK_SIZE];
+
+	rand_bytes(data, MAX_LEN_FOR_KEY_CHECK);
+	KUNIT_ASSERT_MEMEQ(test, key1, key2, sizeof(*key1));
+
+	for (int i = 0; i < 100; i++) {
+		size_t len = rand_length(MAX_LEN_FOR_KEY_CHECK);
+
+		polyval(key1, data, len, hash1);
+		polyval(key2, data, len, hash2);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, sizeof(hash1));
+	}
+}
+
+/* Test that no buffer overreads occur on either raw_key or polyval_key. */
+static void test_polyval_with_guarded_key(struct kunit *test)
+{
+	u8 raw_key[POLYVAL_BLOCK_SIZE];
+	u8 *guarded_raw_key = &test_buf[TEST_BUF_LEN - sizeof(raw_key)];
+	struct polyval_key key1, key2;
+	struct polyval_key *guarded_key =
+		(struct polyval_key *)&test_buf[TEST_BUF_LEN - sizeof(key1)];
+
+	/* Prepare with regular buffers. */
+	rand_bytes(raw_key, sizeof(raw_key));
+	polyval_preparekey(&key1, raw_key);
+
+	/* Prepare with guarded raw_key, then check that it works. */
+	memcpy(guarded_raw_key, raw_key, sizeof(raw_key));
+	polyval_preparekey(&key2, guarded_raw_key);
+	check_key_consistency(test, &key1, &key2);
+
+	/* Prepare guarded polyval_key, then check that it works. */
+	polyval_preparekey(guarded_key, raw_key);
+	check_key_consistency(test, &key1, guarded_key);
+}
+
+/*
+ * Test that polyval_key only needs to be aligned to
+ * __alignof__(struct polyval_key), i.e. 8 bytes.  The assembly code may prefer
+ * 16-byte or higher alignment, but it musn't require it.
+ */
+static void test_polyval_with_minimally_aligned_key(struct kunit *test)
+{
+	u8 raw_key[POLYVAL_BLOCK_SIZE];
+	struct polyval_key key;
+	struct polyval_key *minaligned_key =
+		(struct polyval_key *)&test_buf[MAX_LEN_FOR_KEY_CHECK +
+						__alignof__(struct polyval_key)];
+
+	KUNIT_ASSERT_TRUE(test, IS_ALIGNED((uintptr_t)minaligned_key,
+					   __alignof__(struct polyval_key)));
+	KUNIT_ASSERT_TRUE(test,
+			  !IS_ALIGNED((uintptr_t)minaligned_key,
+				      2 * __alignof__(struct polyval_key)));
+
+	rand_bytes(raw_key, sizeof(raw_key));
+	polyval_preparekey(&key, raw_key);
+	polyval_preparekey(minaligned_key, raw_key);
+	check_key_consistency(test, &key, minaligned_key);
+}
+
+struct polyval_irq_test_state {
+	struct polyval_key expected_key;
+	u8 raw_key[POLYVAL_BLOCK_SIZE];
+};
+
+static bool polyval_irq_test_func(void *state_)
+{
+	struct polyval_irq_test_state *state = state_;
+	struct polyval_key key;
+
+	polyval_preparekey(&key, state->raw_key);
+	return memcmp(&key, &state->expected_key, sizeof(key)) == 0;
+}
+
+/*
+ * Test that polyval_preparekey() produces the same output regardless of whether
+ * FPU or vector registers are usable when it is called.
+ */
+static void test_polyval_preparekey_in_irqs(struct kunit *test)
+{
+	struct polyval_irq_test_state state;
+
+	rand_bytes(state.raw_key, sizeof(state.raw_key));
+	polyval_preparekey(&state.expected_key, state.raw_key);
+	kunit_run_irq_test(test, polyval_irq_test_func, 20000, &state);
+}
+
+static int polyval_suite_init(struct kunit_suite *suite)
+{
+	u8 raw_key[POLYVAL_BLOCK_SIZE];
+
+	rand_bytes_seeded_from_len(raw_key, sizeof(raw_key));
+	polyval_preparekey(&test_key, raw_key);
+	return hash_suite_init(suite);
+}
+
+static void polyval_suite_exit(struct kunit_suite *suite)
+{
+	hash_suite_exit(suite);
+}
+
+static struct kunit_case polyval_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_polyval_rfc8452_testvec),
+	KUNIT_CASE(test_polyval_allones_key_and_message),
+	KUNIT_CASE(test_polyval_with_guarded_key),
+	KUNIT_CASE(test_polyval_with_minimally_aligned_key),
+	KUNIT_CASE(test_polyval_preparekey_in_irqs),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite polyval_test_suite = {
+	.name = "polyval",
+	.test_cases = polyval_test_cases,
+	.suite_init = polyval_suite_init,
+	.suite_exit = polyval_suite_exit,
+};
+kunit_test_suite(polyval_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for POLYVAL");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha1-testvecs.h b/lib/crypto/tests/sha1-testvecs.h
new file mode 100644
index 000000000000..f5d050122e84
--- /dev/null
+++ b/lib/crypto/tests/sha1-testvecs.h
@@ -0,0 +1,212 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha1 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA1_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d,
+			0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90,
+			0xaf, 0xd8, 0x07, 0x09,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x0a, 0xd0, 0x52, 0xdd, 0x9f, 0x32, 0x40, 0x55,
+			0x21, 0xe4, 0x3c, 0x6e, 0xbd, 0xc5, 0x2f, 0x5a,
+			0x02, 0x54, 0x93, 0xb2,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x13, 0x83, 0x82, 0x03, 0x23, 0xff, 0x46, 0xd6,
+			0x12, 0x7f, 0xad, 0x05, 0x2b, 0xc3, 0x4a, 0x42,
+			0x49, 0x6a, 0xf8, 0x84,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0xe4, 0xdf, 0x7b, 0xdc, 0xe8, 0x6e, 0x81, 0x97,
+			0x1e, 0x0f, 0xe8, 0x8b, 0x76, 0xa8, 0x59, 0x04,
+			0xae, 0x92, 0x1a, 0x7c,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x8c, 0x1c, 0x30, 0xd8, 0xbc, 0xc4, 0xc3, 0xf5,
+			0xf8, 0x83, 0x0d, 0x1e, 0x04, 0x5d, 0x29, 0xb5,
+			0x68, 0x89, 0xc1, 0xe9,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x6c, 0x1d, 0x72, 0x31, 0xa5, 0x03, 0x4f, 0xdc,
+			0xff, 0x2d, 0x06, 0x3e, 0x24, 0x26, 0x34, 0x8d,
+			0x60, 0xa4, 0x67, 0x16,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x37, 0x53, 0x33, 0xfa, 0xd0, 0x21, 0xad, 0xe7,
+			0xa5, 0x43, 0xf1, 0x94, 0x64, 0x11, 0x47, 0x9c,
+			0x72, 0xb5, 0x78, 0xb4,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x51, 0x5c, 0xd8, 0x5a, 0xa9, 0xde, 0x7b, 0x2a,
+			0xa2, 0xff, 0x70, 0x09, 0x56, 0x88, 0x40, 0x2b,
+			0x50, 0x93, 0x82, 0x47,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0xbc, 0x9c, 0xab, 0x93, 0x06, 0xd5, 0xdb, 0xac,
+			0x2c, 0x33, 0x15, 0x83, 0x56, 0xf6, 0x91, 0x20,
+			0x09, 0xc7, 0xb2, 0x6b,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x26, 0x90, 0x3b, 0x47, 0xe1, 0x92, 0x42, 0xd0,
+			0x85, 0x63, 0x2e, 0x6b, 0x68, 0xa4, 0xc4, 0x4c,
+			0xe6, 0xf4, 0xb0, 0x52,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x55, 0x6f, 0x87, 0xdc, 0x34, 0x3d, 0xe2, 0x4f,
+			0xc3, 0x81, 0xa4, 0x82, 0x79, 0x84, 0x64, 0x01,
+			0x55, 0xa0, 0x1e, 0x36,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0xb7, 0xd5, 0x5f, 0xa4, 0xef, 0xbf, 0x4f, 0x96,
+			0x01, 0xc1, 0x06, 0xe3, 0x75, 0xa8, 0x90, 0x92,
+			0x4c, 0x5f, 0xf1, 0x21,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x70, 0x0c, 0xea, 0xa4, 0x93, 0xd0, 0x56, 0xf0,
+			0x6f, 0xbb, 0x53, 0x42, 0x5b, 0xe3, 0xf2, 0xb0,
+			0x30, 0x66, 0x8e, 0x75,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x15, 0x01, 0xbc, 0xb0, 0xee, 0xd8, 0xeb, 0xa8,
+			0x7d, 0xd9, 0x4d, 0x50, 0x2e, 0x41, 0x30, 0xba,
+			0x41, 0xaa, 0x7b, 0x02,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x98, 0x05, 0x52, 0xf5, 0x0f, 0xf0, 0xd3, 0x97,
+			0x15, 0x8c, 0xa3, 0x9a, 0x2b, 0x4d, 0x67, 0x57,
+			0x29, 0xa0, 0xac, 0x61,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x1f, 0x47, 0xf0, 0xcc, 0xd7, 0xda, 0xa5, 0x3b,
+			0x39, 0xb4, 0x5b, 0xa8, 0x33, 0xd4, 0xca, 0x2f,
+			0xdd, 0xf2, 0x39, 0x89,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0xb9, 0x75, 0xe6, 0x57, 0x42, 0x7f, 0x8b, 0x0a,
+			0xcc, 0x53, 0x10, 0x69, 0x45, 0xac, 0xfd, 0x11,
+			0xf7, 0x1f, 0x4e, 0x6f,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x63, 0x66, 0xcb, 0x44, 0xc1, 0x2c, 0xa2, 0x06,
+			0x5d, 0xb9, 0x8e, 0x31, 0xcb, 0x4f, 0x4e, 0x49,
+			0xe0, 0xfb, 0x3c, 0x4e,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x35, 0xbc, 0x74, 0xfb, 0x31, 0x9c, 0xd4, 0xdd,
+			0xe8, 0x87, 0xa7, 0x56, 0x3b, 0x08, 0xe5, 0x49,
+			0xe1, 0xe9, 0xc9, 0xa8,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0x43, 0x00, 0xea, 0xcd, 0x4e, 0x7c, 0xe9, 0xe4,
+			0x32, 0xce, 0x25, 0xa8, 0xcd, 0x20, 0xa8, 0xaa,
+			0x7b, 0x63, 0x2c, 0x3c,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0xd0, 0x67, 0x26, 0x0e, 0x22, 0x72, 0xaa, 0x63,
+			0xfc, 0x34, 0x55, 0x07, 0xab, 0xc8, 0x64, 0xb6,
+			0xc4, 0xea, 0xd5, 0x7c,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x6b, 0xc9, 0x5e, 0xb9, 0x41, 0x19, 0x50, 0x35,
+			0xf1, 0x39, 0xfe, 0xd9, 0x72, 0x6d, 0xd0, 0x55,
+			0xb8, 0x1f, 0x1a, 0x95,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x70, 0x5d, 0x10, 0x2e, 0x4e, 0x44, 0xc9, 0x80,
+			0x8f, 0xba, 0x13, 0xbc, 0xd0, 0x77, 0x78, 0xc7,
+			0x84, 0xe3, 0x24, 0x43,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xa8, 0x82, 0xca, 0x08, 0xb4, 0x84, 0x09, 0x13,
+			0xc0, 0x9c, 0x26, 0x18, 0xcf, 0x0f, 0xf3, 0x08,
+			0xff, 0xa1, 0xe4, 0x5d,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA1_DIGEST_SIZE] = {
+	0xe1, 0x72, 0xa5, 0x3c, 0xda, 0xf2, 0xe5, 0x56,
+	0xb8, 0xb5, 0x35, 0x6e, 0xce, 0xc8, 0x37, 0x57,
+	0x31, 0xb4, 0x05, 0xdd,
+};
+
+static const u8 hmac_testvec_consolidated[SHA1_DIGEST_SIZE] = {
+	0x9d, 0xe5, 0xb1, 0x43, 0x97, 0x95, 0x16, 0x52,
+	0xa0, 0x7a, 0xc0, 0xe2, 0xc1, 0x60, 0x64, 0x7c,
+	0x24, 0xf9, 0x34, 0xd7,
+};
diff --git a/lib/crypto/tests/sha1_kunit.c b/lib/crypto/tests/sha1_kunit.c
new file mode 100644
index 000000000000..24ba8d5669c8
--- /dev/null
+++ b/lib/crypto/tests/sha1_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha1.h>
+#include "sha1-testvecs.h"
+
+#define HASH sha1
+#define HASH_CTX sha1_ctx
+#define HASH_SIZE SHA1_DIGEST_SIZE
+#define HASH_INIT sha1_init
+#define HASH_UPDATE sha1_update
+#define HASH_FINAL sha1_final
+#define HMAC_KEY hmac_sha1_key
+#define HMAC_CTX hmac_sha1_ctx
+#define HMAC_PREPAREKEY hmac_sha1_preparekey
+#define HMAC_INIT hmac_sha1_init
+#define HMAC_UPDATE hmac_sha1_update
+#define HMAC_FINAL hmac_sha1_final
+#define HMAC hmac_sha1
+#define HMAC_USINGRAWKEY hmac_sha1_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite hash_test_suite = {
+	.name = "sha1",
+	.test_cases = hash_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-1 and HMAC-SHA1");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha224-testvecs.h b/lib/crypto/tests/sha224-testvecs.h
new file mode 100644
index 000000000000..523adc178e1a
--- /dev/null
+++ b/lib/crypto/tests/sha224-testvecs.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha224 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA224_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xd1, 0x4a, 0x02, 0x8c, 0x2a, 0x3a, 0x2b, 0xc9,
+			0x47, 0x61, 0x02, 0xbb, 0x28, 0x82, 0x34, 0xc4,
+			0x15, 0xa2, 0xb0, 0x1f, 0x82, 0x8e, 0xa6, 0x2a,
+			0xc5, 0xb3, 0xe4, 0x2f,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0xe3, 0x4d, 0x79, 0x17, 0x75, 0x35, 0xdc, 0xd2,
+			0x27, 0xc9, 0x9d, 0x0b, 0x90, 0x0f, 0x21, 0x5d,
+			0x95, 0xfb, 0x9c, 0x6d, 0xa8, 0xec, 0x19, 0x15,
+			0x12, 0xef, 0xf5, 0x0f,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x81, 0xc7, 0x60, 0x0d, 0x6d, 0x13, 0x75, 0x70,
+			0x4b, 0xc0, 0xab, 0xea, 0x04, 0xe3, 0x78, 0x7e,
+			0x73, 0xb9, 0x0f, 0xb6, 0xae, 0x90, 0xf3, 0x94,
+			0xb2, 0x56, 0xda, 0xc8,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x24, 0xf0, 0x8c, 0x6e, 0x9d, 0xd6, 0x06, 0x80,
+			0x0a, 0x03, 0xee, 0x9b, 0x33, 0xec, 0x83, 0x42,
+			0x2c, 0x8b, 0xe7, 0xc7, 0xc6, 0x04, 0xfb, 0xc6,
+			0xa3, 0x3a, 0x4d, 0xc9,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x1c, 0x08, 0xa8, 0x55, 0x8f, 0xc6, 0x0a, 0xea,
+			0x2f, 0x1b, 0x54, 0xff, 0x8d, 0xd2, 0xa3, 0xc7,
+			0x42, 0xc2, 0x93, 0x3d, 0x73, 0x18, 0x84, 0xba,
+			0x75, 0x49, 0x34, 0xfd,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x45, 0xdd, 0xb5, 0xf0, 0x3c, 0xda, 0xe6, 0xd4,
+			0x6c, 0x86, 0x91, 0x29, 0x11, 0x2f, 0x88, 0x7d,
+			0xd8, 0x3c, 0xa3, 0xd6, 0xdd, 0x1e, 0xac, 0x98,
+			0xff, 0xf0, 0x14, 0x69,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x0b, 0xfb, 0x71, 0x4c, 0x06, 0x7a, 0xd5, 0x89,
+			0x76, 0x0a, 0x43, 0x8b, 0x2b, 0x47, 0x12, 0x56,
+			0xa7, 0x64, 0x33, 0x1d, 0xd3, 0x44, 0x17, 0x95,
+			0x23, 0xe7, 0x53, 0x01,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0xc4, 0xae, 0x9c, 0x33, 0xd5, 0x1d, 0xf4, 0xa7,
+			0xfd, 0xb7, 0xd4, 0x6b, 0xc3, 0xeb, 0xa8, 0xbf,
+			0xfb, 0x07, 0x89, 0x4b, 0x07, 0x15, 0x22, 0xec,
+			0xe1, 0x45, 0x84, 0xba,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0xad, 0x01, 0x34, 0x2a, 0xe2, 0x3b, 0x58, 0x06,
+			0x9f, 0x20, 0xc8, 0xfb, 0xf3, 0x20, 0x82, 0xa6,
+			0x9f, 0xee, 0x7a, 0xbe, 0xdf, 0xf3, 0x5d, 0x57,
+			0x9b, 0xce, 0x79, 0x96,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0xa7, 0xa6, 0x47, 0xf7, 0xed, 0x2a, 0xe5, 0xe3,
+			0xc0, 0x1e, 0x7b, 0x40, 0xe4, 0xf7, 0x40, 0x65,
+			0x42, 0xc1, 0x6f, 0x7d, 0x8d, 0x0d, 0x17, 0x4f,
+			0xd3, 0xbc, 0x0d, 0x85,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0xc4, 0x9c, 0xb5, 0x6a, 0x01, 0x2d, 0x10, 0xa9,
+			0x5f, 0xa4, 0x5a, 0xe1, 0xba, 0x40, 0x12, 0x09,
+			0x7b, 0xea, 0xdb, 0xa6, 0x7b, 0xcb, 0x56, 0xf0,
+			0xfd, 0x5b, 0xe2, 0xe7,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x14, 0xda, 0x0e, 0x01, 0xca, 0x78, 0x7d, 0x2d,
+			0x85, 0xa3, 0xca, 0x0e, 0x80, 0xf9, 0x95, 0x10,
+			0xa1, 0x7b, 0xa5, 0xaa, 0xfc, 0x95, 0x05, 0x08,
+			0x53, 0xda, 0x52, 0xee,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0xa5, 0x24, 0xc4, 0x54, 0xe1, 0x50, 0xab, 0xee,
+			0x22, 0xc1, 0xa7, 0x27, 0x15, 0x2c, 0x6f, 0xf7,
+			0x4c, 0x31, 0xe5, 0x15, 0x25, 0x4e, 0x71, 0xc6,
+			0x7e, 0xa0, 0x11, 0x5d,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x73, 0xd0, 0x8c, 0xce, 0xed, 0xed, 0x9f, 0xaa,
+			0x21, 0xaf, 0xa2, 0x08, 0x80, 0x16, 0x15, 0x59,
+			0x3f, 0x1d, 0x7f, 0x0a, 0x79, 0x3d, 0x7b, 0x58,
+			0xf8, 0xc8, 0x5c, 0x27,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x31, 0xa7, 0xa1, 0xca, 0x49, 0x72, 0x75, 0xcc,
+			0x6e, 0x02, 0x9e, 0xad, 0xea, 0x86, 0x5c, 0x91,
+			0x02, 0xe4, 0xc9, 0xf9, 0xd3, 0x9e, 0x74, 0x50,
+			0xd8, 0x43, 0x6b, 0x85,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x40, 0x60, 0x8b, 0xb0, 0x03, 0xa9, 0x75, 0xab,
+			0x2d, 0x5b, 0x20, 0x9a, 0x05, 0x72, 0xb7, 0xa8,
+			0xce, 0xf2, 0x4f, 0x66, 0x62, 0xe3, 0x7e, 0x24,
+			0xd6, 0xe2, 0xea, 0xfa,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x4f, 0x5f, 0x9f, 0x1e, 0xb3, 0x66, 0x81, 0xdb,
+			0x41, 0x5d, 0x65, 0x97, 0x00, 0x8d, 0xdc, 0x62,
+			0x03, 0xb0, 0x4d, 0x6b, 0x5c, 0x7f, 0x1e, 0xa0,
+			0xfe, 0xfc, 0x0e, 0xb8,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x08, 0xa8, 0xa1, 0xc0, 0xd8, 0xf9, 0xb4, 0xaa,
+			0x53, 0x22, 0xa1, 0x73, 0x0b, 0x45, 0xa0, 0x20,
+			0x72, 0xf3, 0xa9, 0xbc, 0x51, 0xd0, 0x20, 0x79,
+			0x69, 0x97, 0xf7, 0xe3,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0xe8, 0x60, 0x5f, 0xb9, 0x12, 0xe1, 0x6b, 0x24,
+			0xc5, 0xe8, 0x43, 0xa9, 0x5c, 0x3f, 0x65, 0xed,
+			0xbe, 0xfd, 0x77, 0xf5, 0x47, 0xf2, 0x75, 0x21,
+			0xc2, 0x8f, 0x54, 0x8f,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xc7, 0xdf, 0x50, 0x16, 0x10, 0x01, 0xb7, 0xdf,
+			0x34, 0x1d, 0x18, 0xa2, 0xd5, 0xad, 0x1f, 0x50,
+			0xf7, 0xa8, 0x9a, 0x72, 0xfb, 0xfd, 0xd9, 0x1c,
+			0x57, 0xac, 0x08, 0x97,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0xdf, 0x16, 0x76, 0x7f, 0xc0, 0x16, 0x84, 0x63,
+			0xac, 0xcf, 0xd0, 0x78, 0x1e, 0x96, 0x67, 0xc5,
+			0x3c, 0x06, 0xe9, 0xdb, 0x6e, 0x7d, 0xd0, 0x07,
+			0xaa, 0xb1, 0x56, 0xc9,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x49, 0xec, 0x5c, 0x18, 0xd7, 0x5b, 0xda, 0xed,
+			0x5b, 0x59, 0xde, 0x09, 0x34, 0xb2, 0x49, 0x43,
+			0x62, 0x6a, 0x0a, 0x63, 0x6a, 0x51, 0x08, 0x37,
+			0x8c, 0xb6, 0x29, 0x84,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x3d, 0xc2, 0xc8, 0x43, 0xcf, 0xb7, 0x33, 0x14,
+			0x04, 0x93, 0xed, 0xe2, 0xcd, 0x8a, 0x69, 0x5c,
+			0x5a, 0xd5, 0x9b, 0x52, 0xdf, 0x48, 0xa7, 0xaa,
+			0x28, 0x2b, 0x5d, 0x27,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xa7, 0xaf, 0xda, 0x92, 0xe2, 0xe7, 0x61, 0xdc,
+			0xa1, 0x32, 0x53, 0x2a, 0x3f, 0x41, 0x5c, 0x7e,
+			0xc9, 0x89, 0xda, 0x1c, 0xf7, 0x8d, 0x00, 0xbd,
+			0x21, 0x73, 0xb1, 0x69,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA224_DIGEST_SIZE] = {
+	0x9e, 0xb8, 0x82, 0xab, 0x83, 0x37, 0xe4, 0x63,
+	0x84, 0xee, 0x21, 0x15, 0xc2, 0xbb, 0xa3, 0x17,
+	0x8f, 0xc4, 0x99, 0x33, 0xa0, 0x2c, 0x9f, 0xec,
+	0xca, 0xd0, 0xf3, 0x73,
+};
+
+static const u8 hmac_testvec_consolidated[SHA224_DIGEST_SIZE] = {
+	0x66, 0x34, 0x79, 0x92, 0x47, 0x0e, 0xcd, 0x70,
+	0xb0, 0x8b, 0x91, 0xcb, 0x94, 0x2f, 0x67, 0x65,
+	0x2f, 0xc9, 0xd2, 0x91, 0x32, 0xaf, 0xf7, 0x5f,
+	0xb6, 0x01, 0x5b, 0xf2,
+};
diff --git a/lib/crypto/tests/sha224_kunit.c b/lib/crypto/tests/sha224_kunit.c
new file mode 100644
index 000000000000..962ad46b9c99
--- /dev/null
+++ b/lib/crypto/tests/sha224_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha224-testvecs.h"
+
+#define HASH sha224
+#define HASH_CTX sha224_ctx
+#define HASH_SIZE SHA224_DIGEST_SIZE
+#define HASH_INIT sha224_init
+#define HASH_UPDATE sha224_update
+#define HASH_FINAL sha224_final
+#define HMAC_KEY hmac_sha224_key
+#define HMAC_CTX hmac_sha224_ctx
+#define HMAC_PREPAREKEY hmac_sha224_preparekey
+#define HMAC_INIT hmac_sha224_init
+#define HMAC_UPDATE hmac_sha224_update
+#define HMAC_FINAL hmac_sha224_final
+#define HMAC hmac_sha224
+#define HMAC_USINGRAWKEY hmac_sha224_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite hash_test_suite = {
+	.name = "sha224",
+	.test_cases = hash_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-224 and HMAC-SHA224");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha256-testvecs.h b/lib/crypto/tests/sha256-testvecs.h
new file mode 100644
index 000000000000..2db7b2042034
--- /dev/null
+++ b/lib/crypto/tests/sha256-testvecs.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha256 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA256_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14,
+			0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24,
+			0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c,
+			0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x45, 0xf8, 0x3d, 0x17, 0xe1, 0x0b, 0x34, 0xfc,
+			0xa0, 0x1e, 0xb8, 0xf4, 0x45, 0x4d, 0xac, 0x34,
+			0xa7, 0x77, 0xd9, 0x40, 0x4a, 0x46, 0x4e, 0x73,
+			0x2c, 0xf4, 0xab, 0xf2, 0xc0, 0xda, 0x94, 0xc4,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0xf9, 0xd3, 0x52, 0x2f, 0xd5, 0xe0, 0x99, 0x15,
+			0x1c, 0xd6, 0xa9, 0x24, 0x4f, 0x40, 0xba, 0x25,
+			0x33, 0x43, 0x3e, 0xe1, 0x78, 0x6a, 0xfe, 0x7d,
+			0x07, 0xe2, 0x29, 0x7b, 0x6d, 0xc5, 0x73, 0xf5,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x71, 0xf7, 0xa1, 0xef, 0x69, 0x86, 0x0e, 0xe4,
+			0x87, 0x25, 0x58, 0x4c, 0x07, 0x2c, 0xfc, 0x60,
+			0xc5, 0xf6, 0xe2, 0x44, 0xaa, 0xfb, 0x41, 0xc7,
+			0x2b, 0xc5, 0x01, 0x8c, 0x39, 0x98, 0x30, 0x37,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x09, 0x95, 0x9a, 0xfa, 0x25, 0x18, 0x86, 0x06,
+			0xfe, 0x65, 0xc9, 0x2f, 0x91, 0x15, 0x74, 0x06,
+			0x6c, 0xbf, 0xef, 0x7b, 0x0b, 0xc7, 0x2c, 0x05,
+			0xdd, 0x17, 0x5d, 0x6f, 0x8a, 0xa5, 0xde, 0x3c,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0xe5, 0x52, 0x3c, 0x85, 0xea, 0x1b, 0xe1, 0x6c,
+			0xe0, 0xdb, 0xc3, 0xef, 0xf0, 0xca, 0xc2, 0xe1,
+			0xb9, 0x36, 0xa1, 0x28, 0xb6, 0x9e, 0xf5, 0x6e,
+			0x70, 0xf7, 0xf9, 0xa7, 0x1c, 0xd3, 0x22, 0xd0,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x5f, 0x84, 0xd4, 0xd7, 0x2e, 0x80, 0x09, 0xef,
+			0x1c, 0x77, 0x7c, 0x25, 0x59, 0x63, 0x88, 0x64,
+			0xfd, 0x56, 0xea, 0x23, 0xf4, 0x4f, 0x2e, 0x49,
+			0xcd, 0xb4, 0xaa, 0xc7, 0x5c, 0x8b, 0x75, 0x84,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x22, 0x6e, 0xca, 0xda, 0x00, 0x2d, 0x90, 0x96,
+			0x24, 0xf8, 0x55, 0x17, 0x11, 0xda, 0x42, 0x1c,
+			0x78, 0x4e, 0xbf, 0xd9, 0xc5, 0xcf, 0xf3, 0xe3,
+			0xaf, 0xd3, 0x60, 0xcd, 0xaa, 0xe2, 0xc7, 0x22,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x97, 0xe2, 0x74, 0xdc, 0x6b, 0xa4, 0xaf, 0x32,
+			0x3b, 0x50, 0x6d, 0x80, 0xb5, 0xd3, 0x0c, 0x36,
+			0xea, 0x3f, 0x5d, 0x36, 0xa7, 0x49, 0x51, 0xf3,
+			0xbd, 0x69, 0x68, 0x60, 0x9b, 0xde, 0x73, 0xf5,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x13, 0x74, 0xb1, 0x72, 0xd6, 0x53, 0x48, 0x28,
+			0x42, 0xd8, 0xba, 0x64, 0x20, 0x60, 0xb6, 0x4c,
+			0xc3, 0xac, 0x5d, 0x93, 0x8c, 0xb9, 0xd4, 0xcc,
+			0xb4, 0x9f, 0x31, 0x1f, 0xeb, 0x68, 0x35, 0x58,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0xda, 0xbe, 0xd7, 0xbc, 0x6e, 0xe6, 0x5a, 0x57,
+			0xeb, 0x9a, 0x93, 0xaa, 0x66, 0xd0, 0xe0, 0xc4,
+			0x29, 0x7f, 0xe9, 0x3b, 0x8e, 0xdf, 0x81, 0x82,
+			0x8d, 0x15, 0x11, 0x59, 0x4e, 0x13, 0xa5, 0x58,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x8c, 0x1a, 0xba, 0x40, 0x66, 0x94, 0x19, 0xf4,
+			0x2e, 0xa2, 0xae, 0x94, 0x53, 0x18, 0xb6, 0xfd,
+			0xa0, 0x12, 0xc5, 0xef, 0xd5, 0xd6, 0x1b, 0xa1,
+			0x37, 0xea, 0x19, 0x44, 0x35, 0x54, 0x85, 0x74,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0xfd, 0x07, 0xd8, 0x77, 0x7d, 0x8b, 0x4f, 0xee,
+			0x60, 0x60, 0x26, 0xef, 0x2a, 0x86, 0xfb, 0x67,
+			0xeb, 0x31, 0x27, 0x03, 0x99, 0x3c, 0xde, 0xe5,
+			0x84, 0x72, 0x71, 0x4c, 0x33, 0x7b, 0x87, 0x13,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x97, 0xc5, 0x58, 0x38, 0x20, 0xc7, 0xde, 0xfa,
+			0xdd, 0x9b, 0x10, 0xc6, 0xc2, 0x2f, 0x94, 0xb5,
+			0xc0, 0x33, 0xc0, 0x20, 0x1c, 0x2f, 0xb4, 0x28,
+			0x5e, 0x36, 0xfa, 0x8c, 0x24, 0x1c, 0x18, 0x27,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x62, 0x17, 0x84, 0x26, 0x98, 0x30, 0x57, 0xca,
+			0x4f, 0x32, 0xd9, 0x09, 0x09, 0x34, 0xe2, 0xcb,
+			0x92, 0x45, 0xd5, 0xeb, 0x8b, 0x9b, 0x3c, 0xd8,
+			0xaa, 0xc7, 0xd2, 0x2b, 0x04, 0xab, 0xb3, 0x35,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x7f, 0xe1, 0x09, 0x78, 0x5d, 0x61, 0xfa, 0x5e,
+			0x9b, 0x8c, 0xb1, 0xa9, 0x09, 0x69, 0xb4, 0x24,
+			0x54, 0xf2, 0x1c, 0xc9, 0x5f, 0xfb, 0x59, 0x9d,
+			0x36, 0x1b, 0x37, 0x44, 0xfc, 0x64, 0x79, 0xb6,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0xd2, 0x3b, 0x3a, 0xe7, 0x13, 0x4f, 0xbd, 0x29,
+			0x6b, 0xd2, 0x79, 0x26, 0x6c, 0xd2, 0x22, 0x43,
+			0x25, 0x34, 0x9b, 0x9b, 0x22, 0xb0, 0x9f, 0x61,
+			0x1d, 0xf4, 0xe2, 0x65, 0x68, 0x95, 0x02, 0x6c,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x0c, 0x34, 0x53, 0x3f, 0x0f, 0x8a, 0x39, 0x8d,
+			0x63, 0xe4, 0x83, 0x6e, 0x11, 0x7d, 0x14, 0x8e,
+			0x5b, 0xf0, 0x4d, 0xca, 0x23, 0x24, 0xb5, 0xd2,
+			0x13, 0x3f, 0xd9, 0xde, 0x84, 0x74, 0x26, 0x59,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0xa8, 0xb8, 0x83, 0x01, 0x1b, 0x38, 0x7a, 0xca,
+			0x59, 0xe9, 0x5b, 0x37, 0x6a, 0xab, 0xb4, 0x85,
+			0x94, 0x73, 0x26, 0x04, 0xef, 0xed, 0xf4, 0x0d,
+			0xd6, 0x09, 0x21, 0x09, 0x96, 0x78, 0xe3, 0xcf,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0x0b, 0x12, 0x66, 0x96, 0x78, 0x4f, 0x2c, 0x35,
+			0xa4, 0xed, 0xbc, 0xb8, 0x30, 0xa6, 0x37, 0x9b,
+			0x94, 0x13, 0xae, 0x86, 0xf0, 0x20, 0xfb, 0x49,
+			0x8f, 0x5d, 0x20, 0x70, 0x60, 0x2b, 0x02, 0x70,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0xe4, 0xbd, 0xe4, 0x3b, 0x85, 0xf4, 0x6f, 0x11,
+			0xad, 0xc4, 0x79, 0xcc, 0x8e, 0x6d, 0x8b, 0x15,
+			0xbb, 0xf9, 0xd3, 0x65, 0xe1, 0xf8, 0x8d, 0x22,
+			0x65, 0x66, 0x66, 0xb3, 0xf5, 0xd0, 0x9c, 0xaf,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x90, 0x5f, 0xe0, 0xfc, 0xb1, 0xdc, 0x38, 0x1b,
+			0xe5, 0x37, 0x3f, 0xd2, 0xcc, 0x48, 0xc4, 0xbc,
+			0xb4, 0xfd, 0xf7, 0x71, 0x5f, 0x6b, 0xf4, 0xc4,
+			0xa6, 0x08, 0x7e, 0xfc, 0x4e, 0x96, 0xf7, 0xc2,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x1f, 0x34, 0x0a, 0x3b, 0xdb, 0xf7, 0x7a, 0xdb,
+			0x3d, 0x89, 0x85, 0x0c, 0xd2, 0xf0, 0x0c, 0xbd,
+			0x25, 0x39, 0x14, 0x06, 0x28, 0x0f, 0x6b, 0x5f,
+			0xe3, 0x1f, 0x2a, 0xb6, 0xca, 0x56, 0x41, 0xa1,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0x7b, 0x01, 0x2d, 0x84, 0x70, 0xee, 0xe0, 0x77,
+			0x3c, 0x17, 0x63, 0xfe, 0x40, 0xd7, 0xfd, 0xa1,
+			0x75, 0x90, 0xb8, 0x3e, 0x50, 0xcd, 0x06, 0xb7,
+			0xb9, 0xb9, 0x2b, 0x91, 0x4f, 0xba, 0xe4, 0x4c,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA256_DIGEST_SIZE] = {
+	0x78, 0x1c, 0xb1, 0x9f, 0xe5, 0xe1, 0xcb, 0x41,
+	0x8b, 0x34, 0x00, 0x33, 0x57, 0xc3, 0x1c, 0x8f,
+	0x5c, 0x84, 0xc7, 0x8b, 0x87, 0x6a, 0x13, 0x29,
+	0x2f, 0xc4, 0x1a, 0x70, 0x5e, 0x40, 0xf2, 0xfe,
+};
+
+static const u8 hmac_testvec_consolidated[SHA256_DIGEST_SIZE] = {
+	0x56, 0x96, 0x2e, 0x23, 0x3f, 0x94, 0x89, 0x0d,
+	0x0f, 0x24, 0x36, 0x2e, 0x19, 0x3d, 0xb5, 0xac,
+	0xb8, 0xcd, 0xf1, 0xc9, 0xca, 0xac, 0xee, 0x9d,
+	0x62, 0xe6, 0x81, 0xe5, 0x96, 0xf9, 0x38, 0xf5,
+};
diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c
new file mode 100644
index 000000000000..5dccdee79693
--- /dev/null
+++ b/lib/crypto/tests/sha256_kunit.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha256-testvecs.h"
+
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
+#define HASH sha256
+#define HASH_CTX sha256_ctx
+#define HASH_SIZE SHA256_DIGEST_SIZE
+#define HASH_INIT sha256_init
+#define HASH_UPDATE sha256_update
+#define HASH_FINAL sha256_final
+#define HMAC_KEY hmac_sha256_key
+#define HMAC_CTX hmac_sha256_ctx
+#define HMAC_PREPAREKEY hmac_sha256_preparekey
+#define HMAC_INIT hmac_sha256_init
+#define HMAC_UPDATE hmac_sha256_update
+#define HMAC_FINAL hmac_sha256_final
+#define HMAC hmac_sha256
+#define HMAC_USINGRAWKEY hmac_sha256_usingrawkey
+#include "hash-test-template.h"
+
+static void free_guarded_buf(void *buf)
+{
+	vfree(buf);
+}
+
+/*
+ * Allocate a KUnit-managed buffer that has length @len bytes immediately
+ * followed by an unmapped page, and assert that the allocation succeeds.
+ */
+static void *alloc_guarded_buf(struct kunit *test, size_t len)
+{
+	size_t full_len = round_up(len, PAGE_SIZE);
+	void *buf = vmalloc(full_len);
+
+	KUNIT_ASSERT_NOT_NULL(test, buf);
+	KUNIT_ASSERT_EQ(test, 0,
+			kunit_add_action_or_reset(test, free_guarded_buf, buf));
+	return buf + full_len - len;
+}
+
+/*
+ * Test for sha256_finup_2x().  Specifically, choose various data lengths and
+ * salt lengths, and for each one, verify that sha256_finup_2x() produces the
+ * same results as sha256_update() and sha256_final().
+ *
+ * Use guarded buffers for all inputs and outputs to reliably detect any
+ * out-of-bounds reads or writes, even if they occur in assembly code.
+ */
+static void test_sha256_finup_2x(struct kunit *test)
+{
+	const size_t max_data_len = 16384;
+	u8 *data1_buf, *data2_buf, *hash1, *hash2;
+	u8 expected_hash1[SHA256_DIGEST_SIZE];
+	u8 expected_hash2[SHA256_DIGEST_SIZE];
+	u8 salt[SHA256_BLOCK_SIZE];
+	struct sha256_ctx *ctx;
+
+	data1_buf = alloc_guarded_buf(test, max_data_len);
+	data2_buf = alloc_guarded_buf(test, max_data_len);
+	hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+	hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+	ctx = alloc_guarded_buf(test, sizeof(*ctx));
+
+	rand_bytes(data1_buf, max_data_len);
+	rand_bytes(data2_buf, max_data_len);
+	rand_bytes(salt, sizeof(salt));
+	memset(ctx, 0, sizeof(*ctx));
+
+	for (size_t i = 0; i < 500; i++) {
+		size_t salt_len = rand_length(sizeof(salt));
+		size_t data_len = rand_length(max_data_len);
+		const u8 *data1 = data1_buf + max_data_len - data_len;
+		const u8 *data2 = data2_buf + max_data_len - data_len;
+		struct sha256_ctx orig_ctx;
+
+		sha256_init(ctx);
+		sha256_update(ctx, salt, salt_len);
+		orig_ctx = *ctx;
+
+		sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, ctx, &orig_ctx, sizeof(*ctx),
+			"sha256_finup_2x() modified its ctx argument");
+
+		sha256_update(ctx, data1, data_len);
+		sha256_final(ctx, expected_hash1);
+		sha256_update(&orig_ctx, data2, data_len);
+		sha256_final(&orig_ctx, expected_hash2);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash1, expected_hash1, SHA256_DIGEST_SIZE,
+			"Wrong hash1 with salt_len=%zu data_len=%zu", salt_len,
+			data_len);
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, hash2, expected_hash2, SHA256_DIGEST_SIZE,
+			"Wrong hash2 with salt_len=%zu data_len=%zu", salt_len,
+			data_len);
+	}
+}
+
+/* Test sha256_finup_2x() with ctx == NULL */
+static void test_sha256_finup_2x_defaultctx(struct kunit *test)
+{
+	const size_t data_len = 128;
+	struct sha256_ctx ctx;
+	u8 hash1_a[SHA256_DIGEST_SIZE];
+	u8 hash2_a[SHA256_DIGEST_SIZE];
+	u8 hash1_b[SHA256_DIGEST_SIZE];
+	u8 hash2_b[SHA256_DIGEST_SIZE];
+
+	rand_bytes(test_buf, 2 * data_len);
+
+	sha256_init(&ctx);
+	sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a,
+			hash2_a);
+
+	sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b,
+			hash2_b);
+
+	KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE);
+	KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE);
+}
+
+/*
+ * Test that sha256_finup_2x() and sha256_update/final() produce consistent
+ * results with total message lengths that require more than 32 bits.
+ */
+static void test_sha256_finup_2x_hugelen(struct kunit *test)
+{
+	const size_t data_len = 4 * SHA256_BLOCK_SIZE;
+	struct sha256_ctx ctx = {};
+	u8 expected_hash[SHA256_DIGEST_SIZE];
+	u8 hash[SHA256_DIGEST_SIZE];
+
+	rand_bytes(test_buf, data_len);
+	for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) {
+		sha256_init(&ctx);
+		ctx.ctx.bytecount = 0x123456789abcd00 + align;
+
+		sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash);
+
+		sha256_update(&ctx, test_buf, data_len);
+		sha256_final(&ctx, expected_hash);
+
+		KUNIT_ASSERT_MEMEQ(test, hash, expected_hash,
+				   SHA256_DIGEST_SIZE);
+	}
+}
+
+/* Benchmark for sha256_finup_2x() */
+static void benchmark_sha256_finup_2x(struct kunit *test)
+{
+	/*
+	 * Try a few different salt lengths, since sha256_finup_2x() performance
+	 * may vary slightly for the same data_len depending on how many bytes
+	 * were already processed in the initial context.
+	 */
+	static const size_t salt_lens_to_test[] = { 0, 32, 64 };
+	const size_t data_len = 4096;
+	const size_t num_iters = 4096;
+	struct sha256_ctx ctx;
+	u8 hash1[SHA256_DIGEST_SIZE];
+	u8 hash2[SHA256_DIGEST_SIZE];
+
+	if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+		kunit_skip(test, "not enabled");
+	if (!sha256_finup_2x_is_optimized())
+		kunit_skip(test, "not relevant");
+
+	rand_bytes(test_buf, data_len * 2);
+
+	/* Warm-up */
+	for (size_t i = 0; i < num_iters; i++)
+		sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len],
+				data_len, hash1, hash2);
+
+	for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) {
+		size_t salt_len = salt_lens_to_test[i];
+		u64 t0, t1;
+
+		/*
+		 * Prepare the initial context.  The time to process the salt is
+		 * not measured; we're just interested in sha256_finup_2x().
+		 */
+		sha256_init(&ctx);
+		sha256_update(&ctx, test_buf, salt_len);
+
+		preempt_disable();
+		t0 = ktime_get_ns();
+		for (size_t j = 0; j < num_iters; j++)
+			sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len],
+					data_len, hash1, hash2);
+		t1 = ktime_get_ns();
+		preempt_enable();
+		kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s",
+			   data_len, salt_len,
+			   div64_u64((u64)data_len * 2 * num_iters * 1000,
+				     t1 - t0 ?: 1));
+	}
+}
+
+static struct kunit_case hash_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_sha256_finup_2x),
+	KUNIT_CASE(test_sha256_finup_2x_defaultctx),
+	KUNIT_CASE(test_sha256_finup_2x_hugelen),
+	KUNIT_CASE(benchmark_hash),
+	KUNIT_CASE(benchmark_sha256_finup_2x),
+	{},
+};
+
+static struct kunit_suite hash_test_suite = {
+	.name = "sha256",
+	.test_cases = hash_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-256 and HMAC-SHA256");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha3-testvecs.h b/lib/crypto/tests/sha3-testvecs.h
new file mode 100644
index 000000000000..8d614a5fa0c3
--- /dev/null
+++ b/lib/crypto/tests/sha3-testvecs.h
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha3 */
+
+/* SHA3-256 test vectors */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA3_256_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xa7, 0xff, 0xc6, 0xf8, 0xbf, 0x1e, 0xd7, 0x66,
+			0x51, 0xc1, 0x47, 0x56, 0xa0, 0x61, 0xd6, 0x62,
+			0xf5, 0x80, 0xff, 0x4d, 0xe4, 0x3b, 0x49, 0xfa,
+			0x82, 0xd8, 0x0a, 0x4b, 0x80, 0xf8, 0x43, 0x4a,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x11, 0x03, 0xe7, 0x84, 0x51, 0x50, 0x86, 0x35,
+			0x71, 0x8a, 0x70, 0xe3, 0xc4, 0x26, 0x7b, 0x21,
+			0x02, 0x13, 0xa0, 0x81, 0xe8, 0xe6, 0x14, 0x25,
+			0x07, 0x34, 0xe5, 0xc5, 0x40, 0x06, 0xf2, 0x8b,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x2f, 0x6f, 0x6d, 0x47, 0x48, 0x52, 0x11, 0xb9,
+			0xe4, 0x3d, 0xc8, 0x71, 0xcf, 0xb2, 0xee, 0xae,
+			0x5b, 0xf4, 0x12, 0x84, 0x5b, 0x1c, 0xec, 0x6c,
+			0xc1, 0x66, 0x88, 0xaa, 0xc3, 0x40, 0xbd, 0x7e,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0xec, 0x02, 0xe8, 0x81, 0x4f, 0x84, 0x41, 0x69,
+			0x06, 0xd8, 0xdc, 0x1d, 0x01, 0x78, 0xd7, 0xcb,
+			0x39, 0xdf, 0xd3, 0x12, 0x1c, 0x99, 0xfd, 0xf3,
+			0x5c, 0x83, 0xc9, 0xc2, 0x7a, 0x7b, 0x6a, 0x05,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0xff, 0x6f, 0xc3, 0x41, 0xc3, 0x5f, 0x34, 0x6d,
+			0xa7, 0xdf, 0x3e, 0xc2, 0x8b, 0x29, 0xb6, 0xf1,
+			0xf8, 0x67, 0xfd, 0xcd, 0xb1, 0x9f, 0x38, 0x08,
+			0x1d, 0x8d, 0xd9, 0xc2, 0x43, 0x66, 0x18, 0x6c,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0xe4, 0xb1, 0x06, 0x17, 0xf8, 0x8b, 0x91, 0x95,
+			0xe7, 0x57, 0x66, 0xac, 0x08, 0xb2, 0x03, 0x3e,
+			0xf7, 0x84, 0x1f, 0xe3, 0x25, 0xa3, 0x11, 0xd2,
+			0x11, 0xa4, 0x78, 0x74, 0x2a, 0x43, 0x20, 0xa5,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0xeb, 0x57, 0x5f, 0x20, 0xa3, 0x6b, 0xc7, 0xb4,
+			0x66, 0x2a, 0xa0, 0x30, 0x3b, 0x52, 0x00, 0xc9,
+			0xce, 0x6a, 0xd8, 0x1e, 0xbe, 0xed, 0xa1, 0xd1,
+			0xbe, 0x63, 0xc7, 0xe1, 0xe2, 0x66, 0x67, 0x0c,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0xf0, 0x67, 0xad, 0x66, 0xbe, 0xec, 0x5a, 0xfd,
+			0x29, 0xd2, 0x4f, 0x1d, 0xb2, 0x24, 0xb8, 0x90,
+			0x05, 0x28, 0x0e, 0x66, 0x67, 0x74, 0x2d, 0xee,
+			0x66, 0x25, 0x11, 0xd1, 0x76, 0xa2, 0xfc, 0x3a,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x57, 0x56, 0x21, 0xb3, 0x2d, 0x2d, 0xe1, 0x9d,
+			0xbf, 0x2c, 0x82, 0xa8, 0xad, 0x7e, 0x6c, 0x46,
+			0xfb, 0x30, 0xeb, 0xce, 0xcf, 0xed, 0x2d, 0x65,
+			0xe7, 0xe4, 0x96, 0x69, 0xe0, 0x48, 0xd2, 0xb6,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x7b, 0xba, 0x67, 0x15, 0xe5, 0x21, 0xc4, 0x69,
+			0xd3, 0xef, 0x5c, 0x97, 0x9f, 0x5b, 0xba, 0x9c,
+			0xfa, 0x55, 0x64, 0xec, 0xb5, 0x37, 0x53, 0x1b,
+			0x3f, 0x4c, 0x0a, 0xed, 0x51, 0x98, 0x2b, 0x52,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x44, 0xb6, 0x6b, 0x83, 0x09, 0x83, 0x55, 0x83,
+			0xde, 0x1f, 0xcc, 0x33, 0xef, 0xdc, 0x05, 0xbb,
+			0x3b, 0x63, 0x76, 0x45, 0xe4, 0x8e, 0x14, 0x7a,
+			0x2d, 0xae, 0x90, 0xce, 0x68, 0xc3, 0xa4, 0xf2,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x50, 0x3e, 0x99, 0x4e, 0x28, 0x2b, 0xc9, 0xf4,
+			0xf5, 0xeb, 0x2b, 0x16, 0x04, 0x2d, 0xf5, 0xbe,
+			0xc0, 0x91, 0x41, 0x2a, 0x8e, 0x69, 0x5e, 0x39,
+			0x53, 0x2c, 0xc1, 0x18, 0xa5, 0xeb, 0xd8, 0xda,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x90, 0x0b, 0xa6, 0x92, 0x84, 0x30, 0xaf, 0xee,
+			0x38, 0x59, 0x83, 0x83, 0xe9, 0xfe, 0xab, 0x86,
+			0x79, 0x1b, 0xcd, 0xe7, 0x0a, 0x0f, 0x58, 0x53,
+			0x36, 0xab, 0x12, 0xe1, 0x5c, 0x97, 0xc1, 0xfb,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x2b, 0x52, 0x1e, 0x54, 0xbe, 0x38, 0x4c, 0x3e,
+			0x73, 0x37, 0x18, 0xf5, 0x25, 0x2c, 0xc8, 0xc7,
+			0xda, 0x7e, 0xb6, 0x47, 0x9d, 0xf4, 0x46, 0xce,
+			0xfa, 0x80, 0x20, 0x6b, 0xbd, 0xfd, 0x2a, 0xd8,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x45, 0xf0, 0xf5, 0x9b, 0xd9, 0x91, 0x26, 0xd5,
+			0x91, 0x3b, 0xf8, 0x87, 0x8b, 0x34, 0x02, 0x31,
+			0x64, 0xab, 0xf4, 0x1c, 0x6e, 0x34, 0x72, 0xdf,
+			0x32, 0x6d, 0xe5, 0xd2, 0x67, 0x5e, 0x86, 0x93,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0xb3, 0xaf, 0x71, 0x64, 0xfa, 0xd4, 0xf1, 0x07,
+			0x38, 0xef, 0x04, 0x8e, 0x89, 0xf4, 0x02, 0xd2,
+			0xa5, 0xaf, 0x3b, 0xf5, 0x67, 0x56, 0xcf, 0xa9,
+			0x8e, 0x43, 0xf5, 0xb5, 0xe3, 0x91, 0x8e, 0xe7,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x51, 0xac, 0x0a, 0x65, 0xb7, 0x96, 0x20, 0xcf,
+			0x88, 0xf6, 0x97, 0x35, 0x89, 0x0d, 0x31, 0x0f,
+			0xbe, 0x17, 0xbe, 0x62, 0x03, 0x67, 0xc0, 0xee,
+			0x4f, 0xc1, 0xe3, 0x7f, 0x6f, 0xab, 0xac, 0xb4,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x7e, 0xea, 0xa8, 0xd7, 0xde, 0x20, 0x1b, 0x58,
+			0x24, 0xd8, 0x26, 0x40, 0x36, 0x5f, 0x3f, 0xaa,
+			0xe5, 0x5a, 0xea, 0x98, 0x58, 0xd4, 0xd6, 0xfc,
+			0x20, 0x4c, 0x5c, 0x4f, 0xaf, 0x56, 0xc7, 0xc3,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x61, 0xb1, 0xb1, 0x3e, 0x0e, 0x7e, 0x90, 0x3d,
+			0x31, 0x54, 0xbd, 0xc9, 0x0d, 0x53, 0x62, 0xf1,
+			0xcd, 0x18, 0x80, 0xf9, 0x91, 0x75, 0x41, 0xb3,
+			0x51, 0x39, 0x57, 0xa7, 0xa8, 0x1e, 0xfb, 0xc9,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xab, 0x29, 0xda, 0x10, 0xc4, 0x11, 0x2d, 0x5c,
+			0xd1, 0xce, 0x1c, 0x95, 0xfa, 0xc6, 0xc7, 0xb0,
+			0x1b, 0xd1, 0xdc, 0x6f, 0xa0, 0x9d, 0x1b, 0x23,
+			0xfb, 0x6e, 0x90, 0x97, 0xd0, 0x75, 0x44, 0x7a,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x02, 0x45, 0x95, 0xf4, 0x19, 0xb5, 0x93, 0x29,
+			0x90, 0xf2, 0x63, 0x3f, 0x89, 0xe8, 0xa5, 0x31,
+			0x76, 0xf2, 0x89, 0x79, 0x66, 0xd3, 0x96, 0xdf,
+			0x33, 0xd1, 0xa6, 0x17, 0x73, 0xb1, 0xd0, 0x45,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0xd1, 0x8e, 0x22, 0xea, 0x44, 0x87, 0x6e, 0x9d,
+			0xfb, 0x36, 0x02, 0x20, 0x63, 0xb7, 0x69, 0x45,
+			0x25, 0x41, 0x69, 0xe0, 0x9b, 0x87, 0xcf, 0xa3,
+			0x51, 0xbb, 0xfc, 0x8d, 0xf7, 0x29, 0xa7, 0xea,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x11, 0x86, 0x7d, 0x84, 0xf9, 0x8c, 0x6e, 0xc4,
+			0x64, 0x36, 0xc6, 0xf3, 0x42, 0x92, 0x31, 0x2b,
+			0x1e, 0x12, 0xe6, 0x4d, 0xbe, 0xfa, 0x77, 0x3f,
+			0x89, 0x41, 0x33, 0x58, 0x1c, 0x98, 0x16, 0x0a,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xb2, 0xba, 0x0c, 0x8c, 0x9d, 0xbb, 0x1e, 0xb0,
+			0x03, 0xb5, 0xdf, 0x4f, 0xf5, 0x35, 0xdb, 0xec,
+			0x60, 0xf2, 0x5b, 0xb6, 0xd0, 0x49, 0xd3, 0xed,
+			0x55, 0xc0, 0x7a, 0xd7, 0xaf, 0xa1, 0xea, 0x53,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+	0x3b, 0x33, 0x67, 0xf8, 0xea, 0x92, 0x78, 0x62,
+	0xdd, 0xbe, 0x72, 0x15, 0xbd, 0x6f, 0xfa, 0xe5,
+	0x5e, 0xab, 0x9f, 0xb1, 0xe4, 0x23, 0x7c, 0x2c,
+	0x80, 0xcf, 0x09, 0x75, 0xf8, 0xe2, 0xfa, 0x30,
+};
+
+/* SHAKE test vectors */
+
+static const u8 shake128_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+	0x89, 0x88, 0x3a, 0x44, 0xec, 0xfe, 0x3c, 0xeb,
+	0x2f, 0x1c, 0x1d, 0xda, 0x9e, 0x36, 0x64, 0xf0,
+	0x85, 0x4c, 0x49, 0x12, 0x76, 0x5a, 0x4d, 0xe7,
+	0xa8, 0xfd, 0xcd, 0xbe, 0x45, 0xb4, 0x6f, 0xb0,
+};
+
+static const u8 shake256_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+	0x5a, 0xfd, 0x66, 0x62, 0x5c, 0x37, 0x2b, 0x41,
+	0x77, 0x1c, 0x01, 0x5d, 0x64, 0x7c, 0x63, 0x7a,
+	0x7c, 0x76, 0x9e, 0xa8, 0xd1, 0xb0, 0x8e, 0x02,
+	0x16, 0x9b, 0xfe, 0x0e, 0xb5, 0xd8, 0x6a, 0xb5,
+};
diff --git a/lib/crypto/tests/sha384-testvecs.h b/lib/crypto/tests/sha384-testvecs.h
new file mode 100644
index 000000000000..6e2f572dd792
--- /dev/null
+++ b/lib/crypto/tests/sha384-testvecs.h
@@ -0,0 +1,290 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha384 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA384_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0x38, 0xb0, 0x60, 0xa7, 0x51, 0xac, 0x96, 0x38,
+			0x4c, 0xd9, 0x32, 0x7e, 0xb1, 0xb1, 0xe3, 0x6a,
+			0x21, 0xfd, 0xb7, 0x11, 0x14, 0xbe, 0x07, 0x43,
+			0x4c, 0x0c, 0xc7, 0xbf, 0x63, 0xf6, 0xe1, 0xda,
+			0x27, 0x4e, 0xde, 0xbf, 0xe7, 0x6f, 0x65, 0xfb,
+			0xd5, 0x1a, 0xd2, 0xf1, 0x48, 0x98, 0xb9, 0x5b,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x07, 0x34, 0x9d, 0x74, 0x48, 0x76, 0xa5, 0x72,
+			0x78, 0x02, 0xb8, 0x6e, 0x21, 0x59, 0xb0, 0x75,
+			0x09, 0x68, 0x11, 0x39, 0x53, 0x61, 0xee, 0x8d,
+			0xf2, 0x01, 0xf3, 0x90, 0x53, 0x7c, 0xd3, 0xde,
+			0x13, 0x9f, 0xd2, 0x74, 0x28, 0xfe, 0xe1, 0xc8,
+			0x2e, 0x95, 0xc6, 0x7d, 0x69, 0x4d, 0x04, 0xc6,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0xc4, 0xef, 0x6e, 0x8c, 0x19, 0x1c, 0xaa, 0x0e,
+			0x86, 0xf2, 0x68, 0xa1, 0xa0, 0x2d, 0x2e, 0xb2,
+			0x84, 0xbc, 0x5d, 0x53, 0x31, 0xf8, 0x03, 0x75,
+			0x56, 0xf4, 0x8b, 0x23, 0x1a, 0x68, 0x15, 0x9a,
+			0x60, 0xb2, 0xec, 0x05, 0xe1, 0xd4, 0x5e, 0x9e,
+			0xe8, 0x7c, 0x9d, 0xe4, 0x0f, 0x9c, 0x3a, 0xdd,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x29, 0xd2, 0x02, 0xa2, 0x77, 0x24, 0xc7, 0xa7,
+			0x23, 0x0c, 0x3e, 0x30, 0x56, 0x47, 0xdb, 0x75,
+			0xd4, 0x41, 0xf8, 0xb3, 0x8e, 0x26, 0xf6, 0x92,
+			0xbc, 0x20, 0x2e, 0x96, 0xcc, 0x81, 0x5f, 0x32,
+			0x82, 0x60, 0xe2, 0xcf, 0x23, 0xd7, 0x3c, 0x90,
+			0xb2, 0x56, 0x8f, 0xb6, 0x0f, 0xf0, 0x6b, 0x80,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x21, 0x4c, 0xac, 0xfe, 0xbd, 0x40, 0x74, 0x1f,
+			0xa2, 0x2d, 0x2f, 0x35, 0x91, 0xfd, 0xc9, 0x97,
+			0x88, 0x12, 0x6c, 0x0c, 0x6e, 0xd8, 0x50, 0x0b,
+			0x4b, 0x2c, 0x89, 0xa6, 0xa6, 0x4a, 0xad, 0xd7,
+			0x72, 0x62, 0x2c, 0x62, 0x81, 0xcd, 0x24, 0x74,
+			0xf5, 0x44, 0x05, 0xa0, 0x97, 0xea, 0xf1, 0x78,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x06, 0x8b, 0x92, 0x9f, 0x8b, 0x64, 0xb2, 0x80,
+			0xde, 0xcc, 0xde, 0xc3, 0x2f, 0x22, 0x27, 0xe8,
+			0x3b, 0x6e, 0x16, 0x21, 0x14, 0x81, 0xbe, 0x5b,
+			0xa7, 0xa7, 0x14, 0x8a, 0x00, 0x8f, 0x0d, 0x38,
+			0x11, 0x63, 0xe8, 0x3e, 0xb9, 0xf1, 0xcf, 0x87,
+			0xb1, 0x28, 0xe5, 0xa1, 0x89, 0xa8, 0x7a, 0xde,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x9e, 0x37, 0x76, 0x62, 0x98, 0x39, 0xbe, 0xfd,
+			0x2b, 0x91, 0x20, 0x54, 0x8f, 0x21, 0xe7, 0x30,
+			0x0a, 0x01, 0x7a, 0x65, 0x0b, 0xc9, 0xb3, 0x89,
+			0x3c, 0xb6, 0xd3, 0xa8, 0xff, 0xc9, 0x1b, 0x5c,
+			0xd4, 0xac, 0xb4, 0x7e, 0xba, 0x94, 0xc3, 0x8a,
+			0x26, 0x41, 0xf6, 0xd5, 0xed, 0x6f, 0x27, 0xa7,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x03, 0x1f, 0xef, 0x5a, 0x16, 0x28, 0x78, 0x10,
+			0x29, 0xe8, 0xe2, 0xe4, 0x84, 0x36, 0x19, 0x10,
+			0xaa, 0xea, 0xde, 0x06, 0x39, 0x5f, 0xb2, 0x36,
+			0xca, 0x24, 0x4f, 0x7b, 0x66, 0xf7, 0xe7, 0x31,
+			0xf3, 0x9b, 0x74, 0x1e, 0x17, 0x20, 0x88, 0x62,
+			0x50, 0xeb, 0x5f, 0x9a, 0xa7, 0x2c, 0xf4, 0xc9,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x10, 0xce, 0xed, 0x26, 0xb8, 0xac, 0xc1, 0x1b,
+			0xe6, 0xb9, 0xeb, 0x7c, 0xae, 0xcd, 0x55, 0x5a,
+			0x20, 0x2a, 0x7b, 0x43, 0xe6, 0x3e, 0xf0, 0x3f,
+			0xd9, 0x2f, 0x8c, 0x52, 0xe2, 0xf0, 0xb6, 0x24,
+			0x2e, 0xa4, 0xac, 0x24, 0x3a, 0x54, 0x99, 0x71,
+			0x65, 0xab, 0x97, 0x2d, 0xb6, 0xe6, 0x94, 0x20,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x24, 0x6d, 0x9f, 0x59, 0x42, 0x36, 0xca, 0x34,
+			0x36, 0x41, 0xa2, 0xcd, 0x69, 0xdf, 0x3d, 0xcb,
+			0x64, 0x94, 0x54, 0xb2, 0xed, 0xc1, 0x1c, 0x31,
+			0xe3, 0x26, 0xcb, 0x71, 0xe6, 0x98, 0xb2, 0x56,
+			0x74, 0x30, 0xa9, 0x15, 0x98, 0x9d, 0xb3, 0x07,
+			0xcc, 0xa8, 0xcc, 0x6f, 0x42, 0xb0, 0x9d, 0x2b,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x85, 0x1f, 0xbc, 0x5e, 0x2a, 0x00, 0x7d, 0xc2,
+			0x21, 0x4c, 0x28, 0x14, 0xc5, 0xd8, 0x0c, 0xe8,
+			0x55, 0xa5, 0xa0, 0x77, 0xda, 0x8f, 0xce, 0xd4,
+			0xf0, 0xcb, 0x30, 0xb8, 0x9c, 0x47, 0xe1, 0x33,
+			0x92, 0x18, 0xc5, 0x1f, 0xf2, 0xef, 0xb5, 0xe5,
+			0xbc, 0x63, 0xa6, 0xe5, 0x9a, 0xc9, 0xcc, 0xf1,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x26, 0xd2, 0x4c, 0xb6, 0xce, 0xd8, 0x22, 0x2b,
+			0x44, 0x10, 0x6f, 0x59, 0xf7, 0x0d, 0xb9, 0x3f,
+			0x7d, 0x29, 0x75, 0xf1, 0x71, 0xb2, 0x71, 0x23,
+			0xef, 0x68, 0xb7, 0x25, 0xae, 0xb8, 0x45, 0xf8,
+			0xa3, 0xb2, 0x2d, 0x7a, 0x83, 0x0a, 0x05, 0x61,
+			0xbc, 0x73, 0xf1, 0xf9, 0xba, 0xfb, 0x3d, 0xc2,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x7c, 0xe5, 0x7f, 0x5e, 0xea, 0xd9, 0x7e, 0x54,
+			0x14, 0x30, 0x6f, 0x37, 0x02, 0x71, 0x0f, 0xf1,
+			0x14, 0x16, 0xfa, 0xeb, 0x6e, 0x1e, 0xf0, 0xbe,
+			0x10, 0xed, 0x01, 0xbf, 0xa0, 0x9d, 0xcb, 0x07,
+			0x5f, 0x8b, 0x7f, 0x44, 0xe1, 0xd9, 0x13, 0xf0,
+			0x29, 0xa2, 0x54, 0x32, 0xd9, 0xb0, 0x69, 0x69,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0xc5, 0x54, 0x1f, 0xcb, 0x9d, 0x8f, 0xdf, 0xbf,
+			0xab, 0x55, 0x92, 0x1d, 0x3b, 0x93, 0x79, 0x26,
+			0xdf, 0xba, 0x9a, 0x28, 0xff, 0xa0, 0x6c, 0xae,
+			0x7b, 0x53, 0x8d, 0xfa, 0xef, 0x35, 0x88, 0x19,
+			0x16, 0xb8, 0x72, 0x86, 0x76, 0x2a, 0xf5, 0xe6,
+			0xec, 0xb2, 0xd7, 0xd4, 0xbe, 0x1a, 0xe4, 0x9f,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x74, 0x9d, 0x77, 0xfb, 0xe8, 0x0f, 0x0c, 0x2d,
+			0x86, 0x0d, 0x49, 0xea, 0x2b, 0xd0, 0x13, 0xd1,
+			0xe8, 0xb8, 0xe1, 0xa3, 0x7b, 0x48, 0xab, 0x6a,
+			0x21, 0x2b, 0x4c, 0x48, 0x32, 0xb5, 0xdc, 0x31,
+			0x7f, 0xd0, 0x32, 0x67, 0x9a, 0xc0, 0x85, 0x53,
+			0xef, 0xe9, 0xfb, 0xe1, 0x8b, 0xd8, 0xcc, 0xc2,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x7b, 0xa9, 0xde, 0xa3, 0x07, 0x5c, 0x4c, 0xaa,
+			0x31, 0xc6, 0x9e, 0x55, 0xd4, 0x3f, 0x52, 0xdd,
+			0xde, 0x36, 0x70, 0x96, 0x59, 0x6e, 0x90, 0x78,
+			0x4c, 0x6a, 0x27, 0xde, 0x83, 0x84, 0xc3, 0x35,
+			0x53, 0x76, 0x1d, 0xbf, 0x83, 0x64, 0xcf, 0xf2,
+			0xb0, 0x3e, 0x07, 0x27, 0xe4, 0x25, 0x6c, 0x56,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x53, 0x50, 0xf7, 0x3b, 0x86, 0x1d, 0x7a, 0xe2,
+			0x5d, 0x9b, 0x71, 0xfa, 0x25, 0x23, 0x5a, 0xfe,
+			0x8c, 0xb9, 0xac, 0x8a, 0x9d, 0x6c, 0x99, 0xbc,
+			0x01, 0x9e, 0xa0, 0xd6, 0x3c, 0x03, 0x46, 0x21,
+			0xb6, 0xd0, 0xb0, 0xb3, 0x23, 0x23, 0x58, 0xf1,
+			0xea, 0x4e, 0xf2, 0x1a, 0x2f, 0x14, 0x2b, 0x5a,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x06, 0x03, 0xb3, 0xba, 0x14, 0xe0, 0x28, 0x07,
+			0xd5, 0x15, 0x97, 0x1f, 0x87, 0xef, 0x80, 0xba,
+			0x48, 0x03, 0xb6, 0xc5, 0x47, 0xca, 0x8c, 0x95,
+			0xed, 0x95, 0xfd, 0x27, 0xb6, 0x83, 0xda, 0x6d,
+			0xa7, 0xb2, 0x1a, 0xd2, 0xb5, 0x89, 0xbb, 0xb4,
+			0x00, 0xbc, 0x86, 0x54, 0x7d, 0x5a, 0x91, 0x63,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0xd3, 0xe0, 0x6e, 0x7d, 0x80, 0x08, 0x53, 0x07,
+			0x8c, 0x0f, 0xc2, 0xce, 0x9f, 0x09, 0x86, 0x31,
+			0x28, 0x24, 0x3c, 0x3e, 0x2d, 0x36, 0xb4, 0x28,
+			0xc7, 0x1b, 0x70, 0xf9, 0x35, 0x9b, 0x10, 0xfa,
+			0xc8, 0x5e, 0x2b, 0x32, 0x7f, 0x65, 0xd2, 0x68,
+			0xb2, 0x84, 0x90, 0xf6, 0xc8, 0x6e, 0xb8, 0xdb,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0x39, 0xeb, 0xc4, 0xb3, 0x08, 0xe2, 0xdd, 0xf3,
+			0x9f, 0x5e, 0x44, 0x93, 0x63, 0x8b, 0x39, 0x57,
+			0xd7, 0xe8, 0x7e, 0x3d, 0x74, 0xf8, 0xf6, 0xab,
+			0xfe, 0x74, 0x51, 0xe4, 0x1b, 0x4a, 0x23, 0xbc,
+			0x69, 0xfc, 0xbb, 0xa7, 0x71, 0xa7, 0x86, 0x24,
+			0xcc, 0x85, 0x70, 0xf2, 0x31, 0x0d, 0x47, 0xc0,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x23, 0xc3, 0x97, 0x06, 0x79, 0xbe, 0x8a, 0xe9,
+			0x1f, 0x1a, 0x43, 0xad, 0xe6, 0x76, 0x23, 0x13,
+			0x64, 0xae, 0xda, 0xe7, 0x8b, 0x88, 0x96, 0xb6,
+			0xa9, 0x1a, 0xb7, 0x80, 0x8e, 0x1c, 0x94, 0x98,
+			0x09, 0x08, 0xdb, 0x8e, 0x4d, 0x0a, 0x09, 0x65,
+			0xe5, 0x21, 0x1c, 0xd9, 0xab, 0x64, 0xbb, 0xea,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x4f, 0x4a, 0x88, 0x9f, 0x40, 0x89, 0xfe, 0xb6,
+			0xda, 0x9d, 0xcd, 0xa5, 0x27, 0xd2, 0x29, 0x71,
+			0x58, 0x60, 0xd4, 0x55, 0xfe, 0x92, 0xcd, 0x51,
+			0x8b, 0xec, 0x3b, 0xd3, 0xd1, 0x3e, 0x8d, 0x36,
+			0x7b, 0xb1, 0x41, 0xef, 0xec, 0x9d, 0xdf, 0xcd,
+			0x4e, 0xde, 0x5a, 0xe5, 0xe5, 0x16, 0x14, 0x54,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0xb5, 0xa5, 0x3e, 0x86, 0x39, 0x20, 0x49, 0x4c,
+			0xcd, 0xb6, 0xdd, 0x03, 0xfe, 0x36, 0x6e, 0xa6,
+			0xfc, 0xff, 0x19, 0x33, 0x0c, 0x52, 0xea, 0x37,
+			0x94, 0xda, 0x5b, 0x27, 0xd1, 0x99, 0x5a, 0x89,
+			0x40, 0x78, 0xfa, 0x96, 0xb9, 0x2f, 0xa0, 0x48,
+			0xc9, 0xf8, 0x5c, 0xf0, 0x95, 0xf4, 0xea, 0x61,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0x6f, 0x48, 0x6f, 0x21, 0xb9, 0xc1, 0xcc, 0x92,
+			0x4e, 0xed, 0x6b, 0xef, 0x51, 0x88, 0xdf, 0xfd,
+			0xcb, 0x3d, 0x44, 0x9c, 0x37, 0x85, 0xb4, 0xc5,
+			0xeb, 0x60, 0x55, 0x58, 0x01, 0x47, 0xbf, 0x75,
+			0x9b, 0xa8, 0x82, 0x8c, 0xec, 0xe8, 0x0e, 0x58,
+			0xc1, 0x26, 0xa2, 0x45, 0x87, 0x3e, 0xfb, 0x8d,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA384_DIGEST_SIZE] = {
+	0xfc, 0xcb, 0xe6, 0x42, 0xf0, 0x9e, 0x2b, 0x77,
+	0x7b, 0x62, 0xe8, 0x70, 0x86, 0xbf, 0xaf, 0x3c,
+	0xbb, 0x02, 0xd9, 0x86, 0xdc, 0xba, 0xd3, 0xa4,
+	0x0d, 0x8d, 0xb3, 0x2d, 0x0b, 0xa3, 0x84, 0x04,
+	0x7c, 0x16, 0x37, 0xaf, 0xba, 0x1e, 0xd4, 0x2f,
+	0x4c, 0x57, 0x55, 0x86, 0x52, 0x47, 0x9a, 0xec,
+};
+
+static const u8 hmac_testvec_consolidated[SHA384_DIGEST_SIZE] = {
+	0x82, 0xcf, 0x7d, 0x80, 0x71, 0xdb, 0x91, 0x09,
+	0x67, 0xe8, 0x44, 0x4a, 0x0d, 0x03, 0xb1, 0xf9,
+	0x62, 0xde, 0x4e, 0xbb, 0x1f, 0x41, 0xcd, 0x62,
+	0x39, 0x6b, 0x2d, 0x44, 0x0e, 0xde, 0x98, 0x73,
+	0xdd, 0xeb, 0x9d, 0x53, 0xfb, 0xee, 0xd1, 0xc3,
+	0x96, 0xdb, 0xfc, 0x2a, 0x38, 0x90, 0x02, 0x53,
+};
diff --git a/lib/crypto/tests/sha384_kunit.c b/lib/crypto/tests/sha384_kunit.c
new file mode 100644
index 000000000000..e1ef5c995bb6
--- /dev/null
+++ b/lib/crypto/tests/sha384_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha384-testvecs.h"
+
+#define HASH sha384
+#define HASH_CTX sha384_ctx
+#define HASH_SIZE SHA384_DIGEST_SIZE
+#define HASH_INIT sha384_init
+#define HASH_UPDATE sha384_update
+#define HASH_FINAL sha384_final
+#define HMAC_KEY hmac_sha384_key
+#define HMAC_CTX hmac_sha384_ctx
+#define HMAC_PREPAREKEY hmac_sha384_preparekey
+#define HMAC_INIT hmac_sha384_init
+#define HMAC_UPDATE hmac_sha384_update
+#define HMAC_FINAL hmac_sha384_final
+#define HMAC hmac_sha384
+#define HMAC_USINGRAWKEY hmac_sha384_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite hash_test_suite = {
+	.name = "sha384",
+	.test_cases = hash_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-384 and HMAC-SHA384");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha3_kunit.c b/lib/crypto/tests/sha3_kunit.c
new file mode 100644
index 000000000000..ed5fbe80337f
--- /dev/null
+++ b/lib/crypto/tests/sha3_kunit.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#include <crypto/sha3.h>
+#include "sha3-testvecs.h"
+
+#define HASH		sha3_256
+#define HASH_CTX	sha3_ctx
+#define HASH_SIZE	SHA3_256_DIGEST_SIZE
+#define HASH_INIT	sha3_256_init
+#define HASH_UPDATE	sha3_update
+#define HASH_FINAL	sha3_final
+#include "hash-test-template.h"
+
+/*
+ * Sample message and the output generated for various algorithms by passing it
+ * into "openssl sha3-224" etc..
+ */
+static const u8 test_sha3_sample[] =
+	"The quick red fox jumped over the lazy brown dog!\n"
+	"The quick red fox jumped over the lazy brown dog!\n"
+	"The quick red fox jumped over the lazy brown dog!\n"
+	"The quick red fox jumped over the lazy brown dog!\n";
+
+static const u8 test_sha3_224[8 + SHA3_224_DIGEST_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0xd6, 0xe8, 0xd8, 0x80, 0xfa, 0x42, 0x80, 0x70,
+	0x7e, 0x7f, 0xd7, 0xd2, 0xd7, 0x7a, 0x35, 0x65,
+	0xf0, 0x0b, 0x4f, 0x9f, 0x2a, 0x33, 0xca, 0x0a,
+	0xef, 0xa6, 0x4c, 0xb8,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_256[8 + SHA3_256_DIGEST_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0xdb, 0x3b, 0xb0, 0xb8, 0x8d, 0x15, 0x78, 0xe5,
+	0x78, 0x76, 0x8e, 0x39, 0x7e, 0x89, 0x86, 0xb9,
+	0x14, 0x3a, 0x1e, 0xe7, 0x96, 0x7c, 0xf3, 0x25,
+	0x70, 0xbd, 0xc3, 0xa9, 0xae, 0x63, 0x71, 0x1d,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_384[8 + SHA3_384_DIGEST_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0x2d, 0x4b, 0x29, 0x85, 0x19, 0x94, 0xaa, 0x31,
+	0x9b, 0x04, 0x9d, 0x6e, 0x79, 0x66, 0xc7, 0x56,
+	0x8a, 0x2e, 0x99, 0x84, 0x06, 0xcf, 0x10, 0x2d,
+	0xec, 0xf0, 0x03, 0x04, 0x1f, 0xd5, 0x99, 0x63,
+	0x2f, 0xc3, 0x2b, 0x0d, 0xd9, 0x45, 0xf7, 0xbb,
+	0x0a, 0xc3, 0x46, 0xab, 0xfe, 0x4d, 0x94, 0xc2,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_512[8 + SHA3_512_DIGEST_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0xdd, 0x71, 0x3b, 0x44, 0xb6, 0x6c, 0xd7, 0x78,
+	0xe7, 0x93, 0xa1, 0x4c, 0xd7, 0x24, 0x16, 0xf1,
+	0xfd, 0xa2, 0x82, 0x4e, 0xed, 0x59, 0xe9, 0x83,
+	0x15, 0x38, 0x89, 0x7d, 0x39, 0x17, 0x0c, 0xb2,
+	0xcf, 0x12, 0x80, 0x78, 0xa1, 0x78, 0x41, 0xeb,
+	0xed, 0x21, 0x4c, 0xa4, 0x4a, 0x5f, 0x30, 0x1a,
+	0x70, 0x98, 0x4f, 0x14, 0xa2, 0xd1, 0x64, 0x1b,
+	0xc2, 0x0a, 0xff, 0x3b, 0xe8, 0x26, 0x41, 0x8f,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_shake128[8 + SHAKE128_DEFAULT_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0x41, 0xd6, 0xb8, 0x9c, 0xf8, 0xe8, 0x54, 0xf2,
+	0x5c, 0xde, 0x51, 0x12, 0xaf, 0x9e, 0x0d, 0x91,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_shake256[8 + SHAKE256_DEFAULT_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0xab, 0x06, 0xd4, 0xf9, 0x8b, 0xfd, 0xb2, 0xc4,
+	0xfe, 0xf1, 0xcc, 0xe2, 0x40, 0x45, 0xdd, 0x15,
+	0xcb, 0xdd, 0x02, 0x8d, 0xb7, 0x9f, 0x1e, 0x67,
+	0xd6, 0x7f, 0x98, 0x5e, 0x1b, 0x19, 0xf8, 0x01,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static void test_sha3_224_basic(struct kunit *test)
+{
+	u8 out[8 + SHA3_224_DIGEST_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_224));
+
+	memset(out, 0, sizeof(out));
+	sha3_224(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_224, sizeof(test_sha3_224),
+			       "SHA3-224 gives wrong output");
+}
+
+static void test_sha3_256_basic(struct kunit *test)
+{
+	u8 out[8 + SHA3_256_DIGEST_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_256));
+
+	memset(out, 0, sizeof(out));
+	sha3_256(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_256, sizeof(test_sha3_256),
+			       "SHA3-256 gives wrong output");
+}
+
+static void test_sha3_384_basic(struct kunit *test)
+{
+	u8 out[8 + SHA3_384_DIGEST_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_384));
+
+	memset(out, 0, sizeof(out));
+	sha3_384(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_384, sizeof(test_sha3_384),
+			       "SHA3-384 gives wrong output");
+}
+
+static void test_sha3_512_basic(struct kunit *test)
+{
+	u8 out[8 + SHA3_512_DIGEST_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_512));
+
+	memset(out, 0, sizeof(out));
+	sha3_512(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_512, sizeof(test_sha3_512),
+			       "SHA3-512 gives wrong output");
+}
+
+static void test_shake128_basic(struct kunit *test)
+{
+	u8 out[8 + SHAKE128_DEFAULT_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_shake128));
+
+	memset(out, 0, sizeof(out));
+	shake128(test_sha3_sample, sizeof(test_sha3_sample) - 1,
+		 out + 8, sizeof(out) - 16);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128, sizeof(test_shake128),
+			       "SHAKE128 gives wrong output");
+}
+
+static void test_shake256_basic(struct kunit *test)
+{
+	u8 out[8 + SHAKE256_DEFAULT_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_shake256));
+
+	memset(out, 0, sizeof(out));
+	shake256(test_sha3_sample, sizeof(test_sha3_sample) - 1,
+		 out + 8, sizeof(out) - 16);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256, sizeof(test_shake256),
+			       "SHAKE256 gives wrong output");
+}
+
+/*
+ * Usable NIST tests.
+ *
+ * From: https://csrc.nist.gov/projects/cryptographic-standards-and-guidelines/example-values
+ */
+static const u8 test_nist_1600_sample[] = {
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3
+};
+
+static const u8 test_shake128_nist_0[] = {
+	0x7f, 0x9c, 0x2b, 0xa4, 0xe8, 0x8f, 0x82, 0x7d,
+	0x61, 0x60, 0x45, 0x50, 0x76, 0x05, 0x85, 0x3e
+};
+
+static const u8 test_shake128_nist_1600[] = {
+	0x13, 0x1a, 0xb8, 0xd2, 0xb5, 0x94, 0x94, 0x6b,
+	0x9c, 0x81, 0x33, 0x3f, 0x9b, 0xb6, 0xe0, 0xce,
+};
+
+static const u8 test_shake256_nist_0[] = {
+	0x46, 0xb9, 0xdd, 0x2b, 0x0b, 0xa8, 0x8d, 0x13,
+	0x23, 0x3b, 0x3f, 0xeb, 0x74, 0x3e, 0xeb, 0x24,
+	0x3f, 0xcd, 0x52, 0xea, 0x62, 0xb8, 0x1b, 0x82,
+	0xb5, 0x0c, 0x27, 0x64, 0x6e, 0xd5, 0x76, 0x2f
+};
+
+static const u8 test_shake256_nist_1600[] = {
+	0xcd, 0x8a, 0x92, 0x0e, 0xd1, 0x41, 0xaa, 0x04,
+	0x07, 0xa2, 0x2d, 0x59, 0x28, 0x86, 0x52, 0xe9,
+	0xd9, 0xf1, 0xa7, 0xee, 0x0c, 0x1e, 0x7c, 0x1c,
+	0xa6, 0x99, 0x42, 0x4d, 0xa8, 0x4a, 0x90, 0x4d,
+};
+
+static void test_shake128_nist(struct kunit *test)
+{
+	u8 out[SHAKE128_DEFAULT_SIZE];
+
+	shake128("", 0, out, sizeof(out));
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128_nist_0, sizeof(out),
+			       "SHAKE128 gives wrong output for NIST.0");
+
+	shake128(test_nist_1600_sample, sizeof(test_nist_1600_sample),
+		 out, sizeof(out));
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128_nist_1600, sizeof(out),
+			       "SHAKE128 gives wrong output for NIST.1600");
+}
+
+static void test_shake256_nist(struct kunit *test)
+{
+	u8 out[SHAKE256_DEFAULT_SIZE];
+
+	shake256("", 0, out, sizeof(out));
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256_nist_0, sizeof(out),
+			       "SHAKE256 gives wrong output for NIST.0");
+
+	shake256(test_nist_1600_sample, sizeof(test_nist_1600_sample),
+		 out, sizeof(out));
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256_nist_1600, sizeof(out),
+			       "SHAKE256 gives wrong output for NIST.1600");
+}
+
+static void shake(int alg, const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	if (alg == 0)
+		shake128(in, in_len, out, out_len);
+	else
+		shake256(in, in_len, out, out_len);
+}
+
+static void shake_init(struct shake_ctx *ctx, int alg)
+{
+	if (alg == 0)
+		shake128_init(ctx);
+	else
+		shake256_init(ctx);
+}
+
+/*
+ * Test each of SHAKE128 and SHAKE256 with all input lengths 0 through 4096, for
+ * both input and output.  The input and output lengths cycle through the values
+ * together, so we do 4096 tests total.  To verify all the SHAKE outputs,
+ * compute and verify the SHA3-256 digest of all of them concatenated together.
+ */
+static void test_shake_all_lens_up_to_4096(struct kunit *test)
+{
+	struct sha3_ctx main_ctx;
+	const size_t max_len = 4096;
+	u8 *const in = test_buf;
+	u8 *const out = &test_buf[TEST_BUF_LEN - max_len];
+	u8 main_hash[SHA3_256_DIGEST_SIZE];
+
+	KUNIT_ASSERT_LE(test, 2 * max_len, TEST_BUF_LEN);
+
+	rand_bytes_seeded_from_len(in, max_len);
+	for (int alg = 0; alg < 2; alg++) {
+		sha3_256_init(&main_ctx);
+		for (size_t in_len = 0; in_len <= max_len; in_len++) {
+			size_t out_len = (in_len * 293) % (max_len + 1);
+
+			shake(alg, in, in_len, out, out_len);
+			sha3_update(&main_ctx, out, out_len);
+		}
+		sha3_final(&main_ctx, main_hash);
+		if (alg == 0)
+			KUNIT_ASSERT_MEMEQ_MSG(test, main_hash,
+					       shake128_testvec_consolidated,
+					       sizeof(main_hash),
+					       "shake128() gives wrong output");
+		else
+			KUNIT_ASSERT_MEMEQ_MSG(test, main_hash,
+					       shake256_testvec_consolidated,
+					       sizeof(main_hash),
+					       "shake256() gives wrong output");
+	}
+}
+
+/*
+ * Test that a sequence of SHAKE squeezes gives the same output as a single
+ * squeeze of the same total length.
+ */
+static void test_shake_multiple_squeezes(struct kunit *test)
+{
+	const size_t max_len = 512;
+	u8 *ref_out;
+
+	KUNIT_ASSERT_GE(test, TEST_BUF_LEN, 2 * max_len);
+
+	ref_out = kunit_kzalloc(test, max_len, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, ref_out);
+
+	for (int i = 0; i < 2000; i++) {
+		const int alg = rand32() % 2;
+		const size_t in_len = rand_length(max_len);
+		const size_t out_len = rand_length(max_len);
+		const size_t in_offs = rand_offset(max_len - in_len);
+		const size_t out_offs = rand_offset(max_len - out_len);
+		u8 *const in = &test_buf[in_offs];
+		u8 *const out = &test_buf[out_offs];
+		struct shake_ctx ctx;
+		size_t remaining_len, j, num_parts;
+
+		rand_bytes(in, in_len);
+		rand_bytes(out, out_len);
+
+		/* Compute the output using the one-shot function. */
+		shake(alg, in, in_len, ref_out, out_len);
+
+		/* Compute the output using a random sequence of squeezes. */
+		shake_init(&ctx, alg);
+		shake_update(&ctx, in, in_len);
+		remaining_len = out_len;
+		j = 0;
+		num_parts = 0;
+		while (rand_bool()) {
+			size_t part_len = rand_length(remaining_len);
+
+			shake_squeeze(&ctx, &out[j], part_len);
+			num_parts++;
+			j += part_len;
+			remaining_len -= part_len;
+		}
+		if (remaining_len != 0 || rand_bool()) {
+			shake_squeeze(&ctx, &out[j], remaining_len);
+			num_parts++;
+		}
+
+		/* Verify that the outputs are the same. */
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, out, ref_out, out_len,
+			"Multi-squeeze test failed with in_len=%zu in_offs=%zu out_len=%zu out_offs=%zu num_parts=%zu alg=%d",
+			in_len, in_offs, out_len, out_offs, num_parts, alg);
+	}
+}
+
+/*
+ * Test that SHAKE operations on buffers immediately followed by an unmapped
+ * page work as expected.  This catches out-of-bounds memory accesses even if
+ * they occur in assembly code.
+ */
+static void test_shake_with_guarded_bufs(struct kunit *test)
+{
+	const size_t max_len = 512;
+	u8 *reg_buf;
+
+	KUNIT_ASSERT_GE(test, TEST_BUF_LEN, max_len);
+
+	reg_buf = kunit_kzalloc(test, max_len, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, reg_buf);
+
+	for (int alg = 0; alg < 2; alg++) {
+		for (size_t len = 0; len <= max_len; len++) {
+			u8 *guarded_buf = &test_buf[TEST_BUF_LEN - len];
+
+			rand_bytes(reg_buf, len);
+			memcpy(guarded_buf, reg_buf, len);
+
+			shake(alg, reg_buf, len, reg_buf, len);
+			shake(alg, guarded_buf, len, guarded_buf, len);
+
+			KUNIT_ASSERT_MEMEQ_MSG(
+				test, reg_buf, guarded_buf, len,
+				"Guard page test failed with len=%zu alg=%d",
+				len, alg);
+		}
+	}
+}
+
+static struct kunit_case sha3_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_sha3_224_basic),
+	KUNIT_CASE(test_sha3_256_basic),
+	KUNIT_CASE(test_sha3_384_basic),
+	KUNIT_CASE(test_sha3_512_basic),
+	KUNIT_CASE(test_shake128_basic),
+	KUNIT_CASE(test_shake256_basic),
+	KUNIT_CASE(test_shake128_nist),
+	KUNIT_CASE(test_shake256_nist),
+	KUNIT_CASE(test_shake_all_lens_up_to_4096),
+	KUNIT_CASE(test_shake_multiple_squeezes),
+	KUNIT_CASE(test_shake_with_guarded_bufs),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite sha3_test_suite = {
+	.name = "sha3",
+	.test_cases = sha3_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(sha3_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA3");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha512-testvecs.h b/lib/crypto/tests/sha512-testvecs.h
new file mode 100644
index 000000000000..c506aaf72ba7
--- /dev/null
+++ b/lib/crypto/tests/sha512-testvecs.h
@@ -0,0 +1,342 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha512 */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA512_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xcf, 0x83, 0xe1, 0x35, 0x7e, 0xef, 0xb8, 0xbd,
+			0xf1, 0x54, 0x28, 0x50, 0xd6, 0x6d, 0x80, 0x07,
+			0xd6, 0x20, 0xe4, 0x05, 0x0b, 0x57, 0x15, 0xdc,
+			0x83, 0xf4, 0xa9, 0x21, 0xd3, 0x6c, 0xe9, 0xce,
+			0x47, 0xd0, 0xd1, 0x3c, 0x5d, 0x85, 0xf2, 0xb0,
+			0xff, 0x83, 0x18, 0xd2, 0x87, 0x7e, 0xec, 0x2f,
+			0x63, 0xb9, 0x31, 0xbd, 0x47, 0x41, 0x7a, 0x81,
+			0xa5, 0x38, 0x32, 0x7a, 0xf9, 0x27, 0xda, 0x3e,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x12, 0xf2, 0xb6, 0xec, 0x84, 0xa0, 0x8e, 0xcf,
+			0x0d, 0xec, 0x17, 0xfd, 0x1f, 0x91, 0x15, 0x69,
+			0xd2, 0xb9, 0x89, 0xff, 0x89, 0x9d, 0xd9, 0x0b,
+			0x7a, 0x0f, 0x82, 0x94, 0x57, 0x5b, 0xf3, 0x08,
+			0x42, 0x45, 0x23, 0x08, 0x44, 0x54, 0x35, 0x36,
+			0xed, 0x4b, 0xb3, 0xa5, 0xbf, 0x17, 0xc1, 0x3c,
+			0xdd, 0x25, 0x4a, 0x30, 0x64, 0xed, 0x66, 0x06,
+			0x72, 0x05, 0xc2, 0x71, 0x5a, 0x6c, 0xd0, 0x75,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x01, 0x37, 0x97, 0xcc, 0x0a, 0xcb, 0x61, 0xa4,
+			0x93, 0x26, 0x36, 0x4b, 0xd2, 0x27, 0xea, 0xaf,
+			0xda, 0xfa, 0x8f, 0x86, 0x12, 0x99, 0x7b, 0xc8,
+			0x94, 0xa9, 0x1c, 0x70, 0x3f, 0x43, 0xf3, 0x9a,
+			0x02, 0xc5, 0x0d, 0x8e, 0x01, 0xf8, 0x3a, 0xa6,
+			0xec, 0xaa, 0xc5, 0xc7, 0x9d, 0x3d, 0x7f, 0x9d,
+			0x47, 0x0e, 0x58, 0x2d, 0x9a, 0x2d, 0x51, 0x1d,
+			0xc3, 0x77, 0xb2, 0x7f, 0x69, 0x9a, 0xc3, 0x50,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0xd4, 0xa3, 0xc2, 0x50, 0xa0, 0x33, 0xc6, 0xe4,
+			0x50, 0x08, 0xea, 0xc9, 0xb8, 0x35, 0x55, 0x34,
+			0x61, 0xb8, 0x2e, 0xa2, 0xe5, 0xdc, 0x04, 0x70,
+			0x99, 0x86, 0x5f, 0xee, 0x2e, 0x1e, 0xd4, 0x40,
+			0xf8, 0x88, 0x01, 0x84, 0x97, 0x16, 0x6a, 0xd3,
+			0x0a, 0xb4, 0xe2, 0x34, 0xca, 0x1f, 0xfc, 0x6b,
+			0x61, 0xf3, 0x7f, 0x72, 0xfa, 0x8d, 0x22, 0xcf,
+			0x7f, 0x2d, 0x87, 0x9d, 0xbb, 0x59, 0xac, 0x53,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x3b, 0xae, 0x66, 0x48, 0x0f, 0x35, 0x3c, 0xcd,
+			0x92, 0xa9, 0xb6, 0xb9, 0xfe, 0x19, 0x90, 0x92,
+			0x52, 0xa5, 0x02, 0xd1, 0x89, 0xcf, 0x98, 0x86,
+			0x29, 0x28, 0xab, 0xc4, 0x9e, 0xcc, 0x75, 0x38,
+			0x95, 0xa7, 0x59, 0x43, 0xef, 0x8c, 0x3a, 0xeb,
+			0x40, 0xa3, 0xbe, 0x2b, 0x75, 0x0d, 0xfd, 0xc3,
+			0xaf, 0x69, 0x08, 0xad, 0x9f, 0xc9, 0xf4, 0x96,
+			0xa9, 0xc2, 0x2b, 0x1b, 0x66, 0x6f, 0x1d, 0x28,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x9c, 0xfb, 0x3c, 0x40, 0xd5, 0x3b, 0xc4, 0xff,
+			0x07, 0xa7, 0xf0, 0x24, 0xb7, 0xd6, 0x5e, 0x12,
+			0x5b, 0x85, 0xb5, 0xa5, 0xe0, 0x82, 0xa6, 0xda,
+			0x30, 0x13, 0x2f, 0x1a, 0xe3, 0xd0, 0x55, 0xcb,
+			0x14, 0x19, 0xe2, 0x09, 0x91, 0x96, 0x26, 0xf9,
+			0x38, 0xd7, 0xfa, 0x4a, 0xfb, 0x2f, 0x6f, 0xc0,
+			0xf4, 0x95, 0xc3, 0x40, 0xf6, 0xdb, 0xe7, 0xc2,
+			0x79, 0x23, 0xa4, 0x20, 0x96, 0x3a, 0x00, 0xbb,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x92, 0x1a, 0x21, 0x06, 0x6e, 0x08, 0x84, 0x09,
+			0x23, 0x8d, 0x63, 0xec, 0xd6, 0x72, 0xd3, 0x21,
+			0x51, 0xe8, 0x65, 0x94, 0xf8, 0x1f, 0x5f, 0xa7,
+			0xab, 0x6b, 0xae, 0x1c, 0x2c, 0xaf, 0xf9, 0x0c,
+			0x7c, 0x5a, 0x74, 0x1d, 0x90, 0x26, 0x4a, 0xc3,
+			0xa1, 0x60, 0xf4, 0x1d, 0xd5, 0x3c, 0x86, 0xe8,
+			0x00, 0xb3, 0x99, 0x27, 0xb8, 0x9d, 0x3e, 0x17,
+			0x32, 0x5a, 0x34, 0x3e, 0xc2, 0xb2, 0x6e, 0xbd,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x5a, 0x1f, 0x40, 0x5f, 0xee, 0xf2, 0xdd, 0x67,
+			0x01, 0xcb, 0x26, 0x58, 0xf5, 0x1b, 0xe8, 0x7e,
+			0xeb, 0x7d, 0x9d, 0xef, 0xd3, 0x55, 0xd6, 0x89,
+			0x2e, 0xfc, 0x14, 0xe2, 0x98, 0x4c, 0x31, 0xaa,
+			0x69, 0x00, 0xf9, 0x4e, 0xb0, 0x75, 0x1b, 0x71,
+			0x93, 0x60, 0xdf, 0xa1, 0xaf, 0xba, 0xc2, 0xd3,
+			0x6a, 0x22, 0xa0, 0xff, 0xb5, 0x66, 0x15, 0x66,
+			0xd2, 0x24, 0x9a, 0x7e, 0xe4, 0xe5, 0x84, 0xdb,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x32, 0xbd, 0xcf, 0x72, 0xa9, 0x74, 0x87, 0xe6,
+			0x2a, 0x53, 0x7e, 0x6d, 0xac, 0xc2, 0xdd, 0x2c,
+			0x87, 0xb3, 0xf7, 0x90, 0x29, 0xc9, 0x16, 0x59,
+			0xd2, 0x7e, 0x6e, 0x84, 0x1d, 0xa6, 0xaf, 0x3c,
+			0xca, 0xd6, 0x1a, 0x24, 0xa4, 0xcd, 0x59, 0x44,
+			0x20, 0xd7, 0xd2, 0x5b, 0x97, 0xda, 0xd5, 0xa9,
+			0x23, 0xb1, 0xa4, 0x60, 0xb8, 0x05, 0x98, 0xdc,
+			0xef, 0x89, 0x81, 0xe3, 0x3a, 0xf9, 0x24, 0x37,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x96, 0x3a, 0x1a, 0xdd, 0x1b, 0xeb, 0x1a, 0x55,
+			0x24, 0x52, 0x3d, 0xec, 0x9d, 0x52, 0x2e, 0xa6,
+			0xfe, 0x81, 0xd6, 0x98, 0xac, 0xcc, 0x60, 0x56,
+			0x04, 0x9d, 0xa3, 0xf3, 0x56, 0x05, 0xe4, 0x8a,
+			0x61, 0xaf, 0x6f, 0x6e, 0x8e, 0x75, 0x67, 0x3a,
+			0xd2, 0xb0, 0x85, 0x2d, 0x17, 0xd2, 0x86, 0x8c,
+			0x50, 0x4b, 0xdd, 0xef, 0x35, 0x00, 0xde, 0x29,
+			0x3d, 0x4b, 0x04, 0x12, 0x8a, 0x81, 0xe2, 0xcc,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x9c, 0x6e, 0xf0, 0x6f, 0x71, 0x77, 0xd5, 0xd0,
+			0xbb, 0x70, 0x1f, 0xcb, 0xbd, 0xd3, 0xfe, 0x23,
+			0x71, 0x78, 0xad, 0x3a, 0xd2, 0x1e, 0x34, 0xf4,
+			0x6d, 0xb4, 0xa2, 0x0a, 0x24, 0xcb, 0xe1, 0x99,
+			0x07, 0xd0, 0x79, 0x8f, 0x7e, 0x69, 0x31, 0x68,
+			0x29, 0xb5, 0x85, 0x82, 0x67, 0xdc, 0x4a, 0x8d,
+			0x44, 0x04, 0x02, 0xc0, 0xfb, 0xd2, 0x19, 0x66,
+			0x1e, 0x25, 0x8b, 0xd2, 0x5a, 0x59, 0x68, 0xc0,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0xb8, 0x8f, 0xa8, 0x29, 0x4d, 0xcf, 0x5f, 0x73,
+			0x3c, 0x55, 0x43, 0xd9, 0x1c, 0xbc, 0x0c, 0x17,
+			0x75, 0x0b, 0xc7, 0xb1, 0x1d, 0x9f, 0x7b, 0x2f,
+			0x4c, 0x3d, 0x2a, 0x71, 0xfb, 0x1b, 0x0e, 0xca,
+			0x4e, 0x96, 0xa0, 0x95, 0xee, 0xf4, 0x93, 0x76,
+			0x36, 0xfb, 0x5d, 0xd3, 0x46, 0xc4, 0x1d, 0x41,
+			0x32, 0x92, 0x9d, 0xed, 0xdb, 0x7f, 0xfa, 0xb3,
+			0x91, 0x61, 0x3e, 0xd6, 0xb2, 0xca, 0x8d, 0x81,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x54, 0xac, 0x1a, 0xa1, 0xa6, 0xa3, 0x47, 0x2a,
+			0x5a, 0xac, 0x1a, 0x3a, 0x4b, 0xa1, 0x11, 0x08,
+			0xa7, 0x90, 0xec, 0x52, 0xcb, 0xaf, 0x68, 0x41,
+			0x38, 0x44, 0x53, 0x94, 0x93, 0x30, 0xaf, 0x3a,
+			0xec, 0x99, 0x3a, 0x7d, 0x2a, 0xd5, 0xb6, 0x05,
+			0xf5, 0xa6, 0xbb, 0x9b, 0x82, 0xc2, 0xbd, 0x98,
+			0x28, 0x62, 0x98, 0x3e, 0xe4, 0x27, 0x9b, 0xaa,
+			0xce, 0x0a, 0x6f, 0xab, 0x1b, 0x16, 0xf3, 0xdd,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x04, 0x37, 0x60, 0xbc, 0xb3, 0xb1, 0xc6, 0x2d,
+			0x42, 0xc5, 0xd7, 0x7e, 0xd9, 0x86, 0x82, 0xe0,
+			0xf4, 0x62, 0xad, 0x75, 0x68, 0x0b, 0xc7, 0xa8,
+			0xd6, 0x9a, 0x76, 0xe5, 0x29, 0xb8, 0x37, 0x30,
+			0x0f, 0xc0, 0xbc, 0x81, 0x94, 0x7c, 0x13, 0xf4,
+			0x9c, 0x27, 0xbc, 0x59, 0xa1, 0x70, 0x6a, 0x87,
+			0x20, 0x12, 0x0a, 0x2a, 0x62, 0x5e, 0x6f, 0xca,
+			0x91, 0x6b, 0x34, 0x7e, 0x4c, 0x0d, 0xf0, 0x6c,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x4b, 0x7c, 0x1f, 0x53, 0x52, 0xcc, 0x30, 0xed,
+			0x91, 0x44, 0x6f, 0x0d, 0xb5, 0x41, 0x79, 0x99,
+			0xaf, 0x82, 0x65, 0x52, 0x03, 0xf8, 0x55, 0x74,
+			0x7c, 0xd9, 0x41, 0xd6, 0xe8, 0x91, 0xa4, 0x85,
+			0xcb, 0x0a, 0x60, 0x08, 0x76, 0x07, 0x60, 0x99,
+			0x89, 0x76, 0xba, 0x84, 0xbd, 0x0b, 0xf2, 0xb3,
+			0xdc, 0xf3, 0x33, 0xd1, 0x9b, 0x0b, 0x2e, 0x5d,
+			0xf6, 0x9d, 0x0f, 0x67, 0xf4, 0x86, 0xb3, 0xd5,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x7d, 0x83, 0x78, 0x6a, 0x5d, 0x52, 0x42, 0x2a,
+			0xb1, 0x97, 0xc6, 0x62, 0xa2, 0x2a, 0x7c, 0x8b,
+			0xcd, 0x4f, 0xa4, 0x86, 0x19, 0xa4, 0x5b, 0x1d,
+			0xc7, 0x6f, 0x2f, 0x9c, 0x03, 0xc3, 0x45, 0x2e,
+			0xa7, 0x8e, 0x38, 0xf2, 0x57, 0x55, 0x89, 0x47,
+			0xed, 0xeb, 0x81, 0xe2, 0xe0, 0x55, 0x9f, 0xe6,
+			0xca, 0x03, 0x59, 0xd3, 0xd4, 0xba, 0xc9, 0x2d,
+			0xaf, 0xbb, 0x62, 0x2e, 0xe6, 0x89, 0xe4, 0x11,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0xe9, 0x14, 0xe7, 0x01, 0xd0, 0x81, 0x09, 0x51,
+			0x78, 0x1c, 0x8e, 0x6c, 0x00, 0xd3, 0x28, 0xa0,
+			0x2a, 0x7b, 0xd6, 0x25, 0xca, 0xd0, 0xf9, 0xb8,
+			0xd8, 0xcf, 0xd0, 0xb7, 0x48, 0x25, 0xb7, 0x6a,
+			0x53, 0x8e, 0xf8, 0x52, 0x9c, 0x1f, 0x7d, 0xae,
+			0x4c, 0x22, 0xd5, 0x9d, 0xf0, 0xaf, 0x98, 0x91,
+			0x19, 0x1f, 0x99, 0xbd, 0xa6, 0xc2, 0x0f, 0x05,
+			0xa5, 0x9f, 0x3e, 0x87, 0xed, 0xc3, 0xab, 0x92,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x2e, 0xf4, 0x72, 0xd2, 0xd9, 0x4a, 0xd5, 0xf9,
+			0x20, 0x03, 0x4a, 0xad, 0xed, 0xa9, 0x1b, 0x64,
+			0x73, 0x38, 0xc6, 0x30, 0xa8, 0x7f, 0xf9, 0x3b,
+			0x8c, 0xbc, 0xa1, 0x2d, 0x22, 0x7b, 0x84, 0x37,
+			0xf5, 0xba, 0xee, 0xf0, 0x80, 0x9d, 0xe3, 0x82,
+			0xbd, 0x07, 0x68, 0x15, 0x01, 0x22, 0xf6, 0x88,
+			0x07, 0x0b, 0xfd, 0xb7, 0xb1, 0xc0, 0x68, 0x4b,
+			0x8d, 0x05, 0xec, 0xfb, 0xcd, 0xde, 0xa4, 0x2a,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x73, 0xe3, 0xe5, 0x87, 0x01, 0x0a, 0x29, 0x4d,
+			0xad, 0x92, 0x67, 0x64, 0xc7, 0x71, 0x0b, 0x22,
+			0x80, 0x8a, 0x6e, 0x8b, 0x20, 0x73, 0xb2, 0xd7,
+			0x98, 0x20, 0x35, 0x42, 0x42, 0x5d, 0x85, 0x12,
+			0xb0, 0x06, 0x69, 0x63, 0x5f, 0x5b, 0xe7, 0x63,
+			0x6f, 0xe6, 0x18, 0xa6, 0xc1, 0xa6, 0xae, 0x27,
+			0xa7, 0x6a, 0x73, 0x6b, 0x27, 0xd5, 0x47, 0xe1,
+			0xa2, 0x7d, 0xe4, 0x0d, 0xbd, 0x23, 0x7b, 0x7a,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0x11, 0x5b, 0x77, 0x36, 0x6b, 0x3b, 0xe4, 0x42,
+			0xe4, 0x92, 0x23, 0xcb, 0x0c, 0x06, 0xff, 0xb7,
+			0x0c, 0x71, 0x64, 0xd9, 0x8a, 0x57, 0x75, 0x7b,
+			0xa2, 0xd2, 0x17, 0x19, 0xbb, 0xb5, 0x3c, 0xb3,
+			0x5f, 0xae, 0x35, 0x75, 0x8e, 0xa8, 0x97, 0x43,
+			0xce, 0xfe, 0x41, 0x84, 0xfe, 0xcb, 0x18, 0x70,
+			0x68, 0x2e, 0x16, 0x19, 0xd5, 0x10, 0x0d, 0x2f,
+			0x61, 0x87, 0x79, 0xee, 0x5f, 0x24, 0xdd, 0x76,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x9e, 0x96, 0xe1, 0x0a, 0xb2, 0xd5, 0xba, 0xcf,
+			0x27, 0xba, 0x6f, 0x85, 0xe9, 0xbf, 0x96, 0xb9,
+			0x5a, 0x00, 0x00, 0x06, 0xdc, 0xb7, 0xaf, 0x0a,
+			0x8f, 0x1d, 0x31, 0xf6, 0xce, 0xc3, 0x50, 0x2e,
+			0x61, 0x3a, 0x8b, 0x28, 0xaf, 0xb2, 0x50, 0x0d,
+			0x00, 0x98, 0x02, 0x11, 0x6b, 0xfa, 0x51, 0xc1,
+			0xde, 0xe1, 0x34, 0x9f, 0xda, 0x11, 0x63, 0xfa,
+			0x0a, 0xa0, 0xa2, 0x67, 0x39, 0xeb, 0x9b, 0xf1,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0x46, 0x4e, 0x81, 0xd1, 0x08, 0x2a, 0x46, 0x12,
+			0x4e, 0xae, 0x1f, 0x5d, 0x57, 0xe5, 0x19, 0xbc,
+			0x76, 0x38, 0xb6, 0xa7, 0xe3, 0x72, 0x6d, 0xaf,
+			0x80, 0x3b, 0xd0, 0xbc, 0x06, 0xe8, 0xab, 0xab,
+			0x86, 0x4b, 0x0b, 0x7a, 0x61, 0xa6, 0x13, 0xff,
+			0x64, 0x47, 0x89, 0xb7, 0x63, 0x8a, 0xa5, 0x4c,
+			0x9f, 0x52, 0x70, 0xeb, 0x21, 0xe5, 0x2d, 0xe9,
+			0xe7, 0xab, 0x1c, 0x0e, 0x74, 0xf5, 0x72, 0xec,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0xfa, 0x6e, 0xff, 0x3c, 0xc1, 0x98, 0x49, 0x42,
+			0x34, 0x67, 0xd4, 0xd3, 0xfa, 0xae, 0x27, 0xe4,
+			0x77, 0x11, 0x84, 0xd2, 0x57, 0x99, 0xf8, 0xfd,
+			0x41, 0x50, 0x84, 0x80, 0x7f, 0xf7, 0xb2, 0xd3,
+			0x88, 0x21, 0x9c, 0xe8, 0xb9, 0x05, 0xd3, 0x48,
+			0x64, 0xc5, 0xb7, 0x29, 0xd9, 0x21, 0x17, 0xad,
+			0x89, 0x9c, 0x79, 0x55, 0x51, 0x0b, 0x96, 0x3e,
+			0x10, 0x40, 0xe1, 0xdd, 0x7b, 0x39, 0x40, 0x86,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0x41, 0xb3, 0xd2, 0x93, 0xcd, 0x79, 0x84, 0xc2,
+			0xf5, 0xea, 0xf3, 0xb3, 0x94, 0x23, 0xaa, 0x76,
+			0x87, 0x5f, 0xe3, 0xd2, 0x03, 0xd8, 0x00, 0xbb,
+			0xa1, 0x55, 0xe4, 0xcb, 0x16, 0x04, 0x5b, 0xdf,
+			0xf8, 0xd2, 0x63, 0x51, 0x02, 0x22, 0xc6, 0x0f,
+			0x98, 0x2b, 0x12, 0x52, 0x25, 0x64, 0x93, 0xd9,
+			0xab, 0xe9, 0x4d, 0x16, 0x4b, 0xf6, 0x09, 0x83,
+			0x5c, 0x63, 0x1c, 0x41, 0x19, 0xf6, 0x76, 0xe3,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA512_DIGEST_SIZE] = {
+	0x5b, 0x9d, 0xf9, 0xab, 0x8c, 0x8e, 0x52, 0xdb,
+	0x02, 0xa0, 0x4c, 0x24, 0x2d, 0xc4, 0xa8, 0x4e,
+	0x9c, 0x93, 0x2f, 0x72, 0xa8, 0x75, 0xfb, 0xb5,
+	0xdb, 0xef, 0x52, 0xc6, 0xa3, 0xfe, 0xeb, 0x6b,
+	0x92, 0x79, 0x18, 0x05, 0xf6, 0xd7, 0xaf, 0x7b,
+	0x36, 0xfc, 0x83, 0x2c, 0x7e, 0x7b, 0x59, 0x8b,
+	0xf9, 0x81, 0xaa, 0x98, 0x38, 0x11, 0x97, 0x56,
+	0x34, 0xe5, 0x2a, 0x4b, 0xf2, 0x9e, 0xf3, 0xdb,
+};
+
+static const u8 hmac_testvec_consolidated[SHA512_DIGEST_SIZE] = {
+	0x40, 0xe7, 0xbc, 0x03, 0xdf, 0x22, 0xd4, 0x76,
+	0x66, 0x45, 0xf8, 0x1d, 0x25, 0xdf, 0xbe, 0xa2,
+	0x93, 0x06, 0x8c, 0x1d, 0x14, 0x23, 0x9b, 0x5c,
+	0xfa, 0xac, 0xdf, 0xbd, 0xa2, 0x24, 0xe5, 0xf7,
+	0xdc, 0xf7, 0xae, 0x96, 0xc1, 0x34, 0xe5, 0x24,
+	0x16, 0x24, 0xdc, 0xee, 0x4f, 0x62, 0x1c, 0x67,
+	0x4e, 0x02, 0x31, 0x4b, 0x9b, 0x65, 0x25, 0xeb,
+	0x32, 0x2e, 0x24, 0xfb, 0xcd, 0x2b, 0x59, 0xd8,
+};
diff --git a/lib/crypto/tests/sha512_kunit.c b/lib/crypto/tests/sha512_kunit.c
new file mode 100644
index 000000000000..8923e2d7d3d4
--- /dev/null
+++ b/lib/crypto/tests/sha512_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha512-testvecs.h"
+
+#define HASH sha512
+#define HASH_CTX sha512_ctx
+#define HASH_SIZE SHA512_DIGEST_SIZE
+#define HASH_INIT sha512_init
+#define HASH_UPDATE sha512_update
+#define HASH_FINAL sha512_final
+#define HMAC_KEY hmac_sha512_key
+#define HMAC_CTX hmac_sha512_ctx
+#define HMAC_PREPAREKEY hmac_sha512_preparekey
+#define HMAC_INIT hmac_sha512_init
+#define HMAC_UPDATE hmac_sha512_update
+#define HMAC_FINAL hmac_sha512_final
+#define HMAC hmac_sha512
+#define HMAC_USINGRAWKEY hmac_sha512_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite hash_test_suite = {
+	.name = "sha512",
+	.test_cases = hash_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-512 and HMAC-SHA512");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/utils.c b/lib/crypto/utils.c
index 53230ab1b195..dec381d5e906 100644
--- a/lib/crypto/utils.c
+++ b/lib/crypto/utils.c
@@ -5,9 +5,10 @@
  * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
  */
 
-#include <asm/unaligned.h>
-#include <crypto/algapi.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
 #include <linux/module.h>
+#include <linux/unaligned.h>
 
 /*
  * XOR @len bytes from @src1 and @src2 together, writing the result to @dst
@@ -85,4 +86,5 @@ void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len)
 }
 EXPORT_SYMBOL_GPL(__crypto_xor);
 
+MODULE_DESCRIPTION("Crypto library utility functions");
 MODULE_LICENSE("GPL");
diff --git a/lib/crypto/x86/.gitignore b/lib/crypto/x86/.gitignore
new file mode 100644
index 000000000000..580c839bb177
--- /dev/null
+++ b/lib/crypto/x86/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-x86_64-cryptogams.S
diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
new file mode 100644
index 000000000000..7b1d98ca7482
--- /dev/null
+++ b/lib/crypto/x86/blake2s-core.S
@@ -0,0 +1,291 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.iv, "aM", @progbits, 32
+.align 32
+.Liv:
+	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
+	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
+
+.section .rodata.cst16.ror16, "aM", @progbits, 16
+.align 16
+.Lror16:
+	.octa 0x0D0C0F0E09080B0A0504070601000302
+
+.section .rodata.cst16.ror8, "aM", @progbits, 16
+.align 16
+.Lror8:
+	.octa 0x0C0F0E0D080B0A090407060500030201
+
+.section .rodata.cst64.sigma, "aM", @progbits, 160
+.align 64
+.Lsigma:
+.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
+.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
+.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
+.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
+.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
+.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
+.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
+.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
+.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
+.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
+
+.section .rodata.cst64.sigma2, "aM", @progbits, 160
+.align 64
+.Lsigma2:
+.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
+.byte  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
+.byte 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
+.byte 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
+.byte  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
+.byte  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
+.byte  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
+.byte  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
+.byte 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
+.byte  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
+
+#define CTX		%rdi
+#define DATA		%rsi
+#define NBLOCKS		%rdx
+#define INC		%ecx
+
+.text
+//
+// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+//			       const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+//	u32 h[8];	(inout)
+//	u32 t[2];	(inout)
+//	u32 f[2];	(in)
+//
+SYM_FUNC_START(blake2s_compress_ssse3)
+	movdqu		(CTX),%xmm0		// Load h[0..3]
+	movdqu		16(CTX),%xmm1		// Load h[4..7]
+	movdqa		.Lror16(%rip),%xmm12
+	movdqa		.Lror8(%rip),%xmm13
+	movdqu		32(CTX),%xmm14		// Load t and f
+	movd		INC,%xmm15		// Load inc
+	leaq		.Lsigma+160(%rip),%r8
+	jmp		.Lssse3_mainloop
+
+	.align		32
+.Lssse3_mainloop:
+	// Main loop: each iteration processes one 64-byte block.
+	movdqa		%xmm0,%xmm10		// Save h[0..3] and let v[0..3] = h[0..3]
+	movdqa		%xmm1,%xmm11		// Save h[4..7] and let v[4..7] = h[4..7]
+	paddq		%xmm15,%xmm14		// t += inc (64-bit addition)
+	movdqa		.Liv(%rip),%xmm2	// v[8..11] = iv[0..3]
+	movdqa		%xmm14,%xmm3
+	pxor		.Liv+16(%rip),%xmm3	// v[12..15] = iv[4..7] ^ [t, f]
+	leaq		.Lsigma(%rip),%rcx
+
+.Lssse3_roundloop:
+	// Round loop: each iteration does 1 round (of 10 rounds total).
+	movzbl		(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
+	movzbl		1(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	movzbl		2(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	movzbl		3(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	punpckldq	%xmm5,%xmm4
+	punpckldq	%xmm7,%xmm6
+	punpcklqdq	%xmm6,%xmm4
+	paddd		%xmm4,%xmm0
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm12,%xmm3
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm8
+	psrld		$12,%xmm1
+	pslld		$20,%xmm8
+	por		%xmm8,%xmm1
+	movzbl		4(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	movzbl		5(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	movzbl		6(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	movzbl		7(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
+	punpckldq	%xmm6,%xmm5
+	punpckldq	%xmm4,%xmm7
+	punpcklqdq	%xmm7,%xmm5
+	paddd		%xmm5,%xmm0
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm13,%xmm3
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm8
+	psrld		$7,%xmm1
+	pslld		$25,%xmm8
+	por		%xmm8,%xmm1
+	pshufd		$0x93,%xmm0,%xmm0
+	pshufd		$0x4e,%xmm3,%xmm3
+	pshufd		$0x39,%xmm2,%xmm2
+	movzbl		8(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	movzbl		9(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	movzbl		10(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
+	movzbl		11(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	punpckldq	%xmm7,%xmm6
+	punpckldq	%xmm5,%xmm4
+	punpcklqdq	%xmm4,%xmm6
+	paddd		%xmm6,%xmm0
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm12,%xmm3
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm8
+	psrld		$12,%xmm1
+	pslld		$20,%xmm8
+	por		%xmm8,%xmm1
+	movzbl		12(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	movzbl		13(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
+	movzbl		14(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	movzbl		15(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	punpckldq	%xmm4,%xmm7
+	punpckldq	%xmm6,%xmm5
+	punpcklqdq	%xmm5,%xmm7
+	paddd		%xmm7,%xmm0
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm13,%xmm3
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm8
+	psrld		$7,%xmm1
+	pslld		$25,%xmm8
+	por		%xmm8,%xmm1
+	pshufd		$0x39,%xmm0,%xmm0
+	pshufd		$0x4e,%xmm3,%xmm3
+	pshufd		$0x93,%xmm2,%xmm2
+	addq		$16,%rcx
+	cmpq		%r8,%rcx
+	jnz		.Lssse3_roundloop
+
+	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
+	pxor		%xmm2,%xmm0
+	pxor		%xmm3,%xmm1
+	pxor		%xmm10,%xmm0
+	pxor		%xmm11,%xmm1
+	addq		$64,DATA
+	decq		NBLOCKS
+	jnz		.Lssse3_mainloop
+
+	movdqu		%xmm0,(CTX)		// Store new h[0..3]
+	movdqu		%xmm1,16(CTX)		// Store new h[4..7]
+	movq		%xmm14,32(CTX)		// Store new t (f is unchanged)
+	RET
+SYM_FUNC_END(blake2s_compress_ssse3)
+
+//
+// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+//				const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+//	u32 h[8];	(inout)
+//	u32 t[2];	(inout)
+//	u32 f[2];	(in)
+//
+SYM_FUNC_START(blake2s_compress_avx512)
+	vmovdqu		(CTX),%xmm0		// Load h[0..3]
+	vmovdqu		16(CTX),%xmm1		// Load h[4..7]
+	vmovdqu		32(CTX),%xmm4		// Load t and f
+	vmovd		INC,%xmm5		// Load inc
+	vmovdqa		.Liv(%rip),%xmm14	// Load iv[0..3]
+	vmovdqa		.Liv+16(%rip),%xmm15	// Load iv[4..7]
+	jmp		.Lavx512_mainloop
+
+	.align		32
+.Lavx512_mainloop:
+	// Main loop: each iteration processes one 64-byte block.
+	vmovdqa		%xmm0,%xmm10		// Save h[0..3] and let v[0..3] = h[0..3]
+	vmovdqa		%xmm1,%xmm11		// Save h[4..7] and let v[4..7] = h[4..7]
+	vpaddq		%xmm5,%xmm4,%xmm4	// t += inc (64-bit addition)
+	vmovdqa		%xmm14,%xmm2		// v[8..11] = iv[0..3]
+	vpxor		%xmm15,%xmm4,%xmm3	// v[12..15] = iv[4..7] ^ [t, f]
+	vmovdqu		(DATA),%ymm6		// Load first 8 data words
+	vmovdqu		32(DATA),%ymm7		// Load second 8 data words
+	addq		$64,DATA
+	leaq		.Lsigma2(%rip),%rax
+	movb		$10,%cl			// Set num rounds remaining
+
+.Lavx512_roundloop:
+	// Round loop: each iteration does 1 round (of 10 rounds total).
+	vpmovzxbd	(%rax),%ymm8
+	vpmovzxbd	8(%rax),%ymm9
+	addq		$16,%rax
+	vpermi2d	%ymm7,%ymm6,%ymm8
+	vpermi2d	%ymm7,%ymm6,%ymm9
+	vmovdqa		%ymm8,%ymm6
+	vmovdqa		%ymm9,%ymm7
+	vpaddd		%xmm8,%xmm0,%xmm0
+	vpaddd		%xmm1,%xmm0,%xmm0
+	vpxor		%xmm0,%xmm3,%xmm3
+	vprord		$16,%xmm3,%xmm3
+	vpaddd		%xmm3,%xmm2,%xmm2
+	vpxor		%xmm2,%xmm1,%xmm1
+	vprord		$12,%xmm1,%xmm1
+	vextracti128	$1,%ymm8,%xmm8
+	vpaddd		%xmm8,%xmm0,%xmm0
+	vpaddd		%xmm1,%xmm0,%xmm0
+	vpxor		%xmm0,%xmm3,%xmm3
+	vprord		$8,%xmm3,%xmm3
+	vpaddd		%xmm3,%xmm2,%xmm2
+	vpxor		%xmm2,%xmm1,%xmm1
+	vprord		$7,%xmm1,%xmm1
+	vpshufd		$0x93,%xmm0,%xmm0
+	vpshufd		$0x4e,%xmm3,%xmm3
+	vpshufd		$0x39,%xmm2,%xmm2
+	vpaddd		%xmm9,%xmm0,%xmm0
+	vpaddd		%xmm1,%xmm0,%xmm0
+	vpxor		%xmm0,%xmm3,%xmm3
+	vprord		$16,%xmm3,%xmm3
+	vpaddd		%xmm3,%xmm2,%xmm2
+	vpxor		%xmm2,%xmm1,%xmm1
+	vprord		$12,%xmm1,%xmm1
+	vextracti128	$1,%ymm9,%xmm9
+	vpaddd		%xmm9,%xmm0,%xmm0
+	vpaddd		%xmm1,%xmm0,%xmm0
+	vpxor		%xmm0,%xmm3,%xmm3
+	vprord		$8,%xmm3,%xmm3
+	vpaddd		%xmm3,%xmm2,%xmm2
+	vpxor		%xmm2,%xmm1,%xmm1
+	vprord		$7,%xmm1,%xmm1
+	vpshufd		$0x39,%xmm0,%xmm0
+	vpshufd		$0x4e,%xmm3,%xmm3
+	vpshufd		$0x93,%xmm2,%xmm2
+	decb		%cl
+	jne		.Lavx512_roundloop
+
+	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
+	vpternlogd	$0x96,%xmm10,%xmm2,%xmm0
+	vpternlogd	$0x96,%xmm11,%xmm3,%xmm1
+	decq		NBLOCKS
+	jne		.Lavx512_mainloop
+
+	vmovdqu		%xmm0,(CTX)		// Store new h[0..3]
+	vmovdqu		%xmm1,16(CTX)		// Store new h[4..7]
+	vmovq		%xmm4,32(CTX)		// Store new t (f is unchanged)
+	vzeroupper
+	RET
+SYM_FUNC_END(blake2s_compress_avx512)
diff --git a/lib/crypto/x86/blake2s.h b/lib/crypto/x86/blake2s.h
new file mode 100644
index 000000000000..f8eed6cb042e
--- /dev/null
+++ b/lib/crypto/x86/blake2s.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
+#include <asm/processor.h>
+#include <asm/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+asmlinkage void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+				       const u8 *data, size_t nblocks, u32 inc);
+asmlinkage void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+					const u8 *data, size_t nblocks, u32 inc);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
+
+static void blake2s_compress(struct blake2s_ctx *ctx,
+			     const u8 *data, size_t nblocks, u32 inc)
+{
+	/* SIMD disables preemption, so relax after processing each page. */
+	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
+
+	if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
+		blake2s_compress_generic(ctx, data, nblocks, inc);
+		return;
+	}
+
+	do {
+		const size_t blocks = min_t(size_t, nblocks,
+					    SZ_4K / BLAKE2S_BLOCK_SIZE);
+
+		kernel_fpu_begin();
+		if (static_branch_likely(&blake2s_use_avx512))
+			blake2s_compress_avx512(ctx, data, blocks, inc);
+		else
+			blake2s_compress_ssse3(ctx, data, blocks, inc);
+		kernel_fpu_end();
+
+		data += blocks * BLAKE2S_BLOCK_SIZE;
+		nblocks -= blocks;
+	} while (nblocks);
+}
+
+#define blake2s_mod_init_arch blake2s_mod_init_arch
+static void blake2s_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SSSE3))
+		static_branch_enable(&blake2s_use_ssse3);
+
+	if (boot_cpu_has(X86_FEATURE_AVX) &&
+	    boot_cpu_has(X86_FEATURE_AVX2) &&
+	    boot_cpu_has(X86_FEATURE_AVX512F) &&
+	    boot_cpu_has(X86_FEATURE_AVX512VL) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+			      XFEATURE_MASK_AVX512, NULL))
+		static_branch_enable(&blake2s_use_avx512);
+}
diff --git a/lib/crypto/x86/chacha-avx2-x86_64.S b/lib/crypto/x86/chacha-avx2-x86_64.S
new file mode 100644
index 000000000000..f3d8fc018249
--- /dev/null
+++ b/lib/crypto/x86/chacha-avx2-x86_64.S
@@ -0,0 +1,1021 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section	.rodata.cst32.ROT8, "aM", @progbits, 32
+.align 32
+ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+	.octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section	.rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
+ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+	.octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
+CTRINC:	.octa 0x00000003000000020000000100000000
+	.octa 0x00000007000000060000000500000004
+
+.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:	.octa 0x00000000000000000000000000000000
+	.octa 0x00000000000000000000000000000001
+
+.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:	.octa 0x00000000000000000000000000000002
+	.octa 0x00000000000000000000000000000003
+
+.text
+
+SYM_FUNC_START(chacha_2block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 2 data blocks output, o
+	# %rdx: up to 2 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts two ChaCha blocks by loading the state
+	# matrix twice across four AVX registers. It performs matrix operations
+	# on four words in each matrix in parallel, but requires shuffling to
+	# rearrange the words after each round.
+
+	vzeroupper
+
+	# x0..3[0-2] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+
+	vmovdqa		%ymm0,%ymm8
+	vmovdqa		%ymm1,%ymm9
+	vmovdqa		%ymm2,%ymm10
+	vmovdqa		%ymm3,%ymm11
+
+	vmovdqa		ROT8(%rip),%ymm4
+	vmovdqa		ROT16(%rip),%ymm5
+
+	mov		%rcx,%rax
+
+.Ldoubleround:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm5,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm6
+	vpslld		$12,%ymm6,%ymm6
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm6,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm4,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm7
+	vpslld		$7,%ymm7,%ymm7
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm5,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm6
+	vpslld		$12,%ymm6,%ymm6
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm6,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm4,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm7
+	vpslld		$7,%ymm7,%ymm7
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround
+
+	# o0 = i0 ^ (x0 + s0)
+	vpaddd		%ymm8,%ymm0,%ymm7
+	cmp		$0x10,%rax
+	jl		.Lxorpart2
+	vpxor		0x00(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x00(%rsi)
+	vextracti128	$1,%ymm7,%xmm0
+	# o1 = i1 ^ (x1 + s1)
+	vpaddd		%ymm9,%ymm1,%ymm7
+	cmp		$0x20,%rax
+	jl		.Lxorpart2
+	vpxor		0x10(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x10(%rsi)
+	vextracti128	$1,%ymm7,%xmm1
+	# o2 = i2 ^ (x2 + s2)
+	vpaddd		%ymm10,%ymm2,%ymm7
+	cmp		$0x30,%rax
+	jl		.Lxorpart2
+	vpxor		0x20(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x20(%rsi)
+	vextracti128	$1,%ymm7,%xmm2
+	# o3 = i3 ^ (x3 + s3)
+	vpaddd		%ymm11,%ymm3,%ymm7
+	cmp		$0x40,%rax
+	jl		.Lxorpart2
+	vpxor		0x30(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x30(%rsi)
+	vextracti128	$1,%ymm7,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm7
+	cmp		$0x50,%rax
+	jl		.Lxorpart2
+	vpxor		0x40(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm7
+	cmp		$0x60,%rax
+	jl		.Lxorpart2
+	vpxor		0x50(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm7
+	cmp		$0x70,%rax
+	jl		.Lxorpart2
+	vpxor		0x60(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm7
+	cmp		$0x80,%rax
+	jl		.Lxorpart2
+	vpxor		0x70(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x70(%rsi)
+
+.Ldone2:
+	vzeroupper
+	RET
+
+.Lxorpart2:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone2
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%xmm7,%xmm7
+	vmovdqa		%xmm7,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone2
+
+SYM_FUNC_END(chacha_2block_xor_avx2)
+
+SYM_FUNC_START(chacha_4block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts four ChaCha blocks by loading the state
+	# matrix four times across eight AVX registers. It performs matrix
+	# operations on four words in two matrices in parallel, sequentially
+	# to the operations on the four words of the other two matrices. The
+	# required word shuffling has a rather high latency, we can do the
+	# arithmetic on two matrix-pairs without much slowdown.
+
+	vzeroupper
+
+	# x0..3[0-4] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vmovdqa		%ymm0,%ymm4
+	vmovdqa		%ymm1,%ymm5
+	vmovdqa		%ymm2,%ymm6
+	vmovdqa		%ymm3,%ymm7
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
+
+	vmovdqa		%ymm0,%ymm11
+	vmovdqa		%ymm1,%ymm12
+	vmovdqa		%ymm2,%ymm13
+	vmovdqa		%ymm3,%ymm14
+	vmovdqa		%ymm7,%ymm15
+
+	vmovdqa		ROT8(%rip),%ymm8
+	vmovdqa		ROT16(%rip),%ymm9
+
+	mov		%rcx,%rax
+
+.Ldoubleround4:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm9,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm9,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	vpshufd		$0x39,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+	vpshufd		$0x93,%ymm7,%ymm7
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm9,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm9,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$12,%ymm10,%ymm10
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxor		%ymm0,%ymm3,%ymm3
+	vpshufb		%ymm8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxor		%ymm4,%ymm7,%ymm7
+	vpshufb		%ymm8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxor		%ymm2,%ymm1,%ymm1
+	vmovdqa		%ymm1,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm1,%ymm1
+	vpor		%ymm10,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxor		%ymm6,%ymm5,%ymm5
+	vmovdqa		%ymm5,%ymm10
+	vpslld		$7,%ymm10,%ymm10
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm10,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	vpshufd		$0x93,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+	vpshufd		$0x39,%ymm7,%ymm7
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround4
+
+	# o0 = i0 ^ (x0 + s0), first block
+	vpaddd		%ymm11,%ymm0,%ymm10
+	cmp		$0x10,%rax
+	jl		.Lxorpart4
+	vpxor		0x00(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x00(%rsi)
+	vextracti128	$1,%ymm10,%xmm0
+	# o1 = i1 ^ (x1 + s1), first block
+	vpaddd		%ymm12,%ymm1,%ymm10
+	cmp		$0x20,%rax
+	jl		.Lxorpart4
+	vpxor		0x10(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x10(%rsi)
+	vextracti128	$1,%ymm10,%xmm1
+	# o2 = i2 ^ (x2 + s2), first block
+	vpaddd		%ymm13,%ymm2,%ymm10
+	cmp		$0x30,%rax
+	jl		.Lxorpart4
+	vpxor		0x20(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x20(%rsi)
+	vextracti128	$1,%ymm10,%xmm2
+	# o3 = i3 ^ (x3 + s3), first block
+	vpaddd		%ymm14,%ymm3,%ymm10
+	cmp		$0x40,%rax
+	jl		.Lxorpart4
+	vpxor		0x30(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x30(%rsi)
+	vextracti128	$1,%ymm10,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm10
+	cmp		$0x50,%rax
+	jl		.Lxorpart4
+	vpxor		0x40(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm10
+	cmp		$0x60,%rax
+	jl		.Lxorpart4
+	vpxor		0x50(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm10
+	cmp		$0x70,%rax
+	jl		.Lxorpart4
+	vpxor		0x60(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm10
+	cmp		$0x80,%rax
+	jl		.Lxorpart4
+	vpxor		0x70(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x70(%rsi)
+
+	# o0 = i0 ^ (x0 + s0), third block
+	vpaddd		%ymm11,%ymm4,%ymm10
+	cmp		$0x90,%rax
+	jl		.Lxorpart4
+	vpxor		0x80(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x80(%rsi)
+	vextracti128	$1,%ymm10,%xmm4
+	# o1 = i1 ^ (x1 + s1), third block
+	vpaddd		%ymm12,%ymm5,%ymm10
+	cmp		$0xa0,%rax
+	jl		.Lxorpart4
+	vpxor		0x90(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x90(%rsi)
+	vextracti128	$1,%ymm10,%xmm5
+	# o2 = i2 ^ (x2 + s2), third block
+	vpaddd		%ymm13,%ymm6,%ymm10
+	cmp		$0xb0,%rax
+	jl		.Lxorpart4
+	vpxor		0xa0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xa0(%rsi)
+	vextracti128	$1,%ymm10,%xmm6
+	# o3 = i3 ^ (x3 + s3), third block
+	vpaddd		%ymm15,%ymm7,%ymm10
+	cmp		$0xc0,%rax
+	jl		.Lxorpart4
+	vpxor		0xb0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xb0(%rsi)
+	vextracti128	$1,%ymm10,%xmm7
+
+	# xor and write fourth block
+	vmovdqa		%xmm4,%xmm10
+	cmp		$0xd0,%rax
+	jl		.Lxorpart4
+	vpxor		0xc0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xc0(%rsi)
+
+	vmovdqa		%xmm5,%xmm10
+	cmp		$0xe0,%rax
+	jl		.Lxorpart4
+	vpxor		0xd0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xd0(%rsi)
+
+	vmovdqa		%xmm6,%xmm10
+	cmp		$0xf0,%rax
+	jl		.Lxorpart4
+	vpxor		0xe0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xe0(%rsi)
+
+	vmovdqa		%xmm7,%xmm10
+	cmp		$0x100,%rax
+	jl		.Lxorpart4
+	vpxor		0xf0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xf0(%rsi)
+
+.Ldone4:
+	vzeroupper
+	RET
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone4
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%xmm10,%xmm10
+	vmovdqa		%xmm10,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone4
+
+SYM_FUNC_END(chacha_4block_xor_avx2)
+
+SYM_FUNC_START(chacha_8block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 8 data blocks output, o
+	# %rdx: up to 8 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts eight consecutive ChaCha blocks by loading
+	# the state matrix in AVX registers eight times. As we need some
+	# scratch registers, we save the first four registers on the stack. The
+	# algorithm performs each operation on the corresponding word of each
+	# state matrix, hence requires no word shuffling. For final XORing step
+	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
+	# words, which allows us to do XOR in AVX registers. 8/16-bit word
+	# rotation is done with the slightly better performing byte shuffling,
+	# 7/12-bit word rotation uses traditional shift+OR.
+
+	vzeroupper
+	# 4 * 32 byte stack, 32-byte aligned
+	lea		8(%rsp),%r10
+	and		$~31, %rsp
+	sub		$0x80, %rsp
+	mov		%rcx,%rax
+
+	# x0..15[0-7] = s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpbroadcastd	0x04(%rdi),%ymm1
+	vpbroadcastd	0x08(%rdi),%ymm2
+	vpbroadcastd	0x0c(%rdi),%ymm3
+	vpbroadcastd	0x10(%rdi),%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm15
+	# x0..3 on stack
+	vmovdqa		%ymm0,0x00(%rsp)
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		%ymm3,0x60(%rsp)
+
+	vmovdqa		CTRINC(%rip),%ymm1
+	vmovdqa		ROT8(%rip),%ymm2
+	vmovdqa		ROT16(%rip),%ymm3
+
+	# x12 += counter values 0-3
+	vpaddd		%ymm1,%ymm12,%ymm12
+
+.Ldoubleround8:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	vpaddd		0x00(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm3,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	vpaddd		0x20(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm3,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	vpaddd		0x40(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm3,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vpaddd		0x60(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm3,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm4,%ymm4
+	vpslld		$12,%ymm4,%ymm0
+	vpsrld		$20,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm5,%ymm5
+	vpslld		$12,%ymm5,%ymm0
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm6,%ymm6
+	vpslld		$12,%ymm6,%ymm0
+	vpsrld		$20,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm7,%ymm7
+	vpslld		$12,%ymm7,%ymm0
+	vpsrld		$20,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	vpaddd		0x00(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm2,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	vpaddd		0x20(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm2,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	vpaddd		0x40(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm2,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vpaddd		0x60(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm2,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm4,%ymm4
+	vpslld		$7,%ymm4,%ymm0
+	vpsrld		$25,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm5,%ymm5
+	vpslld		$7,%ymm5,%ymm0
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm6,%ymm6
+	vpslld		$7,%ymm6,%ymm0
+	vpsrld		$25,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm7,%ymm7
+	vpslld		$7,%ymm7,%ymm0
+	vpsrld		$25,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	vpaddd		0x00(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm3,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+	vpaddd		0x20(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm3,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	vpaddd		0x40(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm3,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vpaddd		0x60(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm3,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm5,%ymm5
+	vpslld		$12,%ymm5,%ymm0
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm6,%ymm6
+	vpslld		$12,%ymm6,%ymm0
+	vpsrld		$20,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm7,%ymm7
+	vpslld		$12,%ymm7,%ymm0
+	vpsrld		$20,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm4,%ymm4
+	vpslld		$12,%ymm4,%ymm0
+	vpsrld		$20,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	vpaddd		0x00(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm2,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	vpaddd		0x20(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm2,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	vpaddd		0x40(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm2,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vpaddd		0x60(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm2,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm5,%ymm5
+	vpslld		$7,%ymm5,%ymm0
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm6,%ymm6
+	vpslld		$7,%ymm6,%ymm0
+	vpsrld		$25,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm7,%ymm7
+	vpslld		$7,%ymm7,%ymm0
+	vpsrld		$25,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm4,%ymm4
+	vpslld		$7,%ymm4,%ymm0
+	vpsrld		$25,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround8
+
+	# x0..15[0-3] += s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpaddd		0x00(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpbroadcastd	0x04(%rdi),%ymm0
+	vpaddd		0x20(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpbroadcastd	0x08(%rdi),%ymm0
+	vpaddd		0x40(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpbroadcastd	0x0c(%rdi),%ymm0
+	vpaddd		0x60(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpbroadcastd	0x10(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm4,%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm5,%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm6,%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm7,%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm8,%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm9,%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm10,%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm11,%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm12,%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm13,%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm14,%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm15,%ymm15
+
+	# x12 += counter values 0-3
+	vpaddd		%ymm1,%ymm12,%ymm12
+
+	# interleave 32-bit words in state n, n+1
+	vmovdqa		0x00(%rsp),%ymm0
+	vmovdqa		0x20(%rsp),%ymm1
+	vpunpckldq	%ymm1,%ymm0,%ymm2
+	vpunpckhdq	%ymm1,%ymm0,%ymm1
+	vmovdqa		%ymm2,0x00(%rsp)
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		0x40(%rsp),%ymm0
+	vmovdqa		0x60(%rsp),%ymm1
+	vpunpckldq	%ymm1,%ymm0,%ymm2
+	vpunpckhdq	%ymm1,%ymm0,%ymm1
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		%ymm1,0x60(%rsp)
+	vmovdqa		%ymm4,%ymm0
+	vpunpckldq	%ymm5,%ymm0,%ymm4
+	vpunpckhdq	%ymm5,%ymm0,%ymm5
+	vmovdqa		%ymm6,%ymm0
+	vpunpckldq	%ymm7,%ymm0,%ymm6
+	vpunpckhdq	%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm8,%ymm0
+	vpunpckldq	%ymm9,%ymm0,%ymm8
+	vpunpckhdq	%ymm9,%ymm0,%ymm9
+	vmovdqa		%ymm10,%ymm0
+	vpunpckldq	%ymm11,%ymm0,%ymm10
+	vpunpckhdq	%ymm11,%ymm0,%ymm11
+	vmovdqa		%ymm12,%ymm0
+	vpunpckldq	%ymm13,%ymm0,%ymm12
+	vpunpckhdq	%ymm13,%ymm0,%ymm13
+	vmovdqa		%ymm14,%ymm0
+	vpunpckldq	%ymm15,%ymm0,%ymm14
+	vpunpckhdq	%ymm15,%ymm0,%ymm15
+
+	# interleave 64-bit words in state n, n+2
+	vmovdqa		0x00(%rsp),%ymm0
+	vmovdqa		0x40(%rsp),%ymm2
+	vpunpcklqdq	%ymm2,%ymm0,%ymm1
+	vpunpckhqdq	%ymm2,%ymm0,%ymm2
+	vmovdqa		%ymm1,0x00(%rsp)
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		0x20(%rsp),%ymm0
+	vmovdqa		0x60(%rsp),%ymm2
+	vpunpcklqdq	%ymm2,%ymm0,%ymm1
+	vpunpckhqdq	%ymm2,%ymm0,%ymm2
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		%ymm2,0x60(%rsp)
+	vmovdqa		%ymm4,%ymm0
+	vpunpcklqdq	%ymm6,%ymm0,%ymm4
+	vpunpckhqdq	%ymm6,%ymm0,%ymm6
+	vmovdqa		%ymm5,%ymm0
+	vpunpcklqdq	%ymm7,%ymm0,%ymm5
+	vpunpckhqdq	%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm8,%ymm0
+	vpunpcklqdq	%ymm10,%ymm0,%ymm8
+	vpunpckhqdq	%ymm10,%ymm0,%ymm10
+	vmovdqa		%ymm9,%ymm0
+	vpunpcklqdq	%ymm11,%ymm0,%ymm9
+	vpunpckhqdq	%ymm11,%ymm0,%ymm11
+	vmovdqa		%ymm12,%ymm0
+	vpunpcklqdq	%ymm14,%ymm0,%ymm12
+	vpunpckhqdq	%ymm14,%ymm0,%ymm14
+	vmovdqa		%ymm13,%ymm0
+	vpunpcklqdq	%ymm15,%ymm0,%ymm13
+	vpunpckhqdq	%ymm15,%ymm0,%ymm15
+
+	# interleave 128-bit words in state n, n+4
+	# xor/write first four blocks
+	vmovdqa		0x00(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm4,%ymm1,%ymm0
+	cmp		$0x0020,%rax
+	jl		.Lxorpart8
+	vpxor		0x0000(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0000(%rsi)
+	vperm2i128	$0x31,%ymm4,%ymm1,%ymm4
+
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
+	cmp		$0x0040,%rax
+	jl		.Lxorpart8
+	vpxor		0x0020(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0020(%rsi)
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+
+	vmovdqa		0x40(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm6,%ymm1,%ymm0
+	cmp		$0x0060,%rax
+	jl		.Lxorpart8
+	vpxor		0x0040(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0040(%rsi)
+	vperm2i128	$0x31,%ymm6,%ymm1,%ymm6
+
+	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
+	cmp		$0x0080,%rax
+	jl		.Lxorpart8
+	vpxor		0x0060(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0060(%rsi)
+	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
+
+	vmovdqa		0x20(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
+	cmp		$0x00a0,%rax
+	jl		.Lxorpart8
+	vpxor		0x0080(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0080(%rsi)
+	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
+
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	cmp		$0x00c0,%rax
+	jl		.Lxorpart8
+	vpxor		0x00a0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00a0(%rsi)
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+
+	vmovdqa		0x60(%rsp),%ymm1
+	vperm2i128	$0x20,%ymm7,%ymm1,%ymm0
+	cmp		$0x00e0,%rax
+	jl		.Lxorpart8
+	vpxor		0x00c0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00c0(%rsi)
+	vperm2i128	$0x31,%ymm7,%ymm1,%ymm7
+
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
+	cmp		$0x0100,%rax
+	jl		.Lxorpart8
+	vpxor		0x00e0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00e0(%rsi)
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+
+	# xor remaining blocks, write to output
+	vmovdqa		%ymm4,%ymm0
+	cmp		$0x0120,%rax
+	jl		.Lxorpart8
+	vpxor		0x0100(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0100(%rsi)
+
+	vmovdqa		%ymm12,%ymm0
+	cmp		$0x0140,%rax
+	jl		.Lxorpart8
+	vpxor		0x0120(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0120(%rsi)
+
+	vmovdqa		%ymm6,%ymm0
+	cmp		$0x0160,%rax
+	jl		.Lxorpart8
+	vpxor		0x0140(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0140(%rsi)
+
+	vmovdqa		%ymm14,%ymm0
+	cmp		$0x0180,%rax
+	jl		.Lxorpart8
+	vpxor		0x0160(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0160(%rsi)
+
+	vmovdqa		%ymm5,%ymm0
+	cmp		$0x01a0,%rax
+	jl		.Lxorpart8
+	vpxor		0x0180(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0180(%rsi)
+
+	vmovdqa		%ymm13,%ymm0
+	cmp		$0x01c0,%rax
+	jl		.Lxorpart8
+	vpxor		0x01a0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01a0(%rsi)
+
+	vmovdqa		%ymm7,%ymm0
+	cmp		$0x01e0,%rax
+	jl		.Lxorpart8
+	vpxor		0x01c0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01c0(%rsi)
+
+	vmovdqa		%ymm15,%ymm0
+	cmp		$0x0200,%rax
+	jl		.Lxorpart8
+	vpxor		0x01e0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x01e0(%rsi)
+
+.Ldone8:
+	vzeroupper
+	lea		-8(%r10),%rsp
+	RET
+
+.Lxorpart8:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x1f,%r9
+	jz		.Ldone8
+	and		$~0x1f,%rax
+
+	mov		%rsi,%r11
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	vpxor		0x00(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	jmp		.Ldone8
+
+SYM_FUNC_END(chacha_8block_xor_avx2)
diff --git a/lib/crypto/x86/chacha-avx512vl-x86_64.S b/lib/crypto/x86/chacha-avx512vl-x86_64.S
new file mode 100644
index 000000000000..259383e1ad44
--- /dev/null
+++ b/lib/crypto/x86/chacha-avx512vl-x86_64.S
@@ -0,0 +1,836 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL:	.octa 0x00000000000000000000000000000000
+	.octa 0x00000000000000000000000000000001
+
+.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL:	.octa 0x00000000000000000000000000000002
+	.octa 0x00000000000000000000000000000003
+
+.section	.rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL:	.octa 0x00000003000000020000000100000000
+	.octa 0x00000007000000060000000500000004
+
+.text
+
+SYM_FUNC_START(chacha_2block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 2 data blocks output, o
+	# %rdx: up to 2 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts two ChaCha blocks by loading the state
+	# matrix twice across four AVX registers. It performs matrix operations
+	# on four words in each matrix in parallel, but requires shuffling to
+	# rearrange the words after each round.
+
+	vzeroupper
+
+	# x0..3[0-2] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+
+	vmovdqa		%ymm0,%ymm8
+	vmovdqa		%ymm1,%ymm9
+	vmovdqa		%ymm2,%ymm10
+	vmovdqa		%ymm3,%ymm11
+
+.Ldoubleround:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround
+
+	# o0 = i0 ^ (x0 + s0)
+	vpaddd		%ymm8,%ymm0,%ymm7
+	cmp		$0x10,%rcx
+	jl		.Lxorpart2
+	vpxord		0x00(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x00(%rsi)
+	vextracti128	$1,%ymm7,%xmm0
+	# o1 = i1 ^ (x1 + s1)
+	vpaddd		%ymm9,%ymm1,%ymm7
+	cmp		$0x20,%rcx
+	jl		.Lxorpart2
+	vpxord		0x10(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x10(%rsi)
+	vextracti128	$1,%ymm7,%xmm1
+	# o2 = i2 ^ (x2 + s2)
+	vpaddd		%ymm10,%ymm2,%ymm7
+	cmp		$0x30,%rcx
+	jl		.Lxorpart2
+	vpxord		0x20(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x20(%rsi)
+	vextracti128	$1,%ymm7,%xmm2
+	# o3 = i3 ^ (x3 + s3)
+	vpaddd		%ymm11,%ymm3,%ymm7
+	cmp		$0x40,%rcx
+	jl		.Lxorpart2
+	vpxord		0x30(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x30(%rsi)
+	vextracti128	$1,%ymm7,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm7
+	cmp		$0x50,%rcx
+	jl		.Lxorpart2
+	vpxord		0x40(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm7
+	cmp		$0x60,%rcx
+	jl		.Lxorpart2
+	vpxord		0x50(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm7
+	cmp		$0x70,%rcx
+	jl		.Lxorpart2
+	vpxord		0x60(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm7
+	cmp		$0x80,%rcx
+	jl		.Lxorpart2
+	vpxord		0x70(%rdx),%xmm7,%xmm6
+	vmovdqu		%xmm6,0x70(%rsi)
+
+.Ldone2:
+	vzeroupper
+	RET
+
+.Lxorpart2:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0xf,%rcx
+	jz		.Ldone2
+	mov		%rax,%r9
+	and		$~0xf,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
+	vpxord		%xmm7,%xmm1,%xmm1
+	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone2
+
+SYM_FUNC_END(chacha_2block_xor_avx512vl)
+
+SYM_FUNC_START(chacha_4block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts four ChaCha blocks by loading the state
+	# matrix four times across eight AVX registers. It performs matrix
+	# operations on four words in two matrices in parallel, sequentially
+	# to the operations on the four words of the other two matrices. The
+	# required word shuffling has a rather high latency, we can do the
+	# arithmetic on two matrix-pairs without much slowdown.
+
+	vzeroupper
+
+	# x0..3[0-4] = s0..3
+	vbroadcasti128	0x00(%rdi),%ymm0
+	vbroadcasti128	0x10(%rdi),%ymm1
+	vbroadcasti128	0x20(%rdi),%ymm2
+	vbroadcasti128	0x30(%rdi),%ymm3
+
+	vmovdqa		%ymm0,%ymm4
+	vmovdqa		%ymm1,%ymm5
+	vmovdqa		%ymm2,%ymm6
+	vmovdqa		%ymm3,%ymm7
+
+	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
+	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
+
+	vmovdqa		%ymm0,%ymm11
+	vmovdqa		%ymm1,%ymm12
+	vmovdqa		%ymm2,%ymm13
+	vmovdqa		%ymm3,%ymm14
+	vmovdqa		%ymm7,%ymm15
+
+.Ldoubleround4:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$16,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm1,%ymm1
+	vpshufd		$0x39,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm3,%ymm3
+	vpshufd		$0x93,%ymm7,%ymm7
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$16,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$16,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$12,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vpaddd		%ymm1,%ymm0,%ymm0
+	vpxord		%ymm0,%ymm3,%ymm3
+	vprold		$8,%ymm3,%ymm3
+
+	vpaddd		%ymm5,%ymm4,%ymm4
+	vpxord		%ymm4,%ymm7,%ymm7
+	vprold		$8,%ymm7,%ymm7
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vpaddd		%ymm3,%ymm2,%ymm2
+	vpxord		%ymm2,%ymm1,%ymm1
+	vprold		$7,%ymm1,%ymm1
+
+	vpaddd		%ymm7,%ymm6,%ymm6
+	vpxord		%ymm6,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vpshufd		$0x93,%ymm1,%ymm1
+	vpshufd		$0x93,%ymm5,%ymm5
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vpshufd		$0x4e,%ymm2,%ymm2
+	vpshufd		$0x4e,%ymm6,%ymm6
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vpshufd		$0x39,%ymm3,%ymm3
+	vpshufd		$0x39,%ymm7,%ymm7
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround4
+
+	# o0 = i0 ^ (x0 + s0), first block
+	vpaddd		%ymm11,%ymm0,%ymm10
+	cmp		$0x10,%rcx
+	jl		.Lxorpart4
+	vpxord		0x00(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x00(%rsi)
+	vextracti128	$1,%ymm10,%xmm0
+	# o1 = i1 ^ (x1 + s1), first block
+	vpaddd		%ymm12,%ymm1,%ymm10
+	cmp		$0x20,%rcx
+	jl		.Lxorpart4
+	vpxord		0x10(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x10(%rsi)
+	vextracti128	$1,%ymm10,%xmm1
+	# o2 = i2 ^ (x2 + s2), first block
+	vpaddd		%ymm13,%ymm2,%ymm10
+	cmp		$0x30,%rcx
+	jl		.Lxorpart4
+	vpxord		0x20(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x20(%rsi)
+	vextracti128	$1,%ymm10,%xmm2
+	# o3 = i3 ^ (x3 + s3), first block
+	vpaddd		%ymm14,%ymm3,%ymm10
+	cmp		$0x40,%rcx
+	jl		.Lxorpart4
+	vpxord		0x30(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x30(%rsi)
+	vextracti128	$1,%ymm10,%xmm3
+
+	# xor and write second block
+	vmovdqa		%xmm0,%xmm10
+	cmp		$0x50,%rcx
+	jl		.Lxorpart4
+	vpxord		0x40(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x40(%rsi)
+
+	vmovdqa		%xmm1,%xmm10
+	cmp		$0x60,%rcx
+	jl		.Lxorpart4
+	vpxord		0x50(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x50(%rsi)
+
+	vmovdqa		%xmm2,%xmm10
+	cmp		$0x70,%rcx
+	jl		.Lxorpart4
+	vpxord		0x60(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x60(%rsi)
+
+	vmovdqa		%xmm3,%xmm10
+	cmp		$0x80,%rcx
+	jl		.Lxorpart4
+	vpxord		0x70(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x70(%rsi)
+
+	# o0 = i0 ^ (x0 + s0), third block
+	vpaddd		%ymm11,%ymm4,%ymm10
+	cmp		$0x90,%rcx
+	jl		.Lxorpart4
+	vpxord		0x80(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x80(%rsi)
+	vextracti128	$1,%ymm10,%xmm4
+	# o1 = i1 ^ (x1 + s1), third block
+	vpaddd		%ymm12,%ymm5,%ymm10
+	cmp		$0xa0,%rcx
+	jl		.Lxorpart4
+	vpxord		0x90(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0x90(%rsi)
+	vextracti128	$1,%ymm10,%xmm5
+	# o2 = i2 ^ (x2 + s2), third block
+	vpaddd		%ymm13,%ymm6,%ymm10
+	cmp		$0xb0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xa0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xa0(%rsi)
+	vextracti128	$1,%ymm10,%xmm6
+	# o3 = i3 ^ (x3 + s3), third block
+	vpaddd		%ymm15,%ymm7,%ymm10
+	cmp		$0xc0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xb0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xb0(%rsi)
+	vextracti128	$1,%ymm10,%xmm7
+
+	# xor and write fourth block
+	vmovdqa		%xmm4,%xmm10
+	cmp		$0xd0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xc0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xc0(%rsi)
+
+	vmovdqa		%xmm5,%xmm10
+	cmp		$0xe0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xd0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xd0(%rsi)
+
+	vmovdqa		%xmm6,%xmm10
+	cmp		$0xf0,%rcx
+	jl		.Lxorpart4
+	vpxord		0xe0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xe0(%rsi)
+
+	vmovdqa		%xmm7,%xmm10
+	cmp		$0x100,%rcx
+	jl		.Lxorpart4
+	vpxord		0xf0(%rdx),%xmm10,%xmm9
+	vmovdqu		%xmm9,0xf0(%rsi)
+
+.Ldone4:
+	vzeroupper
+	RET
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0xf,%rcx
+	jz		.Ldone4
+	mov		%rax,%r9
+	and		$~0xf,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
+	vpxord		%xmm10,%xmm1,%xmm1
+	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone4
+
+SYM_FUNC_END(chacha_4block_xor_avx512vl)
+
+SYM_FUNC_START(chacha_8block_xor_avx512vl)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 8 data blocks output, o
+	# %rdx: up to 8 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts eight consecutive ChaCha blocks by loading
+	# the state matrix in AVX registers eight times. Compared to AVX2, this
+	# mostly benefits from the new rotate instructions in VL and the
+	# additional registers.
+
+	vzeroupper
+
+	# x0..15[0-7] = s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpbroadcastd	0x04(%rdi),%ymm1
+	vpbroadcastd	0x08(%rdi),%ymm2
+	vpbroadcastd	0x0c(%rdi),%ymm3
+	vpbroadcastd	0x10(%rdi),%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm15
+
+	# x12 += counter values 0-3
+	vpaddd		CTR8BL(%rip),%ymm12,%ymm12
+
+	vmovdqa64	%ymm0,%ymm16
+	vmovdqa64	%ymm1,%ymm17
+	vmovdqa64	%ymm2,%ymm18
+	vmovdqa64	%ymm3,%ymm19
+	vmovdqa64	%ymm4,%ymm20
+	vmovdqa64	%ymm5,%ymm21
+	vmovdqa64	%ymm6,%ymm22
+	vmovdqa64	%ymm7,%ymm23
+	vmovdqa64	%ymm8,%ymm24
+	vmovdqa64	%ymm9,%ymm25
+	vmovdqa64	%ymm10,%ymm26
+	vmovdqa64	%ymm11,%ymm27
+	vmovdqa64	%ymm12,%ymm28
+	vmovdqa64	%ymm13,%ymm29
+	vmovdqa64	%ymm14,%ymm30
+	vmovdqa64	%ymm15,%ymm31
+
+.Ldoubleround8:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	vpaddd		%ymm0,%ymm4,%ymm0
+	vpxord		%ymm0,%ymm12,%ymm12
+	vprold		$16,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	vpaddd		%ymm1,%ymm5,%ymm1
+	vpxord		%ymm1,%ymm13,%ymm13
+	vprold		$16,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	vpaddd		%ymm2,%ymm6,%ymm2
+	vpxord		%ymm2,%ymm14,%ymm14
+	vprold		$16,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vpaddd		%ymm3,%ymm7,%ymm3
+	vpxord		%ymm3,%ymm15,%ymm15
+	vprold		$16,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm4,%ymm4
+	vprold		$12,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm6,%ymm6
+	vprold		$12,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm7,%ymm7
+	vprold		$12,%ymm7,%ymm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	vpaddd		%ymm0,%ymm4,%ymm0
+	vpxord		%ymm0,%ymm12,%ymm12
+	vprold		$8,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	vpaddd		%ymm1,%ymm5,%ymm1
+	vpxord		%ymm1,%ymm13,%ymm13
+	vprold		$8,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	vpaddd		%ymm2,%ymm6,%ymm2
+	vpxord		%ymm2,%ymm14,%ymm14
+	vprold		$8,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vpaddd		%ymm3,%ymm7,%ymm3
+	vpxord		%ymm3,%ymm15,%ymm15
+	vprold		$8,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm4,%ymm4
+	vprold		$7,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm6,%ymm6
+	vprold		$7,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm7,%ymm7
+	vprold		$7,%ymm7,%ymm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	vpaddd		%ymm0,%ymm5,%ymm0
+	vpxord		%ymm0,%ymm15,%ymm15
+	vprold		$16,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	vpaddd		%ymm1,%ymm6,%ymm1
+	vpxord		%ymm1,%ymm12,%ymm12
+	vprold		$16,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	vpaddd		%ymm2,%ymm7,%ymm2
+	vpxord		%ymm2,%ymm13,%ymm13
+	vprold		$16,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vpaddd		%ymm3,%ymm4,%ymm3
+	vpxord		%ymm3,%ymm14,%ymm14
+	vprold		$16,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm5,%ymm5
+	vprold		$12,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm6,%ymm6
+	vprold		$12,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm7,%ymm7
+	vprold		$12,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm4,%ymm4
+	vprold		$12,%ymm4,%ymm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	vpaddd		%ymm0,%ymm5,%ymm0
+	vpxord		%ymm0,%ymm15,%ymm15
+	vprold		$8,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	vpaddd		%ymm1,%ymm6,%ymm1
+	vpxord		%ymm1,%ymm12,%ymm12
+	vprold		$8,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	vpaddd		%ymm2,%ymm7,%ymm2
+	vpxord		%ymm2,%ymm13,%ymm13
+	vprold		$8,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vpaddd		%ymm3,%ymm4,%ymm3
+	vpxord		%ymm3,%ymm14,%ymm14
+	vprold		$8,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxord		%ymm10,%ymm5,%ymm5
+	vprold		$7,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxord		%ymm11,%ymm6,%ymm6
+	vprold		$7,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxord		%ymm8,%ymm7,%ymm7
+	vprold		$7,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxord		%ymm9,%ymm4,%ymm4
+	vprold		$7,%ymm4,%ymm4
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround8
+
+	# x0..15[0-3] += s[0..15]
+	vpaddd		%ymm16,%ymm0,%ymm0
+	vpaddd		%ymm17,%ymm1,%ymm1
+	vpaddd		%ymm18,%ymm2,%ymm2
+	vpaddd		%ymm19,%ymm3,%ymm3
+	vpaddd		%ymm20,%ymm4,%ymm4
+	vpaddd		%ymm21,%ymm5,%ymm5
+	vpaddd		%ymm22,%ymm6,%ymm6
+	vpaddd		%ymm23,%ymm7,%ymm7
+	vpaddd		%ymm24,%ymm8,%ymm8
+	vpaddd		%ymm25,%ymm9,%ymm9
+	vpaddd		%ymm26,%ymm10,%ymm10
+	vpaddd		%ymm27,%ymm11,%ymm11
+	vpaddd		%ymm28,%ymm12,%ymm12
+	vpaddd		%ymm29,%ymm13,%ymm13
+	vpaddd		%ymm30,%ymm14,%ymm14
+	vpaddd		%ymm31,%ymm15,%ymm15
+
+	# interleave 32-bit words in state n, n+1
+	vpunpckldq	%ymm1,%ymm0,%ymm16
+	vpunpckhdq	%ymm1,%ymm0,%ymm17
+	vpunpckldq	%ymm3,%ymm2,%ymm18
+	vpunpckhdq	%ymm3,%ymm2,%ymm19
+	vpunpckldq	%ymm5,%ymm4,%ymm20
+	vpunpckhdq	%ymm5,%ymm4,%ymm21
+	vpunpckldq	%ymm7,%ymm6,%ymm22
+	vpunpckhdq	%ymm7,%ymm6,%ymm23
+	vpunpckldq	%ymm9,%ymm8,%ymm24
+	vpunpckhdq	%ymm9,%ymm8,%ymm25
+	vpunpckldq	%ymm11,%ymm10,%ymm26
+	vpunpckhdq	%ymm11,%ymm10,%ymm27
+	vpunpckldq	%ymm13,%ymm12,%ymm28
+	vpunpckhdq	%ymm13,%ymm12,%ymm29
+	vpunpckldq	%ymm15,%ymm14,%ymm30
+	vpunpckhdq	%ymm15,%ymm14,%ymm31
+
+	# interleave 64-bit words in state n, n+2
+	vpunpcklqdq	%ymm18,%ymm16,%ymm0
+	vpunpcklqdq	%ymm19,%ymm17,%ymm1
+	vpunpckhqdq	%ymm18,%ymm16,%ymm2
+	vpunpckhqdq	%ymm19,%ymm17,%ymm3
+	vpunpcklqdq	%ymm22,%ymm20,%ymm4
+	vpunpcklqdq	%ymm23,%ymm21,%ymm5
+	vpunpckhqdq	%ymm22,%ymm20,%ymm6
+	vpunpckhqdq	%ymm23,%ymm21,%ymm7
+	vpunpcklqdq	%ymm26,%ymm24,%ymm8
+	vpunpcklqdq	%ymm27,%ymm25,%ymm9
+	vpunpckhqdq	%ymm26,%ymm24,%ymm10
+	vpunpckhqdq	%ymm27,%ymm25,%ymm11
+	vpunpcklqdq	%ymm30,%ymm28,%ymm12
+	vpunpcklqdq	%ymm31,%ymm29,%ymm13
+	vpunpckhqdq	%ymm30,%ymm28,%ymm14
+	vpunpckhqdq	%ymm31,%ymm29,%ymm15
+
+	# interleave 128-bit words in state n, n+4
+	# xor/write first four blocks
+	vmovdqa64	%ymm0,%ymm16
+	vperm2i128	$0x20,%ymm4,%ymm0,%ymm0
+	cmp		$0x0020,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0000(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0000(%rsi)
+	vmovdqa64	%ymm16,%ymm0
+	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
+
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
+	cmp		$0x0040,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0020(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0020(%rsi)
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+
+	vperm2i128	$0x20,%ymm6,%ymm2,%ymm0
+	cmp		$0x0060,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0040(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0040(%rsi)
+	vperm2i128	$0x31,%ymm6,%ymm2,%ymm6
+
+	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
+	cmp		$0x0080,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0060(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0060(%rsi)
+	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
+
+	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
+	cmp		$0x00a0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0080(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0080(%rsi)
+	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
+
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	cmp		$0x00c0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00a0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00a0(%rsi)
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+
+	vperm2i128	$0x20,%ymm7,%ymm3,%ymm0
+	cmp		$0x00e0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00c0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00c0(%rsi)
+	vperm2i128	$0x31,%ymm7,%ymm3,%ymm7
+
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
+	cmp		$0x0100,%rcx
+	jl		.Lxorpart8
+	vpxord		0x00e0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x00e0(%rsi)
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+
+	# xor remaining blocks, write to output
+	vmovdqa64	%ymm4,%ymm0
+	cmp		$0x0120,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0100(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0100(%rsi)
+
+	vmovdqa64	%ymm12,%ymm0
+	cmp		$0x0140,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0120(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0120(%rsi)
+
+	vmovdqa64	%ymm6,%ymm0
+	cmp		$0x0160,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0140(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0140(%rsi)
+
+	vmovdqa64	%ymm14,%ymm0
+	cmp		$0x0180,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0160(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0160(%rsi)
+
+	vmovdqa64	%ymm5,%ymm0
+	cmp		$0x01a0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x0180(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x0180(%rsi)
+
+	vmovdqa64	%ymm13,%ymm0
+	cmp		$0x01c0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01a0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01a0(%rsi)
+
+	vmovdqa64	%ymm7,%ymm0
+	cmp		$0x01e0,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01c0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01c0(%rsi)
+
+	vmovdqa64	%ymm15,%ymm0
+	cmp		$0x0200,%rcx
+	jl		.Lxorpart8
+	vpxord		0x01e0(%rdx),%ymm0,%ymm0
+	vmovdqu64	%ymm0,0x01e0(%rsi)
+
+.Ldone8:
+	vzeroupper
+	RET
+
+.Lxorpart8:
+	# xor remaining bytes from partial register into output
+	mov		%rcx,%rax
+	and		$0x1f,%rcx
+	jz		.Ldone8
+	mov		%rax,%r9
+	and		$~0x1f,%r9
+
+	mov		$1,%rax
+	shld		%cl,%rax,%rax
+	sub		$1,%rax
+	kmovq		%rax,%k1
+
+	vmovdqu8	(%rdx,%r9),%ymm1{%k1}{z}
+	vpxord		%ymm0,%ymm1,%ymm1
+	vmovdqu8	%ymm1,(%rsi,%r9){%k1}
+
+	jmp		.Ldone8
+
+SYM_FUNC_END(chacha_8block_xor_avx512vl)
diff --git a/lib/crypto/x86/chacha-ssse3-x86_64.S b/lib/crypto/x86/chacha-ssse3-x86_64.S
new file mode 100644
index 000000000000..7111949cd5b9
--- /dev/null
+++ b/lib/crypto/x86/chacha-ssse3-x86_64.S
@@ -0,0 +1,791 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.section	.rodata.cst16.ROT8, "aM", @progbits, 16
+.align 16
+ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+.section	.rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
+.align 16
+CTRINC:	.octa 0x00000003000000020000000100000000
+
+.text
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
+ * function performs matrix operations on four words in parallel, but requires
+ * shuffling to rearrange the words after each round.  8/16-bit word rotation is
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
+ * rotation uses traditional shift+OR.
+ *
+ * The round count is given in %r8d.
+ *
+ * Clobbers: %r8d, %xmm4-%xmm7
+ */
+SYM_FUNC_START_LOCAL(chacha_permute)
+
+	movdqa		ROT8(%rip),%xmm4
+	movdqa		ROT16(%rip),%xmm5
+
+.Ldoubleround:
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm5,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm6
+	pslld		$12,%xmm6
+	psrld		$20,%xmm1
+	por		%xmm6,%xmm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm4,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm7
+	pslld		$7,%xmm7
+	psrld		$25,%xmm1
+	por		%xmm7,%xmm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	pshufd		$0x39,%xmm1,%xmm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	pshufd		$0x4e,%xmm2,%xmm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	pshufd		$0x93,%xmm3,%xmm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm5,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm6
+	pslld		$12,%xmm6
+	psrld		$20,%xmm1
+	por		%xmm6,%xmm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm4,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm7
+	pslld		$7,%xmm7
+	psrld		$25,%xmm1
+	por		%xmm7,%xmm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	pshufd		$0x93,%xmm1,%xmm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	pshufd		$0x4e,%xmm2,%xmm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	pshufd		$0x39,%xmm3,%xmm3
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround
+
+	RET
+SYM_FUNC_END(chacha_permute)
+
+SYM_FUNC_START(chacha_block_xor_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 1 data block output, o
+	# %rdx: up to 1 data block input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+	FRAME_BEGIN
+
+	# x0..3 = s0..3
+	movdqu		0x00(%rdi),%xmm0
+	movdqu		0x10(%rdi),%xmm1
+	movdqu		0x20(%rdi),%xmm2
+	movdqu		0x30(%rdi),%xmm3
+	movdqa		%xmm0,%xmm8
+	movdqa		%xmm1,%xmm9
+	movdqa		%xmm2,%xmm10
+	movdqa		%xmm3,%xmm11
+
+	mov		%rcx,%rax
+	call		chacha_permute
+
+	# o0 = i0 ^ (x0 + s0)
+	paddd		%xmm8,%xmm0
+	cmp		$0x10,%rax
+	jl		.Lxorpart
+	movdqu		0x00(%rdx),%xmm4
+	pxor		%xmm4,%xmm0
+	movdqu		%xmm0,0x00(%rsi)
+	# o1 = i1 ^ (x1 + s1)
+	paddd		%xmm9,%xmm1
+	movdqa		%xmm1,%xmm0
+	cmp		$0x20,%rax
+	jl		.Lxorpart
+	movdqu		0x10(%rdx),%xmm0
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x10(%rsi)
+	# o2 = i2 ^ (x2 + s2)
+	paddd		%xmm10,%xmm2
+	movdqa		%xmm2,%xmm0
+	cmp		$0x30,%rax
+	jl		.Lxorpart
+	movdqu		0x20(%rdx),%xmm0
+	pxor		%xmm2,%xmm0
+	movdqu		%xmm0,0x20(%rsi)
+	# o3 = i3 ^ (x3 + s3)
+	paddd		%xmm11,%xmm3
+	movdqa		%xmm3,%xmm0
+	cmp		$0x40,%rax
+	jl		.Lxorpart
+	movdqu		0x30(%rdx),%xmm0
+	pxor		%xmm3,%xmm0
+	movdqu		%xmm0,0x30(%rsi)
+
+.Ldone:
+	FRAME_END
+	RET
+
+.Lxorpart:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		8(%rsp),%r10
+	sub		$0x10,%rsp
+	and		$~31,%rsp
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	pxor		0x00(%rsp),%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	lea		-8(%r10),%rsp
+	jmp		.Ldone
+
+SYM_FUNC_END(chacha_block_xor_ssse3)
+
+SYM_FUNC_START(hchacha_block_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: output (8 32-bit words)
+	# %edx: nrounds
+	FRAME_BEGIN
+
+	movdqu		0x00(%rdi),%xmm0
+	movdqu		0x10(%rdi),%xmm1
+	movdqu		0x20(%rdi),%xmm2
+	movdqu		0x30(%rdi),%xmm3
+
+	mov		%edx,%r8d
+	call		chacha_permute
+
+	movdqu		%xmm0,0x00(%rsi)
+	movdqu		%xmm3,0x10(%rsi)
+
+	FRAME_END
+	RET
+SYM_FUNC_END(hchacha_block_ssse3)
+
+SYM_FUNC_START(chacha_4block_xor_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: up to 4 data blocks output, o
+	# %rdx: up to 4 data blocks input, i
+	# %rcx: input/output length in bytes
+	# %r8d: nrounds
+
+	# This function encrypts four consecutive ChaCha blocks by loading the
+	# the state matrix in SSE registers four times. As we need some scratch
+	# registers, we save the first four registers on the stack. The
+	# algorithm performs each operation on the corresponding word of each
+	# state matrix, hence requires no word shuffling. For final XORing step
+	# we transpose the matrix by interleaving 32- and then 64-bit words,
+	# which allows us to do XOR in SSE registers. 8/16-bit word rotation is
+	# done with the slightly better performing SSSE3 byte shuffling,
+	# 7/12-bit word rotation uses traditional shift+OR.
+
+	lea		8(%rsp),%r10
+	sub		$0x80,%rsp
+	and		$~63,%rsp
+	mov		%rcx,%rax
+
+	# x0..15[0-3] = s0..3[0..3]
+	movq		0x00(%rdi),%xmm1
+	pshufd		$0x00,%xmm1,%xmm0
+	pshufd		$0x55,%xmm1,%xmm1
+	movq		0x08(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	movq		0x10(%rdi),%xmm5
+	pshufd		$0x00,%xmm5,%xmm4
+	pshufd		$0x55,%xmm5,%xmm5
+	movq		0x18(%rdi),%xmm7
+	pshufd		$0x00,%xmm7,%xmm6
+	pshufd		$0x55,%xmm7,%xmm7
+	movq		0x20(%rdi),%xmm9
+	pshufd		$0x00,%xmm9,%xmm8
+	pshufd		$0x55,%xmm9,%xmm9
+	movq		0x28(%rdi),%xmm11
+	pshufd		$0x00,%xmm11,%xmm10
+	pshufd		$0x55,%xmm11,%xmm11
+	movq		0x30(%rdi),%xmm13
+	pshufd		$0x00,%xmm13,%xmm12
+	pshufd		$0x55,%xmm13,%xmm13
+	movq		0x38(%rdi),%xmm15
+	pshufd		$0x00,%xmm15,%xmm14
+	pshufd		$0x55,%xmm15,%xmm15
+	# x0..3 on stack
+	movdqa		%xmm0,0x00(%rsp)
+	movdqa		%xmm1,0x10(%rsp)
+	movdqa		%xmm2,0x20(%rsp)
+	movdqa		%xmm3,0x30(%rsp)
+
+	movdqa		CTRINC(%rip),%xmm1
+	movdqa		ROT8(%rip),%xmm2
+	movdqa		ROT16(%rip),%xmm3
+
+	# x12 += counter values 0-3
+	paddd		%xmm1,%xmm12
+
+.Ldoubleround4:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm3,%xmm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm3,%xmm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm3,%xmm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm3,%xmm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	paddd		%xmm12,%xmm8
+	pxor		%xmm8,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm4
+	por		%xmm0,%xmm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	paddd		%xmm13,%xmm9
+	pxor		%xmm9,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm5
+	por		%xmm0,%xmm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	paddd		%xmm14,%xmm10
+	pxor		%xmm10,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm6
+	por		%xmm0,%xmm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	paddd		%xmm15,%xmm11
+	pxor		%xmm11,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm7
+	por		%xmm0,%xmm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm2,%xmm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm2,%xmm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm2,%xmm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm2,%xmm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	paddd		%xmm12,%xmm8
+	pxor		%xmm8,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm4
+	por		%xmm0,%xmm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	paddd		%xmm13,%xmm9
+	pxor		%xmm9,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm5
+	por		%xmm0,%xmm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	paddd		%xmm14,%xmm10
+	pxor		%xmm10,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm6
+	por		%xmm0,%xmm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	paddd		%xmm15,%xmm11
+	pxor		%xmm11,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm7
+	por		%xmm0,%xmm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm3,%xmm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm3,%xmm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm3,%xmm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm3,%xmm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	paddd		%xmm15,%xmm10
+	pxor		%xmm10,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm5
+	por		%xmm0,%xmm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	paddd		%xmm12,%xmm11
+	pxor		%xmm11,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm6
+	por		%xmm0,%xmm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	paddd		%xmm13,%xmm8
+	pxor		%xmm8,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm7
+	por		%xmm0,%xmm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	paddd		%xmm14,%xmm9
+	pxor		%xmm9,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm4
+	por		%xmm0,%xmm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm2,%xmm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm2,%xmm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm2,%xmm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm2,%xmm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	paddd		%xmm15,%xmm10
+	pxor		%xmm10,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm5
+	por		%xmm0,%xmm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	paddd		%xmm12,%xmm11
+	pxor		%xmm11,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm6
+	por		%xmm0,%xmm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	paddd		%xmm13,%xmm8
+	pxor		%xmm8,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm7
+	por		%xmm0,%xmm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	paddd		%xmm14,%xmm9
+	pxor		%xmm9,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm4
+	por		%xmm0,%xmm4
+
+	sub		$2,%r8d
+	jnz		.Ldoubleround4
+
+	# x0[0-3] += s0[0]
+	# x1[0-3] += s0[1]
+	movq		0x00(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		0x00(%rsp),%xmm2
+	movdqa		%xmm2,0x00(%rsp)
+	paddd		0x10(%rsp),%xmm3
+	movdqa		%xmm3,0x10(%rsp)
+	# x2[0-3] += s0[2]
+	# x3[0-3] += s0[3]
+	movq		0x08(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		0x20(%rsp),%xmm2
+	movdqa		%xmm2,0x20(%rsp)
+	paddd		0x30(%rsp),%xmm3
+	movdqa		%xmm3,0x30(%rsp)
+
+	# x4[0-3] += s1[0]
+	# x5[0-3] += s1[1]
+	movq		0x10(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm4
+	paddd		%xmm3,%xmm5
+	# x6[0-3] += s1[2]
+	# x7[0-3] += s1[3]
+	movq		0x18(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm6
+	paddd		%xmm3,%xmm7
+
+	# x8[0-3] += s2[0]
+	# x9[0-3] += s2[1]
+	movq		0x20(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm8
+	paddd		%xmm3,%xmm9
+	# x10[0-3] += s2[2]
+	# x11[0-3] += s2[3]
+	movq		0x28(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm10
+	paddd		%xmm3,%xmm11
+
+	# x12[0-3] += s3[0]
+	# x13[0-3] += s3[1]
+	movq		0x30(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm12
+	paddd		%xmm3,%xmm13
+	# x14[0-3] += s3[2]
+	# x15[0-3] += s3[3]
+	movq		0x38(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm14
+	paddd		%xmm3,%xmm15
+
+	# x12 += counter values 0-3
+	paddd		%xmm1,%xmm12
+
+	# interleave 32-bit words in state n, n+1
+	movdqa		0x00(%rsp),%xmm0
+	movdqa		0x10(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpckldq	%xmm1,%xmm2
+	punpckhdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x00(%rsp)
+	movdqa		%xmm0,0x10(%rsp)
+	movdqa		0x20(%rsp),%xmm0
+	movdqa		0x30(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpckldq	%xmm1,%xmm2
+	punpckhdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x20(%rsp)
+	movdqa		%xmm0,0x30(%rsp)
+	movdqa		%xmm4,%xmm0
+	punpckldq	%xmm5,%xmm4
+	punpckhdq	%xmm5,%xmm0
+	movdqa		%xmm0,%xmm5
+	movdqa		%xmm6,%xmm0
+	punpckldq	%xmm7,%xmm6
+	punpckhdq	%xmm7,%xmm0
+	movdqa		%xmm0,%xmm7
+	movdqa		%xmm8,%xmm0
+	punpckldq	%xmm9,%xmm8
+	punpckhdq	%xmm9,%xmm0
+	movdqa		%xmm0,%xmm9
+	movdqa		%xmm10,%xmm0
+	punpckldq	%xmm11,%xmm10
+	punpckhdq	%xmm11,%xmm0
+	movdqa		%xmm0,%xmm11
+	movdqa		%xmm12,%xmm0
+	punpckldq	%xmm13,%xmm12
+	punpckhdq	%xmm13,%xmm0
+	movdqa		%xmm0,%xmm13
+	movdqa		%xmm14,%xmm0
+	punpckldq	%xmm15,%xmm14
+	punpckhdq	%xmm15,%xmm0
+	movdqa		%xmm0,%xmm15
+
+	# interleave 64-bit words in state n, n+2
+	movdqa		0x00(%rsp),%xmm0
+	movdqa		0x20(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpcklqdq	%xmm1,%xmm2
+	punpckhqdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x00(%rsp)
+	movdqa		%xmm0,0x20(%rsp)
+	movdqa		0x10(%rsp),%xmm0
+	movdqa		0x30(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpcklqdq	%xmm1,%xmm2
+	punpckhqdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x10(%rsp)
+	movdqa		%xmm0,0x30(%rsp)
+	movdqa		%xmm4,%xmm0
+	punpcklqdq	%xmm6,%xmm4
+	punpckhqdq	%xmm6,%xmm0
+	movdqa		%xmm0,%xmm6
+	movdqa		%xmm5,%xmm0
+	punpcklqdq	%xmm7,%xmm5
+	punpckhqdq	%xmm7,%xmm0
+	movdqa		%xmm0,%xmm7
+	movdqa		%xmm8,%xmm0
+	punpcklqdq	%xmm10,%xmm8
+	punpckhqdq	%xmm10,%xmm0
+	movdqa		%xmm0,%xmm10
+	movdqa		%xmm9,%xmm0
+	punpcklqdq	%xmm11,%xmm9
+	punpckhqdq	%xmm11,%xmm0
+	movdqa		%xmm0,%xmm11
+	movdqa		%xmm12,%xmm0
+	punpcklqdq	%xmm14,%xmm12
+	punpckhqdq	%xmm14,%xmm0
+	movdqa		%xmm0,%xmm14
+	movdqa		%xmm13,%xmm0
+	punpcklqdq	%xmm15,%xmm13
+	punpckhqdq	%xmm15,%xmm0
+	movdqa		%xmm0,%xmm15
+
+	# xor with corresponding input, write to output
+	movdqa		0x00(%rsp),%xmm0
+	cmp		$0x10,%rax
+	jl		.Lxorpart4
+	movdqu		0x00(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x00(%rsi)
+
+	movdqu		%xmm4,%xmm0
+	cmp		$0x20,%rax
+	jl		.Lxorpart4
+	movdqu		0x10(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x10(%rsi)
+
+	movdqu		%xmm8,%xmm0
+	cmp		$0x30,%rax
+	jl		.Lxorpart4
+	movdqu		0x20(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x20(%rsi)
+
+	movdqu		%xmm12,%xmm0
+	cmp		$0x40,%rax
+	jl		.Lxorpart4
+	movdqu		0x30(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x30(%rsi)
+
+	movdqa		0x20(%rsp),%xmm0
+	cmp		$0x50,%rax
+	jl		.Lxorpart4
+	movdqu		0x40(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x40(%rsi)
+
+	movdqu		%xmm6,%xmm0
+	cmp		$0x60,%rax
+	jl		.Lxorpart4
+	movdqu		0x50(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x50(%rsi)
+
+	movdqu		%xmm10,%xmm0
+	cmp		$0x70,%rax
+	jl		.Lxorpart4
+	movdqu		0x60(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x60(%rsi)
+
+	movdqu		%xmm14,%xmm0
+	cmp		$0x80,%rax
+	jl		.Lxorpart4
+	movdqu		0x70(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x70(%rsi)
+
+	movdqa		0x10(%rsp),%xmm0
+	cmp		$0x90,%rax
+	jl		.Lxorpart4
+	movdqu		0x80(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x80(%rsi)
+
+	movdqu		%xmm5,%xmm0
+	cmp		$0xa0,%rax
+	jl		.Lxorpart4
+	movdqu		0x90(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x90(%rsi)
+
+	movdqu		%xmm9,%xmm0
+	cmp		$0xb0,%rax
+	jl		.Lxorpart4
+	movdqu		0xa0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xa0(%rsi)
+
+	movdqu		%xmm13,%xmm0
+	cmp		$0xc0,%rax
+	jl		.Lxorpart4
+	movdqu		0xb0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xb0(%rsi)
+
+	movdqa		0x30(%rsp),%xmm0
+	cmp		$0xd0,%rax
+	jl		.Lxorpart4
+	movdqu		0xc0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xc0(%rsi)
+
+	movdqu		%xmm7,%xmm0
+	cmp		$0xe0,%rax
+	jl		.Lxorpart4
+	movdqu		0xd0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xd0(%rsi)
+
+	movdqu		%xmm11,%xmm0
+	cmp		$0xf0,%rax
+	jl		.Lxorpart4
+	movdqu		0xe0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xe0(%rsi)
+
+	movdqu		%xmm15,%xmm0
+	cmp		$0x100,%rax
+	jl		.Lxorpart4
+	movdqu		0xf0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xf0(%rsi)
+
+.Ldone4:
+	lea		-8(%r10),%rsp
+	RET
+
+.Lxorpart4:
+	# xor remaining bytes from partial register into output
+	mov		%rax,%r9
+	and		$0x0f,%r9
+	jz		.Ldone4
+	and		$~0x0f,%rax
+
+	mov		%rsi,%r11
+
+	lea		(%rdx,%rax),%rsi
+	mov		%rsp,%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	pxor		0x00(%rsp),%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+
+	mov		%rsp,%rsi
+	lea		(%r11,%rax),%rdi
+	mov		%r9,%rcx
+	rep movsb
+
+	jmp		.Ldone4
+
+SYM_FUNC_END(chacha_4block_xor_ssse3)
diff --git a/lib/crypto/x86/chacha.h b/lib/crypto/x86/chacha.h
new file mode 100644
index 000000000000..10cf8f1c569d
--- /dev/null
+++ b/lib/crypto/x86/chacha.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha and HChaCha functions (x86_64 optimized)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <asm/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state,
+					u8 *dst, const u8 *src,
+					unsigned int len, int nrounds);
+asmlinkage void hchacha_block_ssse3(const struct chacha_state *state,
+				    u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state,
+				       u8 *dst, const u8 *src,
+				       unsigned int len, int nrounds);
+
+asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state,
+					   u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state,
+					   u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state,
+					   u8 *dst, const u8 *src,
+					   unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
+
+static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+{
+	len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
+	return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+	if (static_branch_likely(&chacha_use_avx512vl)) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+			chacha_8block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
+			state->x[12] += 8;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
+			chacha_8block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state->x[12] += chacha_advance(bytes, 8);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 2) {
+			chacha_4block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state->x[12] += chacha_advance(bytes, 4);
+			return;
+		}
+		if (bytes) {
+			chacha_2block_xor_avx512vl(state, dst, src, bytes,
+						   nrounds);
+			state->x[12] += chacha_advance(bytes, 2);
+			return;
+		}
+	}
+
+	if (static_branch_likely(&chacha_use_avx2)) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
+			state->x[12] += 8;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
+			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+			state->x[12] += chacha_advance(bytes, 8);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 2) {
+			chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
+			state->x[12] += chacha_advance(bytes, 4);
+			return;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE) {
+			chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
+			state->x[12] += chacha_advance(bytes, 2);
+			return;
+		}
+	}
+
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
+		state->x[12] += 4;
+	}
+	if (bytes > CHACHA_BLOCK_SIZE) {
+		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+		state->x[12] += chacha_advance(bytes, 4);
+		return;
+	}
+	if (bytes) {
+		chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
+		state->x[12]++;
+	}
+}
+
+static void hchacha_block_arch(const struct chacha_state *state,
+			       u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+	if (!static_branch_likely(&chacha_use_simd)) {
+		hchacha_block_generic(state, out, nrounds);
+	} else {
+		kernel_fpu_begin();
+		hchacha_block_ssse3(state, out, nrounds);
+		kernel_fpu_end();
+	}
+}
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+			      const u8 *src, unsigned int bytes, int nrounds)
+{
+	if (!static_branch_likely(&chacha_use_simd) ||
+	    bytes <= CHACHA_BLOCK_SIZE)
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		kernel_fpu_begin();
+		chacha_dosimd(state, dst, src, todo, nrounds);
+		kernel_fpu_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_SSSE3))
+		return;
+
+	static_branch_enable(&chacha_use_simd);
+
+	if (boot_cpu_has(X86_FEATURE_AVX) &&
+	    boot_cpu_has(X86_FEATURE_AVX2) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+		static_branch_enable(&chacha_use_avx2);
+
+		if (boot_cpu_has(X86_FEATURE_AVX512VL) &&
+		    boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
+			static_branch_enable(&chacha_use_avx512vl);
+	}
+}
diff --git a/lib/crypto/x86/curve25519.h b/lib/crypto/x86/curve25519.h
new file mode 100644
index 000000000000..5c0b8408852d
--- /dev/null
+++ b/lib/crypto/x86/curve25519.h
@@ -0,0 +1,1613 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
+ */
+
+#include <linux/types.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+static __always_inline u64 eq_mask(u64 a, u64 b)
+{
+	u64 x = a ^ b;
+	u64 minus_x = ~x + (u64)1U;
+	u64 x_or_minus_x = x | minus_x;
+	u64 xnx = x_or_minus_x >> (u32)63U;
+	return xnx - (u64)1U;
+}
+
+static __always_inline u64 gte_mask(u64 a, u64 b)
+{
+	u64 x = a;
+	u64 y = b;
+	u64 x_xor_y = x ^ y;
+	u64 x_sub_y = x - y;
+	u64 x_sub_y_xor_y = x_sub_y ^ y;
+	u64 q = x_xor_y | x_sub_y_xor_y;
+	u64 x_xor_q = x ^ q;
+	u64 x_xor_q_ = x_xor_q >> (u32)63U;
+	return x_xor_q_ - (u64)1U;
+}
+
+/* Computes the addition of four-element f1 with value in f2
+ * and returns the carry (if any) */
+static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
+{
+	u64 carry_r;
+
+	asm volatile(
+		/* Clear registers to propagate the carry bit */
+		"  xor %%r8d, %%r8d;"
+		"  xor %%r9d, %%r9d;"
+		"  xor %%r10d, %%r10d;"
+		"  xor %%r11d, %%r11d;"
+		"  xor %k1, %k1;"
+
+		/* Begin addition chain */
+		"  addq 0(%3), %0;"
+		"  movq %0, 0(%2);"
+		"  adcxq 8(%3), %%r8;"
+		"  movq %%r8, 8(%2);"
+		"  adcxq 16(%3), %%r9;"
+		"  movq %%r9, 16(%2);"
+		"  adcxq 24(%3), %%r10;"
+		"  movq %%r10, 24(%2);"
+
+		/* Return the carry bit in a register */
+		"  adcx %%r11, %1;"
+		: "+&r"(f2), "=&r"(carry_r)
+		: "r"(out), "r"(f1)
+		: "%r8", "%r9", "%r10", "%r11", "memory", "cc");
+
+	return carry_r;
+}
+
+/* Computes the field addition of two field elements */
+static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
+{
+	asm volatile(
+		/* Compute the raw addition of f1 + f2 */
+		"  movq 0(%0), %%r8;"
+		"  addq 0(%2), %%r8;"
+		"  movq 8(%0), %%r9;"
+		"  adcxq 8(%2), %%r9;"
+		"  movq 16(%0), %%r10;"
+		"  adcxq 16(%2), %%r10;"
+		"  movq 24(%0), %%r11;"
+		"  adcxq 24(%2), %%r11;"
+
+		/* Wrap the result back into the field */
+
+		/* Step 1: Compute carry*38 */
+		"  mov $0, %%rax;"
+		"  mov $38, %0;"
+		"  cmovc %0, %%rax;"
+
+		/* Step 2: Add carry*38 to the original sum */
+		"  xor %%ecx, %%ecx;"
+		"  add %%rax, %%r8;"
+		"  adcx %%rcx, %%r9;"
+		"  movq %%r9, 8(%1);"
+		"  adcx %%rcx, %%r10;"
+		"  movq %%r10, 16(%1);"
+		"  adcx %%rcx, %%r11;"
+		"  movq %%r11, 24(%1);"
+
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+		"  mov $0, %%rax;"
+		"  cmovc %0, %%rax;"
+		"  add %%rax, %%r8;"
+		"  movq %%r8, 0(%1);"
+		: "+&r"(f2)
+		: "r"(out), "r"(f1)
+		: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
+}
+
+/* Computes the field subtraction of two field elements */
+static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
+{
+	asm volatile(
+		/* Compute the raw subtraction of f1-f2 */
+		"  movq 0(%1), %%r8;"
+		"  subq 0(%2), %%r8;"
+		"  movq 8(%1), %%r9;"
+		"  sbbq 8(%2), %%r9;"
+		"  movq 16(%1), %%r10;"
+		"  sbbq 16(%2), %%r10;"
+		"  movq 24(%1), %%r11;"
+		"  sbbq 24(%2), %%r11;"
+
+		/* Wrap the result back into the field */
+
+		/* Step 1: Compute carry*38 */
+		"  mov $0, %%rax;"
+		"  mov $38, %%rcx;"
+		"  cmovc %%rcx, %%rax;"
+
+		/* Step 2: Subtract carry*38 from the original difference */
+		"  sub %%rax, %%r8;"
+		"  sbb $0, %%r9;"
+		"  sbb $0, %%r10;"
+		"  sbb $0, %%r11;"
+
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+		"  mov $0, %%rax;"
+		"  cmovc %%rcx, %%rax;"
+		"  sub %%rax, %%r8;"
+
+		/* Store the result */
+		"  movq %%r8, 0(%0);"
+		"  movq %%r9, 8(%0);"
+		"  movq %%r10, 16(%0);"
+		"  movq %%r11, 24(%0);"
+		:
+		: "r"(out), "r"(f1), "r"(f2)
+		: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
+}
+
+/* Computes a field multiplication: out <- f1 * f2
+ * Uses the 8-element buffer tmp for intermediate results */
+static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
+{
+	asm volatile(
+
+		/* Compute the raw multiplication: tmp <- src1 * src2 */
+
+		/* Compute src1[0] * src2 */
+		"  movq 0(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  movq %%r8, 0(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  movq %%r10, 8(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+
+		/* Compute src1[1] * src2 */
+		"  movq 8(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 8(%2), %%r8;"
+		"  movq %%r8, 8(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 16(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  mov $0, %%r8;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+
+		/* Compute src1[2] * src2 */
+		"  movq 16(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 16(%2), %%r8;"
+		"  movq %%r8, 16(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 24(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  mov $0, %%r8;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+
+		/* Compute src1[3] * src2 */
+		"  movq 24(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 24(%2), %%r8;"
+		"  movq %%r8, 24(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 32(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  movq %%rbx, 40(%2);"
+		"  mov $0, %%r8;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  movq %%r14, 48(%2);"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+		"  movq %%rax, 56(%2);"
+
+		/* Line up pointers */
+		"  mov %2, %0;"
+		"  mov %3, %2;"
+
+		/* Wrap the result back into the field */
+
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+		"  mov $38, %%rdx;"
+		"  mulxq 32(%0), %%r8, %%r13;"
+		"  xor %k1, %k1;"
+		"  adoxq 0(%0), %%r8;"
+		"  mulxq 40(%0), %%r9, %%rbx;"
+		"  adcx %%r13, %%r9;"
+		"  adoxq 8(%0), %%r9;"
+		"  mulxq 48(%0), %%r10, %%r13;"
+		"  adcx %%rbx, %%r10;"
+		"  adoxq 16(%0), %%r10;"
+		"  mulxq 56(%0), %%r11, %%rax;"
+		"  adcx %%r13, %%r11;"
+		"  adoxq 24(%0), %%r11;"
+		"  adcx %1, %%rax;"
+		"  adox %1, %%rax;"
+		"  imul %%rdx, %%rax;"
+
+		/* Step 2: Fold the carry back into dst */
+		"  add %%rax, %%r8;"
+		"  adcx %1, %%r9;"
+		"  movq %%r9, 8(%2);"
+		"  adcx %1, %%r10;"
+		"  movq %%r10, 16(%2);"
+		"  adcx %1, %%r11;"
+		"  movq %%r11, 24(%2);"
+
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+		"  mov $0, %%rax;"
+		"  cmovc %%rdx, %%rax;"
+		"  add %%rax, %%r8;"
+		"  movq %%r8, 0(%2);"
+		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+		: "r"(out)
+		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+		  "%r14", "memory", "cc");
+}
+
+/* Computes two field multiplications:
+ *   out[0] <- f1[0] * f2[0]
+ *   out[1] <- f1[1] * f2[1]
+ * Uses the 16-element buffer tmp for intermediate results: */
+static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
+{
+	asm volatile(
+
+		/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
+
+		/* Compute src1[0] * src2 */
+		"  movq 0(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  movq %%r8, 0(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  movq %%r10, 8(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+
+		/* Compute src1[1] * src2 */
+		"  movq 8(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 8(%2), %%r8;"
+		"  movq %%r8, 8(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 16(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  mov $0, %%r8;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+
+		/* Compute src1[2] * src2 */
+		"  movq 16(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 16(%2), %%r8;"
+		"  movq %%r8, 16(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 24(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  mov $0, %%r8;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+
+		/* Compute src1[3] * src2 */
+		"  movq 24(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 24(%2), %%r8;"
+		"  movq %%r8, 24(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 32(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  movq %%rbx, 40(%2);"
+		"  mov $0, %%r8;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  movq %%r14, 48(%2);"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+		"  movq %%rax, 56(%2);"
+
+		/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
+
+		/* Compute src1[0] * src2 */
+		"  movq 32(%0), %%rdx;"
+		"  mulxq 32(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  movq %%r8, 64(%2);"
+		"  mulxq 40(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  movq %%r10, 72(%2);"
+		"  mulxq 48(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  mulxq 56(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+
+		/* Compute src1[1] * src2 */
+		"  movq 40(%0), %%rdx;"
+		"  mulxq 32(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 72(%2), %%r8;"
+		"  movq %%r8, 72(%2);"
+		"  mulxq 40(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 80(%2);"
+		"  mulxq 48(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  mov $0, %%r8;"
+		"  mulxq 56(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+
+		/* Compute src1[2] * src2 */
+		"  movq 48(%0), %%rdx;"
+		"  mulxq 32(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 80(%2), %%r8;"
+		"  movq %%r8, 80(%2);"
+		"  mulxq 40(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 88(%2);"
+		"  mulxq 48(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  mov $0, %%r8;"
+		"  mulxq 56(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+
+		/* Compute src1[3] * src2 */
+		"  movq 56(%0), %%rdx;"
+		"  mulxq 32(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 88(%2), %%r8;"
+		"  movq %%r8, 88(%2);"
+		"  mulxq 40(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 96(%2);"
+		"  mulxq 48(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  movq %%rbx, 104(%2);"
+		"  mov $0, %%r8;"
+		"  mulxq 56(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  movq %%r14, 112(%2);"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+		"  movq %%rax, 120(%2);"
+
+		/* Line up pointers */
+		"  mov %2, %0;"
+		"  mov %3, %2;"
+
+		/* Wrap the results back into the field */
+
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+		"  mov $38, %%rdx;"
+		"  mulxq 32(%0), %%r8, %%r13;"
+		"  xor %k1, %k1;"
+		"  adoxq 0(%0), %%r8;"
+		"  mulxq 40(%0), %%r9, %%rbx;"
+		"  adcx %%r13, %%r9;"
+		"  adoxq 8(%0), %%r9;"
+		"  mulxq 48(%0), %%r10, %%r13;"
+		"  adcx %%rbx, %%r10;"
+		"  adoxq 16(%0), %%r10;"
+		"  mulxq 56(%0), %%r11, %%rax;"
+		"  adcx %%r13, %%r11;"
+		"  adoxq 24(%0), %%r11;"
+		"  adcx %1, %%rax;"
+		"  adox %1, %%rax;"
+		"  imul %%rdx, %%rax;"
+
+		/* Step 2: Fold the carry back into dst */
+		"  add %%rax, %%r8;"
+		"  adcx %1, %%r9;"
+		"  movq %%r9, 8(%2);"
+		"  adcx %1, %%r10;"
+		"  movq %%r10, 16(%2);"
+		"  adcx %1, %%r11;"
+		"  movq %%r11, 24(%2);"
+
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+		"  mov $0, %%rax;"
+		"  cmovc %%rdx, %%rax;"
+		"  add %%rax, %%r8;"
+		"  movq %%r8, 0(%2);"
+
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+		"  mov $38, %%rdx;"
+		"  mulxq 96(%0), %%r8, %%r13;"
+		"  xor %k1, %k1;"
+		"  adoxq 64(%0), %%r8;"
+		"  mulxq 104(%0), %%r9, %%rbx;"
+		"  adcx %%r13, %%r9;"
+		"  adoxq 72(%0), %%r9;"
+		"  mulxq 112(%0), %%r10, %%r13;"
+		"  adcx %%rbx, %%r10;"
+		"  adoxq 80(%0), %%r10;"
+		"  mulxq 120(%0), %%r11, %%rax;"
+		"  adcx %%r13, %%r11;"
+		"  adoxq 88(%0), %%r11;"
+		"  adcx %1, %%rax;"
+		"  adox %1, %%rax;"
+		"  imul %%rdx, %%rax;"
+
+		/* Step 2: Fold the carry back into dst */
+		"  add %%rax, %%r8;"
+		"  adcx %1, %%r9;"
+		"  movq %%r9, 40(%2);"
+		"  adcx %1, %%r10;"
+		"  movq %%r10, 48(%2);"
+		"  adcx %1, %%r11;"
+		"  movq %%r11, 56(%2);"
+
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+		"  mov $0, %%rax;"
+		"  cmovc %%rdx, %%rax;"
+		"  add %%rax, %%r8;"
+		"  movq %%r8, 32(%2);"
+		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+		: "r"(out)
+		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+		  "%r14", "memory", "cc");
+}
+
+/* Computes the field multiplication of four-element f1 with value in f2
+ * Requires f2 to be smaller than 2^17 */
+static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
+{
+	register u64 f2_r asm("rdx") = f2;
+
+	asm volatile(
+		/* Compute the raw multiplication of f1*f2 */
+		"  mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
+		"  mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
+		"  add %%rcx, %%r9;"
+		"  mov $0, %%rcx;"
+		"  mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
+		"  adcx %%rbx, %%r10;"
+		"  mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
+		"  adcx %%r13, %%r11;"
+		"  adcx %%rcx, %%rax;"
+
+		/* Wrap the result back into the field */
+
+		/* Step 1: Compute carry*38 */
+		"  mov $38, %%rdx;"
+		"  imul %%rdx, %%rax;"
+
+		/* Step 2: Fold the carry back into dst */
+		"  add %%rax, %%r8;"
+		"  adcx %%rcx, %%r9;"
+		"  movq %%r9, 8(%1);"
+		"  adcx %%rcx, %%r10;"
+		"  movq %%r10, 16(%1);"
+		"  adcx %%rcx, %%r11;"
+		"  movq %%r11, 24(%1);"
+
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+		"  mov $0, %%rax;"
+		"  cmovc %%rdx, %%rax;"
+		"  add %%rax, %%r8;"
+		"  movq %%r8, 0(%1);"
+		: "+&r"(f2_r)
+		: "r"(out), "r"(f1)
+		: "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
+		  "memory", "cc");
+}
+
+/* Computes p1 <- bit ? p2 : p1 in constant time */
+static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
+{
+	asm volatile(
+		/* Transfer bit into CF flag */
+		"  add $18446744073709551615, %0;"
+
+		/* cswap p1[0], p2[0] */
+		"  movq 0(%1), %%r8;"
+		"  movq 0(%2), %%r9;"
+		"  mov %%r8, %%r10;"
+		"  cmovc %%r9, %%r8;"
+		"  cmovc %%r10, %%r9;"
+		"  movq %%r8, 0(%1);"
+		"  movq %%r9, 0(%2);"
+
+		/* cswap p1[1], p2[1] */
+		"  movq 8(%1), %%r8;"
+		"  movq 8(%2), %%r9;"
+		"  mov %%r8, %%r10;"
+		"  cmovc %%r9, %%r8;"
+		"  cmovc %%r10, %%r9;"
+		"  movq %%r8, 8(%1);"
+		"  movq %%r9, 8(%2);"
+
+		/* cswap p1[2], p2[2] */
+		"  movq 16(%1), %%r8;"
+		"  movq 16(%2), %%r9;"
+		"  mov %%r8, %%r10;"
+		"  cmovc %%r9, %%r8;"
+		"  cmovc %%r10, %%r9;"
+		"  movq %%r8, 16(%1);"
+		"  movq %%r9, 16(%2);"
+
+		/* cswap p1[3], p2[3] */
+		"  movq 24(%1), %%r8;"
+		"  movq 24(%2), %%r9;"
+		"  mov %%r8, %%r10;"
+		"  cmovc %%r9, %%r8;"
+		"  cmovc %%r10, %%r9;"
+		"  movq %%r8, 24(%1);"
+		"  movq %%r9, 24(%2);"
+
+		/* cswap p1[4], p2[4] */
+		"  movq 32(%1), %%r8;"
+		"  movq 32(%2), %%r9;"
+		"  mov %%r8, %%r10;"
+		"  cmovc %%r9, %%r8;"
+		"  cmovc %%r10, %%r9;"
+		"  movq %%r8, 32(%1);"
+		"  movq %%r9, 32(%2);"
+
+		/* cswap p1[5], p2[5] */
+		"  movq 40(%1), %%r8;"
+		"  movq 40(%2), %%r9;"
+		"  mov %%r8, %%r10;"
+		"  cmovc %%r9, %%r8;"
+		"  cmovc %%r10, %%r9;"
+		"  movq %%r8, 40(%1);"
+		"  movq %%r9, 40(%2);"
+
+		/* cswap p1[6], p2[6] */
+		"  movq 48(%1), %%r8;"
+		"  movq 48(%2), %%r9;"
+		"  mov %%r8, %%r10;"
+		"  cmovc %%r9, %%r8;"
+		"  cmovc %%r10, %%r9;"
+		"  movq %%r8, 48(%1);"
+		"  movq %%r9, 48(%2);"
+
+		/* cswap p1[7], p2[7] */
+		"  movq 56(%1), %%r8;"
+		"  movq 56(%2), %%r9;"
+		"  mov %%r8, %%r10;"
+		"  cmovc %%r9, %%r8;"
+		"  cmovc %%r10, %%r9;"
+		"  movq %%r8, 56(%1);"
+		"  movq %%r9, 56(%2);"
+		: "+&r"(bit)
+		: "r"(p1), "r"(p2)
+		: "%r8", "%r9", "%r10", "memory", "cc");
+}
+
+/* Computes the square of a field element: out <- f * f
+ * Uses the 8-element buffer tmp for intermediate results */
+static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
+{
+	asm volatile(
+		/* Compute the raw multiplication: tmp <- f * f */
+
+		/* Step 1: Compute all partial products */
+		"  movq 0(%0), %%rdx;" /* f[0] */
+		"  mulxq 8(%0), %%r8, %%r14;"
+		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
+		"  mulxq 16(%0), %%r9, %%r10;"
+		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
+		"  mulxq 24(%0), %%rax, %%rcx;"
+		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
+		"  movq 24(%0), %%rdx;" /* f[3] */
+		"  mulxq 8(%0), %%r11, %%rbx;"
+		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
+		"  mulxq 16(%0), %%rax, %%r13;"
+		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
+		"  movq 8(%0), %%rdx;"
+		"  adcx %%r15, %%r13;" /* f1 */
+		"  mulxq 16(%0), %%rax, %%rcx;"
+		"  mov $0, %%r14;" /* f[2]*f[1] */
+
+		/* Step 2: Compute two parallel carry chains */
+		"  xor %%r15d, %%r15d;"
+		"  adox %%rax, %%r10;"
+		"  adcx %%r8, %%r8;"
+		"  adox %%rcx, %%r11;"
+		"  adcx %%r9, %%r9;"
+		"  adox %%r15, %%rbx;"
+		"  adcx %%r10, %%r10;"
+		"  adox %%r15, %%r13;"
+		"  adcx %%r11, %%r11;"
+		"  adox %%r15, %%r14;"
+		"  adcx %%rbx, %%rbx;"
+		"  adcx %%r13, %%r13;"
+		"  adcx %%r14, %%r14;"
+
+		/* Step 3: Compute intermediate squares */
+		"  movq 0(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+		"  movq %%rax, 0(%1);"
+		"  add %%rcx, %%r8;"
+		"  movq %%r8, 8(%1);"
+		"  movq 8(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+		"  adcx %%rax, %%r9;"
+		"  movq %%r9, 16(%1);"
+		"  adcx %%rcx, %%r10;"
+		"  movq %%r10, 24(%1);"
+		"  movq 16(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+		"  adcx %%rax, %%r11;"
+		"  movq %%r11, 32(%1);"
+		"  adcx %%rcx, %%rbx;"
+		"  movq %%rbx, 40(%1);"
+		"  movq 24(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+		"  adcx %%rax, %%r13;"
+		"  movq %%r13, 48(%1);"
+		"  adcx %%rcx, %%r14;"
+		"  movq %%r14, 56(%1);"
+
+		/* Line up pointers */
+		"  mov %1, %0;"
+		"  mov %2, %1;"
+
+		/* Wrap the result back into the field */
+
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+		"  mov $38, %%rdx;"
+		"  mulxq 32(%0), %%r8, %%r13;"
+		"  xor %%ecx, %%ecx;"
+		"  adoxq 0(%0), %%r8;"
+		"  mulxq 40(%0), %%r9, %%rbx;"
+		"  adcx %%r13, %%r9;"
+		"  adoxq 8(%0), %%r9;"
+		"  mulxq 48(%0), %%r10, %%r13;"
+		"  adcx %%rbx, %%r10;"
+		"  adoxq 16(%0), %%r10;"
+		"  mulxq 56(%0), %%r11, %%rax;"
+		"  adcx %%r13, %%r11;"
+		"  adoxq 24(%0), %%r11;"
+		"  adcx %%rcx, %%rax;"
+		"  adox %%rcx, %%rax;"
+		"  imul %%rdx, %%rax;"
+
+		/* Step 2: Fold the carry back into dst */
+		"  add %%rax, %%r8;"
+		"  adcx %%rcx, %%r9;"
+		"  movq %%r9, 8(%1);"
+		"  adcx %%rcx, %%r10;"
+		"  movq %%r10, 16(%1);"
+		"  adcx %%rcx, %%r11;"
+		"  movq %%r11, 24(%1);"
+
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+		"  mov $0, %%rax;"
+		"  cmovc %%rdx, %%rax;"
+		"  add %%rax, %%r8;"
+		"  movq %%r8, 0(%1);"
+		: "+&r"(f), "+&r"(tmp)
+		: "r"(out)
+		: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+		  "%r13", "%r14", "%r15", "memory", "cc");
+}
+
+/* Computes two field squarings:
+ *   out[0] <- f[0] * f[0]
+ *   out[1] <- f[1] * f[1]
+ * Uses the 16-element buffer tmp for intermediate results */
+static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
+{
+	asm volatile(
+		/* Step 1: Compute all partial products */
+		"  movq 0(%0), %%rdx;" /* f[0] */
+		"  mulxq 8(%0), %%r8, %%r14;"
+		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
+		"  mulxq 16(%0), %%r9, %%r10;"
+		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
+		"  mulxq 24(%0), %%rax, %%rcx;"
+		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
+		"  movq 24(%0), %%rdx;" /* f[3] */
+		"  mulxq 8(%0), %%r11, %%rbx;"
+		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
+		"  mulxq 16(%0), %%rax, %%r13;"
+		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
+		"  movq 8(%0), %%rdx;"
+		"  adcx %%r15, %%r13;" /* f1 */
+		"  mulxq 16(%0), %%rax, %%rcx;"
+		"  mov $0, %%r14;" /* f[2]*f[1] */
+
+		/* Step 2: Compute two parallel carry chains */
+		"  xor %%r15d, %%r15d;"
+		"  adox %%rax, %%r10;"
+		"  adcx %%r8, %%r8;"
+		"  adox %%rcx, %%r11;"
+		"  adcx %%r9, %%r9;"
+		"  adox %%r15, %%rbx;"
+		"  adcx %%r10, %%r10;"
+		"  adox %%r15, %%r13;"
+		"  adcx %%r11, %%r11;"
+		"  adox %%r15, %%r14;"
+		"  adcx %%rbx, %%rbx;"
+		"  adcx %%r13, %%r13;"
+		"  adcx %%r14, %%r14;"
+
+		/* Step 3: Compute intermediate squares */
+		"  movq 0(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+		"  movq %%rax, 0(%1);"
+		"  add %%rcx, %%r8;"
+		"  movq %%r8, 8(%1);"
+		"  movq 8(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+		"  adcx %%rax, %%r9;"
+		"  movq %%r9, 16(%1);"
+		"  adcx %%rcx, %%r10;"
+		"  movq %%r10, 24(%1);"
+		"  movq 16(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+		"  adcx %%rax, %%r11;"
+		"  movq %%r11, 32(%1);"
+		"  adcx %%rcx, %%rbx;"
+		"  movq %%rbx, 40(%1);"
+		"  movq 24(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+		"  adcx %%rax, %%r13;"
+		"  movq %%r13, 48(%1);"
+		"  adcx %%rcx, %%r14;"
+		"  movq %%r14, 56(%1);"
+
+		/* Step 1: Compute all partial products */
+		"  movq 32(%0), %%rdx;" /* f[0] */
+		"  mulxq 40(%0), %%r8, %%r14;"
+		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
+		"  mulxq 48(%0), %%r9, %%r10;"
+		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
+		"  mulxq 56(%0), %%rax, %%rcx;"
+		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
+		"  movq 56(%0), %%rdx;" /* f[3] */
+		"  mulxq 40(%0), %%r11, %%rbx;"
+		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
+		"  mulxq 48(%0), %%rax, %%r13;"
+		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
+		"  movq 40(%0), %%rdx;"
+		"  adcx %%r15, %%r13;" /* f1 */
+		"  mulxq 48(%0), %%rax, %%rcx;"
+		"  mov $0, %%r14;" /* f[2]*f[1] */
+
+		/* Step 2: Compute two parallel carry chains */
+		"  xor %%r15d, %%r15d;"
+		"  adox %%rax, %%r10;"
+		"  adcx %%r8, %%r8;"
+		"  adox %%rcx, %%r11;"
+		"  adcx %%r9, %%r9;"
+		"  adox %%r15, %%rbx;"
+		"  adcx %%r10, %%r10;"
+		"  adox %%r15, %%r13;"
+		"  adcx %%r11, %%r11;"
+		"  adox %%r15, %%r14;"
+		"  adcx %%rbx, %%rbx;"
+		"  adcx %%r13, %%r13;"
+		"  adcx %%r14, %%r14;"
+
+		/* Step 3: Compute intermediate squares */
+		"  movq 32(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+		"  movq %%rax, 64(%1);"
+		"  add %%rcx, %%r8;"
+		"  movq %%r8, 72(%1);"
+		"  movq 40(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+		"  adcx %%rax, %%r9;"
+		"  movq %%r9, 80(%1);"
+		"  adcx %%rcx, %%r10;"
+		"  movq %%r10, 88(%1);"
+		"  movq 48(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+		"  adcx %%rax, %%r11;"
+		"  movq %%r11, 96(%1);"
+		"  adcx %%rcx, %%rbx;"
+		"  movq %%rbx, 104(%1);"
+		"  movq 56(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+		"  adcx %%rax, %%r13;"
+		"  movq %%r13, 112(%1);"
+		"  adcx %%rcx, %%r14;"
+		"  movq %%r14, 120(%1);"
+
+		/* Line up pointers */
+		"  mov %1, %0;"
+		"  mov %2, %1;"
+
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+		"  mov $38, %%rdx;"
+		"  mulxq 32(%0), %%r8, %%r13;"
+		"  xor %%ecx, %%ecx;"
+		"  adoxq 0(%0), %%r8;"
+		"  mulxq 40(%0), %%r9, %%rbx;"
+		"  adcx %%r13, %%r9;"
+		"  adoxq 8(%0), %%r9;"
+		"  mulxq 48(%0), %%r10, %%r13;"
+		"  adcx %%rbx, %%r10;"
+		"  adoxq 16(%0), %%r10;"
+		"  mulxq 56(%0), %%r11, %%rax;"
+		"  adcx %%r13, %%r11;"
+		"  adoxq 24(%0), %%r11;"
+		"  adcx %%rcx, %%rax;"
+		"  adox %%rcx, %%rax;"
+		"  imul %%rdx, %%rax;"
+
+		/* Step 2: Fold the carry back into dst */
+		"  add %%rax, %%r8;"
+		"  adcx %%rcx, %%r9;"
+		"  movq %%r9, 8(%1);"
+		"  adcx %%rcx, %%r10;"
+		"  movq %%r10, 16(%1);"
+		"  adcx %%rcx, %%r11;"
+		"  movq %%r11, 24(%1);"
+
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+		"  mov $0, %%rax;"
+		"  cmovc %%rdx, %%rax;"
+		"  add %%rax, %%r8;"
+		"  movq %%r8, 0(%1);"
+
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+		"  mov $38, %%rdx;"
+		"  mulxq 96(%0), %%r8, %%r13;"
+		"  xor %%ecx, %%ecx;"
+		"  adoxq 64(%0), %%r8;"
+		"  mulxq 104(%0), %%r9, %%rbx;"
+		"  adcx %%r13, %%r9;"
+		"  adoxq 72(%0), %%r9;"
+		"  mulxq 112(%0), %%r10, %%r13;"
+		"  adcx %%rbx, %%r10;"
+		"  adoxq 80(%0), %%r10;"
+		"  mulxq 120(%0), %%r11, %%rax;"
+		"  adcx %%r13, %%r11;"
+		"  adoxq 88(%0), %%r11;"
+		"  adcx %%rcx, %%rax;"
+		"  adox %%rcx, %%rax;"
+		"  imul %%rdx, %%rax;"
+
+		/* Step 2: Fold the carry back into dst */
+		"  add %%rax, %%r8;"
+		"  adcx %%rcx, %%r9;"
+		"  movq %%r9, 40(%1);"
+		"  adcx %%rcx, %%r10;"
+		"  movq %%r10, 48(%1);"
+		"  adcx %%rcx, %%r11;"
+		"  movq %%r11, 56(%1);"
+
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+		"  mov $0, %%rax;"
+		"  cmovc %%rdx, %%rax;"
+		"  add %%rax, %%r8;"
+		"  movq %%r8, 32(%1);"
+		: "+&r"(f), "+&r"(tmp)
+		: "r"(out)
+		: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+		  "%r13", "%r14", "%r15", "memory", "cc");
+}
+
+static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
+{
+	u64 *nq = p01_tmp1;
+	u64 *nq_p1 = p01_tmp1 + (u32)8U;
+	u64 *tmp1 = p01_tmp1 + (u32)16U;
+	u64 *x1 = q;
+	u64 *x2 = nq;
+	u64 *z2 = nq + (u32)4U;
+	u64 *z3 = nq_p1 + (u32)4U;
+	u64 *a = tmp1;
+	u64 *b = tmp1 + (u32)4U;
+	u64 *ab = tmp1;
+	u64 *dc = tmp1 + (u32)8U;
+	u64 *x3;
+	u64 *z31;
+	u64 *d0;
+	u64 *c0;
+	u64 *a1;
+	u64 *b1;
+	u64 *d;
+	u64 *c;
+	u64 *ab1;
+	u64 *dc1;
+	fadd(a, x2, z2);
+	fsub(b, x2, z2);
+	x3 = nq_p1;
+	z31 = nq_p1 + (u32)4U;
+	d0 = dc;
+	c0 = dc + (u32)4U;
+	fadd(c0, x3, z31);
+	fsub(d0, x3, z31);
+	fmul2(dc, dc, ab, tmp2);
+	fadd(x3, d0, c0);
+	fsub(z31, d0, c0);
+	a1 = tmp1;
+	b1 = tmp1 + (u32)4U;
+	d = tmp1 + (u32)8U;
+	c = tmp1 + (u32)12U;
+	ab1 = tmp1;
+	dc1 = tmp1 + (u32)8U;
+	fsqr2(dc1, ab1, tmp2);
+	fsqr2(nq_p1, nq_p1, tmp2);
+	a1[0U] = c[0U];
+	a1[1U] = c[1U];
+	a1[2U] = c[2U];
+	a1[3U] = c[3U];
+	fsub(c, d, c);
+	fmul_scalar(b1, c, (u64)121665U);
+	fadd(b1, b1, d);
+	fmul2(nq, dc1, ab1, tmp2);
+	fmul(z3, z3, x1, tmp2);
+}
+
+static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
+{
+	u64 *x2 = nq;
+	u64 *z2 = nq + (u32)4U;
+	u64 *a = tmp1;
+	u64 *b = tmp1 + (u32)4U;
+	u64 *d = tmp1 + (u32)8U;
+	u64 *c = tmp1 + (u32)12U;
+	u64 *ab = tmp1;
+	u64 *dc = tmp1 + (u32)8U;
+	fadd(a, x2, z2);
+	fsub(b, x2, z2);
+	fsqr2(dc, ab, tmp2);
+	a[0U] = c[0U];
+	a[1U] = c[1U];
+	a[2U] = c[2U];
+	a[3U] = c[3U];
+	fsub(c, d, c);
+	fmul_scalar(b, c, (u64)121665U);
+	fadd(b, b, d);
+	fmul2(nq, dc, ab, tmp2);
+}
+
+static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
+{
+	u64 tmp2[16U] = { 0U };
+	u64 p01_tmp1_swap[33U] = { 0U };
+	u64 *p0 = p01_tmp1_swap;
+	u64 *p01 = p01_tmp1_swap;
+	u64 *p03 = p01;
+	u64 *p11 = p01 + (u32)8U;
+	u64 *x0;
+	u64 *z0;
+	u64 *p01_tmp1;
+	u64 *p01_tmp11;
+	u64 *nq10;
+	u64 *nq_p11;
+	u64 *swap1;
+	u64 sw0;
+	u64 *nq1;
+	u64 *tmp1;
+	memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
+	x0 = p03;
+	z0 = p03 + (u32)4U;
+	x0[0U] = (u64)1U;
+	x0[1U] = (u64)0U;
+	x0[2U] = (u64)0U;
+	x0[3U] = (u64)0U;
+	z0[0U] = (u64)0U;
+	z0[1U] = (u64)0U;
+	z0[2U] = (u64)0U;
+	z0[3U] = (u64)0U;
+	p01_tmp1 = p01_tmp1_swap;
+	p01_tmp11 = p01_tmp1_swap;
+	nq10 = p01_tmp1_swap;
+	nq_p11 = p01_tmp1_swap + (u32)8U;
+	swap1 = p01_tmp1_swap + (u32)32U;
+	cswap2((u64)1U, nq10, nq_p11);
+	point_add_and_double(init1, p01_tmp11, tmp2);
+	swap1[0U] = (u64)1U;
+	{
+		u32 i;
+		for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
+			u64 *p01_tmp12 = p01_tmp1_swap;
+			u64 *swap2 = p01_tmp1_swap + (u32)32U;
+			u64 *nq2 = p01_tmp12;
+			u64 *nq_p12 = p01_tmp12 + (u32)8U;
+			u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
+			u64 sw = swap2[0U] ^ bit;
+			cswap2(sw, nq2, nq_p12);
+			point_add_and_double(init1, p01_tmp12, tmp2);
+			swap2[0U] = bit;
+		}
+	}
+	sw0 = swap1[0U];
+	cswap2(sw0, nq10, nq_p11);
+	nq1 = p01_tmp1;
+	tmp1 = p01_tmp1 + (u32)16U;
+	point_double(nq1, tmp1, tmp2);
+	point_double(nq1, tmp1, tmp2);
+	point_double(nq1, tmp1, tmp2);
+	memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
+
+	memzero_explicit(tmp2, sizeof(tmp2));
+	memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
+}
+
+static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
+{
+	u32 i;
+	fsqr(o, inp, tmp);
+	for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
+		fsqr(o, o, tmp);
+}
+
+static void finv(u64 *o, const u64 *i, u64 *tmp)
+{
+	u64 t1[16U] = { 0U };
+	u64 *a0 = t1;
+	u64 *b = t1 + (u32)4U;
+	u64 *c = t1 + (u32)8U;
+	u64 *t00 = t1 + (u32)12U;
+	u64 *tmp1 = tmp;
+	u64 *a;
+	u64 *t0;
+	fsquare_times(a0, i, tmp1, (u32)1U);
+	fsquare_times(t00, a0, tmp1, (u32)2U);
+	fmul(b, t00, i, tmp);
+	fmul(a0, b, a0, tmp);
+	fsquare_times(t00, a0, tmp1, (u32)1U);
+	fmul(b, t00, b, tmp);
+	fsquare_times(t00, b, tmp1, (u32)5U);
+	fmul(b, t00, b, tmp);
+	fsquare_times(t00, b, tmp1, (u32)10U);
+	fmul(c, t00, b, tmp);
+	fsquare_times(t00, c, tmp1, (u32)20U);
+	fmul(t00, t00, c, tmp);
+	fsquare_times(t00, t00, tmp1, (u32)10U);
+	fmul(b, t00, b, tmp);
+	fsquare_times(t00, b, tmp1, (u32)50U);
+	fmul(c, t00, b, tmp);
+	fsquare_times(t00, c, tmp1, (u32)100U);
+	fmul(t00, t00, c, tmp);
+	fsquare_times(t00, t00, tmp1, (u32)50U);
+	fmul(t00, t00, b, tmp);
+	fsquare_times(t00, t00, tmp1, (u32)5U);
+	a = t1;
+	t0 = t1 + (u32)12U;
+	fmul(o, t0, a, tmp);
+}
+
+static void store_felem(u64 *b, u64 *f)
+{
+	u64 f30 = f[3U];
+	u64 top_bit0 = f30 >> (u32)63U;
+	u64 f31;
+	u64 top_bit;
+	u64 f0;
+	u64 f1;
+	u64 f2;
+	u64 f3;
+	u64 m0;
+	u64 m1;
+	u64 m2;
+	u64 m3;
+	u64 mask;
+	u64 f0_;
+	u64 f1_;
+	u64 f2_;
+	u64 f3_;
+	u64 o0;
+	u64 o1;
+	u64 o2;
+	u64 o3;
+	f[3U] = f30 & (u64)0x7fffffffffffffffU;
+	add_scalar(f, f, (u64)19U * top_bit0);
+	f31 = f[3U];
+	top_bit = f31 >> (u32)63U;
+	f[3U] = f31 & (u64)0x7fffffffffffffffU;
+	add_scalar(f, f, (u64)19U * top_bit);
+	f0 = f[0U];
+	f1 = f[1U];
+	f2 = f[2U];
+	f3 = f[3U];
+	m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
+	m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
+	m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
+	m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
+	mask = ((m0 & m1) & m2) & m3;
+	f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
+	f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
+	f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
+	f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
+	o0 = f0_;
+	o1 = f1_;
+	o2 = f2_;
+	o3 = f3_;
+	b[0U] = o0;
+	b[1U] = o1;
+	b[2U] = o2;
+	b[3U] = o3;
+}
+
+static void encode_point(u8 *o, const u64 *i)
+{
+	const u64 *x = i;
+	const u64 *z = i + (u32)4U;
+	u64 tmp[4U] = { 0U };
+	u64 tmp_w[16U] = { 0U };
+	finv(tmp, z, tmp_w);
+	fmul(tmp, tmp, x, tmp_w);
+	store_felem((u64 *)o, tmp);
+}
+
+static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
+{
+	u64 init1[8U] = { 0U };
+	u64 tmp[4U] = { 0U };
+	u64 tmp3;
+	u64 *x;
+	u64 *z;
+	{
+		u32 i;
+		for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
+			u64 *os = tmp;
+			const u8 *bj = pub + i * (u32)8U;
+			u64 u = *(u64 *)bj;
+			u64 r = u;
+			u64 x0 = r;
+			os[i] = x0;
+		}
+	}
+	tmp3 = tmp[3U];
+	tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
+	x = init1;
+	z = init1 + (u32)4U;
+	z[0U] = (u64)1U;
+	z[1U] = (u64)0U;
+	z[2U] = (u64)0U;
+	z[3U] = (u64)0U;
+	x[0U] = tmp[0U];
+	x[1U] = tmp[1U];
+	x[2U] = tmp[2U];
+	x[3U] = tmp[3U];
+	montgomery_ladder(init1, priv, init1);
+	encode_point(out, init1);
+}
+
+/* The below constants were generated using this sage script:
+ *
+ * #!/usr/bin/env sage
+ * import sys
+ * from sage.all import *
+ * def limbs(n):
+ * 	n = int(n)
+ * 	l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
+ * 	return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
+ * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
+ * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
+ * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
+ * print("static const u64 table_ladder[] = {")
+ * p = ec.lift_x(9)
+ * for i in range(252):
+ * 	l = (p[0] + p[2]) / (p[0] - p[2])
+ * 	print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
+ * 	p = p * 2
+ * print("};")
+ *
+ */
+
+static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
+
+static const u64 table_ladder[] = {
+	0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
+	0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
+	0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
+	0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
+	0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
+	0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
+	0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
+	0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
+	0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
+	0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
+	0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
+	0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
+	0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
+	0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
+	0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
+	0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
+	0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
+	0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
+	0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
+	0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
+	0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
+	0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
+	0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
+	0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
+	0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
+	0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
+	0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
+	0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
+	0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
+	0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
+	0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
+	0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
+	0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
+	0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
+	0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
+	0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
+	0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
+	0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
+	0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
+	0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
+	0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
+	0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
+	0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
+	0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
+	0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
+	0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
+	0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
+	0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
+	0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
+	0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
+	0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
+	0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
+	0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
+	0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
+	0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
+	0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
+	0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
+	0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
+	0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
+	0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
+	0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
+	0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
+	0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
+	0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
+	0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
+	0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
+	0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
+	0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
+	0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
+	0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
+	0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
+	0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
+	0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
+	0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
+	0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
+	0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
+	0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
+	0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
+	0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
+	0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
+	0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
+	0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
+	0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
+	0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
+	0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
+	0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
+	0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
+	0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
+	0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
+	0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
+	0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
+	0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
+	0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
+	0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
+	0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
+	0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
+	0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
+	0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
+	0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
+	0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
+	0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
+	0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
+	0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
+	0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
+	0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
+	0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
+	0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
+	0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
+	0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
+	0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
+	0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
+	0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
+	0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
+	0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
+	0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
+	0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
+	0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
+	0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
+	0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
+	0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
+	0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
+	0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
+	0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
+	0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
+	0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
+	0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
+	0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
+	0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
+	0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
+	0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
+	0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
+	0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
+	0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
+	0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
+	0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
+	0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
+	0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
+	0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
+	0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
+	0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
+	0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
+	0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
+	0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
+	0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
+	0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
+	0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
+	0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
+	0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
+	0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
+	0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
+	0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
+	0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
+	0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
+	0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
+	0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
+	0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
+	0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
+	0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
+	0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
+	0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
+	0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
+	0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
+	0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
+	0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
+	0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
+	0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
+	0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
+	0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
+	0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
+	0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
+	0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
+	0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
+	0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
+	0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
+	0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
+	0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
+	0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
+	0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
+	0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
+	0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
+	0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
+	0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
+	0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
+	0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
+	0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
+	0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
+	0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
+	0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
+	0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
+	0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
+	0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
+	0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
+	0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
+	0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
+	0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
+	0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
+	0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
+	0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
+	0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
+	0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
+	0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
+	0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
+	0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
+	0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
+	0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
+	0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
+	0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
+	0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
+	0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
+	0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
+	0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
+	0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
+	0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
+	0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
+	0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
+	0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
+	0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
+	0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
+	0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
+	0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
+	0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
+	0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
+	0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
+	0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
+	0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
+	0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
+	0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
+	0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
+	0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
+	0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
+	0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
+	0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
+	0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
+	0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
+	0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
+	0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
+	0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
+	0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
+	0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
+	0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
+	0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
+	0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
+	0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
+	0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
+	0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
+	0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
+	0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
+	0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
+	0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
+	0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
+	0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
+	0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
+};
+
+static void curve25519_ever64_base(u8 *out, const u8 *priv)
+{
+	u64 swap = 1;
+	int i, j, k;
+	u64 tmp[16 + 32 + 4];
+	u64 *x1 = &tmp[0];
+	u64 *z1 = &tmp[4];
+	u64 *x2 = &tmp[8];
+	u64 *z2 = &tmp[12];
+	u64 *xz1 = &tmp[0];
+	u64 *xz2 = &tmp[8];
+	u64 *a = &tmp[0 + 16];
+	u64 *b = &tmp[4 + 16];
+	u64 *c = &tmp[8 + 16];
+	u64 *ab = &tmp[0 + 16];
+	u64 *abcd = &tmp[0 + 16];
+	u64 *ef = &tmp[16 + 16];
+	u64 *efgh = &tmp[16 + 16];
+	u64 *key = &tmp[0 + 16 + 32];
+
+	memcpy(key, priv, 32);
+	((u8 *)key)[0] &= 248;
+	((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
+
+	x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
+	z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
+	z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
+	memcpy(x2, p_minus_s, sizeof(p_minus_s));
+
+	j = 3;
+	for (i = 0; i < 4; ++i) {
+		while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
+			u64 bit = (key[i] >> j) & 1;
+			k = (64 * i + j - 3);
+			swap = swap ^ bit;
+			cswap2(swap, xz1, xz2);
+			swap = bit;
+			fsub(b, x1, z1);
+			fadd(a, x1, z1);
+			fmul(c, &table_ladder[4 * k], b, ef);
+			fsub(b, a, c);
+			fadd(a, a, c);
+			fsqr2(ab, ab, efgh);
+			fmul2(xz1, xz2, ab, efgh);
+			++j;
+		}
+		j = 0;
+	}
+
+	point_double(xz1, abcd, efgh);
+	point_double(xz1, abcd, efgh);
+	point_double(xz1, abcd, efgh);
+	encode_point(out, xz1);
+
+	memzero_explicit(tmp, sizeof(tmp));
+}
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
+
+static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+			    const u8 secret[CURVE25519_KEY_SIZE],
+			    const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+	if (static_branch_likely(&curve25519_use_bmi2_adx))
+		curve25519_ever64(mypublic, secret, basepoint);
+	else
+		curve25519_generic(mypublic, secret, basepoint);
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+				 const u8 secret[CURVE25519_KEY_SIZE])
+{
+	if (static_branch_likely(&curve25519_use_bmi2_adx))
+		curve25519_ever64_base(pub, secret);
+	else
+		curve25519_generic(pub, secret, curve25519_base_point);
+}
+
+#define curve25519_mod_init_arch curve25519_mod_init_arch
+static void curve25519_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
+		static_branch_enable(&curve25519_use_bmi2_adx);
+}
diff --git a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
new file mode 100644
index 000000000000..409ec6955733
--- /dev/null
+++ b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
@@ -0,0 +1,4240 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+#
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
+# has relicensed it under the licenses specified in the SPDX header above.
+# The original headers, including the original license headers, are
+# included below for completeness.
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for x86_64.
+#
+# March 2015
+#
+# Initial release.
+#
+# December 2016
+#
+# Add AVX512F+VL+BW code path.
+#
+# November 2017
+#
+# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
+# executed even on Knights Landing. Trigger for modification was
+# observation that AVX512 code paths can negatively affect overall
+# Skylake-X system performance. Since we are likely to suppress
+# AVX512F capability flag [at least on Skylake-X], conversion serves
+# as kind of "investment protection". Note that next *lake processor,
+# Cannonlake, has AVX512IFMA code path to execute...
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone,
+# measured with rdtsc at fixed clock frequency.
+#
+#		IALU/gcc-4.8(*)	AVX(**)		AVX2	AVX-512
+# P4		4.46/+120%	-
+# Core 2	2.41/+90%	-
+# Westmere	1.88/+120%	-
+# Sandy Bridge	1.39/+140%	1.10
+# Haswell	1.14/+175%	1.11		0.65
+# Skylake[-X]	1.13/+120%	0.96		0.51	[0.35]
+# Silvermont	2.83/+95%	-
+# Knights L	3.60/?		1.65		1.10	0.41(***)
+# Goldmont	1.70/+180%	-
+# VIA Nano	1.82/+150%	-
+# Sledgehammer	1.38/+160%	-
+# Bulldozer	2.30/+130%	0.97
+# Ryzen		1.15/+200%	1.08		1.18
+#
+# (*)	improvement coefficients relative to clang are more modest and
+#	are ~50% on most processors, in both cases we are comparing to
+#	__int128 code;
+# (**)	SSE2 implementation was attempted, but among non-AVX processors
+#	it was faster than integer-only code only on older Intel P4 and
+#	Core processors, 50-30%, less newer processor is, but slower on
+#	contemporary ones, for example almost 2x slower on Atom, and as
+#	former are naturally disappearing, SSE2 is deemed unnecessary;
+# (***)	strangely enough performance seems to vary from core to core,
+#	listed result is best case;
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$kernel=0; $kernel=1 if (!$flavour && !$output);
+
+if (!$kernel) {
+	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+	die "can't locate x86_64-xlate.pl";
+
+	open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+	*STDOUT=*OUT;
+
+	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+	    =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+		$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
+	}
+
+	if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+		$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
+		$avx += 1 if ($1==2.11 && $2>=8);
+	}
+
+	if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+		$avx = ($1>=10) + ($1>=11);
+	}
+
+	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+		$avx = ($2>=3.0) + ($2>3.0);
+	}
+} else {
+	$avx = 4; # The kernel uses ifdefs for this.
+}
+
+sub declare_function() {
+	my ($name, $align, $nargs) = @_;
+	if($kernel) {
+		$code .= "SYM_FUNC_START($name)\n";
+		$code .= ".L$name:\n";
+	} else {
+		$code .= ".globl	$name\n";
+		$code .= ".type	$name,\@function,$nargs\n";
+		$code .= ".align	$align\n";
+		$code .= "$name:\n";
+	}
+}
+
+sub end_function() {
+	my ($name) = @_;
+	if($kernel) {
+		$code .= "SYM_FUNC_END($name)\n";
+	} else {
+		$code .= ".size   $name,.-$name\n";
+	}
+}
+
+$code.=<<___ if $kernel;
+#include <linux/linkage.h>
+___
+
+if ($avx) {
+$code.=<<___ if $kernel;
+.section .rodata
+___
+$code.=<<___;
+.align	64
+.Lconst:
+.Lmask24:
+.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
+.Lmask26:
+.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lpermd_avx2:
+.long	2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.L2_44_inp_permd:
+.long	0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad	0,12,24,64
+.L2_44_mask:
+.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad	44,44,42,64
+.L2_44_shift_lft:
+.quad	8,8,10,64
+
+.align	64
+.Lx_mask44:
+.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+___
+}
+$code.=<<___ if (!$kernel);
+.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
+
+my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
+my ($mac,$nonce)=($inp,$len);	# *_emit arguments
+my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
+my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
+
+sub poly1305_iteration {
+# input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
+# output:	$h0-$h2 *= $r0-$r1
+$code.=<<___;
+	mulq	$h0			# h0*r1
+	mov	%rax,$d2
+	 mov	$r0,%rax
+	mov	%rdx,$d3
+
+	mulq	$h0			# h0*r0
+	mov	%rax,$h0		# future $h0
+	 mov	$r0,%rax
+	mov	%rdx,$d1
+
+	mulq	$h1			# h1*r0
+	add	%rax,$d2
+	 mov	$s1,%rax
+	adc	%rdx,$d3
+
+	mulq	$h1			# h1*s1
+	 mov	$h2,$h1			# borrow $h1
+	add	%rax,$h0
+	adc	%rdx,$d1
+
+	imulq	$s1,$h1			# h2*s1
+	add	$h1,$d2
+	 mov	$d1,$h1
+	adc	\$0,$d3
+
+	imulq	$r0,$h2			# h2*r0
+	add	$d2,$h1
+	mov	\$-4,%rax		# mask value
+	adc	$h2,$d3
+
+	and	$d3,%rax		# last reduction step
+	mov	$d3,$h2
+	shr	\$2,$d3
+	and	\$3,$h2
+	add	$d3,%rax
+	add	%rax,$h0
+	adc	\$0,$h1
+	adc	\$0,$h2
+___
+}
+
+########################################################################
+# Layout of opaque area is following.
+#
+#	unsigned __int64 h[3];		# current hash value base 2^64
+#	unsigned __int64 r[2];		# key value base 2^64
+
+$code.=<<___;
+.text
+___
+$code.=<<___ if (!$kernel);
+.extern	OPENSSL_ia32cap_P
+
+.globl	poly1305_init_x86_64
+.hidden	poly1305_init_x86_64
+.globl	poly1305_blocks_x86_64
+.hidden	poly1305_blocks_x86_64
+.globl	poly1305_emit_x86_64
+.hidden	poly1305_emit_x86_64
+___
+&declare_function("poly1305_init_x86_64", 32, 3);
+$code.=<<___;
+	xor	%eax,%eax
+	mov	%rax,0($ctx)		# initialize hash value
+	mov	%rax,8($ctx)
+	mov	%rax,16($ctx)
+
+	test	$inp,$inp
+	je	.Lno_key
+___
+$code.=<<___ if (!$kernel);
+	lea	poly1305_blocks_x86_64(%rip),%r10
+	lea	poly1305_emit_x86_64(%rip),%r11
+___
+$code.=<<___	if (!$kernel && $avx);
+	mov	OPENSSL_ia32cap_P+4(%rip),%r9
+	lea	poly1305_blocks_avx(%rip),%rax
+	lea	poly1305_emit_avx(%rip),%rcx
+	bt	\$`60-32`,%r9		# AVX?
+	cmovc	%rax,%r10
+	cmovc	%rcx,%r11
+___
+$code.=<<___	if (!$kernel && $avx>1);
+	lea	poly1305_blocks_avx2(%rip),%rax
+	bt	\$`5+32`,%r9		# AVX2?
+	cmovc	%rax,%r10
+___
+$code.=<<___	if (!$kernel && $avx>3);
+	mov	\$`(1<<31|1<<21|1<<16)`,%rax
+	shr	\$32,%r9
+	and	%rax,%r9
+	cmp	%rax,%r9
+	je	.Linit_base2_44
+___
+$code.=<<___;
+	mov	\$0x0ffffffc0fffffff,%rax
+	mov	\$0x0ffffffc0ffffffc,%rcx
+	and	0($inp),%rax
+	and	8($inp),%rcx
+	mov	%rax,24($ctx)
+	mov	%rcx,32($ctx)
+___
+$code.=<<___	if (!$kernel && $flavour !~ /elf32/);
+	mov	%r10,0(%rdx)
+	mov	%r11,8(%rdx)
+___
+$code.=<<___	if (!$kernel && $flavour =~ /elf32/);
+	mov	%r10d,0(%rdx)
+	mov	%r11d,4(%rdx)
+___
+$code.=<<___;
+	mov	\$1,%eax
+.Lno_key:
+	RET
+___
+&end_function("poly1305_init_x86_64");
+
+&declare_function("poly1305_blocks_x86_64", 32, 4);
+$code.=<<___;
+.cfi_startproc
+.Lblocks:
+	shr	\$4,$len
+	jz	.Lno_data		# too short
+
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$ctx
+.cfi_push	$ctx
+.Lblocks_body:
+
+	mov	$len,%r15		# reassign $len
+
+	mov	24($ctx),$r0		# load r
+	mov	32($ctx),$s1
+
+	mov	0($ctx),$h0		# load hash value
+	mov	8($ctx),$h1
+	mov	16($ctx),$h2
+
+	mov	$s1,$r1
+	shr	\$2,$s1
+	mov	$r1,%rax
+	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
+	jmp	.Loop
+
+.align	32
+.Loop:
+	add	0($inp),$h0		# accumulate input
+	adc	8($inp),$h1
+	lea	16($inp),$inp
+	adc	$padbit,$h2
+___
+
+	&poly1305_iteration();
+
+$code.=<<___;
+	mov	$r1,%rax
+	dec	%r15			# len-=16
+	jnz	.Loop
+
+	mov	0(%rsp),$ctx
+.cfi_restore	$ctx
+
+	mov	$h0,0($ctx)		# store hash value
+	mov	$h1,8($ctx)
+	mov	$h2,16($ctx)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lno_data:
+.Lblocks_epilogue:
+	RET
+.cfi_endproc
+___
+&end_function("poly1305_blocks_x86_64");
+
+&declare_function("poly1305_emit_x86_64", 32, 3);
+$code.=<<___;
+.Lemit:
+	mov	0($ctx),%r8	# load hash value
+	mov	8($ctx),%r9
+	mov	16($ctx),%r10
+
+	mov	%r8,%rax
+	add	\$5,%r8		# compare to modulus
+	mov	%r9,%rcx
+	adc	\$0,%r9
+	adc	\$0,%r10
+	shr	\$2,%r10	# did 130-bit value overflow?
+	cmovnz	%r8,%rax
+	cmovnz	%r9,%rcx
+
+	add	0($nonce),%rax	# accumulate nonce
+	adc	8($nonce),%rcx
+	mov	%rax,0($mac)	# write result
+	mov	%rcx,8($mac)
+
+	RET
+___
+&end_function("poly1305_emit_x86_64");
+if ($avx) {
+
+########################################################################
+# Layout of opaque area is following.
+#
+#	unsigned __int32 h[5];		# current hash value base 2^26
+#	unsigned __int32 is_base2_26;
+#	unsigned __int64 r[2];		# key value base 2^64
+#	unsigned __int64 pad;
+#	struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
+#
+# where r^n are base 2^26 digits of degrees of multiplier key. There are
+# 5 digits, but last four are interleaved with multiples of 5, totalling
+# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
+
+my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
+    map("%xmm$_",(0..15));
+
+$code.=<<___;
+.type	__poly1305_block,\@abi-omnipotent
+.align	32
+__poly1305_block:
+	push $ctx
+___
+	&poly1305_iteration();
+$code.=<<___;
+	pop $ctx
+	RET
+.size	__poly1305_block,.-__poly1305_block
+
+.type	__poly1305_init_avx,\@abi-omnipotent
+.align	32
+__poly1305_init_avx:
+	push %rbp
+	mov %rsp,%rbp
+	mov	$r0,$h0
+	mov	$r1,$h1
+	xor	$h2,$h2
+
+	lea	48+64($ctx),$ctx	# size optimization
+
+	mov	$r1,%rax
+	call	__poly1305_block	# r^2
+
+	mov	\$0x3ffffff,%eax	# save interleaved r^2 and r base 2^26
+	mov	\$0x3ffffff,%edx
+	mov	$h0,$d1
+	and	$h0#d,%eax
+	mov	$r0,$d2
+	and	$r0#d,%edx
+	mov	%eax,`16*0+0-64`($ctx)
+	shr	\$26,$d1
+	mov	%edx,`16*0+4-64`($ctx)
+	shr	\$26,$d2
+
+	mov	\$0x3ffffff,%eax
+	mov	\$0x3ffffff,%edx
+	and	$d1#d,%eax
+	and	$d2#d,%edx
+	mov	%eax,`16*1+0-64`($ctx)
+	lea	(%rax,%rax,4),%eax	# *5
+	mov	%edx,`16*1+4-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	mov	%eax,`16*2+0-64`($ctx)
+	shr	\$26,$d1
+	mov	%edx,`16*2+4-64`($ctx)
+	shr	\$26,$d2
+
+	mov	$h1,%rax
+	mov	$r1,%rdx
+	shl	\$12,%rax
+	shl	\$12,%rdx
+	or	$d1,%rax
+	or	$d2,%rdx
+	and	\$0x3ffffff,%eax
+	and	\$0x3ffffff,%edx
+	mov	%eax,`16*3+0-64`($ctx)
+	lea	(%rax,%rax,4),%eax	# *5
+	mov	%edx,`16*3+4-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	mov	%eax,`16*4+0-64`($ctx)
+	mov	$h1,$d1
+	mov	%edx,`16*4+4-64`($ctx)
+	mov	$r1,$d2
+
+	mov	\$0x3ffffff,%eax
+	mov	\$0x3ffffff,%edx
+	shr	\$14,$d1
+	shr	\$14,$d2
+	and	$d1#d,%eax
+	and	$d2#d,%edx
+	mov	%eax,`16*5+0-64`($ctx)
+	lea	(%rax,%rax,4),%eax	# *5
+	mov	%edx,`16*5+4-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	mov	%eax,`16*6+0-64`($ctx)
+	shr	\$26,$d1
+	mov	%edx,`16*6+4-64`($ctx)
+	shr	\$26,$d2
+
+	mov	$h2,%rax
+	shl	\$24,%rax
+	or	%rax,$d1
+	mov	$d1#d,`16*7+0-64`($ctx)
+	lea	($d1,$d1,4),$d1		# *5
+	mov	$d2#d,`16*7+4-64`($ctx)
+	lea	($d2,$d2,4),$d2		# *5
+	mov	$d1#d,`16*8+0-64`($ctx)
+	mov	$d2#d,`16*8+4-64`($ctx)
+
+	mov	$r1,%rax
+	call	__poly1305_block	# r^3
+
+	mov	\$0x3ffffff,%eax	# save r^3 base 2^26
+	mov	$h0,$d1
+	and	$h0#d,%eax
+	shr	\$26,$d1
+	mov	%eax,`16*0+12-64`($ctx)
+
+	mov	\$0x3ffffff,%edx
+	and	$d1#d,%edx
+	mov	%edx,`16*1+12-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	shr	\$26,$d1
+	mov	%edx,`16*2+12-64`($ctx)
+
+	mov	$h1,%rax
+	shl	\$12,%rax
+	or	$d1,%rax
+	and	\$0x3ffffff,%eax
+	mov	%eax,`16*3+12-64`($ctx)
+	lea	(%rax,%rax,4),%eax	# *5
+	mov	$h1,$d1
+	mov	%eax,`16*4+12-64`($ctx)
+
+	mov	\$0x3ffffff,%edx
+	shr	\$14,$d1
+	and	$d1#d,%edx
+	mov	%edx,`16*5+12-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	shr	\$26,$d1
+	mov	%edx,`16*6+12-64`($ctx)
+
+	mov	$h2,%rax
+	shl	\$24,%rax
+	or	%rax,$d1
+	mov	$d1#d,`16*7+12-64`($ctx)
+	lea	($d1,$d1,4),$d1		# *5
+	mov	$d1#d,`16*8+12-64`($ctx)
+
+	mov	$r1,%rax
+	call	__poly1305_block	# r^4
+
+	mov	\$0x3ffffff,%eax	# save r^4 base 2^26
+	mov	$h0,$d1
+	and	$h0#d,%eax
+	shr	\$26,$d1
+	mov	%eax,`16*0+8-64`($ctx)
+
+	mov	\$0x3ffffff,%edx
+	and	$d1#d,%edx
+	mov	%edx,`16*1+8-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	shr	\$26,$d1
+	mov	%edx,`16*2+8-64`($ctx)
+
+	mov	$h1,%rax
+	shl	\$12,%rax
+	or	$d1,%rax
+	and	\$0x3ffffff,%eax
+	mov	%eax,`16*3+8-64`($ctx)
+	lea	(%rax,%rax,4),%eax	# *5
+	mov	$h1,$d1
+	mov	%eax,`16*4+8-64`($ctx)
+
+	mov	\$0x3ffffff,%edx
+	shr	\$14,$d1
+	and	$d1#d,%edx
+	mov	%edx,`16*5+8-64`($ctx)
+	lea	(%rdx,%rdx,4),%edx	# *5
+	shr	\$26,$d1
+	mov	%edx,`16*6+8-64`($ctx)
+
+	mov	$h2,%rax
+	shl	\$24,%rax
+	or	%rax,$d1
+	mov	$d1#d,`16*7+8-64`($ctx)
+	lea	($d1,$d1,4),$d1		# *5
+	mov	$d1#d,`16*8+8-64`($ctx)
+
+	lea	-48-64($ctx),$ctx	# size [de-]optimization
+	pop %rbp
+	RET
+.size	__poly1305_init_avx,.-__poly1305_init_avx
+___
+
+&declare_function("poly1305_blocks_avx", 32, 4);
+$code.=<<___;
+.cfi_startproc
+	mov	20($ctx),%r8d		# is_base2_26
+	cmp	\$128,$len
+	jae	.Lblocks_avx
+	test	%r8d,%r8d
+	jz	.Lblocks
+
+.Lblocks_avx:
+	and	\$-16,$len
+	jz	.Lno_data_avx
+
+	vzeroupper
+
+	test	%r8d,%r8d
+	jz	.Lbase2_64_avx
+
+	test	\$31,$len
+	jz	.Leven_avx
+
+	push	%rbp
+.cfi_push	%rbp
+	mov 	%rsp,%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lblocks_avx_body:
+
+	mov	$len,%r15		# reassign $len
+
+	mov	0($ctx),$d1		# load hash value
+	mov	8($ctx),$d2
+	mov	16($ctx),$h2#d
+
+	mov	24($ctx),$r0		# load r
+	mov	32($ctx),$s1
+
+	################################# base 2^26 -> base 2^64
+	mov	$d1#d,$h0#d
+	and	\$`-1*(1<<31)`,$d1
+	mov	$d2,$r1			# borrow $r1
+	mov	$d2#d,$h1#d
+	and	\$`-1*(1<<31)`,$d2
+
+	shr	\$6,$d1
+	shl	\$52,$r1
+	add	$d1,$h0
+	shr	\$12,$h1
+	shr	\$18,$d2
+	add	$r1,$h0
+	adc	$d2,$h1
+
+	mov	$h2,$d1
+	shl	\$40,$d1
+	shr	\$24,$h2
+	add	$d1,$h1
+	adc	\$0,$h2			# can be partially reduced...
+
+	mov	\$-4,$d2		# ... so reduce
+	mov	$h2,$d1
+	and	$h2,$d2
+	shr	\$2,$d1
+	and	\$3,$h2
+	add	$d2,$d1			# =*5
+	add	$d1,$h0
+	adc	\$0,$h1
+	adc	\$0,$h2
+
+	mov	$s1,$r1
+	mov	$s1,%rax
+	shr	\$2,$s1
+	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
+
+	add	0($inp),$h0		# accumulate input
+	adc	8($inp),$h1
+	lea	16($inp),$inp
+	adc	$padbit,$h2
+
+	call	__poly1305_block
+
+	test	$padbit,$padbit		# if $padbit is zero,
+	jz	.Lstore_base2_64_avx	# store hash in base 2^64 format
+
+	################################# base 2^64 -> base 2^26
+	mov	$h0,%rax
+	mov	$h0,%rdx
+	shr	\$52,$h0
+	mov	$h1,$r0
+	mov	$h1,$r1
+	shr	\$26,%rdx
+	and	\$0x3ffffff,%rax	# h[0]
+	shl	\$12,$r0
+	and	\$0x3ffffff,%rdx	# h[1]
+	shr	\$14,$h1
+	or	$r0,$h0
+	shl	\$24,$h2
+	and	\$0x3ffffff,$h0		# h[2]
+	shr	\$40,$r1
+	and	\$0x3ffffff,$h1		# h[3]
+	or	$r1,$h2			# h[4]
+
+	sub	\$16,%r15
+	jz	.Lstore_base2_26_avx
+
+	vmovd	%rax#d,$H0
+	vmovd	%rdx#d,$H1
+	vmovd	$h0#d,$H2
+	vmovd	$h1#d,$H3
+	vmovd	$h2#d,$H4
+	jmp	.Lproceed_avx
+
+.align	32
+.Lstore_base2_64_avx:
+	mov	$h0,0($ctx)
+	mov	$h1,8($ctx)
+	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
+	jmp	.Ldone_avx
+
+.align	16
+.Lstore_base2_26_avx:
+	mov	%rax#d,0($ctx)		# store hash value base 2^26
+	mov	%rdx#d,4($ctx)
+	mov	$h0#d,8($ctx)
+	mov	$h1#d,12($ctx)
+	mov	$h2#d,16($ctx)
+.align	16
+.Ldone_avx:
+	pop 		%r15
+.cfi_restore	%r15
+	pop 		%r14
+.cfi_restore	%r14
+	pop 		%r13
+.cfi_restore	%r13
+	pop 		%r12
+.cfi_restore	%r12
+	pop 		%rbx
+.cfi_restore	%rbx
+	pop 		%rbp
+.cfi_restore	%rbp
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+	RET
+.cfi_endproc
+
+.align	32
+.Lbase2_64_avx:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	mov 	%rsp,%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lbase2_64_avx_body:
+
+	mov	$len,%r15		# reassign $len
+
+	mov	24($ctx),$r0		# load r
+	mov	32($ctx),$s1
+
+	mov	0($ctx),$h0		# load hash value
+	mov	8($ctx),$h1
+	mov	16($ctx),$h2#d
+
+	mov	$s1,$r1
+	mov	$s1,%rax
+	shr	\$2,$s1
+	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
+
+	test	\$31,$len
+	jz	.Linit_avx
+
+	add	0($inp),$h0		# accumulate input
+	adc	8($inp),$h1
+	lea	16($inp),$inp
+	adc	$padbit,$h2
+	sub	\$16,%r15
+
+	call	__poly1305_block
+
+.Linit_avx:
+	################################# base 2^64 -> base 2^26
+	mov	$h0,%rax
+	mov	$h0,%rdx
+	shr	\$52,$h0
+	mov	$h1,$d1
+	mov	$h1,$d2
+	shr	\$26,%rdx
+	and	\$0x3ffffff,%rax	# h[0]
+	shl	\$12,$d1
+	and	\$0x3ffffff,%rdx	# h[1]
+	shr	\$14,$h1
+	or	$d1,$h0
+	shl	\$24,$h2
+	and	\$0x3ffffff,$h0		# h[2]
+	shr	\$40,$d2
+	and	\$0x3ffffff,$h1		# h[3]
+	or	$d2,$h2			# h[4]
+
+	vmovd	%rax#d,$H0
+	vmovd	%rdx#d,$H1
+	vmovd	$h0#d,$H2
+	vmovd	$h1#d,$H3
+	vmovd	$h2#d,$H4
+	movl	\$1,20($ctx)		# set is_base2_26
+
+	call	__poly1305_init_avx
+
+.Lproceed_avx:
+	mov	%r15,$len
+	pop 		%r15
+.cfi_restore	%r15
+	pop 		%r14
+.cfi_restore	%r14
+	pop 		%r13
+.cfi_restore	%r13
+	pop 		%r12
+.cfi_restore	%r12
+	pop 		%rbx
+.cfi_restore	%rbx
+	pop 		%rbp
+.cfi_restore	%rbp
+.Lbase2_64_avx_epilogue:
+	jmp	.Ldo_avx
+.cfi_endproc
+
+.align	32
+.Leven_avx:
+.cfi_startproc
+	vmovd		4*0($ctx),$H0		# load hash value
+	vmovd		4*1($ctx),$H1
+	vmovd		4*2($ctx),$H2
+	vmovd		4*3($ctx),$H3
+	vmovd		4*4($ctx),$H4
+
+.Ldo_avx:
+___
+$code.=<<___	if (!$win64);
+	lea		8(%rsp),%r10
+.cfi_def_cfa_register	%r10
+	and		\$-32,%rsp
+	sub		\$-8,%rsp
+	lea		-0x58(%rsp),%r11
+	sub		\$0x178,%rsp
+___
+$code.=<<___	if ($win64);
+	lea		-0xf8(%rsp),%r11
+	sub		\$0x218,%rsp
+	vmovdqa		%xmm6,0x50(%r11)
+	vmovdqa		%xmm7,0x60(%r11)
+	vmovdqa		%xmm8,0x70(%r11)
+	vmovdqa		%xmm9,0x80(%r11)
+	vmovdqa		%xmm10,0x90(%r11)
+	vmovdqa		%xmm11,0xa0(%r11)
+	vmovdqa		%xmm12,0xb0(%r11)
+	vmovdqa		%xmm13,0xc0(%r11)
+	vmovdqa		%xmm14,0xd0(%r11)
+	vmovdqa		%xmm15,0xe0(%r11)
+.Ldo_avx_body:
+___
+$code.=<<___;
+	sub		\$64,$len
+	lea		-32($inp),%rax
+	cmovc		%rax,$inp
+
+	vmovdqu		`16*3`($ctx),$D4	# preload r0^2
+	lea		`16*3+64`($ctx),$ctx	# size optimization
+	lea		.Lconst(%rip),%rcx
+
+	################################################################
+	# load input
+	vmovdqu		16*2($inp),$T0
+	vmovdqu		16*3($inp),$T1
+	vmovdqa		64(%rcx),$MASK		# .Lmask26
+
+	vpsrldq		\$6,$T0,$T2		# splat input
+	vpsrldq		\$6,$T1,$T3
+	vpunpckhqdq	$T1,$T0,$T4		# 4
+	vpunpcklqdq	$T1,$T0,$T0		# 0:1
+	vpunpcklqdq	$T3,$T2,$T3		# 2:3
+
+	vpsrlq		\$40,$T4,$T4		# 4
+	vpsrlq		\$26,$T0,$T1
+	vpand		$MASK,$T0,$T0		# 0
+	vpsrlq		\$4,$T3,$T2
+	vpand		$MASK,$T1,$T1		# 1
+	vpsrlq		\$30,$T3,$T3
+	vpand		$MASK,$T2,$T2		# 2
+	vpand		$MASK,$T3,$T3		# 3
+	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
+
+	jbe		.Lskip_loop_avx
+
+	# expand and copy pre-calculated table to stack
+	vmovdqu		`16*1-64`($ctx),$D1
+	vmovdqu		`16*2-64`($ctx),$D2
+	vpshufd		\$0xEE,$D4,$D3		# 34xx -> 3434
+	vpshufd		\$0x44,$D4,$D0		# xx12 -> 1212
+	vmovdqa		$D3,-0x90(%r11)
+	vmovdqa		$D0,0x00(%rsp)
+	vpshufd		\$0xEE,$D1,$D4
+	vmovdqu		`16*3-64`($ctx),$D0
+	vpshufd		\$0x44,$D1,$D1
+	vmovdqa		$D4,-0x80(%r11)
+	vmovdqa		$D1,0x10(%rsp)
+	vpshufd		\$0xEE,$D2,$D3
+	vmovdqu		`16*4-64`($ctx),$D1
+	vpshufd		\$0x44,$D2,$D2
+	vmovdqa		$D3,-0x70(%r11)
+	vmovdqa		$D2,0x20(%rsp)
+	vpshufd		\$0xEE,$D0,$D4
+	vmovdqu		`16*5-64`($ctx),$D2
+	vpshufd		\$0x44,$D0,$D0
+	vmovdqa		$D4,-0x60(%r11)
+	vmovdqa		$D0,0x30(%rsp)
+	vpshufd		\$0xEE,$D1,$D3
+	vmovdqu		`16*6-64`($ctx),$D0
+	vpshufd		\$0x44,$D1,$D1
+	vmovdqa		$D3,-0x50(%r11)
+	vmovdqa		$D1,0x40(%rsp)
+	vpshufd		\$0xEE,$D2,$D4
+	vmovdqu		`16*7-64`($ctx),$D1
+	vpshufd		\$0x44,$D2,$D2
+	vmovdqa		$D4,-0x40(%r11)
+	vmovdqa		$D2,0x50(%rsp)
+	vpshufd		\$0xEE,$D0,$D3
+	vmovdqu		`16*8-64`($ctx),$D2
+	vpshufd		\$0x44,$D0,$D0
+	vmovdqa		$D3,-0x30(%r11)
+	vmovdqa		$D0,0x60(%rsp)
+	vpshufd		\$0xEE,$D1,$D4
+	vpshufd		\$0x44,$D1,$D1
+	vmovdqa		$D4,-0x20(%r11)
+	vmovdqa		$D1,0x70(%rsp)
+	vpshufd		\$0xEE,$D2,$D3
+	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
+	vpshufd		\$0x44,$D2,$D2
+	vmovdqa		$D3,-0x10(%r11)
+	vmovdqa		$D2,0x80(%rsp)
+
+	jmp		.Loop_avx
+
+.align	32
+.Loop_avx:
+	################################################################
+	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+	#   \___________________/
+	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+	#   \___________________/ \____________________/
+	#
+	# Note that we start with inp[2:3]*r^2. This is because it
+	# doesn't depend on reduction in previous iteration.
+	################################################################
+	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+	#
+	# though note that $Tx and $Hx are "reversed" in this section,
+	# and $D4 is preloaded with r0^2...
+
+	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
+	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
+	  vmovdqa	$H2,0x20(%r11)				# offload hash
+	vpmuludq	$T2,$D4,$D2		# d3 = h2*r0
+	 vmovdqa	0x10(%rsp),$H2		# r1^2
+	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
+	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
+
+	  vmovdqa	$H0,0x00(%r11)				#
+	vpmuludq	0x20(%rsp),$T4,$H0	# h4*s1
+	  vmovdqa	$H1,0x10(%r11)				#
+	vpmuludq	$T3,$H2,$H1		# h3*r1
+	vpaddq		$H0,$D0,$D0		# d0 += h4*s1
+	vpaddq		$H1,$D4,$D4		# d4 += h3*r1
+	  vmovdqa	$H3,0x30(%r11)				#
+	vpmuludq	$T2,$H2,$H0		# h2*r1
+	vpmuludq	$T1,$H2,$H1		# h1*r1
+	vpaddq		$H0,$D3,$D3		# d3 += h2*r1
+	 vmovdqa	0x30(%rsp),$H3		# r2^2
+	vpaddq		$H1,$D2,$D2		# d2 += h1*r1
+	  vmovdqa	$H4,0x40(%r11)				#
+	vpmuludq	$T0,$H2,$H2		# h0*r1
+	 vpmuludq	$T2,$H3,$H0		# h2*r2
+	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
+
+	 vmovdqa	0x40(%rsp),$H4		# s2^2
+	vpaddq		$H0,$D4,$D4		# d4 += h2*r2
+	vpmuludq	$T1,$H3,$H1		# h1*r2
+	vpmuludq	$T0,$H3,$H3		# h0*r2
+	vpaddq		$H1,$D3,$D3		# d3 += h1*r2
+	 vmovdqa	0x50(%rsp),$H2		# r3^2
+	vpaddq		$H3,$D2,$D2		# d2 += h0*r2
+	vpmuludq	$T4,$H4,$H0		# h4*s2
+	vpmuludq	$T3,$H4,$H4		# h3*s2
+	vpaddq		$H0,$D1,$D1		# d1 += h4*s2
+	 vmovdqa	0x60(%rsp),$H3		# s3^2
+	vpaddq		$H4,$D0,$D0		# d0 += h3*s2
+
+	 vmovdqa	0x80(%rsp),$H4		# s4^2
+	vpmuludq	$T1,$H2,$H1		# h1*r3
+	vpmuludq	$T0,$H2,$H2		# h0*r3
+	vpaddq		$H1,$D4,$D4		# d4 += h1*r3
+	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
+	vpmuludq	$T4,$H3,$H0		# h4*s3
+	vpmuludq	$T3,$H3,$H1		# h3*s3
+	vpaddq		$H0,$D2,$D2		# d2 += h4*s3
+	 vmovdqu	16*0($inp),$H0				# load input
+	vpaddq		$H1,$D1,$D1		# d1 += h3*s3
+	vpmuludq	$T2,$H3,$H3		# h2*s3
+	 vpmuludq	$T2,$H4,$T2		# h2*s4
+	vpaddq		$H3,$D0,$D0		# d0 += h2*s3
+
+	 vmovdqu	16*1($inp),$H1				#
+	vpaddq		$T2,$D1,$D1		# d1 += h2*s4
+	vpmuludq	$T3,$H4,$T3		# h3*s4
+	vpmuludq	$T4,$H4,$T4		# h4*s4
+	 vpsrldq	\$6,$H0,$H2				# splat input
+	vpaddq		$T3,$D2,$D2		# d2 += h3*s4
+	vpaddq		$T4,$D3,$D3		# d3 += h4*s4
+	 vpsrldq	\$6,$H1,$H3				#
+	vpmuludq	0x70(%rsp),$T0,$T4	# h0*r4
+	vpmuludq	$T1,$H4,$T0		# h1*s4
+	 vpunpckhqdq	$H1,$H0,$H4		# 4
+	vpaddq		$T4,$D4,$D4		# d4 += h0*r4
+	 vmovdqa	-0x90(%r11),$T4		# r0^4
+	vpaddq		$T0,$D0,$D0		# d0 += h1*s4
+
+	vpunpcklqdq	$H1,$H0,$H0		# 0:1
+	vpunpcklqdq	$H3,$H2,$H3		# 2:3
+
+	#vpsrlq		\$40,$H4,$H4		# 4
+	vpsrldq		\$`40/8`,$H4,$H4	# 4
+	vpsrlq		\$26,$H0,$H1
+	vpand		$MASK,$H0,$H0		# 0
+	vpsrlq		\$4,$H3,$H2
+	vpand		$MASK,$H1,$H1		# 1
+	vpand		0(%rcx),$H4,$H4		# .Lmask24
+	vpsrlq		\$30,$H3,$H3
+	vpand		$MASK,$H2,$H2		# 2
+	vpand		$MASK,$H3,$H3		# 3
+	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
+
+	vpaddq		0x00(%r11),$H0,$H0	# add hash value
+	vpaddq		0x10(%r11),$H1,$H1
+	vpaddq		0x20(%r11),$H2,$H2
+	vpaddq		0x30(%r11),$H3,$H3
+	vpaddq		0x40(%r11),$H4,$H4
+
+	lea		16*2($inp),%rax
+	lea		16*4($inp),$inp
+	sub		\$64,$len
+	cmovc		%rax,$inp
+
+	################################################################
+	# Now we accumulate (inp[0:1]+hash)*r^4
+	################################################################
+	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+	vpmuludq	$H0,$T4,$T0		# h0*r0
+	vpmuludq	$H1,$T4,$T1		# h1*r0
+	vpaddq		$T0,$D0,$D0
+	vpaddq		$T1,$D1,$D1
+	 vmovdqa	-0x80(%r11),$T2		# r1^4
+	vpmuludq	$H2,$T4,$T0		# h2*r0
+	vpmuludq	$H3,$T4,$T1		# h3*r0
+	vpaddq		$T0,$D2,$D2
+	vpaddq		$T1,$D3,$D3
+	vpmuludq	$H4,$T4,$T4		# h4*r0
+	 vpmuludq	-0x70(%r11),$H4,$T0	# h4*s1
+	vpaddq		$T4,$D4,$D4
+
+	vpaddq		$T0,$D0,$D0		# d0 += h4*s1
+	vpmuludq	$H2,$T2,$T1		# h2*r1
+	vpmuludq	$H3,$T2,$T0		# h3*r1
+	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
+	 vmovdqa	-0x60(%r11),$T3		# r2^4
+	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
+	vpmuludq	$H1,$T2,$T1		# h1*r1
+	vpmuludq	$H0,$T2,$T2		# h0*r1
+	vpaddq		$T1,$D2,$D2		# d2 += h1*r1
+	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
+
+	 vmovdqa	-0x50(%r11),$T4		# s2^4
+	vpmuludq	$H2,$T3,$T0		# h2*r2
+	vpmuludq	$H1,$T3,$T1		# h1*r2
+	vpaddq		$T0,$D4,$D4		# d4 += h2*r2
+	vpaddq		$T1,$D3,$D3		# d3 += h1*r2
+	 vmovdqa	-0x40(%r11),$T2		# r3^4
+	vpmuludq	$H0,$T3,$T3		# h0*r2
+	vpmuludq	$H4,$T4,$T0		# h4*s2
+	vpaddq		$T3,$D2,$D2		# d2 += h0*r2
+	vpaddq		$T0,$D1,$D1		# d1 += h4*s2
+	 vmovdqa	-0x30(%r11),$T3		# s3^4
+	vpmuludq	$H3,$T4,$T4		# h3*s2
+	 vpmuludq	$H1,$T2,$T1		# h1*r3
+	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
+
+	 vmovdqa	-0x10(%r11),$T4		# s4^4
+	vpaddq		$T1,$D4,$D4		# d4 += h1*r3
+	vpmuludq	$H0,$T2,$T2		# h0*r3
+	vpmuludq	$H4,$T3,$T0		# h4*s3
+	vpaddq		$T2,$D3,$D3		# d3 += h0*r3
+	vpaddq		$T0,$D2,$D2		# d2 += h4*s3
+	 vmovdqu	16*2($inp),$T0				# load input
+	vpmuludq	$H3,$T3,$T2		# h3*s3
+	vpmuludq	$H2,$T3,$T3		# h2*s3
+	vpaddq		$T2,$D1,$D1		# d1 += h3*s3
+	 vmovdqu	16*3($inp),$T1				#
+	vpaddq		$T3,$D0,$D0		# d0 += h2*s3
+
+	vpmuludq	$H2,$T4,$H2		# h2*s4
+	vpmuludq	$H3,$T4,$H3		# h3*s4
+	 vpsrldq	\$6,$T0,$T2				# splat input
+	vpaddq		$H2,$D1,$D1		# d1 += h2*s4
+	vpmuludq	$H4,$T4,$H4		# h4*s4
+	 vpsrldq	\$6,$T1,$T3				#
+	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*s4
+	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*s4
+	vpmuludq	-0x20(%r11),$H0,$H4	# h0*r4
+	vpmuludq	$H1,$T4,$H0
+	 vpunpckhqdq	$T1,$T0,$T4		# 4
+	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
+	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
+
+	vpunpcklqdq	$T1,$T0,$T0		# 0:1
+	vpunpcklqdq	$T3,$T2,$T3		# 2:3
+
+	#vpsrlq		\$40,$T4,$T4		# 4
+	vpsrldq		\$`40/8`,$T4,$T4	# 4
+	vpsrlq		\$26,$T0,$T1
+	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
+	vpand		$MASK,$T0,$T0		# 0
+	vpsrlq		\$4,$T3,$T2
+	vpand		$MASK,$T1,$T1		# 1
+	vpand		0(%rcx),$T4,$T4		# .Lmask24
+	vpsrlq		\$30,$T3,$T3
+	vpand		$MASK,$T2,$T2		# 2
+	vpand		$MASK,$T3,$T3		# 3
+	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
+
+	################################################################
+	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+	# and P. Schwabe
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	vpaddq		$D0,$D1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H4,$D0
+	vpand		$MASK,$H4,$H4
+
+	vpsrlq		\$26,$H1,$D1
+	vpand		$MASK,$H1,$H1
+	vpaddq		$D1,$H2,$H2		# h1 -> h2
+
+	vpaddq		$D0,$H0,$H0
+	vpsllq		\$2,$D0,$D0
+	vpaddq		$D0,$H0,$H0		# h4 -> h0
+
+	vpsrlq		\$26,$H2,$D2
+	vpand		$MASK,$H2,$H2
+	vpaddq		$D2,$H3,$H3		# h2 -> h3
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	ja		.Loop_avx
+
+.Lskip_loop_avx:
+	################################################################
+	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+	vpshufd		\$0x10,$D4,$D4		# r0^n, xx12 -> x1x2
+	add		\$32,$len
+	jnz		.Long_tail_avx
+
+	vpaddq		$H2,$T2,$T2
+	vpaddq		$H0,$T0,$T0
+	vpaddq		$H1,$T1,$T1
+	vpaddq		$H3,$T3,$T3
+	vpaddq		$H4,$T4,$T4
+
+.Long_tail_avx:
+	vmovdqa		$H2,0x20(%r11)
+	vmovdqa		$H0,0x00(%r11)
+	vmovdqa		$H1,0x10(%r11)
+	vmovdqa		$H3,0x30(%r11)
+	vmovdqa		$H4,0x40(%r11)
+
+	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+	vpmuludq	$T2,$D4,$D2		# d2 = h2*r0
+	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
+	 vpshufd	\$0x10,`16*1-64`($ctx),$H2		# r1^n
+	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
+	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
+	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
+
+	vpmuludq	$T3,$H2,$H0		# h3*r1
+	vpaddq		$H0,$D4,$D4		# d4 += h3*r1
+	 vpshufd	\$0x10,`16*2-64`($ctx),$H3		# s1^n
+	vpmuludq	$T2,$H2,$H1		# h2*r1
+	vpaddq		$H1,$D3,$D3		# d3 += h2*r1
+	 vpshufd	\$0x10,`16*3-64`($ctx),$H4		# r2^n
+	vpmuludq	$T1,$H2,$H0		# h1*r1
+	vpaddq		$H0,$D2,$D2		# d2 += h1*r1
+	vpmuludq	$T0,$H2,$H2		# h0*r1
+	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
+	vpmuludq	$T4,$H3,$H3		# h4*s1
+	vpaddq		$H3,$D0,$D0		# d0 += h4*s1
+
+	 vpshufd	\$0x10,`16*4-64`($ctx),$H2		# s2^n
+	vpmuludq	$T2,$H4,$H1		# h2*r2
+	vpaddq		$H1,$D4,$D4		# d4 += h2*r2
+	vpmuludq	$T1,$H4,$H0		# h1*r2
+	vpaddq		$H0,$D3,$D3		# d3 += h1*r2
+	 vpshufd	\$0x10,`16*5-64`($ctx),$H3		# r3^n
+	vpmuludq	$T0,$H4,$H4		# h0*r2
+	vpaddq		$H4,$D2,$D2		# d2 += h0*r2
+	vpmuludq	$T4,$H2,$H1		# h4*s2
+	vpaddq		$H1,$D1,$D1		# d1 += h4*s2
+	 vpshufd	\$0x10,`16*6-64`($ctx),$H4		# s3^n
+	vpmuludq	$T3,$H2,$H2		# h3*s2
+	vpaddq		$H2,$D0,$D0		# d0 += h3*s2
+
+	vpmuludq	$T1,$H3,$H0		# h1*r3
+	vpaddq		$H0,$D4,$D4		# d4 += h1*r3
+	vpmuludq	$T0,$H3,$H3		# h0*r3
+	vpaddq		$H3,$D3,$D3		# d3 += h0*r3
+	 vpshufd	\$0x10,`16*7-64`($ctx),$H2		# r4^n
+	vpmuludq	$T4,$H4,$H1		# h4*s3
+	vpaddq		$H1,$D2,$D2		# d2 += h4*s3
+	 vpshufd	\$0x10,`16*8-64`($ctx),$H3		# s4^n
+	vpmuludq	$T3,$H4,$H0		# h3*s3
+	vpaddq		$H0,$D1,$D1		# d1 += h3*s3
+	vpmuludq	$T2,$H4,$H4		# h2*s3
+	vpaddq		$H4,$D0,$D0		# d0 += h2*s3
+
+	vpmuludq	$T0,$H2,$H2		# h0*r4
+	vpaddq		$H2,$D4,$D4		# h4 = d4 + h0*r4
+	vpmuludq	$T4,$H3,$H1		# h4*s4
+	vpaddq		$H1,$D3,$D3		# h3 = d3 + h4*s4
+	vpmuludq	$T3,$H3,$H0		# h3*s4
+	vpaddq		$H0,$D2,$D2		# h2 = d2 + h3*s4
+	vpmuludq	$T2,$H3,$H1		# h2*s4
+	vpaddq		$H1,$D1,$D1		# h1 = d1 + h2*s4
+	vpmuludq	$T1,$H3,$H3		# h1*s4
+	vpaddq		$H3,$D0,$D0		# h0 = d0 + h1*s4
+
+	jz		.Lshort_tail_avx
+
+	vmovdqu		16*0($inp),$H0		# load input
+	vmovdqu		16*1($inp),$H1
+
+	vpsrldq		\$6,$H0,$H2		# splat input
+	vpsrldq		\$6,$H1,$H3
+	vpunpckhqdq	$H1,$H0,$H4		# 4
+	vpunpcklqdq	$H1,$H0,$H0		# 0:1
+	vpunpcklqdq	$H3,$H2,$H3		# 2:3
+
+	vpsrlq		\$40,$H4,$H4		# 4
+	vpsrlq		\$26,$H0,$H1
+	vpand		$MASK,$H0,$H0		# 0
+	vpsrlq		\$4,$H3,$H2
+	vpand		$MASK,$H1,$H1		# 1
+	vpsrlq		\$30,$H3,$H3
+	vpand		$MASK,$H2,$H2		# 2
+	vpand		$MASK,$H3,$H3		# 3
+	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
+
+	vpshufd		\$0x32,`16*0-64`($ctx),$T4	# r0^n, 34xx -> x3x4
+	vpaddq		0x00(%r11),$H0,$H0
+	vpaddq		0x10(%r11),$H1,$H1
+	vpaddq		0x20(%r11),$H2,$H2
+	vpaddq		0x30(%r11),$H3,$H3
+	vpaddq		0x40(%r11),$H4,$H4
+
+	################################################################
+	# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
+
+	vpmuludq	$H0,$T4,$T0		# h0*r0
+	vpaddq		$T0,$D0,$D0		# d0 += h0*r0
+	vpmuludq	$H1,$T4,$T1		# h1*r0
+	vpaddq		$T1,$D1,$D1		# d1 += h1*r0
+	vpmuludq	$H2,$T4,$T0		# h2*r0
+	vpaddq		$T0,$D2,$D2		# d2 += h2*r0
+	 vpshufd	\$0x32,`16*1-64`($ctx),$T2		# r1^n
+	vpmuludq	$H3,$T4,$T1		# h3*r0
+	vpaddq		$T1,$D3,$D3		# d3 += h3*r0
+	vpmuludq	$H4,$T4,$T4		# h4*r0
+	vpaddq		$T4,$D4,$D4		# d4 += h4*r0
+
+	vpmuludq	$H3,$T2,$T0		# h3*r1
+	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
+	 vpshufd	\$0x32,`16*2-64`($ctx),$T3		# s1
+	vpmuludq	$H2,$T2,$T1		# h2*r1
+	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
+	 vpshufd	\$0x32,`16*3-64`($ctx),$T4		# r2
+	vpmuludq	$H1,$T2,$T0		# h1*r1
+	vpaddq		$T0,$D2,$D2		# d2 += h1*r1
+	vpmuludq	$H0,$T2,$T2		# h0*r1
+	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
+	vpmuludq	$H4,$T3,$T3		# h4*s1
+	vpaddq		$T3,$D0,$D0		# d0 += h4*s1
+
+	 vpshufd	\$0x32,`16*4-64`($ctx),$T2		# s2
+	vpmuludq	$H2,$T4,$T1		# h2*r2
+	vpaddq		$T1,$D4,$D4		# d4 += h2*r2
+	vpmuludq	$H1,$T4,$T0		# h1*r2
+	vpaddq		$T0,$D3,$D3		# d3 += h1*r2
+	 vpshufd	\$0x32,`16*5-64`($ctx),$T3		# r3
+	vpmuludq	$H0,$T4,$T4		# h0*r2
+	vpaddq		$T4,$D2,$D2		# d2 += h0*r2
+	vpmuludq	$H4,$T2,$T1		# h4*s2
+	vpaddq		$T1,$D1,$D1		# d1 += h4*s2
+	 vpshufd	\$0x32,`16*6-64`($ctx),$T4		# s3
+	vpmuludq	$H3,$T2,$T2		# h3*s2
+	vpaddq		$T2,$D0,$D0		# d0 += h3*s2
+
+	vpmuludq	$H1,$T3,$T0		# h1*r3
+	vpaddq		$T0,$D4,$D4		# d4 += h1*r3
+	vpmuludq	$H0,$T3,$T3		# h0*r3
+	vpaddq		$T3,$D3,$D3		# d3 += h0*r3
+	 vpshufd	\$0x32,`16*7-64`($ctx),$T2		# r4
+	vpmuludq	$H4,$T4,$T1		# h4*s3
+	vpaddq		$T1,$D2,$D2		# d2 += h4*s3
+	 vpshufd	\$0x32,`16*8-64`($ctx),$T3		# s4
+	vpmuludq	$H3,$T4,$T0		# h3*s3
+	vpaddq		$T0,$D1,$D1		# d1 += h3*s3
+	vpmuludq	$H2,$T4,$T4		# h2*s3
+	vpaddq		$T4,$D0,$D0		# d0 += h2*s3
+
+	vpmuludq	$H0,$T2,$T2		# h0*r4
+	vpaddq		$T2,$D4,$D4		# d4 += h0*r4
+	vpmuludq	$H4,$T3,$T1		# h4*s4
+	vpaddq		$T1,$D3,$D3		# d3 += h4*s4
+	vpmuludq	$H3,$T3,$T0		# h3*s4
+	vpaddq		$T0,$D2,$D2		# d2 += h3*s4
+	vpmuludq	$H2,$T3,$T1		# h2*s4
+	vpaddq		$T1,$D1,$D1		# d1 += h2*s4
+	vpmuludq	$H1,$T3,$T3		# h1*s4
+	vpaddq		$T3,$D0,$D0		# d0 += h1*s4
+
+.Lshort_tail_avx:
+	################################################################
+	# horizontal addition
+
+	vpsrldq		\$8,$D4,$T4
+	vpsrldq		\$8,$D3,$T3
+	vpsrldq		\$8,$D1,$T1
+	vpsrldq		\$8,$D0,$T0
+	vpsrldq		\$8,$D2,$T2
+	vpaddq		$T3,$D3,$D3
+	vpaddq		$T4,$D4,$D4
+	vpaddq		$T0,$D0,$D0
+	vpaddq		$T1,$D1,$D1
+	vpaddq		$T2,$D2,$D2
+
+	################################################################
+	# lazy reduction
+
+	vpsrlq		\$26,$D3,$H3
+	vpand		$MASK,$D3,$D3
+	vpaddq		$H3,$D4,$D4		# h3 -> h4
+
+	vpsrlq		\$26,$D0,$H0
+	vpand		$MASK,$D0,$D0
+	vpaddq		$H0,$D1,$D1		# h0 -> h1
+
+	vpsrlq		\$26,$D4,$H4
+	vpand		$MASK,$D4,$D4
+
+	vpsrlq		\$26,$D1,$H1
+	vpand		$MASK,$D1,$D1
+	vpaddq		$H1,$D2,$D2		# h1 -> h2
+
+	vpaddq		$H4,$D0,$D0
+	vpsllq		\$2,$H4,$H4
+	vpaddq		$H4,$D0,$D0		# h4 -> h0
+
+	vpsrlq		\$26,$D2,$H2
+	vpand		$MASK,$D2,$D2
+	vpaddq		$H2,$D3,$D3		# h2 -> h3
+
+	vpsrlq		\$26,$D0,$H0
+	vpand		$MASK,$D0,$D0
+	vpaddq		$H0,$D1,$D1		# h0 -> h1
+
+	vpsrlq		\$26,$D3,$H3
+	vpand		$MASK,$D3,$D3
+	vpaddq		$H3,$D4,$D4		# h3 -> h4
+
+	vmovd		$D0,`4*0-48-64`($ctx)	# save partially reduced
+	vmovd		$D1,`4*1-48-64`($ctx)
+	vmovd		$D2,`4*2-48-64`($ctx)
+	vmovd		$D3,`4*3-48-64`($ctx)
+	vmovd		$D4,`4*4-48-64`($ctx)
+___
+$code.=<<___	if ($win64);
+	vmovdqa		0x50(%r11),%xmm6
+	vmovdqa		0x60(%r11),%xmm7
+	vmovdqa		0x70(%r11),%xmm8
+	vmovdqa		0x80(%r11),%xmm9
+	vmovdqa		0x90(%r11),%xmm10
+	vmovdqa		0xa0(%r11),%xmm11
+	vmovdqa		0xb0(%r11),%xmm12
+	vmovdqa		0xc0(%r11),%xmm13
+	vmovdqa		0xd0(%r11),%xmm14
+	vmovdqa		0xe0(%r11),%xmm15
+	lea		0xf8(%r11),%rsp
+.Ldo_avx_epilogue:
+___
+$code.=<<___	if (!$win64);
+	lea		-8(%r10),%rsp
+.cfi_def_cfa_register	%rsp
+___
+$code.=<<___;
+	vzeroupper
+	RET
+.cfi_endproc
+___
+&end_function("poly1305_blocks_avx");
+
+&declare_function("poly1305_emit_avx", 32, 3);
+$code.=<<___;
+	cmpl	\$0,20($ctx)	# is_base2_26?
+	je	.Lemit
+
+	mov	0($ctx),%eax	# load hash value base 2^26
+	mov	4($ctx),%ecx
+	mov	8($ctx),%r8d
+	mov	12($ctx),%r11d
+	mov	16($ctx),%r10d
+
+	shl	\$26,%rcx	# base 2^26 -> base 2^64
+	mov	%r8,%r9
+	shl	\$52,%r8
+	add	%rcx,%rax
+	shr	\$12,%r9
+	add	%rax,%r8	# h0
+	adc	\$0,%r9
+
+	shl	\$14,%r11
+	mov	%r10,%rax
+	shr	\$24,%r10
+	add	%r11,%r9
+	shl	\$40,%rax
+	add	%rax,%r9	# h1
+	adc	\$0,%r10	# h2
+
+	mov	%r10,%rax	# could be partially reduced, so reduce
+	mov	%r10,%rcx
+	and	\$3,%r10
+	shr	\$2,%rax
+	and	\$-4,%rcx
+	add	%rcx,%rax
+	add	%rax,%r8
+	adc	\$0,%r9
+	adc	\$0,%r10
+
+	mov	%r8,%rax
+	add	\$5,%r8		# compare to modulus
+	mov	%r9,%rcx
+	adc	\$0,%r9
+	adc	\$0,%r10
+	shr	\$2,%r10	# did 130-bit value overflow?
+	cmovnz	%r8,%rax
+	cmovnz	%r9,%rcx
+
+	add	0($nonce),%rax	# accumulate nonce
+	adc	8($nonce),%rcx
+	mov	%rax,0($mac)	# write result
+	mov	%rcx,8($mac)
+
+	RET
+___
+&end_function("poly1305_emit_avx");
+
+if ($avx>1) {
+
+my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
+    map("%ymm$_",(0..15));
+my $S4=$MASK;
+
+sub poly1305_blocks_avxN {
+	my ($avx512) = @_;
+	my $suffix = $avx512 ? "_avx512" : "";
+$code.=<<___;
+.cfi_startproc
+	mov	20($ctx),%r8d		# is_base2_26
+	cmp	\$128,$len
+	jae	.Lblocks_avx2$suffix
+	test	%r8d,%r8d
+	jz	.Lblocks
+
+.Lblocks_avx2$suffix:
+	and	\$-16,$len
+	jz	.Lno_data_avx2$suffix
+
+	vzeroupper
+
+	test	%r8d,%r8d
+	jz	.Lbase2_64_avx2$suffix
+
+	test	\$63,$len
+	jz	.Leven_avx2$suffix
+
+	push	%rbp
+.cfi_push	%rbp
+	mov 	%rsp,%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lblocks_avx2_body$suffix:
+
+	mov	$len,%r15		# reassign $len
+
+	mov	0($ctx),$d1		# load hash value
+	mov	8($ctx),$d2
+	mov	16($ctx),$h2#d
+
+	mov	24($ctx),$r0		# load r
+	mov	32($ctx),$s1
+
+	################################# base 2^26 -> base 2^64
+	mov	$d1#d,$h0#d
+	and	\$`-1*(1<<31)`,$d1
+	mov	$d2,$r1			# borrow $r1
+	mov	$d2#d,$h1#d
+	and	\$`-1*(1<<31)`,$d2
+
+	shr	\$6,$d1
+	shl	\$52,$r1
+	add	$d1,$h0
+	shr	\$12,$h1
+	shr	\$18,$d2
+	add	$r1,$h0
+	adc	$d2,$h1
+
+	mov	$h2,$d1
+	shl	\$40,$d1
+	shr	\$24,$h2
+	add	$d1,$h1
+	adc	\$0,$h2			# can be partially reduced...
+
+	mov	\$-4,$d2		# ... so reduce
+	mov	$h2,$d1
+	and	$h2,$d2
+	shr	\$2,$d1
+	and	\$3,$h2
+	add	$d2,$d1			# =*5
+	add	$d1,$h0
+	adc	\$0,$h1
+	adc	\$0,$h2
+
+	mov	$s1,$r1
+	mov	$s1,%rax
+	shr	\$2,$s1
+	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
+
+.Lbase2_26_pre_avx2$suffix:
+	add	0($inp),$h0		# accumulate input
+	adc	8($inp),$h1
+	lea	16($inp),$inp
+	adc	$padbit,$h2
+	sub	\$16,%r15
+
+	call	__poly1305_block
+	mov	$r1,%rax
+
+	test	\$63,%r15
+	jnz	.Lbase2_26_pre_avx2$suffix
+
+	test	$padbit,$padbit		# if $padbit is zero,
+	jz	.Lstore_base2_64_avx2$suffix	# store hash in base 2^64 format
+
+	################################# base 2^64 -> base 2^26
+	mov	$h0,%rax
+	mov	$h0,%rdx
+	shr	\$52,$h0
+	mov	$h1,$r0
+	mov	$h1,$r1
+	shr	\$26,%rdx
+	and	\$0x3ffffff,%rax	# h[0]
+	shl	\$12,$r0
+	and	\$0x3ffffff,%rdx	# h[1]
+	shr	\$14,$h1
+	or	$r0,$h0
+	shl	\$24,$h2
+	and	\$0x3ffffff,$h0		# h[2]
+	shr	\$40,$r1
+	and	\$0x3ffffff,$h1		# h[3]
+	or	$r1,$h2			# h[4]
+
+	test	%r15,%r15
+	jz	.Lstore_base2_26_avx2$suffix
+
+	vmovd	%rax#d,%x#$H0
+	vmovd	%rdx#d,%x#$H1
+	vmovd	$h0#d,%x#$H2
+	vmovd	$h1#d,%x#$H3
+	vmovd	$h2#d,%x#$H4
+	jmp	.Lproceed_avx2$suffix
+
+.align	32
+.Lstore_base2_64_avx2$suffix:
+	mov	$h0,0($ctx)
+	mov	$h1,8($ctx)
+	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
+	jmp	.Ldone_avx2$suffix
+
+.align	16
+.Lstore_base2_26_avx2$suffix:
+	mov	%rax#d,0($ctx)		# store hash value base 2^26
+	mov	%rdx#d,4($ctx)
+	mov	$h0#d,8($ctx)
+	mov	$h1#d,12($ctx)
+	mov	$h2#d,16($ctx)
+.align	16
+.Ldone_avx2$suffix:
+	pop 		%r15
+.cfi_restore	%r15
+	pop 		%r14
+.cfi_restore	%r14
+	pop 		%r13
+.cfi_restore	%r13
+	pop 		%r12
+.cfi_restore	%r12
+	pop 		%rbx
+.cfi_restore	%rbx
+	pop 		%rbp
+.cfi_restore 	%rbp
+.Lno_data_avx2$suffix:
+.Lblocks_avx2_epilogue$suffix:
+	RET
+.cfi_endproc
+
+.align	32
+.Lbase2_64_avx2$suffix:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	mov 	%rsp,%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lbase2_64_avx2_body$suffix:
+
+	mov	$len,%r15		# reassign $len
+
+	mov	24($ctx),$r0		# load r
+	mov	32($ctx),$s1
+
+	mov	0($ctx),$h0		# load hash value
+	mov	8($ctx),$h1
+	mov	16($ctx),$h2#d
+
+	mov	$s1,$r1
+	mov	$s1,%rax
+	shr	\$2,$s1
+	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
+
+	test	\$63,$len
+	jz	.Linit_avx2$suffix
+
+.Lbase2_64_pre_avx2$suffix:
+	add	0($inp),$h0		# accumulate input
+	adc	8($inp),$h1
+	lea	16($inp),$inp
+	adc	$padbit,$h2
+	sub	\$16,%r15
+
+	call	__poly1305_block
+	mov	$r1,%rax
+
+	test	\$63,%r15
+	jnz	.Lbase2_64_pre_avx2$suffix
+
+.Linit_avx2$suffix:
+	################################# base 2^64 -> base 2^26
+	mov	$h0,%rax
+	mov	$h0,%rdx
+	shr	\$52,$h0
+	mov	$h1,$d1
+	mov	$h1,$d2
+	shr	\$26,%rdx
+	and	\$0x3ffffff,%rax	# h[0]
+	shl	\$12,$d1
+	and	\$0x3ffffff,%rdx	# h[1]
+	shr	\$14,$h1
+	or	$d1,$h0
+	shl	\$24,$h2
+	and	\$0x3ffffff,$h0		# h[2]
+	shr	\$40,$d2
+	and	\$0x3ffffff,$h1		# h[3]
+	or	$d2,$h2			# h[4]
+
+	vmovd	%rax#d,%x#$H0
+	vmovd	%rdx#d,%x#$H1
+	vmovd	$h0#d,%x#$H2
+	vmovd	$h1#d,%x#$H3
+	vmovd	$h2#d,%x#$H4
+	movl	\$1,20($ctx)		# set is_base2_26
+
+	call	__poly1305_init_avx
+
+.Lproceed_avx2$suffix:
+	mov	%r15,$len			# restore $len
+___
+$code.=<<___ if (!$kernel);
+	mov	OPENSSL_ia32cap_P+8(%rip),%r9d
+	mov	\$`(1<<31|1<<30|1<<16)`,%r11d
+___
+$code.=<<___;
+	pop 		%r15
+.cfi_restore	%r15
+	pop 		%r14
+.cfi_restore	%r14
+	pop 		%r13
+.cfi_restore	%r13
+	pop 		%r12
+.cfi_restore	%r12
+	pop 		%rbx
+.cfi_restore	%rbx
+	pop 		%rbp
+.cfi_restore 	%rbp
+.Lbase2_64_avx2_epilogue$suffix:
+	jmp	.Ldo_avx2$suffix
+.cfi_endproc
+
+.align	32
+.Leven_avx2$suffix:
+.cfi_startproc
+___
+$code.=<<___ if (!$kernel);
+	mov		OPENSSL_ia32cap_P+8(%rip),%r9d
+___
+$code.=<<___;
+	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
+	vmovd		4*1($ctx),%x#$H1
+	vmovd		4*2($ctx),%x#$H2
+	vmovd		4*3($ctx),%x#$H3
+	vmovd		4*4($ctx),%x#$H4
+
+.Ldo_avx2$suffix:
+___
+$code.=<<___		if (!$kernel && $avx>2);
+	cmp		\$512,$len
+	jb		.Lskip_avx512
+	and		%r11d,%r9d
+	test		\$`1<<16`,%r9d		# check for AVX512F
+	jnz		.Lblocks_avx512
+.Lskip_avx512$suffix:
+___
+$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
+	cmp		\$512,$len
+	jae		.Lblocks_avx512
+___
+$code.=<<___	if (!$win64);
+	lea		8(%rsp),%r10
+.cfi_def_cfa_register	%r10
+	sub		\$0x128,%rsp
+___
+$code.=<<___	if ($win64);
+	lea		8(%rsp),%r10
+	sub		\$0x1c8,%rsp
+	vmovdqa		%xmm6,-0xb0(%r10)
+	vmovdqa		%xmm7,-0xa0(%r10)
+	vmovdqa		%xmm8,-0x90(%r10)
+	vmovdqa		%xmm9,-0x80(%r10)
+	vmovdqa		%xmm10,-0x70(%r10)
+	vmovdqa		%xmm11,-0x60(%r10)
+	vmovdqa		%xmm12,-0x50(%r10)
+	vmovdqa		%xmm13,-0x40(%r10)
+	vmovdqa		%xmm14,-0x30(%r10)
+	vmovdqa		%xmm15,-0x20(%r10)
+.Ldo_avx2_body$suffix:
+___
+$code.=<<___;
+	lea		.Lconst(%rip),%rcx
+	lea		48+64($ctx),$ctx	# size optimization
+	vmovdqa		96(%rcx),$T0		# .Lpermd_avx2
+
+	# expand and copy pre-calculated table to stack
+	vmovdqu		`16*0-64`($ctx),%x#$T2
+	and		\$-512,%rsp
+	vmovdqu		`16*1-64`($ctx),%x#$T3
+	vmovdqu		`16*2-64`($ctx),%x#$T4
+	vmovdqu		`16*3-64`($ctx),%x#$D0
+	vmovdqu		`16*4-64`($ctx),%x#$D1
+	vmovdqu		`16*5-64`($ctx),%x#$D2
+	lea		0x90(%rsp),%rax		# size optimization
+	vmovdqu		`16*6-64`($ctx),%x#$D3
+	vpermd		$T2,$T0,$T2		# 00003412 -> 14243444
+	vmovdqu		`16*7-64`($ctx),%x#$D4
+	vpermd		$T3,$T0,$T3
+	vmovdqu		`16*8-64`($ctx),%x#$MASK
+	vpermd		$T4,$T0,$T4
+	vmovdqa		$T2,0x00(%rsp)
+	vpermd		$D0,$T0,$D0
+	vmovdqa		$T3,0x20-0x90(%rax)
+	vpermd		$D1,$T0,$D1
+	vmovdqa		$T4,0x40-0x90(%rax)
+	vpermd		$D2,$T0,$D2
+	vmovdqa		$D0,0x60-0x90(%rax)
+	vpermd		$D3,$T0,$D3
+	vmovdqa		$D1,0x80-0x90(%rax)
+	vpermd		$D4,$T0,$D4
+	vmovdqa		$D2,0xa0-0x90(%rax)
+	vpermd		$MASK,$T0,$MASK
+	vmovdqa		$D3,0xc0-0x90(%rax)
+	vmovdqa		$D4,0xe0-0x90(%rax)
+	vmovdqa		$MASK,0x100-0x90(%rax)
+	vmovdqa		64(%rcx),$MASK		# .Lmask26
+
+	################################################################
+	# load input
+	vmovdqu		16*0($inp),%x#$T0
+	vmovdqu		16*1($inp),%x#$T1
+	vinserti128	\$1,16*2($inp),$T0,$T0
+	vinserti128	\$1,16*3($inp),$T1,$T1
+	lea		16*4($inp),$inp
+
+	vpsrldq		\$6,$T0,$T2		# splat input
+	vpsrldq		\$6,$T1,$T3
+	vpunpckhqdq	$T1,$T0,$T4		# 4
+	vpunpcklqdq	$T3,$T2,$T2		# 2:3
+	vpunpcklqdq	$T1,$T0,$T0		# 0:1
+
+	vpsrlq		\$30,$T2,$T3
+	vpsrlq		\$4,$T2,$T2
+	vpsrlq		\$26,$T0,$T1
+	vpsrlq		\$40,$T4,$T4		# 4
+	vpand		$MASK,$T2,$T2		# 2
+	vpand		$MASK,$T0,$T0		# 0
+	vpand		$MASK,$T1,$T1		# 1
+	vpand		$MASK,$T3,$T3		# 3
+	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
+
+	vpaddq		$H2,$T2,$H2		# accumulate input
+	sub		\$64,$len
+	jz		.Ltail_avx2$suffix
+	jmp		.Loop_avx2$suffix
+
+.align	32
+.Loop_avx2$suffix:
+	################################################################
+	# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
+	# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
+	# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
+	# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
+	#   \________/\__________/
+	################################################################
+	#vpaddq		$H2,$T2,$H2		# accumulate input
+	vpaddq		$H0,$T0,$H0
+	vmovdqa		`32*0`(%rsp),$T0	# r0^4
+	vpaddq		$H1,$T1,$H1
+	vmovdqa		`32*1`(%rsp),$T1	# r1^4
+	vpaddq		$H3,$T3,$H3
+	vmovdqa		`32*3`(%rsp),$T2	# r2^4
+	vpaddq		$H4,$T4,$H4
+	vmovdqa		`32*6-0x90`(%rax),$T3	# s3^4
+	vmovdqa		`32*8-0x90`(%rax),$S4	# s4^4
+
+	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+	#
+	# however, as h2 is "chronologically" first one available pull
+	# corresponding operations up, so it's
+	#
+	# d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
+	# d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
+	# d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	# d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
+	# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
+
+	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
+	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
+	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
+	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
+	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
+
+	vpmuludq	$H0,$T1,$T4		# h0*r1
+	vpmuludq	$H1,$T1,$H2		# h1*r1, borrow $H2 as temp
+	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
+	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
+	vpmuludq	$H3,$T1,$T4		# h3*r1
+	vpmuludq	`32*2`(%rsp),$H4,$H2	# h4*s1
+	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
+	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
+	 vmovdqa	`32*4-0x90`(%rax),$T1	# s2
+
+	vpmuludq	$H0,$T0,$T4		# h0*r0
+	vpmuludq	$H1,$T0,$H2		# h1*r0
+	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
+	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
+	vpmuludq	$H3,$T0,$T4		# h3*r0
+	vpmuludq	$H4,$T0,$H2		# h4*r0
+	 vmovdqu	16*0($inp),%x#$T0	# load input
+	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
+	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
+	 vinserti128	\$1,16*2($inp),$T0,$T0
+
+	vpmuludq	$H3,$T1,$T4		# h3*s2
+	vpmuludq	$H4,$T1,$H2		# h4*s2
+	 vmovdqu	16*1($inp),%x#$T1
+	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
+	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
+	 vmovdqa	`32*5-0x90`(%rax),$H2	# r3
+	vpmuludq	$H1,$T2,$T4		# h1*r2
+	vpmuludq	$H0,$T2,$T2		# h0*r2
+	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
+	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
+	 vinserti128	\$1,16*3($inp),$T1,$T1
+	 lea		16*4($inp),$inp
+
+	vpmuludq	$H1,$H2,$T4		# h1*r3
+	vpmuludq	$H0,$H2,$H2		# h0*r3
+	 vpsrldq	\$6,$T0,$T2		# splat input
+	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
+	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
+	vpmuludq	$H3,$T3,$T4		# h3*s3
+	vpmuludq	$H4,$T3,$H2		# h4*s3
+	 vpsrldq	\$6,$T1,$T3
+	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
+	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
+	 vpunpckhqdq	$T1,$T0,$T4		# 4
+
+	vpmuludq	$H3,$S4,$H3		# h3*s4
+	vpmuludq	$H4,$S4,$H4		# h4*s4
+	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
+	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
+	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
+	 vpunpcklqdq	$T3,$T2,$T3		# 2:3
+	vpmuludq	`32*7-0x90`(%rax),$H0,$H4	# h0*r4
+	vpmuludq	$H1,$S4,$H0		# h1*s4
+	vmovdqa		64(%rcx),$MASK		# .Lmask26
+	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
+	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
+
+	################################################################
+	# lazy reduction (interleaved with tail of input splat)
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	vpaddq		$D0,$D1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H4,$D4
+	vpand		$MASK,$H4,$H4
+
+	 vpsrlq		\$4,$T3,$T2
+
+	vpsrlq		\$26,$H1,$D1
+	vpand		$MASK,$H1,$H1
+	vpaddq		$D1,$H2,$H2		# h1 -> h2
+
+	vpaddq		$D4,$H0,$H0
+	vpsllq		\$2,$D4,$D4
+	vpaddq		$D4,$H0,$H0		# h4 -> h0
+
+	 vpand		$MASK,$T2,$T2		# 2
+	 vpsrlq		\$26,$T0,$T1
+
+	vpsrlq		\$26,$H2,$D2
+	vpand		$MASK,$H2,$H2
+	vpaddq		$D2,$H3,$H3		# h2 -> h3
+
+	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
+	 vpsrlq		\$30,$T3,$T3
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	 vpsrlq		\$40,$T4,$T4		# 4
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	 vpand		$MASK,$T0,$T0		# 0
+	 vpand		$MASK,$T1,$T1		# 1
+	 vpand		$MASK,$T3,$T3		# 3
+	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
+
+	sub		\$64,$len
+	jnz		.Loop_avx2$suffix
+
+	.byte		0x66,0x90
+.Ltail_avx2$suffix:
+	################################################################
+	# while above multiplications were by r^4 in all lanes, in last
+	# iteration we multiply least significant lane by r^4 and most
+	# significant one by r, so copy of above except that references
+	# to the precomputed table are displaced by 4...
+
+	#vpaddq		$H2,$T2,$H2		# accumulate input
+	vpaddq		$H0,$T0,$H0
+	vmovdqu		`32*0+4`(%rsp),$T0	# r0^4
+	vpaddq		$H1,$T1,$H1
+	vmovdqu		`32*1+4`(%rsp),$T1	# r1^4
+	vpaddq		$H3,$T3,$H3
+	vmovdqu		`32*3+4`(%rsp),$T2	# r2^4
+	vpaddq		$H4,$T4,$H4
+	vmovdqu		`32*6+4-0x90`(%rax),$T3	# s3^4
+	vmovdqu		`32*8+4-0x90`(%rax),$S4	# s4^4
+
+	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
+	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
+	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
+	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
+	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
+
+	vpmuludq	$H0,$T1,$T4		# h0*r1
+	vpmuludq	$H1,$T1,$H2		# h1*r1
+	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
+	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
+	vpmuludq	$H3,$T1,$T4		# h3*r1
+	vpmuludq	`32*2+4`(%rsp),$H4,$H2	# h4*s1
+	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
+	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
+
+	vpmuludq	$H0,$T0,$T4		# h0*r0
+	vpmuludq	$H1,$T0,$H2		# h1*r0
+	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
+	 vmovdqu	`32*4+4-0x90`(%rax),$T1	# s2
+	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
+	vpmuludq	$H3,$T0,$T4		# h3*r0
+	vpmuludq	$H4,$T0,$H2		# h4*r0
+	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
+	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
+
+	vpmuludq	$H3,$T1,$T4		# h3*s2
+	vpmuludq	$H4,$T1,$H2		# h4*s2
+	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
+	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
+	 vmovdqu	`32*5+4-0x90`(%rax),$H2	# r3
+	vpmuludq	$H1,$T2,$T4		# h1*r2
+	vpmuludq	$H0,$T2,$T2		# h0*r2
+	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
+	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
+
+	vpmuludq	$H1,$H2,$T4		# h1*r3
+	vpmuludq	$H0,$H2,$H2		# h0*r3
+	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
+	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
+	vpmuludq	$H3,$T3,$T4		# h3*s3
+	vpmuludq	$H4,$T3,$H2		# h4*s3
+	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
+	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
+
+	vpmuludq	$H3,$S4,$H3		# h3*s4
+	vpmuludq	$H4,$S4,$H4		# h4*s4
+	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
+	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
+	vpmuludq	`32*7+4-0x90`(%rax),$H0,$H4		# h0*r4
+	vpmuludq	$H1,$S4,$H0		# h1*s4
+	vmovdqa		64(%rcx),$MASK		# .Lmask26
+	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
+	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
+
+	################################################################
+	# horizontal addition
+
+	vpsrldq		\$8,$D1,$T1
+	vpsrldq		\$8,$H2,$T2
+	vpsrldq		\$8,$H3,$T3
+	vpsrldq		\$8,$H4,$T4
+	vpsrldq		\$8,$H0,$T0
+	vpaddq		$T1,$D1,$D1
+	vpaddq		$T2,$H2,$H2
+	vpaddq		$T3,$H3,$H3
+	vpaddq		$T4,$H4,$H4
+	vpaddq		$T0,$H0,$H0
+
+	vpermq		\$0x2,$H3,$T3
+	vpermq		\$0x2,$H4,$T4
+	vpermq		\$0x2,$H0,$T0
+	vpermq		\$0x2,$D1,$T1
+	vpermq		\$0x2,$H2,$T2
+	vpaddq		$T3,$H3,$H3
+	vpaddq		$T4,$H4,$H4
+	vpaddq		$T0,$H0,$H0
+	vpaddq		$T1,$D1,$D1
+	vpaddq		$T2,$H2,$H2
+
+	################################################################
+	# lazy reduction
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	vpaddq		$D0,$D1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H4,$D4
+	vpand		$MASK,$H4,$H4
+
+	vpsrlq		\$26,$H1,$D1
+	vpand		$MASK,$H1,$H1
+	vpaddq		$D1,$H2,$H2		# h1 -> h2
+
+	vpaddq		$D4,$H0,$H0
+	vpsllq		\$2,$D4,$D4
+	vpaddq		$D4,$H0,$H0		# h4 -> h0
+
+	vpsrlq		\$26,$H2,$D2
+	vpand		$MASK,$H2,$H2
+	vpaddq		$D2,$H3,$H3		# h2 -> h3
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
+	vmovd		%x#$H1,`4*1-48-64`($ctx)
+	vmovd		%x#$H2,`4*2-48-64`($ctx)
+	vmovd		%x#$H3,`4*3-48-64`($ctx)
+	vmovd		%x#$H4,`4*4-48-64`($ctx)
+___
+$code.=<<___	if ($win64);
+	vmovdqa		-0xb0(%r10),%xmm6
+	vmovdqa		-0xa0(%r10),%xmm7
+	vmovdqa		-0x90(%r10),%xmm8
+	vmovdqa		-0x80(%r10),%xmm9
+	vmovdqa		-0x70(%r10),%xmm10
+	vmovdqa		-0x60(%r10),%xmm11
+	vmovdqa		-0x50(%r10),%xmm12
+	vmovdqa		-0x40(%r10),%xmm13
+	vmovdqa		-0x30(%r10),%xmm14
+	vmovdqa		-0x20(%r10),%xmm15
+	lea		-8(%r10),%rsp
+.Ldo_avx2_epilogue$suffix:
+___
+$code.=<<___	if (!$win64);
+	lea		-8(%r10),%rsp
+.cfi_def_cfa_register	%rsp
+___
+$code.=<<___;
+	vzeroupper
+	RET
+.cfi_endproc
+___
+if($avx > 2 && $avx512) {
+my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
+my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
+my $PADBIT="%zmm30";
+
+map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));		# switch to %zmm domain
+map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
+map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
+map(s/%y/%z/,($MASK));
+
+$code.=<<___;
+.cfi_startproc
+.Lblocks_avx512:
+	mov		\$15,%eax
+	kmovw		%eax,%k2
+___
+$code.=<<___	if (!$win64);
+	lea		8(%rsp),%r10
+.cfi_def_cfa_register	%r10
+	sub		\$0x128,%rsp
+___
+$code.=<<___	if ($win64);
+	lea		8(%rsp),%r10
+	sub		\$0x1c8,%rsp
+	vmovdqa		%xmm6,-0xb0(%r10)
+	vmovdqa		%xmm7,-0xa0(%r10)
+	vmovdqa		%xmm8,-0x90(%r10)
+	vmovdqa		%xmm9,-0x80(%r10)
+	vmovdqa		%xmm10,-0x70(%r10)
+	vmovdqa		%xmm11,-0x60(%r10)
+	vmovdqa		%xmm12,-0x50(%r10)
+	vmovdqa		%xmm13,-0x40(%r10)
+	vmovdqa		%xmm14,-0x30(%r10)
+	vmovdqa		%xmm15,-0x20(%r10)
+.Ldo_avx512_body:
+___
+$code.=<<___;
+	lea		.Lconst(%rip),%rcx
+	lea		48+64($ctx),$ctx	# size optimization
+	vmovdqa		96(%rcx),%y#$T2		# .Lpermd_avx2
+
+	# expand pre-calculated table
+	vmovdqu		`16*0-64`($ctx),%x#$D0	# will become expanded ${R0}
+	and		\$-512,%rsp
+	vmovdqu		`16*1-64`($ctx),%x#$D1	# will become ... ${R1}
+	mov		\$0x20,%rax
+	vmovdqu		`16*2-64`($ctx),%x#$T0	# ... ${S1}
+	vmovdqu		`16*3-64`($ctx),%x#$D2	# ... ${R2}
+	vmovdqu		`16*4-64`($ctx),%x#$T1	# ... ${S2}
+	vmovdqu		`16*5-64`($ctx),%x#$D3	# ... ${R3}
+	vmovdqu		`16*6-64`($ctx),%x#$T3	# ... ${S3}
+	vmovdqu		`16*7-64`($ctx),%x#$D4	# ... ${R4}
+	vmovdqu		`16*8-64`($ctx),%x#$T4	# ... ${S4}
+	vpermd		$D0,$T2,$R0		# 00003412 -> 14243444
+	vpbroadcastq	64(%rcx),$MASK		# .Lmask26
+	vpermd		$D1,$T2,$R1
+	vpermd		$T0,$T2,$S1
+	vpermd		$D2,$T2,$R2
+	vmovdqa64	$R0,0x00(%rsp){%k2}	# save in case $len%128 != 0
+	 vpsrlq		\$32,$R0,$T0		# 14243444 -> 01020304
+	vpermd		$T1,$T2,$S2
+	vmovdqu64	$R1,0x00(%rsp,%rax){%k2}
+	 vpsrlq		\$32,$R1,$T1
+	vpermd		$D3,$T2,$R3
+	vmovdqa64	$S1,0x40(%rsp){%k2}
+	vpermd		$T3,$T2,$S3
+	vpermd		$D4,$T2,$R4
+	vmovdqu64	$R2,0x40(%rsp,%rax){%k2}
+	vpermd		$T4,$T2,$S4
+	vmovdqa64	$S2,0x80(%rsp){%k2}
+	vmovdqu64	$R3,0x80(%rsp,%rax){%k2}
+	vmovdqa64	$S3,0xc0(%rsp){%k2}
+	vmovdqu64	$R4,0xc0(%rsp,%rax){%k2}
+	vmovdqa64	$S4,0x100(%rsp){%k2}
+
+	################################################################
+	# calculate 5th through 8th powers of the key
+	#
+	# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
+	# d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
+	# d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3
+	# d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4
+	# d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
+
+	vpmuludq	$T0,$R0,$D0		# d0 = r0'*r0
+	vpmuludq	$T0,$R1,$D1		# d1 = r0'*r1
+	vpmuludq	$T0,$R2,$D2		# d2 = r0'*r2
+	vpmuludq	$T0,$R3,$D3		# d3 = r0'*r3
+	vpmuludq	$T0,$R4,$D4		# d4 = r0'*r4
+	 vpsrlq		\$32,$R2,$T2
+
+	vpmuludq	$T1,$S4,$M0
+	vpmuludq	$T1,$R0,$M1
+	vpmuludq	$T1,$R1,$M2
+	vpmuludq	$T1,$R2,$M3
+	vpmuludq	$T1,$R3,$M4
+	 vpsrlq		\$32,$R3,$T3
+	vpaddq		$M0,$D0,$D0		# d0 += r1'*5*r4
+	vpaddq		$M1,$D1,$D1		# d1 += r1'*r0
+	vpaddq		$M2,$D2,$D2		# d2 += r1'*r1
+	vpaddq		$M3,$D3,$D3		# d3 += r1'*r2
+	vpaddq		$M4,$D4,$D4		# d4 += r1'*r3
+
+	vpmuludq	$T2,$S3,$M0
+	vpmuludq	$T2,$S4,$M1
+	vpmuludq	$T2,$R1,$M3
+	vpmuludq	$T2,$R2,$M4
+	vpmuludq	$T2,$R0,$M2
+	 vpsrlq		\$32,$R4,$T4
+	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r3
+	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r4
+	vpaddq		$M3,$D3,$D3		# d3 += r2'*r1
+	vpaddq		$M4,$D4,$D4		# d4 += r2'*r2
+	vpaddq		$M2,$D2,$D2		# d2 += r2'*r0
+
+	vpmuludq	$T3,$S2,$M0
+	vpmuludq	$T3,$R0,$M3
+	vpmuludq	$T3,$R1,$M4
+	vpmuludq	$T3,$S3,$M1
+	vpmuludq	$T3,$S4,$M2
+	vpaddq		$M0,$D0,$D0		# d0 += r3'*5*r2
+	vpaddq		$M3,$D3,$D3		# d3 += r3'*r0
+	vpaddq		$M4,$D4,$D4		# d4 += r3'*r1
+	vpaddq		$M1,$D1,$D1		# d1 += r3'*5*r3
+	vpaddq		$M2,$D2,$D2		# d2 += r3'*5*r4
+
+	vpmuludq	$T4,$S4,$M3
+	vpmuludq	$T4,$R0,$M4
+	vpmuludq	$T4,$S1,$M0
+	vpmuludq	$T4,$S2,$M1
+	vpmuludq	$T4,$S3,$M2
+	vpaddq		$M3,$D3,$D3		# d3 += r2'*5*r4
+	vpaddq		$M4,$D4,$D4		# d4 += r2'*r0
+	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r1
+	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r2
+	vpaddq		$M2,$D2,$D2		# d2 += r2'*5*r3
+
+	################################################################
+	# load input
+	vmovdqu64	16*0($inp),%z#$T3
+	vmovdqu64	16*4($inp),%z#$T4
+	lea		16*8($inp),$inp
+
+	################################################################
+	# lazy reduction
+
+	vpsrlq		\$26,$D3,$M3
+	vpandq		$MASK,$D3,$D3
+	vpaddq		$M3,$D4,$D4		# d3 -> d4
+
+	vpsrlq		\$26,$D0,$M0
+	vpandq		$MASK,$D0,$D0
+	vpaddq		$M0,$D1,$D1		# d0 -> d1
+
+	vpsrlq		\$26,$D4,$M4
+	vpandq		$MASK,$D4,$D4
+
+	vpsrlq		\$26,$D1,$M1
+	vpandq		$MASK,$D1,$D1
+	vpaddq		$M1,$D2,$D2		# d1 -> d2
+
+	vpaddq		$M4,$D0,$D0
+	vpsllq		\$2,$M4,$M4
+	vpaddq		$M4,$D0,$D0		# d4 -> d0
+
+	vpsrlq		\$26,$D2,$M2
+	vpandq		$MASK,$D2,$D2
+	vpaddq		$M2,$D3,$D3		# d2 -> d3
+
+	vpsrlq		\$26,$D0,$M0
+	vpandq		$MASK,$D0,$D0
+	vpaddq		$M0,$D1,$D1		# d0 -> d1
+
+	vpsrlq		\$26,$D3,$M3
+	vpandq		$MASK,$D3,$D3
+	vpaddq		$M3,$D4,$D4		# d3 -> d4
+
+	################################################################
+	# at this point we have 14243444 in $R0-$S4 and 05060708 in
+	# $D0-$D4, ...
+
+	vpunpcklqdq	$T4,$T3,$T0	# transpose input
+	vpunpckhqdq	$T4,$T3,$T4
+
+	# ... since input 64-bit lanes are ordered as 73625140, we could
+	# "vperm" it to 76543210 (here and in each loop iteration), *or*
+	# we could just flow along, hence the goal for $R0-$S4 is
+	# 1858286838784888 ...
+
+	vmovdqa32	128(%rcx),$M0		# .Lpermd_avx512:
+	mov		\$0x7777,%eax
+	kmovw		%eax,%k1
+
+	vpermd		$R0,$M0,$R0		# 14243444 -> 1---2---3---4---
+	vpermd		$R1,$M0,$R1
+	vpermd		$R2,$M0,$R2
+	vpermd		$R3,$M0,$R3
+	vpermd		$R4,$M0,$R4
+
+	vpermd		$D0,$M0,${R0}{%k1}	# 05060708 -> 1858286838784888
+	vpermd		$D1,$M0,${R1}{%k1}
+	vpermd		$D2,$M0,${R2}{%k1}
+	vpermd		$D3,$M0,${R3}{%k1}
+	vpermd		$D4,$M0,${R4}{%k1}
+
+	vpslld		\$2,$R1,$S1		# *5
+	vpslld		\$2,$R2,$S2
+	vpslld		\$2,$R3,$S3
+	vpslld		\$2,$R4,$S4
+	vpaddd		$R1,$S1,$S1
+	vpaddd		$R2,$S2,$S2
+	vpaddd		$R3,$S3,$S3
+	vpaddd		$R4,$S4,$S4
+
+	vpbroadcastq	32(%rcx),$PADBIT	# .L129
+
+	vpsrlq		\$52,$T0,$T2		# splat input
+	vpsllq		\$12,$T4,$T3
+	vporq		$T3,$T2,$T2
+	vpsrlq		\$26,$T0,$T1
+	vpsrlq		\$14,$T4,$T3
+	vpsrlq		\$40,$T4,$T4		# 4
+	vpandq		$MASK,$T2,$T2		# 2
+	vpandq		$MASK,$T0,$T0		# 0
+	#vpandq		$MASK,$T1,$T1		# 1
+	#vpandq		$MASK,$T3,$T3		# 3
+	#vporq		$PADBIT,$T4,$T4		# padbit, yes, always
+
+	vpaddq		$H2,$T2,$H2		# accumulate input
+	sub		\$192,$len
+	jbe		.Ltail_avx512
+	jmp		.Loop_avx512
+
+.align	32
+.Loop_avx512:
+	################################################################
+	# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
+	# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
+	# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
+	# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
+	# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
+	# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
+	# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
+	# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
+	#   \________/\___________/
+	################################################################
+	#vpaddq		$H2,$T2,$H2		# accumulate input
+
+	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+	#
+	# however, as h2 is "chronologically" first one available pull
+	# corresponding operations up, so it's
+	#
+	# d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
+	# d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
+	# d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
+	# d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
+	# d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
+
+	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
+	 vpaddq		$H0,$T0,$H0
+	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
+	 vpandq		$MASK,$T1,$T1		# 1
+	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
+	 vpandq		$MASK,$T3,$T3		# 3
+	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
+	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
+	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
+	 vpaddq		$H1,$T1,$H1		# accumulate input
+	 vpaddq		$H3,$T3,$H3
+	 vpaddq		$H4,$T4,$H4
+
+	  vmovdqu64	16*0($inp),$T3		# load input
+	  vmovdqu64	16*4($inp),$T4
+	  lea		16*8($inp),$inp
+	vpmuludq	$H0,$R3,$M3
+	vpmuludq	$H0,$R4,$M4
+	vpmuludq	$H0,$R0,$M0
+	vpmuludq	$H0,$R1,$M1
+	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
+	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
+	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
+	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
+
+	vpmuludq	$H1,$R2,$M3
+	vpmuludq	$H1,$R3,$M4
+	vpmuludq	$H1,$S4,$M0
+	vpmuludq	$H0,$R2,$M2
+	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
+	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
+	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
+	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
+
+	  vpunpcklqdq	$T4,$T3,$T0		# transpose input
+	  vpunpckhqdq	$T4,$T3,$T4
+
+	vpmuludq	$H3,$R0,$M3
+	vpmuludq	$H3,$R1,$M4
+	vpmuludq	$H1,$R0,$M1
+	vpmuludq	$H1,$R1,$M2
+	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
+	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
+	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
+	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
+
+	vpmuludq	$H4,$S4,$M3
+	vpmuludq	$H4,$R0,$M4
+	vpmuludq	$H3,$S2,$M0
+	vpmuludq	$H3,$S3,$M1
+	vpaddq		$M3,$D3,$D3		# d3 += h4*s4
+	vpmuludq	$H3,$S4,$M2
+	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
+	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
+	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
+	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
+
+	vpmuludq	$H4,$S1,$M0
+	vpmuludq	$H4,$S2,$M1
+	vpmuludq	$H4,$S3,$M2
+	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
+	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
+	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
+
+	################################################################
+	# lazy reduction (interleaved with input splat)
+
+	 vpsrlq		\$52,$T0,$T2		# splat input
+	 vpsllq		\$12,$T4,$T3
+
+	vpsrlq		\$26,$D3,$H3
+	vpandq		$MASK,$D3,$D3
+	vpaddq		$H3,$D4,$H4		# h3 -> h4
+
+	 vporq		$T3,$T2,$T2
+
+	vpsrlq		\$26,$H0,$D0
+	vpandq		$MASK,$H0,$H0
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	 vpandq		$MASK,$T2,$T2		# 2
+
+	vpsrlq		\$26,$H4,$D4
+	vpandq		$MASK,$H4,$H4
+
+	vpsrlq		\$26,$H1,$D1
+	vpandq		$MASK,$H1,$H1
+	vpaddq		$D1,$H2,$H2		# h1 -> h2
+
+	vpaddq		$D4,$H0,$H0
+	vpsllq		\$2,$D4,$D4
+	vpaddq		$D4,$H0,$H0		# h4 -> h0
+
+	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
+	 vpsrlq		\$26,$T0,$T1
+
+	vpsrlq		\$26,$H2,$D2
+	vpandq		$MASK,$H2,$H2
+	vpaddq		$D2,$D3,$H3		# h2 -> h3
+
+	 vpsrlq		\$14,$T4,$T3
+
+	vpsrlq		\$26,$H0,$D0
+	vpandq		$MASK,$H0,$H0
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	 vpsrlq		\$40,$T4,$T4		# 4
+
+	vpsrlq		\$26,$H3,$D3
+	vpandq		$MASK,$H3,$H3
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	 vpandq		$MASK,$T0,$T0		# 0
+	 #vpandq	$MASK,$T1,$T1		# 1
+	 #vpandq	$MASK,$T3,$T3		# 3
+	 #vporq		$PADBIT,$T4,$T4		# padbit, yes, always
+
+	sub		\$128,$len
+	ja		.Loop_avx512
+
+.Ltail_avx512:
+	################################################################
+	# while above multiplications were by r^8 in all lanes, in last
+	# iteration we multiply least significant lane by r^8 and most
+	# significant one by r, that's why table gets shifted...
+
+	vpsrlq		\$32,$R0,$R0		# 0105020603070408
+	vpsrlq		\$32,$R1,$R1
+	vpsrlq		\$32,$R2,$R2
+	vpsrlq		\$32,$S3,$S3
+	vpsrlq		\$32,$S4,$S4
+	vpsrlq		\$32,$R3,$R3
+	vpsrlq		\$32,$R4,$R4
+	vpsrlq		\$32,$S1,$S1
+	vpsrlq		\$32,$S2,$S2
+
+	################################################################
+	# load either next or last 64 byte of input
+	lea		($inp,$len),$inp
+
+	#vpaddq		$H2,$T2,$H2		# accumulate input
+	vpaddq		$H0,$T0,$H0
+
+	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
+	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
+	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
+	 vpandq		$MASK,$T1,$T1		# 1
+	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
+	 vpandq		$MASK,$T3,$T3		# 3
+	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
+	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
+	 vpaddq		$H1,$T1,$H1		# accumulate input
+	 vpaddq		$H3,$T3,$H3
+	 vpaddq		$H4,$T4,$H4
+
+	  vmovdqu	16*0($inp),%x#$T0
+	vpmuludq	$H0,$R3,$M3
+	vpmuludq	$H0,$R4,$M4
+	vpmuludq	$H0,$R0,$M0
+	vpmuludq	$H0,$R1,$M1
+	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
+	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
+	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
+	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
+
+	  vmovdqu	16*1($inp),%x#$T1
+	vpmuludq	$H1,$R2,$M3
+	vpmuludq	$H1,$R3,$M4
+	vpmuludq	$H1,$S4,$M0
+	vpmuludq	$H0,$R2,$M2
+	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
+	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
+	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
+	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
+
+	  vinserti128	\$1,16*2($inp),%y#$T0,%y#$T0
+	vpmuludq	$H3,$R0,$M3
+	vpmuludq	$H3,$R1,$M4
+	vpmuludq	$H1,$R0,$M1
+	vpmuludq	$H1,$R1,$M2
+	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
+	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
+	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
+	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
+
+	  vinserti128	\$1,16*3($inp),%y#$T1,%y#$T1
+	vpmuludq	$H4,$S4,$M3
+	vpmuludq	$H4,$R0,$M4
+	vpmuludq	$H3,$S2,$M0
+	vpmuludq	$H3,$S3,$M1
+	vpmuludq	$H3,$S4,$M2
+	vpaddq		$M3,$D3,$H3		# h3 = d3 + h4*s4
+	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
+	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
+	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
+	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
+
+	vpmuludq	$H4,$S1,$M0
+	vpmuludq	$H4,$S2,$M1
+	vpmuludq	$H4,$S3,$M2
+	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
+	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
+	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
+
+	################################################################
+	# horizontal addition
+
+	mov		\$1,%eax
+	vpermq		\$0xb1,$H3,$D3
+	vpermq		\$0xb1,$D4,$H4
+	vpermq		\$0xb1,$H0,$D0
+	vpermq		\$0xb1,$H1,$D1
+	vpermq		\$0xb1,$H2,$D2
+	vpaddq		$D3,$H3,$H3
+	vpaddq		$D4,$H4,$H4
+	vpaddq		$D0,$H0,$H0
+	vpaddq		$D1,$H1,$H1
+	vpaddq		$D2,$H2,$H2
+
+	kmovw		%eax,%k3
+	vpermq		\$0x2,$H3,$D3
+	vpermq		\$0x2,$H4,$D4
+	vpermq		\$0x2,$H0,$D0
+	vpermq		\$0x2,$H1,$D1
+	vpermq		\$0x2,$H2,$D2
+	vpaddq		$D3,$H3,$H3
+	vpaddq		$D4,$H4,$H4
+	vpaddq		$D0,$H0,$H0
+	vpaddq		$D1,$H1,$H1
+	vpaddq		$D2,$H2,$H2
+
+	vextracti64x4	\$0x1,$H3,%y#$D3
+	vextracti64x4	\$0x1,$H4,%y#$D4
+	vextracti64x4	\$0x1,$H0,%y#$D0
+	vextracti64x4	\$0x1,$H1,%y#$D1
+	vextracti64x4	\$0x1,$H2,%y#$D2
+	vpaddq		$D3,$H3,${H3}{%k3}{z}	# keep single qword in case
+	vpaddq		$D4,$H4,${H4}{%k3}{z}	# it's passed to .Ltail_avx2
+	vpaddq		$D0,$H0,${H0}{%k3}{z}
+	vpaddq		$D1,$H1,${H1}{%k3}{z}
+	vpaddq		$D2,$H2,${H2}{%k3}{z}
+___
+map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
+map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
+$code.=<<___;
+	################################################################
+	# lazy reduction (interleaved with input splat)
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	 vpsrldq	\$6,$T0,$T2		# splat input
+	 vpsrldq	\$6,$T1,$T3
+	 vpunpckhqdq	$T1,$T0,$T4		# 4
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	 vpunpcklqdq	$T3,$T2,$T2		# 2:3
+	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H4,$D4
+	vpand		$MASK,$H4,$H4
+
+	vpsrlq		\$26,$H1,$D1
+	vpand		$MASK,$H1,$H1
+	 vpsrlq		\$30,$T2,$T3
+	 vpsrlq		\$4,$T2,$T2
+	vpaddq		$D1,$H2,$H2		# h1 -> h2
+
+	vpaddq		$D4,$H0,$H0
+	vpsllq		\$2,$D4,$D4
+	 vpsrlq		\$26,$T0,$T1
+	 vpsrlq		\$40,$T4,$T4		# 4
+	vpaddq		$D4,$H0,$H0		# h4 -> h0
+
+	vpsrlq		\$26,$H2,$D2
+	vpand		$MASK,$H2,$H2
+	 vpand		$MASK,$T2,$T2		# 2
+	 vpand		$MASK,$T0,$T0		# 0
+	vpaddq		$D2,$H3,$H3		# h2 -> h3
+
+	vpsrlq		\$26,$H0,$D0
+	vpand		$MASK,$H0,$H0
+	 vpaddq		$H2,$T2,$H2		# accumulate input for .Ltail_avx2
+	 vpand		$MASK,$T1,$T1		# 1
+	vpaddq		$D0,$H1,$H1		# h0 -> h1
+
+	vpsrlq		\$26,$H3,$D3
+	vpand		$MASK,$H3,$H3
+	 vpand		$MASK,$T3,$T3		# 3
+	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
+	vpaddq		$D3,$H4,$H4		# h3 -> h4
+
+	lea		0x90(%rsp),%rax		# size optimization for .Ltail_avx2
+	add		\$64,$len
+	jnz		.Ltail_avx2$suffix
+
+	vpsubq		$T2,$H2,$H2		# undo input accumulation
+	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
+	vmovd		%x#$H1,`4*1-48-64`($ctx)
+	vmovd		%x#$H2,`4*2-48-64`($ctx)
+	vmovd		%x#$H3,`4*3-48-64`($ctx)
+	vmovd		%x#$H4,`4*4-48-64`($ctx)
+	vzeroall
+___
+$code.=<<___	if ($win64);
+	movdqa		-0xb0(%r10),%xmm6
+	movdqa		-0xa0(%r10),%xmm7
+	movdqa		-0x90(%r10),%xmm8
+	movdqa		-0x80(%r10),%xmm9
+	movdqa		-0x70(%r10),%xmm10
+	movdqa		-0x60(%r10),%xmm11
+	movdqa		-0x50(%r10),%xmm12
+	movdqa		-0x40(%r10),%xmm13
+	movdqa		-0x30(%r10),%xmm14
+	movdqa		-0x20(%r10),%xmm15
+	lea		-8(%r10),%rsp
+.Ldo_avx512_epilogue:
+___
+$code.=<<___	if (!$win64);
+	lea		-8(%r10),%rsp
+.cfi_def_cfa_register	%rsp
+___
+$code.=<<___;
+	RET
+.cfi_endproc
+___
+
+}
+
+}
+
+&declare_function("poly1305_blocks_avx2", 32, 4);
+poly1305_blocks_avxN(0);
+&end_function("poly1305_blocks_avx2");
+
+#######################################################################
+if ($avx>2) {
+# On entry we have input length divisible by 64. But since inner loop
+# processes 128 bytes per iteration, cases when length is not divisible
+# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
+# reason stack layout is kept identical to poly1305_blocks_avx2. If not
+# for this tail, we wouldn't have to even allocate stack frame...
+
+&declare_function("poly1305_blocks_avx512", 32, 4);
+poly1305_blocks_avxN(1);
+&end_function("poly1305_blocks_avx512");
+
+if (!$kernel && $avx>3) {
+########################################################################
+# VPMADD52 version using 2^44 radix.
+#
+# One can argue that base 2^52 would be more natural. Well, even though
+# some operations would be more natural, one has to recognize couple of
+# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
+# at amount of multiply-n-accumulate operations. Secondly, it makes it
+# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
+# reference implementations], which means that more such operations
+# would have to be performed in inner loop, which in turn makes critical
+# path longer. In other words, even though base 2^44 reduction might
+# look less elegant, overall critical path is actually shorter...
+
+########################################################################
+# Layout of opaque area is following.
+#
+#	unsigned __int64 h[3];		# current hash value base 2^44
+#	unsigned __int64 s[2];		# key value*20 base 2^44
+#	unsigned __int64 r[3];		# key value base 2^44
+#	struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
+#					# r^n positions reflect
+#					# placement in register, not
+#					# memory, R[3] is R[1]*20
+
+$code.=<<___;
+.type	poly1305_init_base2_44,\@function,3
+.align	32
+poly1305_init_base2_44:
+	xor	%eax,%eax
+	mov	%rax,0($ctx)		# initialize hash value
+	mov	%rax,8($ctx)
+	mov	%rax,16($ctx)
+
+.Linit_base2_44:
+	lea	poly1305_blocks_vpmadd52(%rip),%r10
+	lea	poly1305_emit_base2_44(%rip),%r11
+
+	mov	\$0x0ffffffc0fffffff,%rax
+	mov	\$0x0ffffffc0ffffffc,%rcx
+	and	0($inp),%rax
+	mov	\$0x00000fffffffffff,%r8
+	and	8($inp),%rcx
+	mov	\$0x00000fffffffffff,%r9
+	and	%rax,%r8
+	shrd	\$44,%rcx,%rax
+	mov	%r8,40($ctx)		# r0
+	and	%r9,%rax
+	shr	\$24,%rcx
+	mov	%rax,48($ctx)		# r1
+	lea	(%rax,%rax,4),%rax	# *5
+	mov	%rcx,56($ctx)		# r2
+	shl	\$2,%rax		# magic <<2
+	lea	(%rcx,%rcx,4),%rcx	# *5
+	shl	\$2,%rcx		# magic <<2
+	mov	%rax,24($ctx)		# s1
+	mov	%rcx,32($ctx)		# s2
+	movq	\$-1,64($ctx)		# write impossible value
+___
+$code.=<<___	if ($flavour !~ /elf32/);
+	mov	%r10,0(%rdx)
+	mov	%r11,8(%rdx)
+___
+$code.=<<___	if ($flavour =~ /elf32/);
+	mov	%r10d,0(%rdx)
+	mov	%r11d,4(%rdx)
+___
+$code.=<<___;
+	mov	\$1,%eax
+	RET
+.size	poly1305_init_base2_44,.-poly1305_init_base2_44
+___
+{
+my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
+my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
+my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
+
+$code.=<<___;
+.type	poly1305_blocks_vpmadd52,\@function,4
+.align	32
+poly1305_blocks_vpmadd52:
+	shr	\$4,$len
+	jz	.Lno_data_vpmadd52		# too short
+
+	shl	\$40,$padbit
+	mov	64($ctx),%r8			# peek on power of the key
+
+	# if powers of the key are not calculated yet, process up to 3
+	# blocks with this single-block subroutine, otherwise ensure that
+	# length is divisible by 2 blocks and pass the rest down to next
+	# subroutine...
+
+	mov	\$3,%rax
+	mov	\$1,%r10
+	cmp	\$4,$len			# is input long
+	cmovae	%r10,%rax
+	test	%r8,%r8				# is power value impossible?
+	cmovns	%r10,%rax
+
+	and	$len,%rax			# is input of favourable length?
+	jz	.Lblocks_vpmadd52_4x
+
+	sub		%rax,$len
+	mov		\$7,%r10d
+	mov		\$1,%r11d
+	kmovw		%r10d,%k7
+	lea		.L2_44_inp_permd(%rip),%r10
+	kmovw		%r11d,%k1
+
+	vmovq		$padbit,%x#$PAD
+	vmovdqa64	0(%r10),$inp_permd	# .L2_44_inp_permd
+	vmovdqa64	32(%r10),$inp_shift	# .L2_44_inp_shift
+	vpermq		\$0xcf,$PAD,$PAD
+	vmovdqa64	64(%r10),$reduc_mask	# .L2_44_mask
+
+	vmovdqu64	0($ctx),${Dlo}{%k7}{z}		# load hash value
+	vmovdqu64	40($ctx),${r2r1r0}{%k7}{z}	# load keys
+	vmovdqu64	32($ctx),${r1r0s2}{%k7}{z}
+	vmovdqu64	24($ctx),${r0s2s1}{%k7}{z}
+
+	vmovdqa64	96(%r10),$reduc_rght	# .L2_44_shift_rgt
+	vmovdqa64	128(%r10),$reduc_left	# .L2_44_shift_lft
+
+	jmp		.Loop_vpmadd52
+
+.align	32
+.Loop_vpmadd52:
+	vmovdqu32	0($inp),%x#$T0		# load input as ----3210
+	lea		16($inp),$inp
+
+	vpermd		$T0,$inp_permd,$T0	# ----3210 -> --322110
+	vpsrlvq		$inp_shift,$T0,$T0
+	vpandq		$reduc_mask,$T0,$T0
+	vporq		$PAD,$T0,$T0
+
+	vpaddq		$T0,$Dlo,$Dlo		# accumulate input
+
+	vpermq		\$0,$Dlo,${H0}{%k7}{z}	# smash hash value
+	vpermq		\$0b01010101,$Dlo,${H1}{%k7}{z}
+	vpermq		\$0b10101010,$Dlo,${H2}{%k7}{z}
+
+	vpxord		$Dlo,$Dlo,$Dlo
+	vpxord		$Dhi,$Dhi,$Dhi
+
+	vpmadd52luq	$r2r1r0,$H0,$Dlo
+	vpmadd52huq	$r2r1r0,$H0,$Dhi
+
+	vpmadd52luq	$r1r0s2,$H1,$Dlo
+	vpmadd52huq	$r1r0s2,$H1,$Dhi
+
+	vpmadd52luq	$r0s2s1,$H2,$Dlo
+	vpmadd52huq	$r0s2s1,$H2,$Dhi
+
+	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost qword
+	vpsllvq		$reduc_left,$Dhi,$Dhi	# 0 in topmost qword
+	vpandq		$reduc_mask,$Dlo,$Dlo
+
+	vpaddq		$T0,$Dhi,$Dhi
+
+	vpermq		\$0b10010011,$Dhi,$Dhi	# 0 in lowest qword
+
+	vpaddq		$Dhi,$Dlo,$Dlo		# note topmost qword :-)
+
+	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost word
+	vpandq		$reduc_mask,$Dlo,$Dlo
+
+	vpermq		\$0b10010011,$T0,$T0
+
+	vpaddq		$T0,$Dlo,$Dlo
+
+	vpermq		\$0b10010011,$Dlo,${T0}{%k1}{z}
+
+	vpaddq		$T0,$Dlo,$Dlo
+	vpsllq		\$2,$T0,$T0
+
+	vpaddq		$T0,$Dlo,$Dlo
+
+	dec		%rax			# len-=16
+	jnz		.Loop_vpmadd52
+
+	vmovdqu64	$Dlo,0($ctx){%k7}	# store hash value
+
+	test		$len,$len
+	jnz		.Lblocks_vpmadd52_4x
+
+.Lno_data_vpmadd52:
+	RET
+.size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
+___
+}
+{
+########################################################################
+# As implied by its name 4x subroutine processes 4 blocks in parallel
+# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
+# and is handled in 256-bit %ymm registers.
+
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+
+$code.=<<___;
+.type	poly1305_blocks_vpmadd52_4x,\@function,4
+.align	32
+poly1305_blocks_vpmadd52_4x:
+	shr	\$4,$len
+	jz	.Lno_data_vpmadd52_4x		# too short
+
+	shl	\$40,$padbit
+	mov	64($ctx),%r8			# peek on power of the key
+
+.Lblocks_vpmadd52_4x:
+	vpbroadcastq	$padbit,$PAD
+
+	vmovdqa64	.Lx_mask44(%rip),$mask44
+	mov		\$5,%eax
+	vmovdqa64	.Lx_mask42(%rip),$mask42
+	kmovw		%eax,%k1		# used in 2x path
+
+	test		%r8,%r8			# is power value impossible?
+	js		.Linit_vpmadd52		# if it is, then init R[4]
+
+	vmovq		0($ctx),%x#$H0		# load current hash value
+	vmovq		8($ctx),%x#$H1
+	vmovq		16($ctx),%x#$H2
+
+	test		\$3,$len		# is length 4*n+2?
+	jnz		.Lblocks_vpmadd52_2x_do
+
+.Lblocks_vpmadd52_4x_do:
+	vpbroadcastq	64($ctx),$R0		# load 4th power of the key
+	vpbroadcastq	96($ctx),$R1
+	vpbroadcastq	128($ctx),$R2
+	vpbroadcastq	160($ctx),$S1
+
+.Lblocks_vpmadd52_4x_key_loaded:
+	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
+	vpaddq		$R2,$S2,$S2
+	vpsllq		\$2,$S2,$S2
+
+	test		\$7,$len		# is len 8*n?
+	jz		.Lblocks_vpmadd52_8x
+
+	vmovdqu64	16*0($inp),$T2		# load data
+	vmovdqu64	16*2($inp),$T3
+	lea		16*4($inp),$inp
+
+	vpunpcklqdq	$T3,$T2,$T1		# transpose data
+	vpunpckhqdq	$T3,$T2,$T3
+
+	# at this point 64-bit lanes are ordered as 3-1-2-0
+
+	vpsrlq		\$24,$T3,$T2		# splat the data
+	vporq		$PAD,$T2,$T2
+	 vpaddq		$T2,$H2,$H2		# accumulate input
+	vpandq		$mask44,$T1,$T0
+	vpsrlq		\$44,$T1,$T1
+	vpsllq		\$20,$T3,$T3
+	vporq		$T3,$T1,$T1
+	vpandq		$mask44,$T1,$T1
+
+	sub		\$4,$len
+	jz		.Ltail_vpmadd52_4x
+	jmp		.Loop_vpmadd52_4x
+	ud2
+
+.align	32
+.Linit_vpmadd52:
+	vmovq		24($ctx),%x#$S1		# load key
+	vmovq		56($ctx),%x#$H2
+	vmovq		32($ctx),%x#$S2
+	vmovq		40($ctx),%x#$R0
+	vmovq		48($ctx),%x#$R1
+
+	vmovdqa		$R0,$H0
+	vmovdqa		$R1,$H1
+	vmovdqa		$H2,$R2
+
+	mov		\$2,%eax
+
+.Lmul_init_vpmadd52:
+	vpxorq		$D0lo,$D0lo,$D0lo
+	vpmadd52luq	$H2,$S1,$D0lo
+	vpxorq		$D0hi,$D0hi,$D0hi
+	vpmadd52huq	$H2,$S1,$D0hi
+	vpxorq		$D1lo,$D1lo,$D1lo
+	vpmadd52luq	$H2,$S2,$D1lo
+	vpxorq		$D1hi,$D1hi,$D1hi
+	vpmadd52huq	$H2,$S2,$D1hi
+	vpxorq		$D2lo,$D2lo,$D2lo
+	vpmadd52luq	$H2,$R0,$D2lo
+	vpxorq		$D2hi,$D2hi,$D2hi
+	vpmadd52huq	$H2,$R0,$D2hi
+
+	vpmadd52luq	$H0,$R0,$D0lo
+	vpmadd52huq	$H0,$R0,$D0hi
+	vpmadd52luq	$H0,$R1,$D1lo
+	vpmadd52huq	$H0,$R1,$D1hi
+	vpmadd52luq	$H0,$R2,$D2lo
+	vpmadd52huq	$H0,$R2,$D2hi
+
+	vpmadd52luq	$H1,$S2,$D0lo
+	vpmadd52huq	$H1,$S2,$D0hi
+	vpmadd52luq	$H1,$R0,$D1lo
+	vpmadd52huq	$H1,$R0,$D1hi
+	vpmadd52luq	$H1,$R1,$D2lo
+	vpmadd52huq	$H1,$R1,$D2hi
+
+	################################################################
+	# partial reduction
+	vpsrlq		\$44,$D0lo,$tmp
+	vpsllq		\$8,$D0hi,$D0hi
+	vpandq		$mask44,$D0lo,$H0
+	vpaddq		$tmp,$D0hi,$D0hi
+
+	vpaddq		$D0hi,$D1lo,$D1lo
+
+	vpsrlq		\$44,$D1lo,$tmp
+	vpsllq		\$8,$D1hi,$D1hi
+	vpandq		$mask44,$D1lo,$H1
+	vpaddq		$tmp,$D1hi,$D1hi
+
+	vpaddq		$D1hi,$D2lo,$D2lo
+
+	vpsrlq		\$42,$D2lo,$tmp
+	vpsllq		\$10,$D2hi,$D2hi
+	vpandq		$mask42,$D2lo,$H2
+	vpaddq		$tmp,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+	vpsllq		\$2,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+
+	vpsrlq		\$44,$H0,$tmp		# additional step
+	vpandq		$mask44,$H0,$H0
+
+	vpaddq		$tmp,$H1,$H1
+
+	dec		%eax
+	jz		.Ldone_init_vpmadd52
+
+	vpunpcklqdq	$R1,$H1,$R1		# 1,2
+	vpbroadcastq	%x#$H1,%x#$H1		# 2,2
+	vpunpcklqdq	$R2,$H2,$R2
+	vpbroadcastq	%x#$H2,%x#$H2
+	vpunpcklqdq	$R0,$H0,$R0
+	vpbroadcastq	%x#$H0,%x#$H0
+
+	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
+	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
+	vpaddq		$R1,$S1,$S1
+	vpaddq		$R2,$S2,$S2
+	vpsllq		\$2,$S1,$S1
+	vpsllq		\$2,$S2,$S2
+
+	jmp		.Lmul_init_vpmadd52
+	ud2
+
+.align	32
+.Ldone_init_vpmadd52:
+	vinserti128	\$1,%x#$R1,$H1,$R1	# 1,2,3,4
+	vinserti128	\$1,%x#$R2,$H2,$R2
+	vinserti128	\$1,%x#$R0,$H0,$R0
+
+	vpermq		\$0b11011000,$R1,$R1	# 1,3,2,4
+	vpermq		\$0b11011000,$R2,$R2
+	vpermq		\$0b11011000,$R0,$R0
+
+	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
+	vpaddq		$R1,$S1,$S1
+	vpsllq		\$2,$S1,$S1
+
+	vmovq		0($ctx),%x#$H0		# load current hash value
+	vmovq		8($ctx),%x#$H1
+	vmovq		16($ctx),%x#$H2
+
+	test		\$3,$len		# is length 4*n+2?
+	jnz		.Ldone_init_vpmadd52_2x
+
+	vmovdqu64	$R0,64($ctx)		# save key powers
+	vpbroadcastq	%x#$R0,$R0		# broadcast 4th power
+	vmovdqu64	$R1,96($ctx)
+	vpbroadcastq	%x#$R1,$R1
+	vmovdqu64	$R2,128($ctx)
+	vpbroadcastq	%x#$R2,$R2
+	vmovdqu64	$S1,160($ctx)
+	vpbroadcastq	%x#$S1,$S1
+
+	jmp		.Lblocks_vpmadd52_4x_key_loaded
+	ud2
+
+.align	32
+.Ldone_init_vpmadd52_2x:
+	vmovdqu64	$R0,64($ctx)		# save key powers
+	vpsrldq		\$8,$R0,$R0		# 0-1-0-2
+	vmovdqu64	$R1,96($ctx)
+	vpsrldq		\$8,$R1,$R1
+	vmovdqu64	$R2,128($ctx)
+	vpsrldq		\$8,$R2,$R2
+	vmovdqu64	$S1,160($ctx)
+	vpsrldq		\$8,$S1,$S1
+	jmp		.Lblocks_vpmadd52_2x_key_loaded
+	ud2
+
+.align	32
+.Lblocks_vpmadd52_2x_do:
+	vmovdqu64	128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
+	vmovdqu64	160+8($ctx),${S1}{%k1}{z}
+	vmovdqu64	64+8($ctx),${R0}{%k1}{z}
+	vmovdqu64	96+8($ctx),${R1}{%k1}{z}
+
+.Lblocks_vpmadd52_2x_key_loaded:
+	vmovdqu64	16*0($inp),$T2		# load data
+	vpxorq		$T3,$T3,$T3
+	lea		16*2($inp),$inp
+
+	vpunpcklqdq	$T3,$T2,$T1		# transpose data
+	vpunpckhqdq	$T3,$T2,$T3
+
+	# at this point 64-bit lanes are ordered as x-1-x-0
+
+	vpsrlq		\$24,$T3,$T2		# splat the data
+	vporq		$PAD,$T2,$T2
+	 vpaddq		$T2,$H2,$H2		# accumulate input
+	vpandq		$mask44,$T1,$T0
+	vpsrlq		\$44,$T1,$T1
+	vpsllq		\$20,$T3,$T3
+	vporq		$T3,$T1,$T1
+	vpandq		$mask44,$T1,$T1
+
+	jmp		.Ltail_vpmadd52_2x
+	ud2
+
+.align	32
+.Loop_vpmadd52_4x:
+	#vpaddq		$T2,$H2,$H2		# accumulate input
+	vpaddq		$T0,$H0,$H0
+	vpaddq		$T1,$H1,$H1
+
+	vpxorq		$D0lo,$D0lo,$D0lo
+	vpmadd52luq	$H2,$S1,$D0lo
+	vpxorq		$D0hi,$D0hi,$D0hi
+	vpmadd52huq	$H2,$S1,$D0hi
+	vpxorq		$D1lo,$D1lo,$D1lo
+	vpmadd52luq	$H2,$S2,$D1lo
+	vpxorq		$D1hi,$D1hi,$D1hi
+	vpmadd52huq	$H2,$S2,$D1hi
+	vpxorq		$D2lo,$D2lo,$D2lo
+	vpmadd52luq	$H2,$R0,$D2lo
+	vpxorq		$D2hi,$D2hi,$D2hi
+	vpmadd52huq	$H2,$R0,$D2hi
+
+	 vmovdqu64	16*0($inp),$T2		# load data
+	 vmovdqu64	16*2($inp),$T3
+	 lea		16*4($inp),$inp
+	vpmadd52luq	$H0,$R0,$D0lo
+	vpmadd52huq	$H0,$R0,$D0hi
+	vpmadd52luq	$H0,$R1,$D1lo
+	vpmadd52huq	$H0,$R1,$D1hi
+	vpmadd52luq	$H0,$R2,$D2lo
+	vpmadd52huq	$H0,$R2,$D2hi
+
+	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
+	 vpunpckhqdq	$T3,$T2,$T3
+	vpmadd52luq	$H1,$S2,$D0lo
+	vpmadd52huq	$H1,$S2,$D0hi
+	vpmadd52luq	$H1,$R0,$D1lo
+	vpmadd52huq	$H1,$R0,$D1hi
+	vpmadd52luq	$H1,$R1,$D2lo
+	vpmadd52huq	$H1,$R1,$D2hi
+
+	################################################################
+	# partial reduction (interleaved with data splat)
+	vpsrlq		\$44,$D0lo,$tmp
+	vpsllq		\$8,$D0hi,$D0hi
+	vpandq		$mask44,$D0lo,$H0
+	vpaddq		$tmp,$D0hi,$D0hi
+
+	 vpsrlq		\$24,$T3,$T2
+	 vporq		$PAD,$T2,$T2
+	vpaddq		$D0hi,$D1lo,$D1lo
+
+	vpsrlq		\$44,$D1lo,$tmp
+	vpsllq		\$8,$D1hi,$D1hi
+	vpandq		$mask44,$D1lo,$H1
+	vpaddq		$tmp,$D1hi,$D1hi
+
+	 vpandq		$mask44,$T1,$T0
+	 vpsrlq		\$44,$T1,$T1
+	 vpsllq		\$20,$T3,$T3
+	vpaddq		$D1hi,$D2lo,$D2lo
+
+	vpsrlq		\$42,$D2lo,$tmp
+	vpsllq		\$10,$D2hi,$D2hi
+	vpandq		$mask42,$D2lo,$H2
+	vpaddq		$tmp,$D2hi,$D2hi
+
+	  vpaddq	$T2,$H2,$H2		# accumulate input
+	vpaddq		$D2hi,$H0,$H0
+	vpsllq		\$2,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+	 vporq		$T3,$T1,$T1
+	 vpandq		$mask44,$T1,$T1
+
+	vpsrlq		\$44,$H0,$tmp		# additional step
+	vpandq		$mask44,$H0,$H0
+
+	vpaddq		$tmp,$H1,$H1
+
+	sub		\$4,$len		# len-=64
+	jnz		.Loop_vpmadd52_4x
+
+.Ltail_vpmadd52_4x:
+	vmovdqu64	128($ctx),$R2		# load all key powers
+	vmovdqu64	160($ctx),$S1
+	vmovdqu64	64($ctx),$R0
+	vmovdqu64	96($ctx),$R1
+
+.Ltail_vpmadd52_2x:
+	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
+	vpaddq		$R2,$S2,$S2
+	vpsllq		\$2,$S2,$S2
+
+	#vpaddq		$T2,$H2,$H2		# accumulate input
+	vpaddq		$T0,$H0,$H0
+	vpaddq		$T1,$H1,$H1
+
+	vpxorq		$D0lo,$D0lo,$D0lo
+	vpmadd52luq	$H2,$S1,$D0lo
+	vpxorq		$D0hi,$D0hi,$D0hi
+	vpmadd52huq	$H2,$S1,$D0hi
+	vpxorq		$D1lo,$D1lo,$D1lo
+	vpmadd52luq	$H2,$S2,$D1lo
+	vpxorq		$D1hi,$D1hi,$D1hi
+	vpmadd52huq	$H2,$S2,$D1hi
+	vpxorq		$D2lo,$D2lo,$D2lo
+	vpmadd52luq	$H2,$R0,$D2lo
+	vpxorq		$D2hi,$D2hi,$D2hi
+	vpmadd52huq	$H2,$R0,$D2hi
+
+	vpmadd52luq	$H0,$R0,$D0lo
+	vpmadd52huq	$H0,$R0,$D0hi
+	vpmadd52luq	$H0,$R1,$D1lo
+	vpmadd52huq	$H0,$R1,$D1hi
+	vpmadd52luq	$H0,$R2,$D2lo
+	vpmadd52huq	$H0,$R2,$D2hi
+
+	vpmadd52luq	$H1,$S2,$D0lo
+	vpmadd52huq	$H1,$S2,$D0hi
+	vpmadd52luq	$H1,$R0,$D1lo
+	vpmadd52huq	$H1,$R0,$D1hi
+	vpmadd52luq	$H1,$R1,$D2lo
+	vpmadd52huq	$H1,$R1,$D2hi
+
+	################################################################
+	# horizontal addition
+
+	mov		\$1,%eax
+	kmovw		%eax,%k1
+	vpsrldq		\$8,$D0lo,$T0
+	vpsrldq		\$8,$D0hi,$H0
+	vpsrldq		\$8,$D1lo,$T1
+	vpsrldq		\$8,$D1hi,$H1
+	vpaddq		$T0,$D0lo,$D0lo
+	vpaddq		$H0,$D0hi,$D0hi
+	vpsrldq		\$8,$D2lo,$T2
+	vpsrldq		\$8,$D2hi,$H2
+	vpaddq		$T1,$D1lo,$D1lo
+	vpaddq		$H1,$D1hi,$D1hi
+	 vpermq		\$0x2,$D0lo,$T0
+	 vpermq		\$0x2,$D0hi,$H0
+	vpaddq		$T2,$D2lo,$D2lo
+	vpaddq		$H2,$D2hi,$D2hi
+
+	vpermq		\$0x2,$D1lo,$T1
+	vpermq		\$0x2,$D1hi,$H1
+	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
+	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
+	vpermq		\$0x2,$D2lo,$T2
+	vpermq		\$0x2,$D2hi,$H2
+	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
+	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
+	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
+	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
+
+	################################################################
+	# partial reduction
+	vpsrlq		\$44,$D0lo,$tmp
+	vpsllq		\$8,$D0hi,$D0hi
+	vpandq		$mask44,$D0lo,$H0
+	vpaddq		$tmp,$D0hi,$D0hi
+
+	vpaddq		$D0hi,$D1lo,$D1lo
+
+	vpsrlq		\$44,$D1lo,$tmp
+	vpsllq		\$8,$D1hi,$D1hi
+	vpandq		$mask44,$D1lo,$H1
+	vpaddq		$tmp,$D1hi,$D1hi
+
+	vpaddq		$D1hi,$D2lo,$D2lo
+
+	vpsrlq		\$42,$D2lo,$tmp
+	vpsllq		\$10,$D2hi,$D2hi
+	vpandq		$mask42,$D2lo,$H2
+	vpaddq		$tmp,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+	vpsllq		\$2,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+
+	vpsrlq		\$44,$H0,$tmp		# additional step
+	vpandq		$mask44,$H0,$H0
+
+	vpaddq		$tmp,$H1,$H1
+						# at this point $len is
+						# either 4*n+2 or 0...
+	sub		\$2,$len		# len-=32
+	ja		.Lblocks_vpmadd52_4x_do
+
+	vmovq		%x#$H0,0($ctx)
+	vmovq		%x#$H1,8($ctx)
+	vmovq		%x#$H2,16($ctx)
+	vzeroall
+
+.Lno_data_vpmadd52_4x:
+	RET
+.size	poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
+___
+}
+{
+########################################################################
+# As implied by its name 8x subroutine processes 8 blocks in parallel...
+# This is intermediate version, as it's used only in cases when input
+# length is either 8*n, 8*n+1 or 8*n+2...
+
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
+
+$code.=<<___;
+.type	poly1305_blocks_vpmadd52_8x,\@function,4
+.align	32
+poly1305_blocks_vpmadd52_8x:
+	shr	\$4,$len
+	jz	.Lno_data_vpmadd52_8x		# too short
+
+	shl	\$40,$padbit
+	mov	64($ctx),%r8			# peek on power of the key
+
+	vmovdqa64	.Lx_mask44(%rip),$mask44
+	vmovdqa64	.Lx_mask42(%rip),$mask42
+
+	test	%r8,%r8				# is power value impossible?
+	js	.Linit_vpmadd52			# if it is, then init R[4]
+
+	vmovq	0($ctx),%x#$H0			# load current hash value
+	vmovq	8($ctx),%x#$H1
+	vmovq	16($ctx),%x#$H2
+
+.Lblocks_vpmadd52_8x:
+	################################################################
+	# fist we calculate more key powers
+
+	vmovdqu64	128($ctx),$R2		# load 1-3-2-4 powers
+	vmovdqu64	160($ctx),$S1
+	vmovdqu64	64($ctx),$R0
+	vmovdqu64	96($ctx),$R1
+
+	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
+	vpaddq		$R2,$S2,$S2
+	vpsllq		\$2,$S2,$S2
+
+	vpbroadcastq	%x#$R2,$RR2		# broadcast 4th power
+	vpbroadcastq	%x#$R0,$RR0
+	vpbroadcastq	%x#$R1,$RR1
+
+	vpxorq		$D0lo,$D0lo,$D0lo
+	vpmadd52luq	$RR2,$S1,$D0lo
+	vpxorq		$D0hi,$D0hi,$D0hi
+	vpmadd52huq	$RR2,$S1,$D0hi
+	vpxorq		$D1lo,$D1lo,$D1lo
+	vpmadd52luq	$RR2,$S2,$D1lo
+	vpxorq		$D1hi,$D1hi,$D1hi
+	vpmadd52huq	$RR2,$S2,$D1hi
+	vpxorq		$D2lo,$D2lo,$D2lo
+	vpmadd52luq	$RR2,$R0,$D2lo
+	vpxorq		$D2hi,$D2hi,$D2hi
+	vpmadd52huq	$RR2,$R0,$D2hi
+
+	vpmadd52luq	$RR0,$R0,$D0lo
+	vpmadd52huq	$RR0,$R0,$D0hi
+	vpmadd52luq	$RR0,$R1,$D1lo
+	vpmadd52huq	$RR0,$R1,$D1hi
+	vpmadd52luq	$RR0,$R2,$D2lo
+	vpmadd52huq	$RR0,$R2,$D2hi
+
+	vpmadd52luq	$RR1,$S2,$D0lo
+	vpmadd52huq	$RR1,$S2,$D0hi
+	vpmadd52luq	$RR1,$R0,$D1lo
+	vpmadd52huq	$RR1,$R0,$D1hi
+	vpmadd52luq	$RR1,$R1,$D2lo
+	vpmadd52huq	$RR1,$R1,$D2hi
+
+	################################################################
+	# partial reduction
+	vpsrlq		\$44,$D0lo,$tmp
+	vpsllq		\$8,$D0hi,$D0hi
+	vpandq		$mask44,$D0lo,$RR0
+	vpaddq		$tmp,$D0hi,$D0hi
+
+	vpaddq		$D0hi,$D1lo,$D1lo
+
+	vpsrlq		\$44,$D1lo,$tmp
+	vpsllq		\$8,$D1hi,$D1hi
+	vpandq		$mask44,$D1lo,$RR1
+	vpaddq		$tmp,$D1hi,$D1hi
+
+	vpaddq		$D1hi,$D2lo,$D2lo
+
+	vpsrlq		\$42,$D2lo,$tmp
+	vpsllq		\$10,$D2hi,$D2hi
+	vpandq		$mask42,$D2lo,$RR2
+	vpaddq		$tmp,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$RR0,$RR0
+	vpsllq		\$2,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$RR0,$RR0
+
+	vpsrlq		\$44,$RR0,$tmp		# additional step
+	vpandq		$mask44,$RR0,$RR0
+
+	vpaddq		$tmp,$RR1,$RR1
+
+	################################################################
+	# At this point Rx holds 1324 powers, RRx - 5768, and the goal
+	# is 15263748, which reflects how data is loaded...
+
+	vpunpcklqdq	$R2,$RR2,$T2		# 3748
+	vpunpckhqdq	$R2,$RR2,$R2		# 1526
+	vpunpcklqdq	$R0,$RR0,$T0
+	vpunpckhqdq	$R0,$RR0,$R0
+	vpunpcklqdq	$R1,$RR1,$T1
+	vpunpckhqdq	$R1,$RR1,$R1
+___
+######## switch to %zmm
+map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
+map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
+map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
+map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
+
+$code.=<<___;
+	vshufi64x2	\$0x44,$R2,$T2,$RR2	# 15263748
+	vshufi64x2	\$0x44,$R0,$T0,$RR0
+	vshufi64x2	\$0x44,$R1,$T1,$RR1
+
+	vmovdqu64	16*0($inp),$T2		# load data
+	vmovdqu64	16*4($inp),$T3
+	lea		16*8($inp),$inp
+
+	vpsllq		\$2,$RR2,$SS2		# S2 = R2*5*4
+	vpsllq		\$2,$RR1,$SS1		# S1 = R1*5*4
+	vpaddq		$RR2,$SS2,$SS2
+	vpaddq		$RR1,$SS1,$SS1
+	vpsllq		\$2,$SS2,$SS2
+	vpsllq		\$2,$SS1,$SS1
+
+	vpbroadcastq	$padbit,$PAD
+	vpbroadcastq	%x#$mask44,$mask44
+	vpbroadcastq	%x#$mask42,$mask42
+
+	vpbroadcastq	%x#$SS1,$S1		# broadcast 8th power
+	vpbroadcastq	%x#$SS2,$S2
+	vpbroadcastq	%x#$RR0,$R0
+	vpbroadcastq	%x#$RR1,$R1
+	vpbroadcastq	%x#$RR2,$R2
+
+	vpunpcklqdq	$T3,$T2,$T1		# transpose data
+	vpunpckhqdq	$T3,$T2,$T3
+
+	# at this point 64-bit lanes are ordered as 73625140
+
+	vpsrlq		\$24,$T3,$T2		# splat the data
+	vporq		$PAD,$T2,$T2
+	 vpaddq		$T2,$H2,$H2		# accumulate input
+	vpandq		$mask44,$T1,$T0
+	vpsrlq		\$44,$T1,$T1
+	vpsllq		\$20,$T3,$T3
+	vporq		$T3,$T1,$T1
+	vpandq		$mask44,$T1,$T1
+
+	sub		\$8,$len
+	jz		.Ltail_vpmadd52_8x
+	jmp		.Loop_vpmadd52_8x
+
+.align	32
+.Loop_vpmadd52_8x:
+	#vpaddq		$T2,$H2,$H2		# accumulate input
+	vpaddq		$T0,$H0,$H0
+	vpaddq		$T1,$H1,$H1
+
+	vpxorq		$D0lo,$D0lo,$D0lo
+	vpmadd52luq	$H2,$S1,$D0lo
+	vpxorq		$D0hi,$D0hi,$D0hi
+	vpmadd52huq	$H2,$S1,$D0hi
+	vpxorq		$D1lo,$D1lo,$D1lo
+	vpmadd52luq	$H2,$S2,$D1lo
+	vpxorq		$D1hi,$D1hi,$D1hi
+	vpmadd52huq	$H2,$S2,$D1hi
+	vpxorq		$D2lo,$D2lo,$D2lo
+	vpmadd52luq	$H2,$R0,$D2lo
+	vpxorq		$D2hi,$D2hi,$D2hi
+	vpmadd52huq	$H2,$R0,$D2hi
+
+	 vmovdqu64	16*0($inp),$T2		# load data
+	 vmovdqu64	16*4($inp),$T3
+	 lea		16*8($inp),$inp
+	vpmadd52luq	$H0,$R0,$D0lo
+	vpmadd52huq	$H0,$R0,$D0hi
+	vpmadd52luq	$H0,$R1,$D1lo
+	vpmadd52huq	$H0,$R1,$D1hi
+	vpmadd52luq	$H0,$R2,$D2lo
+	vpmadd52huq	$H0,$R2,$D2hi
+
+	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
+	 vpunpckhqdq	$T3,$T2,$T3
+	vpmadd52luq	$H1,$S2,$D0lo
+	vpmadd52huq	$H1,$S2,$D0hi
+	vpmadd52luq	$H1,$R0,$D1lo
+	vpmadd52huq	$H1,$R0,$D1hi
+	vpmadd52luq	$H1,$R1,$D2lo
+	vpmadd52huq	$H1,$R1,$D2hi
+
+	################################################################
+	# partial reduction (interleaved with data splat)
+	vpsrlq		\$44,$D0lo,$tmp
+	vpsllq		\$8,$D0hi,$D0hi
+	vpandq		$mask44,$D0lo,$H0
+	vpaddq		$tmp,$D0hi,$D0hi
+
+	 vpsrlq		\$24,$T3,$T2
+	 vporq		$PAD,$T2,$T2
+	vpaddq		$D0hi,$D1lo,$D1lo
+
+	vpsrlq		\$44,$D1lo,$tmp
+	vpsllq		\$8,$D1hi,$D1hi
+	vpandq		$mask44,$D1lo,$H1
+	vpaddq		$tmp,$D1hi,$D1hi
+
+	 vpandq		$mask44,$T1,$T0
+	 vpsrlq		\$44,$T1,$T1
+	 vpsllq		\$20,$T3,$T3
+	vpaddq		$D1hi,$D2lo,$D2lo
+
+	vpsrlq		\$42,$D2lo,$tmp
+	vpsllq		\$10,$D2hi,$D2hi
+	vpandq		$mask42,$D2lo,$H2
+	vpaddq		$tmp,$D2hi,$D2hi
+
+	  vpaddq	$T2,$H2,$H2		# accumulate input
+	vpaddq		$D2hi,$H0,$H0
+	vpsllq		\$2,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+	 vporq		$T3,$T1,$T1
+	 vpandq		$mask44,$T1,$T1
+
+	vpsrlq		\$44,$H0,$tmp		# additional step
+	vpandq		$mask44,$H0,$H0
+
+	vpaddq		$tmp,$H1,$H1
+
+	sub		\$8,$len		# len-=128
+	jnz		.Loop_vpmadd52_8x
+
+.Ltail_vpmadd52_8x:
+	#vpaddq		$T2,$H2,$H2		# accumulate input
+	vpaddq		$T0,$H0,$H0
+	vpaddq		$T1,$H1,$H1
+
+	vpxorq		$D0lo,$D0lo,$D0lo
+	vpmadd52luq	$H2,$SS1,$D0lo
+	vpxorq		$D0hi,$D0hi,$D0hi
+	vpmadd52huq	$H2,$SS1,$D0hi
+	vpxorq		$D1lo,$D1lo,$D1lo
+	vpmadd52luq	$H2,$SS2,$D1lo
+	vpxorq		$D1hi,$D1hi,$D1hi
+	vpmadd52huq	$H2,$SS2,$D1hi
+	vpxorq		$D2lo,$D2lo,$D2lo
+	vpmadd52luq	$H2,$RR0,$D2lo
+	vpxorq		$D2hi,$D2hi,$D2hi
+	vpmadd52huq	$H2,$RR0,$D2hi
+
+	vpmadd52luq	$H0,$RR0,$D0lo
+	vpmadd52huq	$H0,$RR0,$D0hi
+	vpmadd52luq	$H0,$RR1,$D1lo
+	vpmadd52huq	$H0,$RR1,$D1hi
+	vpmadd52luq	$H0,$RR2,$D2lo
+	vpmadd52huq	$H0,$RR2,$D2hi
+
+	vpmadd52luq	$H1,$SS2,$D0lo
+	vpmadd52huq	$H1,$SS2,$D0hi
+	vpmadd52luq	$H1,$RR0,$D1lo
+	vpmadd52huq	$H1,$RR0,$D1hi
+	vpmadd52luq	$H1,$RR1,$D2lo
+	vpmadd52huq	$H1,$RR1,$D2hi
+
+	################################################################
+	# horizontal addition
+
+	mov		\$1,%eax
+	kmovw		%eax,%k1
+	vpsrldq		\$8,$D0lo,$T0
+	vpsrldq		\$8,$D0hi,$H0
+	vpsrldq		\$8,$D1lo,$T1
+	vpsrldq		\$8,$D1hi,$H1
+	vpaddq		$T0,$D0lo,$D0lo
+	vpaddq		$H0,$D0hi,$D0hi
+	vpsrldq		\$8,$D2lo,$T2
+	vpsrldq		\$8,$D2hi,$H2
+	vpaddq		$T1,$D1lo,$D1lo
+	vpaddq		$H1,$D1hi,$D1hi
+	 vpermq		\$0x2,$D0lo,$T0
+	 vpermq		\$0x2,$D0hi,$H0
+	vpaddq		$T2,$D2lo,$D2lo
+	vpaddq		$H2,$D2hi,$D2hi
+
+	vpermq		\$0x2,$D1lo,$T1
+	vpermq		\$0x2,$D1hi,$H1
+	vpaddq		$T0,$D0lo,$D0lo
+	vpaddq		$H0,$D0hi,$D0hi
+	vpermq		\$0x2,$D2lo,$T2
+	vpermq		\$0x2,$D2hi,$H2
+	vpaddq		$T1,$D1lo,$D1lo
+	vpaddq		$H1,$D1hi,$D1hi
+	 vextracti64x4	\$1,$D0lo,%y#$T0
+	 vextracti64x4	\$1,$D0hi,%y#$H0
+	vpaddq		$T2,$D2lo,$D2lo
+	vpaddq		$H2,$D2hi,$D2hi
+
+	vextracti64x4	\$1,$D1lo,%y#$T1
+	vextracti64x4	\$1,$D1hi,%y#$H1
+	vextracti64x4	\$1,$D2lo,%y#$T2
+	vextracti64x4	\$1,$D2hi,%y#$H2
+___
+######## switch back to %ymm
+map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
+map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
+map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
+
+$code.=<<___;
+	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
+	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
+	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
+	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
+	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
+	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
+
+	################################################################
+	# partial reduction
+	vpsrlq		\$44,$D0lo,$tmp
+	vpsllq		\$8,$D0hi,$D0hi
+	vpandq		$mask44,$D0lo,$H0
+	vpaddq		$tmp,$D0hi,$D0hi
+
+	vpaddq		$D0hi,$D1lo,$D1lo
+
+	vpsrlq		\$44,$D1lo,$tmp
+	vpsllq		\$8,$D1hi,$D1hi
+	vpandq		$mask44,$D1lo,$H1
+	vpaddq		$tmp,$D1hi,$D1hi
+
+	vpaddq		$D1hi,$D2lo,$D2lo
+
+	vpsrlq		\$42,$D2lo,$tmp
+	vpsllq		\$10,$D2hi,$D2hi
+	vpandq		$mask42,$D2lo,$H2
+	vpaddq		$tmp,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+	vpsllq		\$2,$D2hi,$D2hi
+
+	vpaddq		$D2hi,$H0,$H0
+
+	vpsrlq		\$44,$H0,$tmp		# additional step
+	vpandq		$mask44,$H0,$H0
+
+	vpaddq		$tmp,$H1,$H1
+
+	################################################################
+
+	vmovq		%x#$H0,0($ctx)
+	vmovq		%x#$H1,8($ctx)
+	vmovq		%x#$H2,16($ctx)
+	vzeroall
+
+.Lno_data_vpmadd52_8x:
+	RET
+.size	poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
+___
+}
+$code.=<<___;
+.type	poly1305_emit_base2_44,\@function,3
+.align	32
+poly1305_emit_base2_44:
+	mov	0($ctx),%r8	# load hash value
+	mov	8($ctx),%r9
+	mov	16($ctx),%r10
+
+	mov	%r9,%rax
+	shr	\$20,%r9
+	shl	\$44,%rax
+	mov	%r10,%rcx
+	shr	\$40,%r10
+	shl	\$24,%rcx
+
+	add	%rax,%r8
+	adc	%rcx,%r9
+	adc	\$0,%r10
+
+	mov	%r8,%rax
+	add	\$5,%r8		# compare to modulus
+	mov	%r9,%rcx
+	adc	\$0,%r9
+	adc	\$0,%r10
+	shr	\$2,%r10	# did 130-bit value overflow?
+	cmovnz	%r8,%rax
+	cmovnz	%r9,%rcx
+
+	add	0($nonce),%rax	# accumulate nonce
+	adc	8($nonce),%rcx
+	mov	%rax,0($mac)	# write result
+	mov	%rcx,8($mac)
+
+	RET
+.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
+___
+}	}	}
+}
+
+if (!$kernel)
+{	# chacha20-poly1305 helpers
+my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+                                  ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+$code.=<<___;
+.globl	xor128_encrypt_n_pad
+.type	xor128_encrypt_n_pad,\@abi-omnipotent
+.align	16
+xor128_encrypt_n_pad:
+	sub	$otp,$inp
+	sub	$otp,$out
+	mov	$len,%r10		# put len aside
+	shr	\$4,$len		# len / 16
+	jz	.Ltail_enc
+	nop
+.Loop_enc_xmm:
+	movdqu	($inp,$otp),%xmm0
+	pxor	($otp),%xmm0
+	movdqu	%xmm0,($out,$otp)
+	movdqa	%xmm0,($otp)
+	lea	16($otp),$otp
+	dec	$len
+	jnz	.Loop_enc_xmm
+
+	and	\$15,%r10		# len % 16
+	jz	.Ldone_enc
+
+.Ltail_enc:
+	mov	\$16,$len
+	sub	%r10,$len
+	xor	%eax,%eax
+.Loop_enc_byte:
+	mov	($inp,$otp),%al
+	xor	($otp),%al
+	mov	%al,($out,$otp)
+	mov	%al,($otp)
+	lea	1($otp),$otp
+	dec	%r10
+	jnz	.Loop_enc_byte
+
+	xor	%eax,%eax
+.Loop_enc_pad:
+	mov	%al,($otp)
+	lea	1($otp),$otp
+	dec	$len
+	jnz	.Loop_enc_pad
+
+.Ldone_enc:
+	mov	$otp,%rax
+	RET
+.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
+
+.globl	xor128_decrypt_n_pad
+.type	xor128_decrypt_n_pad,\@abi-omnipotent
+.align	16
+xor128_decrypt_n_pad:
+	sub	$otp,$inp
+	sub	$otp,$out
+	mov	$len,%r10		# put len aside
+	shr	\$4,$len		# len / 16
+	jz	.Ltail_dec
+	nop
+.Loop_dec_xmm:
+	movdqu	($inp,$otp),%xmm0
+	movdqa	($otp),%xmm1
+	pxor	%xmm0,%xmm1
+	movdqu	%xmm1,($out,$otp)
+	movdqa	%xmm0,($otp)
+	lea	16($otp),$otp
+	dec	$len
+	jnz	.Loop_dec_xmm
+
+	pxor	%xmm1,%xmm1
+	and	\$15,%r10		# len % 16
+	jz	.Ldone_dec
+
+.Ltail_dec:
+	mov	\$16,$len
+	sub	%r10,$len
+	xor	%eax,%eax
+	xor	%r11d,%r11d
+.Loop_dec_byte:
+	mov	($inp,$otp),%r11b
+	mov	($otp),%al
+	xor	%r11b,%al
+	mov	%al,($out,$otp)
+	mov	%r11b,($otp)
+	lea	1($otp),$otp
+	dec	%r10
+	jnz	.Loop_dec_byte
+
+	xor	%eax,%eax
+.Loop_dec_pad:
+	mov	%al,($otp)
+	lea	1($otp),$otp
+	dec	$len
+	jnz	.Loop_dec_pad
+
+.Ldone_dec:
+	mov	$otp,%rax
+	RET
+.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
+___
+}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<.Lprologue
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lcommon_seh_tail
+
+	lea	48(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R14
+
+	jmp	.Lcommon_seh_tail
+.size	se_handler,.-se_handler
+
+.type	avx_handler,\@abi-omnipotent
+.align	16
+avx_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	208($context),%rax	# pull context->R11
+
+	lea	0x50(%rax),%rsi
+	lea	0xf8(%rax),%rax
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%ecx,%ecx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	RET
+.size	avx_handler,.-avx_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_poly1305_init_x86_64
+	.rva	.LSEH_end_poly1305_init_x86_64
+	.rva	.LSEH_info_poly1305_init_x86_64
+
+	.rva	.LSEH_begin_poly1305_blocks_x86_64
+	.rva	.LSEH_end_poly1305_blocks_x86_64
+	.rva	.LSEH_info_poly1305_blocks_x86_64
+
+	.rva	.LSEH_begin_poly1305_emit_x86_64
+	.rva	.LSEH_end_poly1305_emit_x86_64
+	.rva	.LSEH_info_poly1305_emit_x86_64
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_poly1305_blocks_avx
+	.rva	.Lbase2_64_avx
+	.rva	.LSEH_info_poly1305_blocks_avx_1
+
+	.rva	.Lbase2_64_avx
+	.rva	.Leven_avx
+	.rva	.LSEH_info_poly1305_blocks_avx_2
+
+	.rva	.Leven_avx
+	.rva	.LSEH_end_poly1305_blocks_avx
+	.rva	.LSEH_info_poly1305_blocks_avx_3
+
+	.rva	.LSEH_begin_poly1305_emit_avx
+	.rva	.LSEH_end_poly1305_emit_avx
+	.rva	.LSEH_info_poly1305_emit_avx
+___
+$code.=<<___ if ($avx>1);
+	.rva	.LSEH_begin_poly1305_blocks_avx2
+	.rva	.Lbase2_64_avx2
+	.rva	.LSEH_info_poly1305_blocks_avx2_1
+
+	.rva	.Lbase2_64_avx2
+	.rva	.Leven_avx2
+	.rva	.LSEH_info_poly1305_blocks_avx2_2
+
+	.rva	.Leven_avx2
+	.rva	.LSEH_end_poly1305_blocks_avx2
+	.rva	.LSEH_info_poly1305_blocks_avx2_3
+___
+$code.=<<___ if ($avx>2);
+	.rva	.LSEH_begin_poly1305_blocks_avx512
+	.rva	.LSEH_end_poly1305_blocks_avx512
+	.rva	.LSEH_info_poly1305_blocks_avx512
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_poly1305_init_x86_64:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
+
+.LSEH_info_poly1305_blocks_x86_64:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lblocks_body,.Lblocks_epilogue
+
+.LSEH_info_poly1305_emit_x86_64:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
+___
+$code.=<<___ if ($avx);
+.LSEH_info_poly1305_blocks_avx_1:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lblocks_avx_body,.Lblocks_avx_epilogue		# HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx_2:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lbase2_64_avx_body,.Lbase2_64_avx_epilogue	# HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx_3:
+	.byte	9,0,0,0
+	.rva	avx_handler
+	.rva	.Ldo_avx_body,.Ldo_avx_epilogue			# HandlerData[]
+
+.LSEH_info_poly1305_emit_avx:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_poly1305_blocks_avx2_1:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lblocks_avx2_body,.Lblocks_avx2_epilogue	# HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx2_2:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue	# HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx2_3:
+	.byte	9,0,0,0
+	.rva	avx_handler
+	.rva	.Ldo_avx2_body,.Ldo_avx2_epilogue		# HandlerData[]
+___
+$code.=<<___ if ($avx>2);
+.LSEH_info_poly1305_blocks_avx512:
+	.byte	9,0,0,0
+	.rva	avx_handler
+	.rva	.Ldo_avx512_body,.Ldo_avx512_epilogue		# HandlerData[]
+___
+}
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/\/\// and !/^$/);
+	print;
+}
+close SELF;
+
+foreach (split('\n',$code)) {
+	s/\`([^\`]*)\`/eval($1)/ge;
+	s/%r([a-z]+)#d/%e$1/g;
+	s/%r([0-9]+)#d/%r$1d/g;
+	s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
+
+	if ($kernel) {
+		s/(^\.type.*),[0-9]+$/\1/;
+		s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
+		next if /^\.cfi.*/;
+	}
+
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/lib/crypto/x86/poly1305.h b/lib/crypto/x86/poly1305.h
new file mode 100644
index 000000000000..ee92e3740a78
--- /dev/null
+++ b/lib/crypto/x86/poly1305.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/cpu_device_id.h>
+#include <asm/fpu/api.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+struct poly1305_arch_internal {
+	union {
+		struct {
+			u32 h[5];
+			u32 is_base2_26;
+		};
+		u64 hs[3];
+	};
+	u64 r[2];
+	u64 pad;
+	struct { u32 r2, r1, r4, r3; } rn[9];
+};
+
+/*
+ * The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
+ * the unfortunate situation of using AVX and then having to go back to scalar
+ * -- because the user is silly and has called the update function from two
+ * separate contexts -- then we need to convert back to the original base before
+ * proceeding. It is possible to reason that the initial reduction below is
+ * sufficient given the implementation invariants. However, for an avoidance of
+ * doubt and because this is not performance critical, we do the full reduction
+ * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
+ */
+static void convert_to_base2_64(void *ctx)
+{
+	struct poly1305_arch_internal *state = ctx;
+	u32 cy;
+
+	if (!state->is_base2_26)
+		return;
+
+	cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+	cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+	cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+	cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+	state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
+	state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
+	state->hs[2] = state->h[4] >> 24;
+	/* Unsigned Less Than: branchlessly produces 1 if a < b, else 0. */
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+	cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
+	state->hs[2] &= 3;
+	state->hs[0] += cy;
+	state->hs[1] += (cy = ULT(state->hs[0], cy));
+	state->hs[2] += ULT(state->hs[1], cy);
+#undef ULT
+	state->is_base2_26 = 0;
+}
+
+asmlinkage void poly1305_init_x86_64(struct poly1305_block_state *state,
+				     const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx,
+				       const u8 *inp,
+				       const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx,
+				     u8 mac[POLY1305_DIGEST_SIZE],
+				     const u32 nonce[4]);
+asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx,
+				  u8 mac[POLY1305_DIGEST_SIZE],
+				  const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx,
+				    const u8 *inp, const size_t len,
+				    const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx,
+				     const u8 *inp, const size_t len,
+				     const u32 padbit);
+asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx,
+				       const u8 *inp,
+				       const size_t len, const u32 padbit);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
+
+static void poly1305_block_init(struct poly1305_block_state *state,
+				const u8 raw_key[POLY1305_BLOCK_SIZE])
+{
+	poly1305_init_x86_64(state, raw_key);
+}
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *inp,
+			    unsigned int len, u32 padbit)
+{
+	struct poly1305_arch_internal *ctx =
+		container_of(&state->h.h, struct poly1305_arch_internal, h);
+
+	/* SIMD disables preemption, so relax after processing each page. */
+	BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
+		     SZ_4K % POLY1305_BLOCK_SIZE);
+
+	/*
+	 * The AVX implementations have significant setup overhead (e.g. key
+	 * power computation, kernel FPU enabling) which makes them slower for
+	 * short messages.  Fall back to the scalar implementation for messages
+	 * shorter than 288 bytes, unless the AVX-specific key setup has already
+	 * been performed (indicated by ctx->is_base2_26).
+	 */
+	if (!static_branch_likely(&poly1305_use_avx) ||
+	    (len < POLY1305_BLOCK_SIZE * 18 && !ctx->is_base2_26) ||
+	    unlikely(!irq_fpu_usable())) {
+		convert_to_base2_64(ctx);
+		poly1305_blocks_x86_64(ctx, inp, len, padbit);
+		return;
+	}
+
+	do {
+		const unsigned int bytes = min(len, SZ_4K);
+
+		kernel_fpu_begin();
+		if (static_branch_likely(&poly1305_use_avx512))
+			poly1305_blocks_avx512(ctx, inp, bytes, padbit);
+		else if (static_branch_likely(&poly1305_use_avx2))
+			poly1305_blocks_avx2(ctx, inp, bytes, padbit);
+		else
+			poly1305_blocks_avx(ctx, inp, bytes, padbit);
+		kernel_fpu_end();
+
+		len -= bytes;
+		inp += bytes;
+	} while (len);
+}
+
+static void poly1305_emit(const struct poly1305_state *ctx,
+			  u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4])
+{
+	if (!static_branch_likely(&poly1305_use_avx))
+		poly1305_emit_x86_64(ctx, mac, nonce);
+	else
+		poly1305_emit_avx(ctx, mac, nonce);
+}
+
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_AVX) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+		static_branch_enable(&poly1305_use_avx);
+	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+		static_branch_enable(&poly1305_use_avx2);
+	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+	    boot_cpu_has(X86_FEATURE_AVX512F) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
+	    /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
+	    boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
+		static_branch_enable(&poly1305_use_avx512);
+}
diff --git a/lib/crypto/x86/polyval-pclmul-avx.S b/lib/crypto/x86/polyval-pclmul-avx.S
new file mode 100644
index 000000000000..7f739465ad35
--- /dev/null
+++ b/lib/crypto/x86/polyval-pclmul-avx.S
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
+ * instructions. It works on 8 blocks at a time, by precomputing the first 8
+ * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
+ * allows us to split finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process  only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STRIDE_BLOCKS 8
+
+#define GSTAR %xmm7
+#define PL %xmm8
+#define PH %xmm9
+#define TMP_XMM %xmm11
+#define LO %xmm12
+#define HI %xmm13
+#define MI %xmm14
+#define SUM %xmm15
+
+#define ACCUMULATOR %rdi
+#define KEY_POWERS %rsi
+#define MSG %rdx
+#define BLOCKS_LEFT %rcx
+#define TMP %rax
+
+.section    .rodata.cst16.gstar, "aM", @progbits, 16
+.align 16
+
+.Lgstar:
+	.quad 0xc200000000000000, 0xc200000000000000
+
+.text
+
+/*
+ * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
+ * count pointed to by MSG and KEY_POWERS.
+ */
+.macro schoolbook1 count
+	.set i, 0
+	.rept (\count)
+		schoolbook1_iteration i 0
+		.set i, (i +1)
+	.endr
+.endm
+
+/*
+ * Computes the product of two 128-bit polynomials at the memory locations
+ * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
+ * the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ *   X = [X_1 : X_0]
+ *   Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ *   LO += X_0 * Y_0
+ *   MI += X_0 * Y_1 + X_1 * Y_0
+ *   HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ *   [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * If xor_sum == 1, then also XOR the value of SUM into m_0.  This avoids an
+ * extra multiplication of SUM and h^8.
+ */
+.macro schoolbook1_iteration i xor_sum
+	movups (16*\i)(MSG), %xmm0
+	.if (\i == 0 && \xor_sum == 1)
+		pxor SUM, %xmm0
+	.endif
+	vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
+	vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
+	vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
+	vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
+	vpxor %xmm2, MI, MI
+	vpxor %xmm1, LO, LO
+	vpxor %xmm4, HI, HI
+	vpxor %xmm3, MI, MI
+.endm
+
+/*
+ * Performs the same computation as schoolbook1_iteration, except we expect the
+ * arguments to already be loaded into xmm0 and xmm1 and we set the result
+ * registers LO, MI, and HI directly rather than XOR'ing into them.
+ */
+.macro schoolbook1_noload
+	vpclmulqdq $0x01, %xmm0, %xmm1, MI
+	vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
+	vpclmulqdq $0x00, %xmm0, %xmm1, LO
+	vpclmulqdq $0x11, %xmm0, %xmm1, HI
+	vpxor %xmm2, MI, MI
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ *   [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ */
+.macro schoolbook2
+	vpslldq $8, MI, PL
+	vpsrldq $8, MI, PH
+	pxor LO, PL
+	pxor HI, PH
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
+ * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128.  To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ *   T = T_1 : T_0 = g*(x) * P_0
+ *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+	vpclmulqdq $0x00, PL, GSTAR, TMP_XMM	# TMP_XMM = T_1 : T_0 = P_0 * g*(x)
+	pshufd $0b01001110, TMP_XMM, TMP_XMM	# TMP_XMM = T_0 : T_1
+	pxor PL, TMP_XMM			# TMP_XMM = P_1 + T_0 : P_0 + T_1
+	pxor TMP_XMM, PH			# PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+	pclmulqdq $0x11, GSTAR, TMP_XMM		# TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
+	vpxor TMP_XMM, PH, \dest
+.endm
+
+/*
+ * Compute schoolbook multiplication for 8 blocks
+ * m_0h^8 + ... + m_7h^1
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ */
+.macro full_stride reduce
+	pxor LO, LO
+	pxor HI, HI
+	pxor MI, MI
+
+	schoolbook1_iteration 7 0
+	.if \reduce
+		vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 6 0
+	.if \reduce
+		pshufd $0b01001110, TMP_XMM, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 5 0
+	.if \reduce
+		pxor PL, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 4 0
+	.if \reduce
+		pxor TMP_XMM, PH
+	.endif
+
+	schoolbook1_iteration 3 0
+	.if \reduce
+		pclmulqdq $0x11, GSTAR, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 2 0
+	.if \reduce
+		vpxor TMP_XMM, PH, SUM
+	.endif
+
+	schoolbook1_iteration 1 0
+
+	schoolbook1_iteration 0 1
+
+	addq $(8*16), MSG
+	schoolbook2
+.endm
+
+/*
+ * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
+ */
+.macro partial_stride
+	mov BLOCKS_LEFT, TMP
+	shlq $4, TMP
+	addq $(16*STRIDE_BLOCKS), KEY_POWERS
+	subq TMP, KEY_POWERS
+
+	movups (MSG), %xmm0
+	pxor SUM, %xmm0
+	movups (KEY_POWERS), %xmm1
+	schoolbook1_noload
+	dec BLOCKS_LEFT
+	addq $16, MSG
+	addq $16, KEY_POWERS
+
+	test $4, BLOCKS_LEFT
+	jz .Lpartial4BlocksDone
+	schoolbook1 4
+	addq $(4*16), MSG
+	addq $(4*16), KEY_POWERS
+.Lpartial4BlocksDone:
+	test $2, BLOCKS_LEFT
+	jz .Lpartial2BlocksDone
+	schoolbook1 2
+	addq $(2*16), MSG
+	addq $(2*16), KEY_POWERS
+.Lpartial2BlocksDone:
+	test $1, BLOCKS_LEFT
+	jz .LpartialDone
+	schoolbook1 1
+.LpartialDone:
+	schoolbook2
+	montgomery_reduction SUM
+.endm
+
+/*
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ *			       const struct polyval_elem *b);
+ */
+SYM_FUNC_START(polyval_mul_pclmul_avx)
+	FRAME_BEGIN
+	vmovdqa .Lgstar(%rip), GSTAR
+	movups (%rdi), %xmm0
+	movups (%rsi), %xmm1
+	schoolbook1_noload
+	schoolbook2
+	montgomery_reduction SUM
+	movups SUM, (%rdi)
+	FRAME_END
+	RET
+SYM_FUNC_END(polyval_mul_pclmul_avx)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL.  This computes:
+ *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * rdi - pointer to the accumulator
+ * rsi - pointer to precomputed key powers h^8 ... h^1
+ * rdx - pointer to message blocks
+ * rcx - number of blocks to hash
+ *
+ * void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ *				  const struct polyval_key *key,
+ *				  const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(polyval_blocks_pclmul_avx)
+	FRAME_BEGIN
+	vmovdqa .Lgstar(%rip), GSTAR
+	movups (ACCUMULATOR), SUM
+	subq $STRIDE_BLOCKS, BLOCKS_LEFT
+	js .LstrideLoopExit
+	full_stride 0
+	subq $STRIDE_BLOCKS, BLOCKS_LEFT
+	js .LstrideLoopExitReduce
+.LstrideLoop:
+	full_stride 1
+	subq $STRIDE_BLOCKS, BLOCKS_LEFT
+	jns .LstrideLoop
+.LstrideLoopExitReduce:
+	montgomery_reduction SUM
+.LstrideLoopExit:
+	add $STRIDE_BLOCKS, BLOCKS_LEFT
+	jz .LskipPartial
+	partial_stride
+.LskipPartial:
+	movups SUM, (ACCUMULATOR)
+	FRAME_END
+	RET
+SYM_FUNC_END(polyval_blocks_pclmul_avx)
diff --git a/lib/crypto/x86/polyval.h b/lib/crypto/x86/polyval.h
new file mode 100644
index 000000000000..ef8797521420
--- /dev/null
+++ b/lib/crypto/x86/polyval.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, x86_64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
+
+asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
+				       const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+					  const struct polyval_key *key,
+					  const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+				    const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+	memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_pclmul_avx(
+				&key->h_powers[i],
+				&key->h_powers[NUM_H_POWERS - 1]);
+		}
+		kernel_fpu_end();
+	} else {
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_generic(&key->h_powers[i],
+					    &key->h_powers[NUM_H_POWERS - 1]);
+		}
+	}
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+			     const struct polyval_key *key)
+{
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
+		kernel_fpu_end();
+	} else {
+		polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	}
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+				const struct polyval_key *key,
+				const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		do {
+			/* Allow rescheduling every 4 KiB. */
+			size_t n = min_t(size_t, nblocks,
+					 4096 / POLYVAL_BLOCK_SIZE);
+
+			kernel_fpu_begin();
+			polyval_blocks_pclmul_avx(acc, key, data, n);
+			kernel_fpu_end();
+			data += n * POLYVAL_BLOCK_SIZE;
+			nblocks -= n;
+		} while (nblocks);
+	} else {
+		polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+				       data, nblocks);
+	}
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
+	    boot_cpu_has(X86_FEATURE_AVX))
+		static_branch_enable(&have_pclmul_avx);
+}
diff --git a/lib/crypto/x86/sha1-avx2-asm.S b/lib/crypto/x86/sha1-avx2-asm.S
new file mode 100644
index 000000000000..91b2dc6db179
--- /dev/null
+++ b/lib/crypto/x86/sha1-avx2-asm.S
@@ -0,0 +1,697 @@
+/*
+ *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Ilya Albrekht <ilya.albrekht@intel.com>
+ * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
+ *
+ *This implementation is based on the previous SSSE3 release:
+ *Visit http://software.intel.com/en-us/articles/
+ *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * void sha1_transform_avx2(struct sha1_block_state *state,
+ *			    const u8 *data, size_t nblocks);
+ */
+
+#include <linux/linkage.h>
+
+#define	CTX	%rdi	/* arg1 */
+#define BUF	%rsi	/* arg2 */
+#define CNT	%rdx	/* arg3 */
+
+#define	REG_A	%ecx
+#define	REG_B	%esi
+#define	REG_C	%edi
+#define	REG_D	%eax
+#define	REG_E	%edx
+#define	REG_TB	%ebx
+#define	REG_TA	%r12d
+#define	REG_RA	%rcx
+#define	REG_RB	%rsi
+#define	REG_RC	%rdi
+#define	REG_RD	%rax
+#define	REG_RE	%rdx
+#define	REG_RTA	%r12
+#define	REG_RTB	%rbx
+#define	REG_T1	%r11d
+#define	xmm_mov	vmovups
+#define	avx2_zeroupper	vzeroupper
+#define	RND_F1	1
+#define	RND_F2	2
+#define	RND_F3	3
+
+.macro REGALLOC
+	.set A, REG_A
+	.set B, REG_B
+	.set C, REG_C
+	.set D, REG_D
+	.set E, REG_E
+	.set TB, REG_TB
+	.set TA, REG_TA
+
+	.set RA, REG_RA
+	.set RB, REG_RB
+	.set RC, REG_RC
+	.set RD, REG_RD
+	.set RE, REG_RE
+
+	.set RTA, REG_RTA
+	.set RTB, REG_RTB
+
+	.set T1, REG_T1
+.endm
+
+#define HASH_PTR	%r9
+#define BLOCKS_CTR	%r8
+#define BUFFER_PTR	%r10
+#define BUFFER_PTR2	%r13
+
+#define PRECALC_BUF	%r14
+#define WK_BUF		%r15
+
+#define W_TMP		%xmm0
+#define WY_TMP		%ymm0
+#define WY_TMP2		%ymm9
+
+# AVX2 variables
+#define WY0		%ymm3
+#define WY4		%ymm5
+#define WY08		%ymm7
+#define WY12		%ymm8
+#define WY16		%ymm12
+#define WY20		%ymm13
+#define WY24		%ymm14
+#define WY28		%ymm15
+
+#define YMM_SHUFB_BSWAP	%ymm10
+
+/*
+ * Keep 2 iterations precalculated at a time:
+ *    - 80 DWORDs per iteration * 2
+ */
+#define W_SIZE		(80*2*2 +16)
+
+#define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
+#define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)
+
+
+.macro UPDATE_HASH  hash, val
+	add	\hash, \val
+	mov	\val, \hash
+.endm
+
+.macro PRECALC_RESET_WY
+	.set WY_00, WY0
+	.set WY_04, WY4
+	.set WY_08, WY08
+	.set WY_12, WY12
+	.set WY_16, WY16
+	.set WY_20, WY20
+	.set WY_24, WY24
+	.set WY_28, WY28
+	.set WY_32, WY_00
+.endm
+
+.macro PRECALC_ROTATE_WY
+	/* Rotate macros */
+	.set WY_32, WY_28
+	.set WY_28, WY_24
+	.set WY_24, WY_20
+	.set WY_20, WY_16
+	.set WY_16, WY_12
+	.set WY_12, WY_08
+	.set WY_08, WY_04
+	.set WY_04, WY_00
+	.set WY_00, WY_32
+
+	/* Define register aliases */
+	.set WY, WY_00
+	.set WY_minus_04, WY_04
+	.set WY_minus_08, WY_08
+	.set WY_minus_12, WY_12
+	.set WY_minus_16, WY_16
+	.set WY_minus_20, WY_20
+	.set WY_minus_24, WY_24
+	.set WY_minus_28, WY_28
+	.set WY_minus_32, WY
+.endm
+
+.macro PRECALC_00_15
+	.if (i == 0) # Initialize and rotate registers
+		PRECALC_RESET_WY
+		PRECALC_ROTATE_WY
+	.endif
+
+	/* message scheduling pre-compute for rounds 0-15 */
+	.if   ((i & 7) == 0)
+		/*
+		 * blended AVX2 and ALU instruction scheduling
+		 * 1 vector iteration per 8 rounds
+		 */
+		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
+	.elseif ((i & 7) == 1)
+		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
+			 WY_TMP, WY_TMP
+	.elseif ((i & 7) == 2)
+		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
+	.elseif ((i & 7) == 4)
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+	.elseif ((i & 7) == 7)
+		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
+
+		PRECALC_ROTATE_WY
+	.endif
+.endm
+
+.macro PRECALC_16_31
+	/*
+	 * message scheduling pre-compute for rounds 16-31
+	 * calculating last 32 w[i] values in 8 XMM registers
+	 * pre-calculate K+w[i] values and store to mem
+	 * for later load by ALU add instruction
+	 *
+	 * "brute force" vectorization for rounds 16-31 only
+	 * due to w[i]->w[i-3] dependency
+	 */
+	.if   ((i & 7) == 0)
+		/*
+		 * blended AVX2 and ALU instruction scheduling
+		 * 1 vector iteration per 8 rounds
+		 */
+		/* w[i-14] */
+		vpalignr	$8, WY_minus_16, WY_minus_12, WY
+		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
+	.elseif ((i & 7) == 1)
+		vpxor	WY_minus_08, WY, WY
+		vpxor	WY_minus_16, WY_TMP, WY_TMP
+	.elseif ((i & 7) == 2)
+		vpxor	WY_TMP, WY, WY
+		vpslldq	$12, WY, WY_TMP2
+	.elseif ((i & 7) == 3)
+		vpslld	$1, WY, WY_TMP
+		vpsrld	$31, WY, WY
+	.elseif ((i & 7) == 4)
+		vpor	WY, WY_TMP, WY_TMP
+		vpslld	$2, WY_TMP2, WY
+	.elseif ((i & 7) == 5)
+		vpsrld	$30, WY_TMP2, WY_TMP2
+		vpxor	WY, WY_TMP, WY_TMP
+	.elseif ((i & 7) == 7)
+		vpxor	WY_TMP2, WY_TMP, WY
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
+
+		PRECALC_ROTATE_WY
+	.endif
+.endm
+
+.macro PRECALC_32_79
+	/*
+	 * in SHA-1 specification:
+	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+	 * instead we do equal:
+	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+	 * allows more efficient vectorization
+	 * since w[i]=>w[i-3] dependency is broken
+	 */
+
+	.if   ((i & 7) == 0)
+	/*
+	 * blended AVX2 and ALU instruction scheduling
+	 * 1 vector iteration per 8 rounds
+	 */
+		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
+	.elseif ((i & 7) == 1)
+		/* W is W_minus_32 before xor */
+		vpxor	WY_minus_28, WY, WY
+	.elseif ((i & 7) == 2)
+		vpxor	WY_minus_16, WY_TMP, WY_TMP
+	.elseif ((i & 7) == 3)
+		vpxor	WY_TMP, WY, WY
+	.elseif ((i & 7) == 4)
+		vpslld	$2, WY, WY_TMP
+	.elseif ((i & 7) == 5)
+		vpsrld	$30, WY, WY
+		vpor	WY, WY_TMP, WY
+	.elseif ((i & 7) == 7)
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
+
+		PRECALC_ROTATE_WY
+	.endif
+.endm
+
+.macro PRECALC r, s
+	.set i, \r
+
+	.if (i < 40)
+		.set K_XMM, 32*0
+	.elseif (i < 80)
+		.set K_XMM, 32*1
+	.elseif (i < 120)
+		.set K_XMM, 32*2
+	.else
+		.set K_XMM, 32*3
+	.endif
+
+	.if (i<32)
+		PRECALC_00_15	\s
+	.elseif (i<64)
+		PRECALC_16_31	\s
+	.elseif (i < 160)
+		PRECALC_32_79	\s
+	.endif
+.endm
+
+.macro ROTATE_STATE
+	.set T_REG, E
+	.set E, D
+	.set D, C
+	.set C, B
+	.set B, TB
+	.set TB, A
+	.set A, T_REG
+
+	.set T_REG, RE
+	.set RE, RD
+	.set RD, RC
+	.set RC, RB
+	.set RB, RTB
+	.set RTB, RA
+	.set RA, T_REG
+.endm
+
+/* Macro relies on saved ROUND_Fx */
+
+.macro RND_FUN f, r
+	.if (\f == RND_F1)
+		ROUND_F1	\r
+	.elseif (\f == RND_F2)
+		ROUND_F2	\r
+	.elseif (\f == RND_F3)
+		ROUND_F3	\r
+	.endif
+.endm
+
+.macro RR r
+	.set round_id, (\r % 80)
+
+	.if (round_id == 0)        /* Precalculate F for first round */
+		.set ROUND_FUNC, RND_F1
+		mov	B, TB
+
+		rorx	$(32-30), B, B    /* b>>>2 */
+		andn	D, TB, T1
+		and	C, TB
+		xor	T1, TB
+	.endif
+
+	RND_FUN ROUND_FUNC, \r
+	ROTATE_STATE
+
+	.if   (round_id == 18)
+		.set ROUND_FUNC, RND_F2
+	.elseif (round_id == 38)
+		.set ROUND_FUNC, RND_F3
+	.elseif (round_id == 58)
+		.set ROUND_FUNC, RND_F2
+	.endif
+
+	.set round_id, ( (\r+1) % 80)
+
+	RND_FUN ROUND_FUNC, (\r+1)
+	ROTATE_STATE
+.endm
+
+.macro ROUND_F1 r
+	add	WK(\r), E
+
+	andn	C, A, T1			/* ~b&d */
+	lea	(RE,RTB), E		/* Add F from the previous round */
+
+	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	rorx	$(32-30),A, TB		/* b>>>2 for next round */
+
+	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+
+	/*
+	 * Calculate F for the next round
+	 * (b & c) ^ andn[b, d]
+	 */
+	and	B, A			/* b&c */
+	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
+
+	lea	(RE,RTA), E		/* E += A >>> 5 */
+.endm
+
+.macro ROUND_F2 r
+	add	WK(\r), E
+	lea	(RE,RTB), E		/* Add F from the previous round */
+
+	/* Calculate F for the next round */
+	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	.if ((round_id) < 79)
+		rorx	$(32-30), A, TB	/* b>>>2 for next round */
+	.endif
+	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+
+	.if ((round_id) < 79)
+		xor	B, A
+	.endif
+
+	add	TA, E			/* E += A >>> 5 */
+
+	.if ((round_id) < 79)
+		xor	C, A
+	.endif
+.endm
+
+.macro ROUND_F3 r
+	add	WK(\r), E
+	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+
+	lea	(RE,RTB), E		/* Add F from the previous round */
+
+	mov	B, T1
+	or	A, T1
+
+	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	rorx	$(32-30), A, TB		/* b>>>2 for next round */
+
+	/* Calculate F for the next round
+	 * (b and c) or (d and (b or c))
+	 */
+	and	C, T1
+	and	B, A
+	or	T1, A
+
+	add	TA, E			/* E += A >>> 5 */
+
+.endm
+
+/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
+ * %1 + %2 >= %3 ? %4 : 0
+ */
+.macro ADD_IF_GE a, b, c, d
+	mov     \a, RTA
+	add     $\d, RTA
+	cmp     $\c, \b
+	cmovge  RTA, \a
+.endm
+
+/*
+ * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+
+	REGALLOC
+
+	mov	(HASH_PTR), A
+	mov	4(HASH_PTR), B
+	mov	8(HASH_PTR), C
+	mov	12(HASH_PTR), D
+	mov	16(HASH_PTR), E
+
+	mov	%rsp, PRECALC_BUF
+	lea	(2*4*80+32)(%rsp), WK_BUF
+
+	# Precalc WK for first 2 blocks
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
+	.set i, 0
+	.rept    160
+		PRECALC i
+		.set i, i + 1
+	.endr
+
+	/* Go to next block if needed */
+	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+	xchg	WK_BUF, PRECALC_BUF
+
+	.align 32
+.L_loop:
+	/*
+	 * code loops through more than one block
+	 * we use K_BASE value as a signal of a last block,
+	 * it is set below by: cmovae BUFFER_PTR, K_BASE
+	 */
+	test BLOCKS_CTR, BLOCKS_CTR
+	jnz .L_begin
+	.align 32
+	jmp	.L_end
+	.align 32
+.L_begin:
+
+	/*
+	 * Do first block
+	 * rounds: 0,2,4,6,8
+	 */
+	.set j, 0
+	.rept 5
+		RR	j
+		.set j, j+2
+	.endr
+
+	/*
+	 * rounds:
+	 * 10,12,14,16,18
+	 * 20,22,24,26,28
+	 * 30,32,34,36,38
+	 * 40,42,44,46,48
+	 * 50,52,54,56,58
+	 */
+	.rept 25
+		RR	j
+		.set j, j+2
+	.endr
+
+	/* Update Counter */
+	sub $1, BLOCKS_CTR
+	/* Move to the next block only if needed*/
+	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
+	/*
+	 * rounds
+	 * 60,62,64,66,68
+	 * 70,72,74,76,78
+	 */
+	.rept 10
+		RR	j
+		.set j, j+2
+	.endr
+
+	UPDATE_HASH	(HASH_PTR), A
+	UPDATE_HASH	4(HASH_PTR), TB
+	UPDATE_HASH	8(HASH_PTR), C
+	UPDATE_HASH	12(HASH_PTR), D
+	UPDATE_HASH	16(HASH_PTR), E
+
+	test	BLOCKS_CTR, BLOCKS_CTR
+	jz	.L_loop
+
+	mov	TB, B
+
+	/* Process second block */
+	/*
+	 * rounds
+	 *  0+80, 2+80, 4+80, 6+80, 8+80
+	 * 10+80,12+80,14+80,16+80,18+80
+	 */
+
+	.set j, 0
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	/*
+	 * rounds
+	 * 20+80,22+80,24+80,26+80,28+80
+	 * 30+80,32+80,34+80,36+80,38+80
+	 */
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	/*
+	 * rounds
+	 * 40+80,42+80,44+80,46+80,48+80
+	 * 50+80,52+80,54+80,56+80,58+80
+	 */
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	/* update counter */
+	sub     $1, BLOCKS_CTR
+	/* Move to the next block only if needed*/
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+
+	/*
+	 * rounds
+	 * 60+80,62+80,64+80,66+80,68+80
+	 * 70+80,72+80,74+80,76+80,78+80
+	 */
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	UPDATE_HASH	(HASH_PTR), A
+	UPDATE_HASH	4(HASH_PTR), TB
+	UPDATE_HASH	8(HASH_PTR), C
+	UPDATE_HASH	12(HASH_PTR), D
+	UPDATE_HASH	16(HASH_PTR), E
+
+	/* Reset state for AVX2 reg permutation */
+	mov	A, TA
+	mov	TB, A
+	mov	C, TB
+	mov	E, C
+	mov	D, B
+	mov	TA, D
+
+	REGALLOC
+
+	xchg	WK_BUF, PRECALC_BUF
+
+	jmp	.L_loop
+
+	.align 32
+.L_end:
+
+.endm
+/*
+ * macro implements SHA-1 function's body for several 64-byte blocks
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM  name
+	SYM_FUNC_START(\name)
+
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	RESERVE_STACK  = (W_SIZE*4 + 8+24)
+
+	/* Align stack */
+	push	%rbp
+	mov	%rsp, %rbp
+	and	$~(0x20-1), %rsp
+	sub	$RESERVE_STACK, %rsp
+
+	avx2_zeroupper
+
+	/* Setup initial values */
+	mov	CTX, HASH_PTR
+	mov	BUF, BUFFER_PTR
+
+	mov	BUF, BUFFER_PTR2
+	mov	CNT, BLOCKS_CTR
+
+	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
+
+	SHA1_PIPELINED_MAIN_BODY
+
+	avx2_zeroupper
+
+	mov	%rbp, %rsp
+	pop	%rbp
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+
+	RET
+
+	SYM_FUNC_END(\name)
+.endm
+
+.section .rodata
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.align 128
+K_XMM_AR:
+	.long K1, K1, K1, K1
+	.long K1, K1, K1, K1
+	.long K2, K2, K2, K2
+	.long K2, K2, K2, K2
+	.long K3, K3, K3, K3
+	.long K3, K3, K3, K3
+	.long K4, K4, K4, K4
+	.long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+.text
+
+SHA1_VECTOR_ASM     sha1_transform_avx2
diff --git a/lib/crypto/x86/sha1-ni-asm.S b/lib/crypto/x86/sha1-ni-asm.S
new file mode 100644
index 000000000000..428f9b960594
--- /dev/null
+++ b/lib/crypto/x86/sha1-ni-asm.S
@@ -0,0 +1,152 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * 	Sean Gulley <sean.m.gulley@intel.com>
+ * 	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 	* Redistributions of source code must retain the above copyright
+ * 	  notice, this list of conditions and the following disclaimer.
+ * 	* Redistributions in binary form must reproduce the above copyright
+ * 	  notice, this list of conditions and the following disclaimer in
+ * 	  the documentation and/or other materials provided with the
+ * 	  distribution.
+ * 	* Neither the name of Intel Corporation nor the names of its
+ * 	  contributors may be used to endorse or promote products derived
+ * 	  from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define STATE_PTR	%rdi	/* 1st arg */
+#define DATA_PTR	%rsi	/* 2nd arg */
+#define NUM_BLKS	%rdx	/* 3rd arg */
+
+#define ABCD		%xmm0
+#define E0		%xmm1	/* Need two E's b/c they ping pong */
+#define E1		%xmm2
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+#define SHUF_MASK	%xmm7
+#define ABCD_SAVED	%xmm8
+#define E0_SAVED	%xmm9
+
+.macro do_4rounds	i, m0, m1, m2, m3, e0, e1
+.if \i < 16
+	movdqu		\i*4(DATA_PTR), \m0
+	pshufb		SHUF_MASK, \m0
+.endif
+.if \i == 0
+	paddd		\m0, \e0
+.else
+	sha1nexte	\m0, \e0
+.endif
+	movdqa		ABCD, \e1
+.if \i >= 12 && \i < 76
+	sha1msg2	\m0, \m1
+.endif
+	sha1rnds4	$\i / 20, \e0, ABCD
+.if \i >= 4 && \i < 68
+	sha1msg1	\m0, \m3
+.endif
+.if \i >= 8 && \i < 72
+	pxor		\m0, \m2
+.endif
+.endm
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 block function
+ *
+ * This function takes a pointer to the current SHA-1 state, a pointer to the
+ * input data, and the number of 64-byte blocks to process.  The number of
+ * blocks to process is assumed to be nonzero.  Once all blocks have been
+ * processed, the state is updated with the new state.  This function only
+ * processes complete blocks.  State initialization, buffering of partial
+ * blocks, and digest finalization are expected to be handled elsewhere.
+ *
+ * void sha1_ni_transform(struct sha1_block_state *state,
+ *			  const u8 *data, size_t nblocks)
+ */
+.text
+SYM_FUNC_START(sha1_ni_transform)
+
+	/* Load the initial state from STATE_PTR. */
+	pxor		E0, E0
+	pinsrd		$3, 16(STATE_PTR), E0
+	movdqu		(STATE_PTR), ABCD
+	pshufd		$0x1B, ABCD, ABCD
+
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+.Lnext_block:
+	/* Save the state for addition after the rounds. */
+	movdqa		E0, E0_SAVED
+	movdqa		ABCD, ABCD_SAVED
+
+.irp i, 0, 16, 32, 48, 64
+	do_4rounds	(\i + 0),  MSG0, MSG1, MSG2, MSG3, E0, E1
+	do_4rounds	(\i + 4),  MSG1, MSG2, MSG3, MSG0, E1, E0
+	do_4rounds	(\i + 8),  MSG2, MSG3, MSG0, MSG1, E0, E1
+	do_4rounds	(\i + 12), MSG3, MSG0, MSG1, MSG2, E1, E0
+.endr
+
+	/* Add the previous state (before the rounds) to the current state. */
+	sha1nexte	E0_SAVED, E0
+	paddd		ABCD_SAVED, ABCD
+
+	/* Advance to the next block, or break if there are no more blocks. */
+	add		$64, DATA_PTR
+	dec		NUM_BLKS
+	jnz		.Lnext_block
+
+	/* Store the new state to STATE_PTR. */
+	pextrd		$3, E0, 16(STATE_PTR)
+	pshufd		$0x1B, ABCD, ABCD
+	movdqu		ABCD, (STATE_PTR)
+
+	RET
+SYM_FUNC_END(sha1_ni_transform)
+
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x000102030405060708090a0b0c0d0e0f
diff --git a/lib/crypto/x86/sha1-ssse3-and-avx.S b/lib/crypto/x86/sha1-ssse3-and-avx.S
new file mode 100644
index 000000000000..78b48cb09c5b
--- /dev/null
+++ b/lib/crypto/x86/sha1-ssse3-and-avx.S
@@ -0,0 +1,551 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
+ * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
+ * processors. CPUs supporting Intel(R) AVX extensions will get an additional
+ * boost.
+ *
+ * This work was inspired by the vectorized implementation of Dean Gaudet.
+ * Additional information on it can be found at:
+ *    http://www.arctic.org/~dean/crypto/sha1.html
+ *
+ * It was improved upon with more efficient vectorization of the message
+ * scheduling. This implementation has also been optimized for all current and
+ * several future generations of Intel CPUs.
+ *
+ * See this article for more information about the implementation details:
+ *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * Copyright (C) 2010, Intel Corp.
+ *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ *            Ronen Zohar <ronen.zohar@intel.com>
+ *
+ * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
+ *   Author: Mathias Krause <minipli@googlemail.com>
+ */
+
+#include <linux/linkage.h>
+
+#define CTX	%rdi	// arg1
+#define BUF	%rsi	// arg2
+#define CNT	%rdx	// arg3
+
+#define REG_A	%ecx
+#define REG_B	%esi
+#define REG_C	%edi
+#define REG_D	%r12d
+#define REG_E	%edx
+
+#define REG_T1	%eax
+#define REG_T2	%ebx
+
+#define K_BASE		%r8
+#define HASH_PTR	%r9
+#define BUFFER_PTR	%r10
+#define BUFFER_END	%r11
+
+#define W_TMP1	%xmm0
+#define W_TMP2	%xmm9
+
+#define W0	%xmm1
+#define W4	%xmm2
+#define W8	%xmm3
+#define W12	%xmm4
+#define W16	%xmm5
+#define W20	%xmm6
+#define W24	%xmm7
+#define W28	%xmm8
+
+#define XMM_SHUFB_BSWAP	%xmm10
+
+/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
+#define WK(t)	(((t) & 15) * 4)(%rsp)
+#define W_PRECALC_AHEAD	16
+
+/*
+ * This macro implements the SHA-1 function's body for single 64-byte block
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM  name
+	SYM_FUNC_START(\name)
+
+	push	%rbx
+	push	%r12
+	push	%rbp
+	mov	%rsp, %rbp
+
+	sub	$64, %rsp		# allocate workspace
+	and	$~15, %rsp		# align stack
+
+	mov	CTX, HASH_PTR
+	mov	BUF, BUFFER_PTR
+
+	shl	$6, CNT			# multiply by 64
+	add	BUF, CNT
+	mov	CNT, BUFFER_END
+
+	lea	K_XMM_AR(%rip), K_BASE
+	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
+
+	SHA1_PIPELINED_MAIN_BODY
+
+	# cleanup workspace
+	mov	$8, %ecx
+	mov	%rsp, %rdi
+	xor	%eax, %eax
+	rep stosq
+
+	mov	%rbp, %rsp		# deallocate workspace
+	pop	%rbp
+	pop	%r12
+	pop	%rbx
+	RET
+
+	SYM_FUNC_END(\name)
+.endm
+
+/*
+ * This macro implements 80 rounds of SHA-1 for one 64-byte block
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+	INIT_REGALLOC
+
+	mov	  (HASH_PTR), A
+	mov	 4(HASH_PTR), B
+	mov	 8(HASH_PTR), C
+	mov	12(HASH_PTR), D
+	mov	16(HASH_PTR), E
+
+  .set i, 0
+  .rept W_PRECALC_AHEAD
+	W_PRECALC i
+    .set i, (i+1)
+  .endr
+
+.align 4
+1:
+	RR F1,A,B,C,D,E,0
+	RR F1,D,E,A,B,C,2
+	RR F1,B,C,D,E,A,4
+	RR F1,E,A,B,C,D,6
+	RR F1,C,D,E,A,B,8
+
+	RR F1,A,B,C,D,E,10
+	RR F1,D,E,A,B,C,12
+	RR F1,B,C,D,E,A,14
+	RR F1,E,A,B,C,D,16
+	RR F1,C,D,E,A,B,18
+
+	RR F2,A,B,C,D,E,20
+	RR F2,D,E,A,B,C,22
+	RR F2,B,C,D,E,A,24
+	RR F2,E,A,B,C,D,26
+	RR F2,C,D,E,A,B,28
+
+	RR F2,A,B,C,D,E,30
+	RR F2,D,E,A,B,C,32
+	RR F2,B,C,D,E,A,34
+	RR F2,E,A,B,C,D,36
+	RR F2,C,D,E,A,B,38
+
+	RR F3,A,B,C,D,E,40
+	RR F3,D,E,A,B,C,42
+	RR F3,B,C,D,E,A,44
+	RR F3,E,A,B,C,D,46
+	RR F3,C,D,E,A,B,48
+
+	RR F3,A,B,C,D,E,50
+	RR F3,D,E,A,B,C,52
+	RR F3,B,C,D,E,A,54
+	RR F3,E,A,B,C,D,56
+	RR F3,C,D,E,A,B,58
+
+	add	$64, BUFFER_PTR		# move to the next 64-byte block
+	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
+	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
+
+	RR F4,A,B,C,D,E,60
+	RR F4,D,E,A,B,C,62
+	RR F4,B,C,D,E,A,64
+	RR F4,E,A,B,C,D,66
+	RR F4,C,D,E,A,B,68
+
+	RR F4,A,B,C,D,E,70
+	RR F4,D,E,A,B,C,72
+	RR F4,B,C,D,E,A,74
+	RR F4,E,A,B,C,D,76
+	RR F4,C,D,E,A,B,78
+
+	UPDATE_HASH   (HASH_PTR), A
+	UPDATE_HASH  4(HASH_PTR), B
+	UPDATE_HASH  8(HASH_PTR), C
+	UPDATE_HASH 12(HASH_PTR), D
+	UPDATE_HASH 16(HASH_PTR), E
+
+	RESTORE_RENAMED_REGS
+	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
+	jne	1b
+.endm
+
+.macro INIT_REGALLOC
+  .set A, REG_A
+  .set B, REG_B
+  .set C, REG_C
+  .set D, REG_D
+  .set E, REG_E
+  .set T1, REG_T1
+  .set T2, REG_T2
+.endm
+
+.macro RESTORE_RENAMED_REGS
+	# order is important (REG_C is where it should be)
+	mov	B, REG_B
+	mov	D, REG_D
+	mov	A, REG_A
+	mov	E, REG_E
+.endm
+
+.macro SWAP_REG_NAMES  a, b
+  .set _T, \a
+  .set \a, \b
+  .set \b, _T
+.endm
+
+.macro F1  b, c, d
+	mov	\c, T1
+	SWAP_REG_NAMES \c, T1
+	xor	\d, T1
+	and	\b, T1
+	xor	\d, T1
+.endm
+
+.macro F2  b, c, d
+	mov	\d, T1
+	SWAP_REG_NAMES \d, T1
+	xor	\c, T1
+	xor	\b, T1
+.endm
+
+.macro F3  b, c ,d
+	mov	\c, T1
+	SWAP_REG_NAMES \c, T1
+	mov	\b, T2
+	or	\b, T1
+	and	\c, T2
+	and	\d, T1
+	or	T2, T1
+.endm
+
+.macro F4  b, c, d
+	F2 \b, \c, \d
+.endm
+
+.macro UPDATE_HASH  hash, val
+	add	\hash, \val
+	mov	\val, \hash
+.endm
+
+/*
+ * RR does two rounds of SHA-1 back to back with W[] pre-calc
+ *   t1 = F(b, c, d);   e += w(i)
+ *   e += t1;           b <<= 30;   d  += w(i+1);
+ *   t1 = F(a, b, c);
+ *   d += t1;           a <<= 5;
+ *   e += a;
+ *   t1 = e;            a >>= 7;
+ *   t1 <<= 5;
+ *   d += t1;
+ */
+.macro RR  F, a, b, c, d, e, round
+	add	WK(\round), \e
+	\F   \b, \c, \d		# t1 = F(b, c, d);
+	W_PRECALC (\round + W_PRECALC_AHEAD)
+	rol	$30, \b
+	add	T1, \e
+	add	WK(\round + 1), \d
+
+	\F   \a, \b, \c
+	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
+	rol	$5, \a
+	add	\a, \e
+	add	T1, \d
+	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
+
+	mov	\e, T1
+	SWAP_REG_NAMES \e, T1
+
+	rol	$5, T1
+	add	T1, \d
+
+	# write:  \a, \b
+	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
+.endm
+
+.macro W_PRECALC  r
+  .set i, \r
+
+  .if (i < 20)
+    .set K_XMM, 0
+  .elseif (i < 40)
+    .set K_XMM, 16
+  .elseif (i < 60)
+    .set K_XMM, 32
+  .elseif (i < 80)
+    .set K_XMM, 48
+  .endif
+
+  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
+    .set i, ((\r) % 80)	    # pre-compute for the next iteration
+    .if (i == 0)
+	W_PRECALC_RESET
+    .endif
+	W_PRECALC_00_15
+  .elseif (i<32)
+	W_PRECALC_16_31
+  .elseif (i < 80)   // rounds 32-79
+	W_PRECALC_32_79
+  .endif
+.endm
+
+.macro W_PRECALC_RESET
+  .set W,          W0
+  .set W_minus_04, W4
+  .set W_minus_08, W8
+  .set W_minus_12, W12
+  .set W_minus_16, W16
+  .set W_minus_20, W20
+  .set W_minus_24, W24
+  .set W_minus_28, W28
+  .set W_minus_32, W
+.endm
+
+.macro W_PRECALC_ROTATE
+  .set W_minus_32, W_minus_28
+  .set W_minus_28, W_minus_24
+  .set W_minus_24, W_minus_20
+  .set W_minus_20, W_minus_16
+  .set W_minus_16, W_minus_12
+  .set W_minus_12, W_minus_08
+  .set W_minus_08, W_minus_04
+  .set W_minus_04, W
+  .set W,          W_minus_32
+.endm
+
+.macro W_PRECALC_SSSE3
+
+.macro W_PRECALC_00_15
+	W_PRECALC_00_15_SSSE3
+.endm
+.macro W_PRECALC_16_31
+	W_PRECALC_16_31_SSSE3
+.endm
+.macro W_PRECALC_32_79
+	W_PRECALC_32_79_SSSE3
+.endm
+
+/* message scheduling pre-compute for rounds 0-15 */
+.macro W_PRECALC_00_15_SSSE3
+  .if ((i & 3) == 0)
+	movdqu	(i*4)(BUFFER_PTR), W_TMP1
+  .elseif ((i & 3) == 1)
+	pshufb	XMM_SHUFB_BSWAP, W_TMP1
+	movdqa	W_TMP1, W
+  .elseif ((i & 3) == 2)
+	paddd	(K_BASE), W_TMP1
+  .elseif ((i & 3) == 3)
+	movdqa  W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+/* message scheduling pre-compute for rounds 16-31
+ *
+ * - calculating last 32 w[i] values in 8 XMM registers
+ * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
+ *   instruction
+ *
+ * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
+ * dependency, but improves for 32-79
+ */
+.macro W_PRECALC_16_31_SSSE3
+  # blended scheduling of vector and scalar instruction streams, one 4-wide
+  # vector iteration / 4 scalar rounds
+  .if ((i & 3) == 0)
+	movdqa	W_minus_12, W
+	palignr	$8, W_minus_16, W	# w[i-14]
+	movdqa	W_minus_04, W_TMP1
+	psrldq	$4, W_TMP1		# w[i-3]
+	pxor	W_minus_08, W
+  .elseif ((i & 3) == 1)
+	pxor	W_minus_16, W_TMP1
+	pxor	W_TMP1, W
+	movdqa	W, W_TMP2
+	movdqa	W, W_TMP1
+	pslldq	$12, W_TMP2
+  .elseif ((i & 3) == 2)
+	psrld	$31, W
+	pslld	$1, W_TMP1
+	por	W, W_TMP1
+	movdqa	W_TMP2, W
+	psrld	$30, W_TMP2
+	pslld	$2, W
+  .elseif ((i & 3) == 3)
+	pxor	W, W_TMP1
+	pxor	W_TMP2, W_TMP1
+	movdqa	W_TMP1, W
+	paddd	K_XMM(K_BASE), W_TMP1
+	movdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+/* message scheduling pre-compute for rounds 32-79
+ *
+ * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ */
+.macro W_PRECALC_32_79_SSSE3
+  .if ((i & 3) == 0)
+	movdqa	W_minus_04, W_TMP1
+	pxor	W_minus_28, W		# W is W_minus_32 before xor
+	palignr	$8, W_minus_08, W_TMP1
+  .elseif ((i & 3) == 1)
+	pxor	W_minus_16, W
+	pxor	W_TMP1, W
+	movdqa	W, W_TMP1
+  .elseif ((i & 3) == 2)
+	psrld	$30, W
+	pslld	$2, W_TMP1
+	por	W, W_TMP1
+  .elseif ((i & 3) == 3)
+	movdqa	W_TMP1, W
+	paddd	K_XMM(K_BASE), W_TMP1
+	movdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.endm		// W_PRECALC_SSSE3
+
+
+#define K1	0x5a827999
+#define K2	0x6ed9eba1
+#define K3	0x8f1bbcdc
+#define K4	0xca62c1d6
+
+.section .rodata
+.align 16
+
+K_XMM_AR:
+	.long K1, K1, K1, K1
+	.long K2, K2, K2, K2
+	.long K3, K3, K3, K3
+	.long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+
+
+.section .text
+
+W_PRECALC_SSSE3
+.macro xmm_mov a, b
+	movdqu	\a,\b
+.endm
+
+/*
+ * SSSE3 optimized implementation:
+ *
+ * void sha1_transform_ssse3(struct sha1_block_state *state,
+ *			     const u8 *data, size_t nblocks);
+ */
+SHA1_VECTOR_ASM     sha1_transform_ssse3
+
+.macro W_PRECALC_AVX
+
+.purgem W_PRECALC_00_15
+.macro  W_PRECALC_00_15
+    W_PRECALC_00_15_AVX
+.endm
+.purgem W_PRECALC_16_31
+.macro  W_PRECALC_16_31
+    W_PRECALC_16_31_AVX
+.endm
+.purgem W_PRECALC_32_79
+.macro  W_PRECALC_32_79
+    W_PRECALC_32_79_AVX
+.endm
+
+.macro W_PRECALC_00_15_AVX
+  .if ((i & 3) == 0)
+	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
+  .elseif ((i & 3) == 1)
+	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
+  .elseif ((i & 3) == 2)
+	vpaddd	(K_BASE), W, W_TMP1
+  .elseif ((i & 3) == 3)
+	vmovdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.macro W_PRECALC_16_31_AVX
+  .if ((i & 3) == 0)
+	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
+	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
+	vpxor	W_minus_08, W, W
+	vpxor	W_minus_16, W_TMP1, W_TMP1
+  .elseif ((i & 3) == 1)
+	vpxor	W_TMP1, W, W
+	vpslldq	$12, W, W_TMP2
+	vpslld	$1, W, W_TMP1
+  .elseif ((i & 3) == 2)
+	vpsrld	$31, W, W
+	vpor	W, W_TMP1, W_TMP1
+	vpslld	$2, W_TMP2, W
+	vpsrld	$30, W_TMP2, W_TMP2
+  .elseif ((i & 3) == 3)
+	vpxor	W, W_TMP1, W_TMP1
+	vpxor	W_TMP2, W_TMP1, W
+	vpaddd	K_XMM(K_BASE), W, W_TMP1
+	vmovdqu	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.macro W_PRECALC_32_79_AVX
+  .if ((i & 3) == 0)
+	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
+	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
+  .elseif ((i & 3) == 1)
+	vpxor	W_minus_16, W_TMP1, W_TMP1
+	vpxor	W_TMP1, W, W
+  .elseif ((i & 3) == 2)
+	vpslld	$2, W, W_TMP1
+	vpsrld	$30, W, W
+	vpor	W, W_TMP1, W
+  .elseif ((i & 3) == 3)
+	vpaddd	K_XMM(K_BASE), W, W_TMP1
+	vmovdqu	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.endm    // W_PRECALC_AVX
+
+W_PRECALC_AVX
+.purgem xmm_mov
+.macro xmm_mov a, b
+	vmovdqu	\a,\b
+.endm
+
+
+/* AVX optimized implementation:
+ * void sha1_transform_avx(struct sha1_block_state *state,
+ *			   const u8 *data, size_t nblocks);
+ */
+SHA1_VECTOR_ASM     sha1_transform_avx
diff --git a/lib/crypto/x86/sha1.h b/lib/crypto/x86/sha1.h
new file mode 100644
index 000000000000..c48a0131fd12
--- /dev/null
+++ b/lib/crypto/x86/sha1.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for x86_64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/static_call.h>
+
+DEFINE_STATIC_CALL(sha1_blocks_x86, sha1_blocks_generic);
+
+#define DEFINE_X86_SHA1_FN(c_fn, asm_fn)                           \
+	asmlinkage void asm_fn(struct sha1_block_state *state,     \
+			       const u8 *data, size_t nblocks);    \
+	static void c_fn(struct sha1_block_state *state,           \
+			 const u8 *data, size_t nblocks)           \
+	{                                                          \
+		if (likely(irq_fpu_usable())) {                    \
+			kernel_fpu_begin();                        \
+			asm_fn(state, data, nblocks);              \
+			kernel_fpu_end();                          \
+		} else {                                           \
+			sha1_blocks_generic(state, data, nblocks); \
+		}                                                  \
+	}
+
+DEFINE_X86_SHA1_FN(sha1_blocks_ssse3, sha1_transform_ssse3);
+DEFINE_X86_SHA1_FN(sha1_blocks_avx, sha1_transform_avx);
+DEFINE_X86_SHA1_FN(sha1_blocks_ni, sha1_ni_transform);
+
+#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
+
+asmlinkage void sha1_transform_avx2(struct sha1_block_state *state,
+				    const u8 *data, size_t nblocks);
+static void sha1_blocks_avx2(struct sha1_block_state *state,
+			     const u8 *data, size_t nblocks)
+{
+	if (likely(irq_fpu_usable())) {
+		kernel_fpu_begin();
+		/* Select the optimal transform based on the number of blocks */
+		if (nblocks >= SHA1_AVX2_BLOCK_OPTSIZE)
+			sha1_transform_avx2(state, data, nblocks);
+		else
+			sha1_transform_avx(state, data, nblocks);
+		kernel_fpu_end();
+	} else {
+		sha1_blocks_generic(state, data, nblocks);
+	}
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+			const u8 *data, size_t nblocks)
+{
+	static_call(sha1_blocks_x86)(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+		static_call_update(sha1_blocks_x86, sha1_blocks_ni);
+	} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				     NULL) &&
+		   boot_cpu_has(X86_FEATURE_AVX)) {
+		if (boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_BMI1) &&
+		    boot_cpu_has(X86_FEATURE_BMI2))
+			static_call_update(sha1_blocks_x86, sha1_blocks_avx2);
+		else
+			static_call_update(sha1_blocks_x86, sha1_blocks_avx);
+	} else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+		static_call_update(sha1_blocks_x86, sha1_blocks_ssse3);
+	}
+}
diff --git a/lib/crypto/x86/sha256-avx-asm.S b/lib/crypto/x86/sha256-avx-asm.S
new file mode 100644
index 000000000000..c1aceb3ba3a3
--- /dev/null
+++ b/lib/crypto/x86/sha256-avx-asm.S
@@ -0,0 +1,493 @@
+########################################################################
+# Implement fast SHA-256 with AVX1 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 block at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define    VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+	add     \p1, \p2
+	mov     \p2, \p1
+.endm
+
+
+.macro MY_ROR p1 p2
+	shld    $(32-(\p1)), \p2, \p2
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+	VMOVDQ \p2, \p1
+	vpshufb \p3, \p1, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+XTMP5 = %xmm11
+
+SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm13
+
+NUM_BLKS = %rdx   # 3rd arg
+INP = %rsi        # 2nd arg
+CTX = %rdi        # 1st arg
+
+SRND = %rsi       # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP            = _INP_END  + _INP_END_SIZE
+_XFER           = _INP      + _INP_SIZE
+_XMM_SAVE       = _XFER     + _XFER_SIZE
+STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+	## compute s0 four at a time and s1 two at a time
+	## compute W[-16] + W[-7] 4 at a time
+
+	mov     e, y0			# y0 = e
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	## compute s0
+	vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y0, y2                  # y2 = S1 + CH
+	add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpsrld  $7, XTMP1, XTMP2
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	vpslld  $(32-7), XTMP1, XTMP3
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	vpsrld  $18, XTMP1, XTMP2       #
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     g, y2                   # y2 = f^g
+	vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	vpslld  $(32-18), XTMP1, XTMP1
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	vpxor   XTMP1, XTMP3, XTMP3     #
+	add     y0, y2                  # y2 = S1 + CH
+	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	## compute low s1
+	vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	mov     f, y2                   # y2 = f
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
+	xor     g, y2                   # y2 = f^g
+	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	vpxor   XTMP3, XTMP2, XTMP2     #
+	add     y0, y2                  # y2 = S1 + CH
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	## compute high s1
+	vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	mov     e, y0                   # y0 = e
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	vpxor   XTMP3, XTMP2, XTMP2
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y0, y2                  # y2 = S1 + CH
+	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+	mov	e, y0			# y0 = e
+        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+        mov     a, y1                   # y1 = a
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+        mov     f, y2                   # y2 = f
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        xor     g, y2                   # y2 = f^g
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        and     e, y2                   # y2 = (f^g)&e
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        add     y0, y2                  # y2 = S1 + CH
+        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        offset = \round * 4 + _XFER     #
+        add     offset(%rsp), y2	# y2 = k + w + S1 + CH
+        mov     a, y0			# y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+        ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_avx(struct sha256_block_state *state,
+##			     const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_avx)
+	pushq   %rbx
+	pushq   %r12
+	pushq   %r13
+	pushq   %r14
+	pushq   %r15
+	pushq	%rbp
+	movq	%rsp, %rbp
+
+	subq    $STACK_SIZE, %rsp	# allocate stack space
+	and	$~15, %rsp		# align stack pointer
+
+	shl     $6, NUM_BLKS		# convert to bytes
+	add     INP, NUM_BLKS		# pointer to end of data
+	mov     NUM_BLKS, _INP_END(%rsp)
+
+	## load initial digest
+	mov     4*0(CTX), a
+	mov     4*1(CTX), b
+	mov     4*2(CTX), c
+	mov     4*3(CTX), d
+	mov     4*4(CTX), e
+	mov     4*5(CTX), f
+	mov     4*6(CTX), g
+	mov     4*7(CTX), h
+
+	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
+	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
+.Lloop0:
+	lea     K256(%rip), TBL
+
+	## byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
+
+	mov     INP, _INP(%rsp)
+
+	## schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov     $3, SRND
+.align 16
+.Lloop1:
+	vpaddd  (TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd  1*16(TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd  2*16(TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd  3*16(TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	add	$4*16, TBL
+	FOUR_ROUNDS_AND_SCHED
+
+	sub     $1, SRND
+	jne     .Lloop1
+
+	mov     $2, SRND
+.Lloop2:
+	vpaddd  (TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	vpaddd  1*16(TBL), X1, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	add     $2*16, TBL
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	vmovdqa X2, X0
+	vmovdqa X3, X1
+
+	sub     $1, SRND
+	jne     .Lloop2
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	mov     _INP(%rsp), INP
+	add     $64, INP
+	cmp     _INP_END(%rsp), INP
+	jne     .Lloop0
+
+	mov	%rbp, %rsp
+	popq	%rbp
+	popq    %r15
+	popq    %r14
+	popq    %r13
+	popq	%r12
+	popq    %rbx
+	RET
+SYM_FUNC_END(sha256_transform_avx)
+
+.section	.rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha256-avx2-asm.S b/lib/crypto/x86/sha256-avx2-asm.S
new file mode 100644
index 000000000000..eb8836fb9695
--- /dev/null
+++ b/lib/crypto/x86/sha256-avx2-asm.S
@@ -0,0 +1,770 @@
+########################################################################
+# Implement fast SHA-256 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 2 blocks at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define	VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+	add	\p1, \p2
+	mov	\p2, \p1
+.endm
+
+################################
+
+X0 = %ymm4
+X1 = %ymm5
+X2 = %ymm6
+X3 = %ymm7
+
+# XMM versions of above
+XWORD0 = %xmm4
+XWORD1 = %xmm5
+XWORD2 = %xmm6
+XWORD3 = %xmm7
+
+XTMP0 = %ymm0
+XTMP1 = %ymm1
+XTMP2 = %ymm2
+XTMP3 = %ymm3
+XTMP4 = %ymm8
+XFER  = %ymm9
+XTMP5 = %ymm11
+
+SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
+SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %ymm13
+
+X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
+
+NUM_BLKS = %rdx	# 3rd arg
+INP	= %rsi  # 2nd arg
+CTX	= %rdi	# 1st arg
+c	= %ecx
+d	= %r8d
+e       = %edx	# clobbers NUM_BLKS
+y3	= %esi	# clobbers INP
+
+SRND	= CTX	# SRND is same register as CTX
+
+a = %eax
+b = %ebx
+f = %r9d
+g = %r10d
+h = %r11d
+old_h = %r11d
+
+T1 = %r12d
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
+_XMM_SAVE_SIZE	= 0
+_INP_END_SIZE	= 8
+_INP_SIZE	= 8
+_CTX_SIZE	= 8
+
+_XFER		= 0
+_XMM_SAVE	= _XFER     + _XFER_SIZE
+_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
+_INP		= _INP_END  + _INP_END_SIZE
+_CTX		= _INP      + _INP_SIZE
+STACK_SIZE	= _CTX      + _CTX_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+	X_ = X0
+	X0 = X1
+	X1 = X2
+	X2 = X3
+	X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+	old_h = h
+	TMP_ = h
+	h = g
+	g = f
+	f = e
+	e = d
+	d = c
+	c = b
+	b = a
+	a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED disp
+################################### RND N + 0 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+
+	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	add	h, d		# d = k + w + h + d                     # --
+
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	vpsrld	$7, XTMP1, XTMP2
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+
+	add	y0, y2		# y2 = S1 + CH                          # --
+	vpslld	$(32-7), XTMP1, XTMP3
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
+
+	vpsrld	$18, XTMP1, XTMP2
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+
+	ROTATE_ARGS
+
+################################### RND N + 1 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	offset = \disp + 1*4
+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+
+	vpslld	$(32-18), XTMP1, XTMP1
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+
+	vpxor	XTMP1, XTMP3, XTMP3
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
+	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+
+
+	ROTATE_ARGS
+
+################################### RND N + 2 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	offset = \disp + 2*4
+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
+
+	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	or	c, y3		# y3 = a|c                              # MAJA
+	mov	f, y2		# y2 = f                                # CH
+	xor	g, y2		# y2 = f^g                              # CH
+
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	vpxor	XTMP3, XTMP2, XTMP2
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
+	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
+
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1,h		# h = k + w + h + S0                    # --
+	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3,h		# h = t1 + S0 + MAJ                     # --
+
+
+	ROTATE_ARGS
+
+################################### RND N + 3 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	offset = \disp + 3*4
+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	vpxor	XTMP3, XTMP2, XTMP2
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
+
+	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	ROTATE_ARGS
+	rotate_Xs
+.endm
+
+.macro DO_4ROUNDS disp
+################################### RND N + 0 ###########################
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	ROTATE_ARGS
+
+################################### RND N + 1 ###########################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	offset = 4*1 + \disp
+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	ROTATE_ARGS
+
+################################### RND N + 2 ##############################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	offset = 4*2 + \disp
+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	ROTATE_ARGS
+
+################################### RND N + 3 ###########################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	offset = 4*3 + \disp
+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	ROTATE_ARGS
+
+.endm
+
+########################################################################
+## void sha256_transform_rorx(struct sha256_block_state *state,
+##			      const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_rorx)
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	push	%rbp
+	mov	%rsp, %rbp
+
+	subq	$STACK_SIZE, %rsp
+	and	$-32, %rsp	# align rsp to 32 byte boundary
+
+	shl	$6, NUM_BLKS	# convert to bytes
+	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
+	mov	NUM_BLKS, _INP_END(%rsp)
+
+	cmp	NUM_BLKS, INP
+	je	.Lonly_one_block
+
+	## load initial digest
+	mov	(CTX), a
+	mov	4*1(CTX), b
+	mov	4*2(CTX), c
+	mov	4*3(CTX), d
+	mov	4*4(CTX), e
+	mov	4*5(CTX), f
+	mov	4*6(CTX), g
+	mov	4*7(CTX), h
+
+	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
+	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
+
+	mov	CTX, _CTX(%rsp)
+
+.Lloop0:
+	## Load first 16 dwords from two blocks
+	VMOVDQ	0*32(INP),XTMP0
+	VMOVDQ	1*32(INP),XTMP1
+	VMOVDQ	2*32(INP),XTMP2
+	VMOVDQ	3*32(INP),XTMP3
+
+	## byte swap data
+	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
+	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
+	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
+	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
+
+	## transpose data into high/low halves
+	vperm2i128	$0x20, XTMP2, XTMP0, X0
+	vperm2i128	$0x31, XTMP2, XTMP0, X1
+	vperm2i128	$0x20, XTMP3, XTMP1, X2
+	vperm2i128	$0x31, XTMP3, XTMP1, X3
+
+.Llast_block_enter:
+	add	$64, INP
+	mov	INP, _INP(%rsp)
+
+	## schedule 48 input dwords, by doing 3 rounds of 12 each
+	xor	SRND, SRND
+
+.align 16
+.Lloop1:
+	leaq	K256+0*32(%rip), INP		## reuse INP as scratch reg
+	vpaddd	(INP, SRND), X0, XFER
+	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	(_XFER + 0*32)
+
+	leaq	K256+1*32(%rip), INP
+	vpaddd	(INP, SRND), X0, XFER
+	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	(_XFER + 1*32)
+
+	leaq	K256+2*32(%rip), INP
+	vpaddd	(INP, SRND), X0, XFER
+	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	(_XFER + 2*32)
+
+	leaq	K256+3*32(%rip), INP
+	vpaddd	(INP, SRND), X0, XFER
+	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	(_XFER + 3*32)
+
+	add	$4*32, SRND
+	cmp	$3*4*32, SRND
+	jb	.Lloop1
+
+.Lloop2:
+	## Do last 16 rounds with no scheduling
+	leaq	K256+0*32(%rip), INP
+	vpaddd	(INP, SRND), X0, XFER
+	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+	DO_4ROUNDS	(_XFER + 0*32)
+
+	leaq	K256+1*32(%rip), INP
+	vpaddd	(INP, SRND), X1, XFER
+	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+	DO_4ROUNDS	(_XFER + 1*32)
+	add	$2*32, SRND
+
+	vmovdqa	X2, X0
+	vmovdqa	X3, X1
+
+	cmp	$4*4*32, SRND
+	jb	.Lloop2
+
+	mov	_CTX(%rsp), CTX
+	mov	_INP(%rsp), INP
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	cmp	_INP_END(%rsp), INP
+	ja	.Ldone_hash
+
+	#### Do second block using previously scheduled results
+	xor	SRND, SRND
+.align 16
+.Lloop3:
+	DO_4ROUNDS	(_XFER + 0*32 + 16)
+	DO_4ROUNDS	(_XFER + 1*32 + 16)
+	add	$2*32, SRND
+	cmp	$4*4*32, SRND
+	jb	.Lloop3
+
+	mov	_CTX(%rsp), CTX
+	mov	_INP(%rsp), INP
+	add	$64, INP
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	cmp	_INP_END(%rsp), INP
+	jb	.Lloop0
+	ja	.Ldone_hash
+
+.Ldo_last_block:
+	VMOVDQ	0*16(INP),XWORD0
+	VMOVDQ	1*16(INP),XWORD1
+	VMOVDQ	2*16(INP),XWORD2
+	VMOVDQ	3*16(INP),XWORD3
+
+	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
+	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
+	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
+	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
+
+	jmp	.Llast_block_enter
+
+.Lonly_one_block:
+
+	## load initial digest
+	mov	(4*0)(CTX),a
+	mov	(4*1)(CTX),b
+	mov	(4*2)(CTX),c
+	mov	(4*3)(CTX),d
+	mov	(4*4)(CTX),e
+	mov	(4*5)(CTX),f
+	mov	(4*6)(CTX),g
+	mov	(4*7)(CTX),h
+
+	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
+	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
+
+	mov	CTX, _CTX(%rsp)
+	jmp	.Ldo_last_block
+
+.Ldone_hash:
+
+	mov	%rbp, %rsp
+	pop	%rbp
+
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbx
+	vzeroupper
+	RET
+SYM_FUNC_END(sha256_transform_rorx)
+
+.section	.rodata.cst512.K256, "aM", @progbits, 512
+.align 64
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
+.align 32
+_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
+.align 32
+_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S
new file mode 100644
index 000000000000..de5f707e7ef7
--- /dev/null
+++ b/lib/crypto/x86/sha256-ni-asm.S
@@ -0,0 +1,559 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * 	Sean Gulley <sean.m.gulley@intel.com>
+ * 	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 	* Redistributions of source code must retain the above copyright
+ * 	  notice, this list of conditions and the following disclaimer.
+ * 	* Redistributions in binary form must reproduce the above copyright
+ * 	  notice, this list of conditions and the following disclaimer in
+ * 	  the documentation and/or other materials provided with the
+ * 	  distribution.
+ * 	* Neither the name of Intel Corporation nor the names of its
+ * 	  contributors may be used to endorse or promote products derived
+ * 	  from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define STATE_PTR	%rdi	/* 1st arg */
+#define DATA_PTR	%rsi	/* 2nd arg */
+#define NUM_BLKS	%rdx	/* 3rd arg */
+
+#define SHA256CONSTANTS	%rax
+
+#define MSG		%xmm0  /* sha256rnds2 implicit operand */
+#define STATE0		%xmm1
+#define STATE1		%xmm2
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+#define TMP		%xmm7
+
+#define SHUF_MASK	%xmm8
+
+#define ABEF_SAVE	%xmm9
+#define CDGH_SAVE	%xmm10
+
+.macro do_4rounds	i, m0, m1, m2, m3
+.if \i < 16
+	movdqu		\i*4(DATA_PTR), \m0
+	pshufb		SHUF_MASK, \m0
+.endif
+	movdqa		(\i-32)*4(SHA256CONSTANTS), MSG
+	paddd		\m0, MSG
+	sha256rnds2	STATE0, STATE1
+.if \i >= 12 && \i < 60
+	movdqa		\m0, TMP
+	palignr		$4, \m3, TMP
+	paddd		TMP, \m1
+	sha256msg2	\m0, \m1
+.endif
+	punpckhqdq	MSG, MSG
+	sha256rnds2	STATE1, STATE0
+.if \i >= 4 && \i < 52
+	sha256msg1	\m0, \m3
+.endif
+.endm
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 block function
+ *
+ * This function takes a pointer to the current SHA-256 state, a pointer to the
+ * input data, and the number of 64-byte blocks to process.  Once all blocks
+ * have been processed, the state is updated with the new state.  This function
+ * only processes complete blocks.  State initialization, buffering of partial
+ * blocks, and digest finalization is expected to be handled elsewhere.
+ *
+ * void sha256_ni_transform(struct sha256_block_state *state,
+ *			    const u8 *data, size_t nblocks);
+ */
+.text
+SYM_FUNC_START(sha256_ni_transform)
+
+	shl		$6, NUM_BLKS		/*  convert to bytes */
+	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
+
+	/*
+	 * load initial hash values
+	 * Need to reorder these appropriately
+	 * DCBA, HGFE -> ABEF, CDGH
+	 */
+	movdqu		0*16(STATE_PTR), STATE0		/* DCBA */
+	movdqu		1*16(STATE_PTR), STATE1		/* HGFE */
+
+	movdqa		STATE0, TMP
+	punpcklqdq	STATE1, STATE0			/* FEBA */
+	punpckhqdq	TMP, STATE1			/* DCHG */
+	pshufd		$0x1B, STATE0, STATE0		/* ABEF */
+	pshufd		$0xB1, STATE1, STATE1		/* CDGH */
+
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+	lea		K256+32*4(%rip), SHA256CONSTANTS
+
+.Lloop0:
+	/* Save hash values for addition after rounds */
+	movdqa		STATE0, ABEF_SAVE
+	movdqa		STATE1, CDGH_SAVE
+
+.irp i, 0, 16, 32, 48
+	do_4rounds	(\i + 0),  MSG0, MSG1, MSG2, MSG3
+	do_4rounds	(\i + 4),  MSG1, MSG2, MSG3, MSG0
+	do_4rounds	(\i + 8),  MSG2, MSG3, MSG0, MSG1
+	do_4rounds	(\i + 12), MSG3, MSG0, MSG1, MSG2
+.endr
+
+	/* Add current hash values with previously saved */
+	paddd		ABEF_SAVE, STATE0
+	paddd		CDGH_SAVE, STATE1
+
+	/* Increment data pointer and loop if more to process */
+	add		$64, DATA_PTR
+	cmp		NUM_BLKS, DATA_PTR
+	jne		.Lloop0
+
+	/* Write hash values back in the correct order */
+	movdqa		STATE0, TMP
+	punpcklqdq	STATE1, STATE0			/* GHEF */
+	punpckhqdq	TMP, STATE1			/* ABCD */
+	pshufd		$0xB1, STATE0, STATE0		/* HGFE */
+	pshufd		$0x1B, STATE1, STATE1		/* DCBA */
+
+	movdqu		STATE1, 0*16(STATE_PTR)
+	movdqu		STATE0, 1*16(STATE_PTR)
+
+	RET
+SYM_FUNC_END(sha256_ni_transform)
+
+#undef DIGEST_PTR
+#undef DATA_PTR
+#undef NUM_BLKS
+#undef SHA256CONSTANTS
+#undef MSG
+#undef STATE0
+#undef STATE1
+#undef MSG0
+#undef MSG1
+#undef MSG2
+#undef MSG3
+#undef TMP
+#undef SHUF_MASK
+#undef ABEF_SAVE
+#undef CDGH_SAVE
+
+// parameters for sha256_ni_finup2x()
+#define CTX		%rdi
+#define DATA1		%rsi
+#define DATA2		%rdx
+#define LEN		%ecx
+#define LEN8		%cl
+#define LEN64		%rcx
+#define OUT1		%r8
+#define OUT2		%r9
+
+// other scalar variables
+#define SHA256CONSTANTS	%rax
+#define COUNT		%r10
+#define COUNT32		%r10d
+#define FINAL_STEP	%r11d
+
+// rbx is used as a temporary.
+
+#define MSG		%xmm0	// sha256rnds2 implicit operand
+#define STATE0_A	%xmm1
+#define STATE1_A	%xmm2
+#define STATE0_B	%xmm3
+#define STATE1_B	%xmm4
+#define TMP_A		%xmm5
+#define TMP_B		%xmm6
+#define MSG0_A		%xmm7
+#define MSG1_A		%xmm8
+#define MSG2_A		%xmm9
+#define MSG3_A		%xmm10
+#define MSG0_B		%xmm11
+#define MSG1_B		%xmm12
+#define MSG2_B		%xmm13
+#define MSG3_B		%xmm14
+#define SHUF_MASK	%xmm15
+
+#define OFFSETOF_STATE		0  // offsetof(struct __sha256_ctx, state)
+#define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
+
+// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a and m0_b
+// contain the current 4 message schedule words for the first and second message
+// respectively.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words for each message.  m1_a-m3_a contain
+// the next 3 groups of 4 message schedule words for the first message, and
+// likewise m1_b-m3_b for the second.  After consuming the current value of
+// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
+// likewise for *_b.  This means that the next (m0_a, m1_a, m2_a, m3_a) is the
+// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
+// cycle through the registers accordingly.
+.macro	do_4rounds_2x	i, m0_a, m1_a, m2_a, m3_a,  m0_b, m1_b, m2_b, m3_b
+	movdqa		(\i-32)*4(SHA256CONSTANTS), TMP_A
+	movdqa		TMP_A, TMP_B
+	paddd		\m0_a, TMP_A
+	paddd		\m0_b, TMP_B
+.if \i < 48
+	sha256msg1	\m1_a, \m0_a
+	sha256msg1	\m1_b, \m0_b
+.endif
+	movdqa		TMP_A, MSG
+	sha256rnds2	STATE0_A, STATE1_A
+	movdqa		TMP_B, MSG
+	sha256rnds2	STATE0_B, STATE1_B
+	pshufd 		$0x0E, TMP_A, MSG
+	sha256rnds2	STATE1_A, STATE0_A
+	pshufd 		$0x0E, TMP_B, MSG
+	sha256rnds2	STATE1_B, STATE0_B
+.if \i < 48
+	movdqa		\m3_a, TMP_A
+	movdqa		\m3_b, TMP_B
+	palignr		$4, \m2_a, TMP_A
+	palignr		$4, \m2_b, TMP_B
+	paddd		TMP_A, \m0_a
+	paddd		TMP_B, \m0_b
+	sha256msg2	\m3_a, \m0_a
+	sha256msg2	\m3_b, \m0_b
+.endif
+.endm
+
+//
+// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+//			  const u8 *data1, const u8 *data2, int len,
+//			  u8 out1[SHA256_DIGEST_SIZE],
+//			  u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved.  On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ni_finup2x)
+	// Allocate 128 bytes of stack space, 16-byte aligned.
+	push		%rbx
+	push		%rbp
+	mov		%rsp, %rbp
+	sub		$128, %rsp
+	and		$~15, %rsp
+
+	// Load the shuffle mask for swapping the endianness of 32-bit words.
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+	// Set up pointer to the round constants.
+	lea		K256+32*4(%rip), SHA256CONSTANTS
+
+	// Initially we're not processing the final blocks.
+	xor		FINAL_STEP, FINAL_STEP
+
+	// Load the initial state from ctx->state.
+	movdqu		OFFSETOF_STATE+0*16(CTX), STATE0_A	// DCBA
+	movdqu		OFFSETOF_STATE+1*16(CTX), STATE1_A	// HGFE
+	movdqa		STATE0_A, TMP_A
+	punpcklqdq	STATE1_A, STATE0_A			// FEBA
+	punpckhqdq	TMP_A, STATE1_A				// DCHG
+	pshufd		$0x1B, STATE0_A, STATE0_A		// ABEF
+	pshufd		$0xB1, STATE1_A, STATE1_A		// CDGH
+
+	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
+	// bytes that are buffered in ctx->buf.  Also save it in a register with
+	// LEN added to it.
+	mov		LEN, LEN
+	mov		OFFSETOF_BYTECOUNT(CTX), %rbx
+	lea		(%rbx, LEN64, 1), COUNT
+	and		$63, %ebx
+	jz		.Lfinup2x_enter_loop	// No bytes buffered?
+
+	// %ebx bytes (1 to 63) are currently buffered in ctx->buf.  Load them
+	// followed by the first 64 - %ebx bytes of data.  Since LEN >= 64, we
+	// just load 64 bytes from each of ctx->buf, DATA1, and DATA2
+	// unconditionally and rearrange the data as needed.
+
+	movdqu		OFFSETOF_BUF+0*16(CTX), MSG0_A
+	movdqu		OFFSETOF_BUF+1*16(CTX), MSG1_A
+	movdqu		OFFSETOF_BUF+2*16(CTX), MSG2_A
+	movdqu		OFFSETOF_BUF+3*16(CTX), MSG3_A
+	movdqa		MSG0_A, 0*16(%rsp)
+	movdqa		MSG1_A, 1*16(%rsp)
+	movdqa		MSG2_A, 2*16(%rsp)
+	movdqa		MSG3_A, 3*16(%rsp)
+
+	movdqu		0*16(DATA1), MSG0_A
+	movdqu		1*16(DATA1), MSG1_A
+	movdqu		2*16(DATA1), MSG2_A
+	movdqu		3*16(DATA1), MSG3_A
+	movdqu		MSG0_A, 0*16(%rsp,%rbx)
+	movdqu		MSG1_A, 1*16(%rsp,%rbx)
+	movdqu		MSG2_A, 2*16(%rsp,%rbx)
+	movdqu		MSG3_A, 3*16(%rsp,%rbx)
+	movdqa		0*16(%rsp), MSG0_A
+	movdqa		1*16(%rsp), MSG1_A
+	movdqa		2*16(%rsp), MSG2_A
+	movdqa		3*16(%rsp), MSG3_A
+
+	movdqu		0*16(DATA2), MSG0_B
+	movdqu		1*16(DATA2), MSG1_B
+	movdqu		2*16(DATA2), MSG2_B
+	movdqu		3*16(DATA2), MSG3_B
+	movdqu		MSG0_B, 0*16(%rsp,%rbx)
+	movdqu		MSG1_B, 1*16(%rsp,%rbx)
+	movdqu		MSG2_B, 2*16(%rsp,%rbx)
+	movdqu		MSG3_B, 3*16(%rsp,%rbx)
+	movdqa		0*16(%rsp), MSG0_B
+	movdqa		1*16(%rsp), MSG1_B
+	movdqa		2*16(%rsp), MSG2_B
+	movdqa		3*16(%rsp), MSG3_B
+
+	sub		$64, %rbx 	// rbx = buffered - 64
+	sub		%rbx, DATA1	// DATA1 += 64 - buffered
+	sub		%rbx, DATA2	// DATA2 += 64 - buffered
+	add		%ebx, LEN	// LEN += buffered - 64
+	movdqa		STATE0_A, STATE0_B
+	movdqa		STATE1_A, STATE1_B
+	jmp		.Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+	sub		$64, LEN
+	movdqa		STATE0_A, STATE0_B
+	movdqa		STATE1_A, STATE1_B
+.Lfinup2x_loop:
+	// Load the next two data blocks.
+	movdqu		0*16(DATA1), MSG0_A
+	movdqu		0*16(DATA2), MSG0_B
+	movdqu		1*16(DATA1), MSG1_A
+	movdqu		1*16(DATA2), MSG1_B
+	movdqu		2*16(DATA1), MSG2_A
+	movdqu		2*16(DATA2), MSG2_B
+	movdqu		3*16(DATA1), MSG3_A
+	movdqu		3*16(DATA2), MSG3_B
+	add		$64, DATA1
+	add		$64, DATA2
+.Lfinup2x_loop_have_data:
+	// Convert the words of the data blocks from big endian.
+	pshufb		SHUF_MASK, MSG0_A
+	pshufb		SHUF_MASK, MSG0_B
+	pshufb		SHUF_MASK, MSG1_A
+	pshufb		SHUF_MASK, MSG1_B
+	pshufb		SHUF_MASK, MSG2_A
+	pshufb		SHUF_MASK, MSG2_B
+	pshufb		SHUF_MASK, MSG3_A
+	pshufb		SHUF_MASK, MSG3_B
+.Lfinup2x_loop_have_bswapped_data:
+
+	// Save the original state for each block.
+	movdqa		STATE0_A, 0*16(%rsp)
+	movdqa		STATE0_B, 1*16(%rsp)
+	movdqa		STATE1_A, 2*16(%rsp)
+	movdqa		STATE1_B, 3*16(%rsp)
+
+	// Do the SHA-256 rounds on each block.
+.irp i, 0, 16, 32, 48
+	do_4rounds_2x	(\i + 0),  MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
+				   MSG0_B, MSG1_B, MSG2_B, MSG3_B
+	do_4rounds_2x	(\i + 4),  MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
+				   MSG1_B, MSG2_B, MSG3_B, MSG0_B
+	do_4rounds_2x	(\i + 8),  MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
+				   MSG2_B, MSG3_B, MSG0_B, MSG1_B
+	do_4rounds_2x	(\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
+				   MSG3_B, MSG0_B, MSG1_B, MSG2_B
+.endr
+
+	// Add the original state for each block.
+	paddd		0*16(%rsp), STATE0_A
+	paddd		1*16(%rsp), STATE0_B
+	paddd		2*16(%rsp), STATE1_A
+	paddd		3*16(%rsp), STATE1_B
+
+	// Update LEN and loop back if more blocks remain.
+	sub		$64, LEN
+	jge		.Lfinup2x_loop
+
+	// Check if any final blocks need to be handled.
+	// FINAL_STEP = 2: all done
+	// FINAL_STEP = 1: need to do count-only padding block
+	// FINAL_STEP = 0: need to do the block with 0x80 padding byte
+	cmp		$1, FINAL_STEP
+	jg		.Lfinup2x_done
+	je		.Lfinup2x_finalize_countonly
+	add		$64, LEN
+	jz		.Lfinup2x_finalize_blockaligned
+
+	// Not block-aligned; 1 <= LEN <= 63 data bytes remain.  Pad the block.
+	// To do this, write the padding starting with the 0x80 byte to
+	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
+	// and load from &sp[64 - LEN] to get the needed padding block.  This
+	// code relies on the data buffers being >= 64 bytes in length.
+	mov		$64, %ebx
+	sub		LEN, %ebx		// ebx = 64 - LEN
+	sub		%rbx, DATA1		// DATA1 -= 64 - LEN
+	sub		%rbx, DATA2		// DATA2 -= 64 - LEN
+	mov		$0x80, FINAL_STEP   // using FINAL_STEP as a temporary
+	movd		FINAL_STEP, MSG0_A
+	pxor		MSG1_A, MSG1_A
+	movdqa		MSG0_A, 4*16(%rsp)
+	movdqa		MSG1_A, 5*16(%rsp)
+	movdqa		MSG1_A, 6*16(%rsp)
+	movdqa		MSG1_A, 7*16(%rsp)
+	cmp		$56, LEN
+	jge		1f	// will COUNT spill into its own block?
+	shl		$3, COUNT
+	bswap		COUNT
+	mov		COUNT, 56(%rsp,%rbx)
+	mov		$2, FINAL_STEP	// won't need count-only block
+	jmp		2f
+1:
+	mov		$1, FINAL_STEP	// will need count-only block
+2:
+	movdqu		0*16(DATA1), MSG0_A
+	movdqu		1*16(DATA1), MSG1_A
+	movdqu		2*16(DATA1), MSG2_A
+	movdqu		3*16(DATA1), MSG3_A
+	movdqa		MSG0_A, 0*16(%rsp)
+	movdqa		MSG1_A, 1*16(%rsp)
+	movdqa		MSG2_A, 2*16(%rsp)
+	movdqa		MSG3_A, 3*16(%rsp)
+	movdqu		0*16(%rsp,%rbx), MSG0_A
+	movdqu		1*16(%rsp,%rbx), MSG1_A
+	movdqu		2*16(%rsp,%rbx), MSG2_A
+	movdqu		3*16(%rsp,%rbx), MSG3_A
+
+	movdqu		0*16(DATA2), MSG0_B
+	movdqu		1*16(DATA2), MSG1_B
+	movdqu		2*16(DATA2), MSG2_B
+	movdqu		3*16(DATA2), MSG3_B
+	movdqa		MSG0_B, 0*16(%rsp)
+	movdqa		MSG1_B, 1*16(%rsp)
+	movdqa		MSG2_B, 2*16(%rsp)
+	movdqa		MSG3_B, 3*16(%rsp)
+	movdqu		0*16(%rsp,%rbx), MSG0_B
+	movdqu		1*16(%rsp,%rbx), MSG1_B
+	movdqu		2*16(%rsp,%rbx), MSG2_B
+	movdqu		3*16(%rsp,%rbx), MSG3_B
+	jmp		.Lfinup2x_loop_have_data
+
+	// Prepare a padding block, either:
+	//
+	//	{0x80, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a block aligned message.
+	//
+	//	{   0, 0, 0, 0, ..., count (as __be64)}
+	//	This is for a message whose length mod 64 is >= 56.
+	//
+	// Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+	pxor		MSG0_A, MSG0_A
+	jmp		1f
+
+.Lfinup2x_finalize_blockaligned:
+	mov		$0x80000000, %ebx
+	movd		%ebx, MSG0_A
+1:
+	pxor		MSG1_A, MSG1_A
+	pxor		MSG2_A, MSG2_A
+	ror		$29, COUNT
+	movq		COUNT, MSG3_A
+	pslldq		$8, MSG3_A
+	movdqa		MSG0_A, MSG0_B
+	pxor		MSG1_B, MSG1_B
+	pxor		MSG2_B, MSG2_B
+	movdqa		MSG3_A, MSG3_B
+	mov		$2, FINAL_STEP
+	jmp		.Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+	// Write the two digests with all bytes in the correct order.
+	movdqa		STATE0_A, TMP_A
+	movdqa		STATE0_B, TMP_B
+	punpcklqdq	STATE1_A, STATE0_A		// GHEF
+	punpcklqdq	STATE1_B, STATE0_B
+	punpckhqdq	TMP_A, STATE1_A			// ABCD
+	punpckhqdq	TMP_B, STATE1_B
+	pshufd		$0xB1, STATE0_A, STATE0_A	// HGFE
+	pshufd		$0xB1, STATE0_B, STATE0_B
+	pshufd		$0x1B, STATE1_A, STATE1_A	// DCBA
+	pshufd		$0x1B, STATE1_B, STATE1_B
+	pshufb		SHUF_MASK, STATE0_A
+	pshufb		SHUF_MASK, STATE0_B
+	pshufb		SHUF_MASK, STATE1_A
+	pshufb		SHUF_MASK, STATE1_B
+	movdqu		STATE0_A, 1*16(OUT1)
+	movdqu		STATE0_B, 1*16(OUT2)
+	movdqu		STATE1_A, 0*16(OUT1)
+	movdqu		STATE1_B, 0*16(OUT2)
+
+	mov		%rbp, %rsp
+	pop		%rbp
+	pop		%rbx
+	RET
+SYM_FUNC_END(sha256_ni_finup2x)
+
+.section	.rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/lib/crypto/x86/sha256-ssse3-asm.S b/lib/crypto/x86/sha256-ssse3-asm.S
new file mode 100644
index 000000000000..383b8eec7ebe
--- /dev/null
+++ b/lib/crypto/x86/sha256-ssse3-asm.S
@@ -0,0 +1,505 @@
+########################################################################
+# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define    MOVDQ movdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+        add     \p1, \p2
+        mov     \p2, \p1
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+        MOVDQ \p2, \p1
+        pshufb \p3, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+
+SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm12
+
+NUM_BLKS = %rdx   # 3rd arg
+INP = %rsi        # 2nd arg
+CTX = %rdi        # 1st arg
+
+SRND = %rsi       # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP            = _INP_END  + _INP_END_SIZE
+_XFER           = _INP      + _INP_SIZE
+_XMM_SAVE       = _XFER     + _XFER_SIZE
+STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+	## compute s0 four at a time and s1 two at a time
+	## compute W[-16] + W[-7] 4 at a time
+	movdqa  X3, XTMP0
+	mov     e, y0			# y0 = e
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	palignr $4, X2, XTMP0           # XTMP0 = W[-7]
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	movdqa  X1, XTMP1
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	paddd   X0, XTMP0               # XTMP0 = W[-7] + W[-16]
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	## compute s0
+	palignr $4, X0, XTMP1           # XTMP1 = W[-15]
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	movdqa  XTMP1, XTMP2            # XTMP2 = W[-15]
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y0, y2                  # y2 = S1 + CH
+	add     _XFER(%rsp) , y2        # y2 = k + w + S1 + CH
+	movdqa  XTMP1, XTMP3            # XTMP3 = W[-15]
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pslld   $(32-7), XTMP1          #
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	psrld   $7, XTMP2               #
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	por     XTMP2, XTMP1            # XTMP1 = W[-15] ror 7
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+					#
+	ROTATE_ARGS                     #
+	movdqa  XTMP3, XTMP2            # XTMP2 = W[-15]
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	movdqa  XTMP3, XTMP4            # XTMP4 = W[-15]
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	pslld   $(32-18), XTMP3         #
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     g, y2                   # y2 = f^g
+	psrld   $18, XTMP2              #
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	pxor    XTMP3, XTMP1
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	psrld   $3, XTMP4               # XTMP4 = W[-15] >> 3
+	add     y0, y2                  # y2 = S1 + CH
+	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	pxor    XTMP2, XTMP1            # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pxor    XTMP4, XTMP1            # XTMP1 = s0
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	## compute low s1
+	pshufd  $0b11111010, X3, XTMP2   # XTMP2 = W[-2] {BBAA}
+	and     b, y0			# y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	paddd   XTMP1, XTMP0            # XTMP0 = W[-16] + W[-7] + s0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {BBAA}
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	movdqa  XTMP2, XTMP4            # XTMP4 = W[-2] {BBAA}
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	mov     f, y2                   # y2 = f
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xBxA}
+	xor     g, y2                   # y2 = f^g
+	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xBxA}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	psrld   $10, XTMP4              # XTMP4 = W[-2] >> 10 {BBAA}
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	pxor    XTMP3, XTMP2
+	add     y0, y2                  # y2 = S1 + CH
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	pxor    XTMP2, XTMP4            # XTMP4 = s1 {xBxA}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pshufb  SHUF_00BA, XTMP4        # XTMP4 = s1 {00BA}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	paddd   XTMP4, XTMP0            # XTMP0 = {..., ..., W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	## compute high s1
+	pshufd  $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+					#
+	ROTATE_ARGS                     #
+	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {DDCC}
+	mov     e, y0                   # y0 = e
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	movdqa  XTMP2, X0               # X0    = W[-2] {DDCC}
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xDxC}
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xDxC}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25
+	and     e, y2                   # y2 = (f^g)&e
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	psrld   $10, X0                 # X0 = W[-2] >> 10 {DDCC}
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	pxor    XTMP3, XTMP2            #
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+	add     y0, y2                  # y2 = S1 + CH
+	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	pxor    XTMP2, X0               # X0 = s1 {xDxC}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pshufb  SHUF_DC00, X0           # X0 = s1 {DC00}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	paddd   XTMP0, X0               # X0 = {W[3], W[2], W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+	mov     e, y0                 # y0 = e
+	ror     $(25-11), y0          # y0 = e >> (25-11)
+	mov     a, y1                 # y1 = a
+	xor     e, y0                 # y0 = e ^ (e >> (25-11))
+	ror     $(22-13), y1          # y1 = a >> (22-13)
+	mov     f, y2                 # y2 = f
+	xor     a, y1                 # y1 = a ^ (a >> (22-13)
+	ror     $(11-6), y0           # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     g, y2                 # y2 = f^g
+	xor     e, y0                 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	ror     $(13-2), y1           # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	and     e, y2                 # y2 = (f^g)&e
+	xor     a, y1                 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror     $6, y0                # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                 # y2 = CH = ((f^g)&e)^g
+	add     y0, y2                # y2 = S1 + CH
+	ror     $2, y1                # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	offset = \round * 4 + _XFER
+	add     offset(%rsp), y2      # y2 = k + w + S1 + CH
+	mov     a, y0                 # y0 = a
+	add     y2, h                 # h = h + S1 + CH + k + w
+	mov     a, y2                 # y2 = a
+	or      c, y0                 # y0 = a|c
+	add     h, d                  # d = d + h + S1 + CH + k + w
+	and     c, y2                 # y2 = a&c
+	and     b, y0                 # y0 = (a|c)&b
+	add     y1, h                 # h = h + S1 + CH + k + w + S0
+	or      y2, y0		      # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h		      # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_ssse3(struct sha256_block_state *state,
+##			       const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_ssse3)
+	pushq   %rbx
+	pushq   %r12
+	pushq   %r13
+	pushq   %r14
+	pushq   %r15
+	pushq   %rbp
+	mov	%rsp, %rbp
+
+	subq    $STACK_SIZE, %rsp
+	and	$~15, %rsp
+
+	shl     $6, NUM_BLKS		 # convert to bytes
+	add     INP, NUM_BLKS
+	mov     NUM_BLKS, _INP_END(%rsp) # pointer to end of data
+
+	## load initial digest
+	mov     4*0(CTX), a
+	mov     4*1(CTX), b
+	mov     4*2(CTX), c
+	mov     4*3(CTX), d
+	mov     4*4(CTX), e
+	mov     4*5(CTX), f
+	mov     4*6(CTX), g
+	mov     4*7(CTX), h
+
+	movdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	movdqa  _SHUF_00BA(%rip), SHUF_00BA
+	movdqa  _SHUF_DC00(%rip), SHUF_DC00
+
+.Lloop0:
+	lea     K256(%rip), TBL
+
+	## byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
+
+	mov     INP, _INP(%rsp)
+
+	## schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov     $3, SRND
+.align 16
+.Lloop1:
+	movdqa  (TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  1*16(TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  2*16(TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  3*16(TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	add     $4*16, TBL
+	FOUR_ROUNDS_AND_SCHED
+
+	sub     $1, SRND
+	jne     .Lloop1
+
+	mov     $2, SRND
+.Lloop2:
+	paddd   (TBL), X0
+	movdqa  X0, _XFER(%rsp)
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+	paddd   1*16(TBL), X1
+	movdqa  X1, _XFER(%rsp)
+	add     $2*16, TBL
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	movdqa  X2, X0
+	movdqa  X3, X1
+
+	sub     $1, SRND
+	jne     .Lloop2
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	mov     _INP(%rsp), INP
+	add     $64, INP
+	cmp     _INP_END(%rsp), INP
+	jne     .Lloop0
+
+	mov	%rbp, %rsp
+	popq	%rbp
+	popq    %r15
+	popq    %r14
+	popq    %r13
+	popq    %r12
+	popq    %rbx
+
+	RET
+SYM_FUNC_END(sha256_transform_ssse3)
+
+.section	.rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+        .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h
new file mode 100644
index 000000000000..38e33b22a092
--- /dev/null
+++ b/lib/crypto/x86/sha256.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for x86_64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/static_call.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);
+
+DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);
+
+#define DEFINE_X86_SHA256_FN(c_fn, asm_fn)                                 \
+	asmlinkage void asm_fn(struct sha256_block_state *state,           \
+			       const u8 *data, size_t nblocks);            \
+	static void c_fn(struct sha256_block_state *state, const u8 *data, \
+			 size_t nblocks)                                   \
+	{                                                                  \
+		if (likely(irq_fpu_usable())) {                            \
+			kernel_fpu_begin();                                \
+			asm_fn(state, data, nblocks);                      \
+			kernel_fpu_end();                                  \
+		} else {                                                   \
+			sha256_blocks_generic(state, data, nblocks);       \
+		}                                                          \
+	}
+
+DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3);
+DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx);
+DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx);
+DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	static_call(sha256_blocks_x86)(state, data, nblocks);
+}
+
+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+				  const u8 *data1, const u8 *data2, int len,
+				  u8 out1[SHA256_DIGEST_SIZE],
+				  u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+				 const u8 *data1, const u8 *data2, size_t len,
+				 u8 out1[SHA256_DIGEST_SIZE],
+				 u8 out2[SHA256_DIGEST_SIZE])
+{
+	/*
+	 * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+	 * Further limit len to 65536 to avoid spending too long with preemption
+	 * disabled.  (Of course, in practice len is nearly always 4096 anyway.)
+	 */
+	if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
+	    len <= 65536 && likely(irq_fpu_usable())) {
+		kernel_fpu_begin();
+		sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
+		kernel_fpu_end();
+		kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+		kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+		return true;
+	}
+	return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+	return static_key_enabled(&have_sha_ni);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+		static_call_update(sha256_blocks_x86, sha256_blocks_ni);
+		static_branch_enable(&have_sha_ni);
+	} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				     NULL) &&
+		   boot_cpu_has(X86_FEATURE_AVX)) {
+		if (boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_BMI2))
+			static_call_update(sha256_blocks_x86,
+					   sha256_blocks_avx2);
+		else
+			static_call_update(sha256_blocks_x86,
+					   sha256_blocks_avx);
+	} else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+		static_call_update(sha256_blocks_x86, sha256_blocks_ssse3);
+	}
+}
diff --git a/lib/crypto/x86/sha512-avx-asm.S b/lib/crypto/x86/sha512-avx-asm.S
new file mode 100644
index 000000000000..7732aa8fd850
--- /dev/null
+++ b/lib/crypto/x86/sha512-avx-asm.S
@@ -0,0 +1,420 @@
+########################################################################
+# Implement fast SHA-512 with AVX instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest	= %rdi
+# ARG2
+msg	= %rsi
+# ARG3
+msglen	= %rdx
+T1	= %rcx
+T2	= %r8
+a_64	= %r9
+b_64	= %r10
+c_64	= %r11
+d_64	= %r12
+e_64	= %r13
+f_64	= %r14
+g_64	= %r15
+h_64	= %rbx
+tmp0	= %rax
+
+# Local variables (stack frame)
+
+# Message Schedule
+W_SIZE = 80*8
+# W[t] + K[t] | W[t+1] + K[t+1]
+WK_SIZE = 2*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_size = frame_WK + WK_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i)    8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i)    8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i)    8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+	# Rotate symbols a..h right
+	TMP   = h_64
+	h_64  = g_64
+	g_64  = f_64
+	f_64  = e_64
+	e_64  = d_64
+	d_64  = c_64
+	c_64  = b_64
+	b_64  = a_64
+	a_64  = TMP
+.endm
+
+.macro RORQ p1 p2
+	# shld is faster than ror on Sandybridge
+	shld	$(64-\p2), \p1, \p1
+.endm
+
+.macro SHA512_Round rnd
+	# Compute Round %%t
+	mov     f_64, T1          # T1 = f
+	mov     e_64, tmp0        # tmp = e
+	xor     g_64, T1          # T1 = f ^ g
+	RORQ    tmp0, 23   # 41    # tmp = e ror 23
+	and     e_64, T1          # T1 = (f ^ g) & e
+	xor     e_64, tmp0        # tmp = (e ror 23) ^ e
+	xor     g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+	idx = \rnd
+	add     WK_2(idx), T1     # W[t] + K[t] from message scheduler
+	RORQ    tmp0, 4   # 18    # tmp = ((e ror 23) ^ e) ror 4
+	xor     e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
+	mov     a_64, T2          # T2 = a
+	add     h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
+	RORQ    tmp0, 14  # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+	add     tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+	mov     a_64, tmp0        # tmp = a
+	xor     c_64, T2          # T2 = a ^ c
+	and     c_64, tmp0        # tmp = a & c
+	and     b_64, T2          # T2 = (a ^ c) & b
+	xor     tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+	mov     a_64, tmp0        # tmp = a
+	RORQ    tmp0, 5  # 39     # tmp = a ror 5
+	xor     a_64, tmp0        # tmp = (a ror 5) ^ a
+	add     T1, d_64          # e(next_state) = d + T1
+	RORQ    tmp0, 6  # 34     # tmp = ((a ror 5) ^ a) ror 6
+	xor     a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
+	lea     (T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
+	RORQ    tmp0, 28  # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+	add     tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
+	RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_avx rnd
+	# Compute rounds t-2 and t-1
+	# Compute message schedule QWORDS t and t+1
+
+	#   Two rounds are computed based on the values for K[t-2]+W[t-2] and
+	# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+	# scheduler.
+	#   The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
+	# They are then added to their respective SHA512 constants at
+	# [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
+	#   For brievity, the comments following vectored instructions only refer to
+	# the first of a pair of QWORDS.
+	# Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
+	#   The computation of the message schedule and the rounds are tightly
+	# stitched to take advantage of instruction-level parallelism.
+
+	idx = \rnd - 2
+	vmovdqa	W_t(idx), %xmm4		# XMM4 = W[t-2]
+	idx = \rnd - 15
+	vmovdqu	W_t(idx), %xmm5		# XMM5 = W[t-15]
+	mov	f_64, T1
+	vpsrlq	$61, %xmm4, %xmm0	# XMM0 = W[t-2]>>61
+	mov	e_64, tmp0
+	vpsrlq	$1, %xmm5, %xmm6	# XMM6 = W[t-15]>>1
+	xor	g_64, T1
+	RORQ	tmp0, 23 # 41
+	vpsrlq	$19, %xmm4, %xmm1	# XMM1 = W[t-2]>>19
+	and	e_64, T1
+	xor	e_64, tmp0
+	vpxor	%xmm1, %xmm0, %xmm0	# XMM0 = W[t-2]>>61 ^ W[t-2]>>19
+	xor	g_64, T1
+	idx = \rnd
+	add	WK_2(idx), T1#
+	vpsrlq	$8, %xmm5, %xmm7	# XMM7 = W[t-15]>>8
+	RORQ	tmp0, 4 # 18
+	vpsrlq	$6, %xmm4, %xmm2	# XMM2 = W[t-2]>>6
+	xor	e_64, tmp0
+	mov	a_64, T2
+	add	h_64, T1
+	vpxor	%xmm7, %xmm6, %xmm6	# XMM6 = W[t-15]>>1 ^ W[t-15]>>8
+	RORQ	tmp0, 14 # 14
+	add	tmp0, T1
+	vpsrlq	$7, %xmm5, %xmm8	# XMM8 = W[t-15]>>7
+	mov	a_64, tmp0
+	xor	c_64, T2
+	vpsllq	$(64-61), %xmm4, %xmm3  # XMM3 = W[t-2]<<3
+	and	c_64, tmp0
+	and	b_64, T2
+	vpxor	%xmm3, %xmm2, %xmm2	# XMM2 = W[t-2]>>6 ^ W[t-2]<<3
+	xor	tmp0, T2
+	mov	a_64, tmp0
+	vpsllq	$(64-1), %xmm5, %xmm9	# XMM9 = W[t-15]<<63
+	RORQ	tmp0, 5 # 39
+	vpxor	%xmm9, %xmm8, %xmm8	# XMM8 = W[t-15]>>7 ^ W[t-15]<<63
+	xor	a_64, tmp0
+	add	T1, d_64
+	RORQ	tmp0, 6 # 34
+	xor	a_64, tmp0
+	vpxor	%xmm8, %xmm6, %xmm6	# XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
+					#  W[t-15]>>7 ^ W[t-15]<<63
+	lea	(T1, T2), h_64
+	RORQ	tmp0, 28 # 28
+	vpsllq	$(64-19), %xmm4, %xmm4  # XMM4 = W[t-2]<<25
+	add	tmp0, h_64
+	RotateState
+	vpxor	%xmm4, %xmm0, %xmm0     # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
+					#        W[t-2]<<25
+	mov	f_64, T1
+	vpxor	%xmm2, %xmm0, %xmm0     # XMM0 = s1(W[t-2])
+	mov	e_64, tmp0
+	xor	g_64, T1
+	idx = \rnd - 16
+	vpaddq	W_t(idx), %xmm0, %xmm0  # XMM0 = s1(W[t-2]) + W[t-16]
+	idx = \rnd - 7
+	vmovdqu	W_t(idx), %xmm1		# XMM1 = W[t-7]
+	RORQ	tmp0, 23 # 41
+	and	e_64, T1
+	xor	e_64, tmp0
+	xor	g_64, T1
+	vpsllq	$(64-8), %xmm5, %xmm5   # XMM5 = W[t-15]<<56
+	idx = \rnd + 1
+	add	WK_2(idx), T1
+	vpxor	%xmm5, %xmm6, %xmm6     # XMM6 = s0(W[t-15])
+	RORQ	tmp0, 4 # 18
+	vpaddq	%xmm6, %xmm0, %xmm0     # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
+	xor	e_64, tmp0
+	vpaddq	%xmm1, %xmm0, %xmm0     # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
+					#               s0(W[t-15]) + W[t-16]
+	mov	a_64, T2
+	add	h_64, T1
+	RORQ	tmp0, 14 # 14
+	add	tmp0, T1
+	idx = \rnd
+	vmovdqa	%xmm0, W_t(idx)		# Store W[t]
+	vpaddq	K_t(idx), %xmm0, %xmm0  # Compute W[t]+K[t]
+	vmovdqa	%xmm0, WK_2(idx)	# Store W[t]+K[t] for next rounds
+	mov	a_64, tmp0
+	xor	c_64, T2
+	and	c_64, tmp0
+	and	b_64, T2
+	xor	tmp0, T2
+	mov	a_64, tmp0
+	RORQ	tmp0, 5 # 39
+	xor	a_64, tmp0
+	add	T1, d_64
+	RORQ	tmp0, 6 # 34
+	xor	a_64, tmp0
+	lea	(T1, T2), h_64
+	RORQ	tmp0, 28 # 28
+	add	tmp0, h_64
+	RotateState
+.endm
+
+########################################################################
+# void sha512_transform_avx(struct sha512_block_state *state,
+#			    const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks.  Must be >= 1.
+########################################################################
+SYM_FUNC_START(sha512_transform_avx)
+
+	# Save GPRs
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	# Allocate Stack Space
+	push	%rbp
+	mov	%rsp, %rbp
+	sub     $frame_size, %rsp
+	and	$~(0x20 - 1), %rsp
+
+.Lupdateblock:
+
+	# Load state variables
+	mov     DIGEST(0), a_64
+	mov     DIGEST(1), b_64
+	mov     DIGEST(2), c_64
+	mov     DIGEST(3), d_64
+	mov     DIGEST(4), e_64
+	mov     DIGEST(5), f_64
+	mov     DIGEST(6), g_64
+	mov     DIGEST(7), h_64
+
+	t = 0
+	.rept 80/2 + 1
+	# (80 rounds) / (2 rounds/iteration) + (1 iteration)
+	# +1 iteration because the scheduler leads hashing by 1 iteration
+		.if t < 2
+			# BSWAP 2 QWORDS
+			vmovdqa  XMM_QWORD_BSWAP(%rip), %xmm1
+			vmovdqu  MSG(t), %xmm0
+			vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
+			vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
+			vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+			vmovdqa  %xmm0, WK_2(t) # Store into WK for rounds
+		.elseif t < 16
+			# BSWAP 2 QWORDS# Compute 2 Rounds
+			vmovdqu  MSG(t), %xmm0
+			vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
+			SHA512_Round t-2    # Round t-2
+			vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
+			vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+			SHA512_Round t-1    # Round t-1
+			vmovdqa  %xmm0, WK_2(t)# Store W[t]+K[t] into WK
+		.elseif t < 79
+			# Schedule 2 QWORDS# Compute 2 Rounds
+			SHA512_2Sched_2Round_avx t
+		.else
+			# Compute 2 Rounds
+			SHA512_Round t-2
+			SHA512_Round t-1
+		.endif
+		t = t+2
+	.endr
+
+	# Update digest
+	add     a_64, DIGEST(0)
+	add     b_64, DIGEST(1)
+	add     c_64, DIGEST(2)
+	add     d_64, DIGEST(3)
+	add     e_64, DIGEST(4)
+	add     f_64, DIGEST(5)
+	add     g_64, DIGEST(6)
+	add     h_64, DIGEST(7)
+
+	# Advance to next message block
+	add     $16*8, msg
+	dec     msglen
+	jnz     .Lupdateblock
+
+	# Restore Stack Pointer
+	mov	%rbp, %rsp
+	pop	%rbp
+
+	# Restore GPRs
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+
+	RET
+SYM_FUNC_END(sha512_transform_avx)
+
+########################################################################
+### Binary Data
+
+.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad 0x3956c25bf348b538,0x59f111f1b605d019
+	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad 0xd807aa98a3030242,0x12835b0145706fbe
+	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad 0x06ca6351e003826f,0x142929670a0e6e70
+	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad 0x81c2c92e47edaee6,0x92722c851482353b
+	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad 0xd192e819d6ef5218,0xd69906245565a910
+	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad 0x90befffa23631e28,0xa4506cebde82bde9
+	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad 0xca273eceea26619c,0xd186b8c721c0c207
+	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad 0x113f9804bef90dae,0x1b710b35131c471b
+	.quad 0x28db77f523047d84,0x32caab7b40c72493
+	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/lib/crypto/x86/sha512-avx2-asm.S b/lib/crypto/x86/sha512-avx2-asm.S
new file mode 100644
index 000000000000..22bdbfd899d0
--- /dev/null
+++ b/lib/crypto/x86/sha512-avx2-asm.S
@@ -0,0 +1,748 @@
+########################################################################
+# Implement fast SHA-512 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 blocks at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+Y_0 = %ymm4
+Y_1 = %ymm5
+Y_2 = %ymm6
+Y_3 = %ymm7
+
+YTMP0 = %ymm0
+YTMP1 = %ymm1
+YTMP2 = %ymm2
+YTMP3 = %ymm3
+YTMP4 = %ymm8
+XFER  = YTMP0
+
+BYTE_FLIP_MASK  = %ymm9
+
+# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
+CTX1        = %rdi
+CTX2        = %r12
+# 2nd arg
+INP         = %rsi
+# 3rd arg
+NUM_BLKS    = %rdx
+
+c           = %rcx
+d           = %r8
+e           = %rdx
+y3          = %rsi
+
+TBL   = %rdi # clobbers CTX1
+
+a     = %rax
+b     = %rbx
+
+f     = %r9
+g     = %r10
+h     = %r11
+old_h = %r11
+
+T1    = %r12 # clobbers CTX2
+y0    = %r13
+y1    = %r14
+y2    = %r15
+
+# Local variables (stack frame)
+XFER_SIZE = 4*8
+SRND_SIZE = 1*8
+INP_SIZE = 1*8
+INPEND_SIZE = 1*8
+CTX_SIZE = 1*8
+
+frame_XFER = 0
+frame_SRND = frame_XFER + XFER_SIZE
+frame_INP = frame_SRND + SRND_SIZE
+frame_INPEND = frame_INP + INP_SIZE
+frame_CTX = frame_INPEND + INPEND_SIZE
+frame_size = frame_CTX + CTX_SIZE
+
+## assume buffers not aligned
+#define	VMOVDQ vmovdqu
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+	add	\p1, \p2
+	mov	\p2, \p1
+.endm
+
+
+# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
+# Load ymm with mem and byte swap each dword
+.macro COPY_YMM_AND_BSWAP p1 p2 p3
+	VMOVDQ \p2, \p1
+	vpshufb \p3, \p1, \p1
+.endm
+# rotate_Ys
+# Rotate values of symbols Y0...Y3
+.macro rotate_Ys
+	Y_ = Y_0
+	Y_0 = Y_1
+	Y_1 = Y_2
+	Y_2 = Y_3
+	Y_3 = Y_
+.endm
+
+# RotateState
+.macro RotateState
+	# Rotate symbols a..h right
+	old_h  = h
+	TMP_   = h
+	h      = g
+	g      = f
+	f      = e
+	e      = d
+	d      = c
+	c      = b
+	b      = a
+	a      = TMP_
+.endm
+
+# macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL
+# YDST = {YSRC1, YSRC2} >> RVAL*8
+.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
+	vperm2f128      $0x3, \YSRC2, \YSRC1, \YDST     # YDST = {YS1_LO, YS2_HI}
+	vpalignr        $\RVAL, \YSRC2, \YDST, \YDST    # YDST = {YDS1, YS2} >> RVAL*8
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+################################### RND N + 0 #########################################
+
+	# Extract w[t-7]
+	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
+	# Calculate w[t-16] + w[t-7]
+	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
+	# Extract w[t-15]
+	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
+
+	# Calculate sigma0
+
+	# Calculate w[t-15] ror 1
+	vpsrlq		$1, YTMP1, YTMP2
+	vpsllq		$(64-1), YTMP1, YTMP3
+	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
+	# Calculate w[t-15] shr 7
+	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	add	frame_XFER(%rsp),h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	add	h, d		# d = k + w + h + d                     # --
+
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+
+	add	y0, y2		# y2 = S1 + CH                          # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+################################### RND N + 1 #########################################
+
+	# Calculate w[t-15] ror 8
+	vpsrlq		$8, YTMP1, YTMP2
+	vpsllq		$(64-8), YTMP1, YTMP1
+	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
+	# XOR the three components
+	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
+	vpxor		YTMP1, YTMP3, YTMP1		# YTMP1 = s0
+
+
+	# Add three components, w[t-16], w[t-7] and sigma0
+	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
+	# Move to appropriate lanes for calculating w[16] and w[17]
+	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
+	# Move to appropriate lanes for calculating w[18] and w[19]
+	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
+
+	# Calculate w[16] and w[17] in both 128 bit lanes
+
+	# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
+	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
+
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+
+################################### RND N + 2 #########################################
+
+	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
+	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
+	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
+							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
+
+	# Add sigma1 to the other compunents to get w[16] and w[17]
+	vpaddq		YTMP4, Y_0, Y_0			# Y_0 = {W[1], W[0], W[1], W[0]}
+
+	# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
+
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	or	c, y3		# y3 = a|c                              # MAJA
+	mov	f, y2		# y2 = f                                # CH
+	xor	g, y2		# y2 = f^g                              # CH
+
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+################################### RND N + 3 #########################################
+
+	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
+	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
+	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
+							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
+
+	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
+	# to newly calculated sigma1 to get w[18] and w[19]
+	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
+
+	# Form w[19, w[18], w17], w[16]
+	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+	rotate_Ys
+.endm
+
+.macro DO_4ROUNDS
+
+################################### RND N + 0 #########################################
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	RotateState
+
+################################### RND N + 1 #########################################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	RotateState
+
+################################### RND N + 2 #########################################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	RotateState
+
+################################### RND N + 3 #########################################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+.endm
+
+########################################################################
+# void sha512_transform_rorx(struct sha512_block_state *state,
+#			     const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks.  Must be >= 1.
+########################################################################
+SYM_FUNC_START(sha512_transform_rorx)
+
+	# Save GPRs
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	# Allocate Stack Space
+	push	%rbp
+	mov	%rsp, %rbp
+	sub	$frame_size, %rsp
+	and	$~(0x20 - 1), %rsp
+
+	shl	$7, NUM_BLKS	# convert to bytes
+	add	INP, NUM_BLKS	# pointer to end of data
+	mov	NUM_BLKS, frame_INPEND(%rsp)
+
+	## load initial digest
+	mov	8*0(CTX1), a
+	mov	8*1(CTX1), b
+	mov	8*2(CTX1), c
+	mov	8*3(CTX1), d
+	mov	8*4(CTX1), e
+	mov	8*5(CTX1), f
+	mov	8*6(CTX1), g
+	mov	8*7(CTX1), h
+
+	# save %rdi (CTX) before it gets clobbered
+	mov	%rdi, frame_CTX(%rsp)
+
+	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+
+.Lloop0:
+	lea	K512(%rip), TBL
+
+	## byte swap first 16 dwords
+	COPY_YMM_AND_BSWAP	Y_0, (INP), BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_1, 1*32(INP), BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_2, 2*32(INP), BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_3, 3*32(INP), BYTE_FLIP_MASK
+
+	mov	INP, frame_INP(%rsp)
+
+	## schedule 64 input dwords, by doing 12 rounds of 4 each
+	movq	$4, frame_SRND(%rsp)
+
+.align 16
+.Lloop1:
+	vpaddq	(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddq	1*32(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddq	2*32(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddq	3*32(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	add	$(4*32), TBL
+	FOUR_ROUNDS_AND_SCHED
+
+	subq	$1, frame_SRND(%rsp)
+	jne	.Lloop1
+
+	movq	$2, frame_SRND(%rsp)
+.Lloop2:
+	vpaddq	(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	DO_4ROUNDS
+	vpaddq	1*32(TBL), Y_1, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	add	$(2*32), TBL
+	DO_4ROUNDS
+
+	vmovdqa	Y_2, Y_0
+	vmovdqa	Y_3, Y_1
+
+	subq	$1, frame_SRND(%rsp)
+	jne	.Lloop2
+
+	mov	frame_CTX(%rsp), CTX2
+	addm	8*0(CTX2), a
+	addm	8*1(CTX2), b
+	addm	8*2(CTX2), c
+	addm	8*3(CTX2), d
+	addm	8*4(CTX2), e
+	addm	8*5(CTX2), f
+	addm	8*6(CTX2), g
+	addm	8*7(CTX2), h
+
+	mov	frame_INP(%rsp), INP
+	add	$128, INP
+	cmp	frame_INPEND(%rsp), INP
+	jne	.Lloop0
+
+	# Restore Stack Pointer
+	mov	%rbp, %rsp
+	pop	%rbp
+
+	# Restore GPRs
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+
+	vzeroupper
+	RET
+SYM_FUNC_END(sha512_transform_rorx)
+
+########################################################################
+### Binary Data
+
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+	.octa 0x18191a1b1c1d1e1f1011121314151617
+
+.section	.rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
+.align 32
+MASK_YMM_LO:
+	.octa 0x00000000000000000000000000000000
+	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha512-ssse3-asm.S b/lib/crypto/x86/sha512-ssse3-asm.S
new file mode 100644
index 000000000000..4cae7445b2a8
--- /dev/null
+++ b/lib/crypto/x86/sha512-ssse3-asm.S
@@ -0,0 +1,419 @@
+########################################################################
+# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest =	%rdi
+# ARG2
+msg =		%rsi
+# ARG3
+msglen =	%rdx
+T1 =		%rcx
+T2 =		%r8
+a_64 =		%r9
+b_64 =		%r10
+c_64 =		%r11
+d_64 =		%r12
+e_64 =		%r13
+f_64 =		%r14
+g_64 =		%r15
+h_64 =		%rbx
+tmp0 =		%rax
+
+# Local variables (stack frame)
+
+W_SIZE = 80*8
+WK_SIZE = 2*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_size = frame_WK + WK_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i)    8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i)    8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i)    8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+	# Rotate symbols a..h right
+	TMP   = h_64
+	h_64  = g_64
+	g_64  = f_64
+	f_64  = e_64
+	e_64  = d_64
+	d_64  = c_64
+	c_64  = b_64
+	b_64  = a_64
+	a_64  = TMP
+.endm
+
+.macro SHA512_Round rnd
+
+	# Compute Round %%t
+	mov	f_64, T1          # T1 = f
+	mov	e_64, tmp0        # tmp = e
+	xor	g_64, T1          # T1 = f ^ g
+	ror	$23, tmp0 # 41    # tmp = e ror 23
+	and	e_64, T1          # T1 = (f ^ g) & e
+	xor	e_64, tmp0        # tmp = (e ror 23) ^ e
+	xor	g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+	idx = \rnd
+	add	WK_2(idx), T1     # W[t] + K[t] from message scheduler
+	ror	$4, tmp0  # 18    # tmp = ((e ror 23) ^ e) ror 4
+	xor	e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
+	mov	a_64, T2          # T2 = a
+	add	h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
+	ror	$14, tmp0 # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+	add	tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+	mov	a_64, tmp0        # tmp = a
+	xor	c_64, T2          # T2 = a ^ c
+	and	c_64, tmp0        # tmp = a & c
+	and	b_64, T2          # T2 = (a ^ c) & b
+	xor	tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+	mov	a_64, tmp0        # tmp = a
+	ror	$5, tmp0 # 39     # tmp = a ror 5
+	xor	a_64, tmp0        # tmp = (a ror 5) ^ a
+	add	T1, d_64          # e(next_state) = d + T1
+	ror	$6, tmp0 # 34     # tmp = ((a ror 5) ^ a) ror 6
+	xor	a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
+	lea	(T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
+	ror	$28, tmp0 # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+	add	tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
+	RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_sse rnd
+
+	# Compute rounds t-2 and t-1
+	# Compute message schedule QWORDS t and t+1
+
+	#   Two rounds are computed based on the values for K[t-2]+W[t-2] and
+	# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+	# scheduler.
+	#   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+	# They are then added to their respective SHA512 constants at
+	# [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+	#   For brievity, the comments following vectored instructions only refer to
+	# the first of a pair of QWORDS.
+	# Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+	#   The computation of the message schedule and the rounds are tightly
+	# stitched to take advantage of instruction-level parallelism.
+	# For clarity, integer instructions (for the rounds calculation) are indented
+	# by one tab. Vectored instructions (for the message scheduler) are indented
+	# by two tabs.
+
+	mov	f_64, T1
+	idx = \rnd -2
+	movdqa	W_t(idx), %xmm2		    # XMM2 = W[t-2]
+	xor	g_64, T1
+	and	e_64, T1
+	movdqa	%xmm2, %xmm0	            # XMM0 = W[t-2]
+	xor	g_64, T1
+	idx = \rnd
+	add	WK_2(idx), T1
+	idx = \rnd - 15
+	movdqu	W_t(idx), %xmm5		    # XMM5 = W[t-15]
+	mov	e_64, tmp0
+	ror	$23, tmp0 # 41
+	movdqa	%xmm5, %xmm3	            # XMM3 = W[t-15]
+	xor	e_64, tmp0
+	ror	$4, tmp0 # 18
+	psrlq	$61-19, %xmm0		    # XMM0 = W[t-2] >> 42
+	xor	e_64, tmp0
+	ror	$14, tmp0 # 14
+	psrlq	$(8-7), %xmm3		    # XMM3 = W[t-15] >> 1
+	add	tmp0, T1
+	add	h_64, T1
+	pxor	%xmm2, %xmm0                # XMM0 = (W[t-2] >> 42) ^ W[t-2]
+	mov	a_64, T2
+	xor	c_64, T2
+	pxor	%xmm5, %xmm3                # XMM3 = (W[t-15] >> 1) ^ W[t-15]
+	and	b_64, T2
+	mov	a_64, tmp0
+	psrlq	$(19-6), %xmm0		    # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+	and	c_64, tmp0
+	xor	tmp0, T2
+	psrlq	$(7-1), %xmm3		    # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+	mov	a_64, tmp0
+	ror	$5, tmp0 # 39
+	pxor	%xmm2, %xmm0	            # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+	xor	a_64, tmp0
+	ror	$6, tmp0 # 34
+	pxor	%xmm5, %xmm3                # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+	xor	a_64, tmp0
+	ror	$28, tmp0 # 28
+	psrlq	$6, %xmm0                   # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+	add	tmp0, T2
+	add	T1, d_64
+	psrlq	$1, %xmm3                   # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+	lea	(T1, T2), h_64
+	RotateState
+	movdqa	%xmm2, %xmm1	            # XMM1 = W[t-2]
+	mov	f_64, T1
+	xor	g_64, T1
+	movdqa	%xmm5, %xmm4		    # XMM4 = W[t-15]
+	and	e_64, T1
+	xor	g_64, T1
+	psllq	$(64-19)-(64-61) , %xmm1    # XMM1 = W[t-2] << 42
+	idx = \rnd + 1
+	add	WK_2(idx), T1
+	mov	e_64, tmp0
+	psllq	$(64-1)-(64-8), %xmm4	    # XMM4 = W[t-15] << 7
+	ror	$23, tmp0 # 41
+	xor	e_64, tmp0
+	pxor	%xmm2, %xmm1		    # XMM1 = (W[t-2] << 42)^W[t-2]
+	ror	$4, tmp0 # 18
+	xor	e_64, tmp0
+	pxor	%xmm5, %xmm4		    # XMM4 = (W[t-15]<<7)^W[t-15]
+	ror	$14, tmp0 # 14
+	add	tmp0, T1
+	psllq	$(64-61), %xmm1		    # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+	add	h_64, T1
+	mov	a_64, T2
+	psllq	$(64-8), %xmm4		    # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+	xor	c_64, T2
+	and	b_64, T2
+	pxor	%xmm1, %xmm0		    # XMM0 = s1(W[t-2])
+	mov	a_64, tmp0
+	and	c_64, tmp0
+	idx = \rnd - 7
+	movdqu	W_t(idx), %xmm1		    # XMM1 = W[t-7]
+	xor	tmp0, T2
+	pxor	%xmm4, %xmm3                # XMM3 = s0(W[t-15])
+	mov	a_64, tmp0
+	paddq	%xmm3, %xmm0		    # XMM0 = s1(W[t-2]) + s0(W[t-15])
+	ror	$5, tmp0 # 39
+	idx =\rnd-16
+	paddq	W_t(idx), %xmm0		    # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+	xor	a_64, tmp0
+	paddq	%xmm1, %xmm0	            # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+	ror	$6, tmp0 # 34
+	movdqa	%xmm0, W_t(\rnd)	    # Store scheduled qwords
+	xor	a_64, tmp0
+	paddq	K_t(\rnd), %xmm0	    # Compute W[t]+K[t]
+	ror	$28, tmp0 # 28
+	idx = \rnd
+	movdqa	%xmm0, WK_2(idx)	    # Store W[t]+K[t] for next rounds
+	add	tmp0, T2
+	add	T1, d_64
+	lea	(T1, T2), h_64
+	RotateState
+.endm
+
+########################################################################
+# void sha512_transform_ssse3(struct sha512_block_state *state,
+#			      const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks.  Must be >= 1.
+########################################################################
+SYM_FUNC_START(sha512_transform_ssse3)
+
+	# Save GPRs
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	# Allocate Stack Space
+	push	%rbp
+	mov	%rsp, %rbp
+	sub	$frame_size, %rsp
+	and	$~(0x20 - 1), %rsp
+
+.Lupdateblock:
+
+# Load state variables
+	mov	DIGEST(0), a_64
+	mov	DIGEST(1), b_64
+	mov	DIGEST(2), c_64
+	mov	DIGEST(3), d_64
+	mov	DIGEST(4), e_64
+	mov	DIGEST(5), f_64
+	mov	DIGEST(6), g_64
+	mov	DIGEST(7), h_64
+
+	t = 0
+	.rept 80/2 + 1
+	# (80 rounds) / (2 rounds/iteration) + (1 iteration)
+	# +1 iteration because the scheduler leads hashing by 1 iteration
+		.if t < 2
+			# BSWAP 2 QWORDS
+			movdqa	XMM_QWORD_BSWAP(%rip), %xmm1
+			movdqu	MSG(t), %xmm0
+			pshufb	%xmm1, %xmm0	# BSWAP
+			movdqa	%xmm0, W_t(t)	# Store Scheduled Pair
+			paddq	K_t(t), %xmm0	# Compute W[t]+K[t]
+			movdqa	%xmm0, WK_2(t)	# Store into WK for rounds
+		.elseif t < 16
+			# BSWAP 2 QWORDS# Compute 2 Rounds
+			movdqu	MSG(t), %xmm0
+			pshufb	%xmm1, %xmm0	# BSWAP
+			SHA512_Round t-2	# Round t-2
+			movdqa	%xmm0, W_t(t)	# Store Scheduled Pair
+			paddq	K_t(t), %xmm0	# Compute W[t]+K[t]
+			SHA512_Round t-1	# Round t-1
+			movdqa	%xmm0, WK_2(t)	# Store W[t]+K[t] into WK
+		.elseif t < 79
+			# Schedule 2 QWORDS# Compute 2 Rounds
+			SHA512_2Sched_2Round_sse t
+		.else
+			# Compute 2 Rounds
+			SHA512_Round t-2
+			SHA512_Round t-1
+		.endif
+		t = t+2
+	.endr
+
+	# Update digest
+	add	a_64, DIGEST(0)
+	add	b_64, DIGEST(1)
+	add	c_64, DIGEST(2)
+	add	d_64, DIGEST(3)
+	add	e_64, DIGEST(4)
+	add	f_64, DIGEST(5)
+	add	g_64, DIGEST(6)
+	add	h_64, DIGEST(7)
+
+	# Advance to next message block
+	add	$16*8, msg
+	dec	msglen
+	jnz	.Lupdateblock
+
+	# Restore Stack Pointer
+	mov	%rbp, %rsp
+	pop	%rbp
+
+	# Restore GPRs
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+
+	RET
+SYM_FUNC_END(sha512_transform_ssse3)
+
+########################################################################
+### Binary Data
+
+.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad 0x3956c25bf348b538,0x59f111f1b605d019
+	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad 0xd807aa98a3030242,0x12835b0145706fbe
+	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad 0x06ca6351e003826f,0x142929670a0e6e70
+	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad 0x81c2c92e47edaee6,0x92722c851482353b
+	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad 0xd192e819d6ef5218,0xd69906245565a910
+	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad 0x90befffa23631e28,0xa4506cebde82bde9
+	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad 0xca273eceea26619c,0xd186b8c721c0c207
+	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad 0x113f9804bef90dae,0x1b710b35131c471b
+	.quad 0x28db77f523047d84,0x32caab7b40c72493
+	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/lib/crypto/x86/sha512.h b/lib/crypto/x86/sha512.h
new file mode 100644
index 000000000000..0213c70cedd0
--- /dev/null
+++ b/lib/crypto/x86/sha512.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * x86-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/static_call.h>
+
+DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic);
+
+#define DEFINE_X86_SHA512_FN(c_fn, asm_fn)                                 \
+	asmlinkage void asm_fn(struct sha512_block_state *state,           \
+			       const u8 *data, size_t nblocks);            \
+	static void c_fn(struct sha512_block_state *state, const u8 *data, \
+			 size_t nblocks)                                   \
+	{                                                                  \
+		if (likely(irq_fpu_usable())) {                            \
+			kernel_fpu_begin();                                \
+			asm_fn(state, data, nblocks);                      \
+			kernel_fpu_end();                                  \
+		} else {                                                   \
+			sha512_blocks_generic(state, data, nblocks);       \
+		}                                                          \
+	}
+
+DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3);
+DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx);
+DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx);
+
+static void sha512_blocks(struct sha512_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	static_call(sha512_blocks_x86)(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+	if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) &&
+	    boot_cpu_has(X86_FEATURE_AVX)) {
+		if (boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_BMI2))
+			static_call_update(sha512_blocks_x86,
+					   sha512_blocks_avx2);
+		else
+			static_call_update(sha512_blocks_x86,
+					   sha512_blocks_avx);
+	} else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+		static_call_update(sha512_blocks_x86, sha512_blocks_ssse3);
+	}
+}