summaryrefslogtreecommitdiff
path: root/lib/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'lib/crypto')
-rw-r--r--lib/crypto/Kconfig245
-rw-r--r--lib/crypto/Makefile305
-rw-r--r--lib/crypto/aes.c3
-rw-r--r--lib/crypto/aescfb.c256
-rw-r--r--lib/crypto/aesgcm.c68
-rw-r--r--lib/crypto/arc4.c2
-rw-r--r--lib/crypto/arm/.gitignore4
-rw-r--r--lib/crypto/arm/blake2b-neon-core.S350
-rw-r--r--lib/crypto/arm/blake2b.h40
-rw-r--r--lib/crypto/arm/blake2s-core.S309
-rw-r--r--lib/crypto/arm/blake2s.h5
-rw-r--r--lib/crypto/arm/chacha-neon-core.S643
-rw-r--r--lib/crypto/arm/chacha-scalar-core.S444
-rw-r--r--lib/crypto/arm/chacha.h114
-rw-r--r--lib/crypto/arm/curve25519-core.S2062
-rw-r--r--lib/crypto/arm/curve25519.h46
-rw-r--r--lib/crypto/arm/poly1305-armv4.pl1235
-rw-r--r--lib/crypto/arm/poly1305.h51
-rw-r--r--lib/crypto/arm/sha1-armv4-large.S507
-rw-r--r--lib/crypto/arm/sha1-armv7-neon.S633
-rw-r--r--lib/crypto/arm/sha1-ce-core.S123
-rw-r--r--lib/crypto/arm/sha1.h45
-rw-r--r--lib/crypto/arm/sha256-armv4.pl724
-rw-r--r--lib/crypto/arm/sha256-ce.S123
-rw-r--r--lib/crypto/arm/sha256.h46
-rw-r--r--lib/crypto/arm/sha512-armv4.pl657
-rw-r--r--lib/crypto/arm/sha512.h36
-rw-r--r--lib/crypto/arm64/.gitignore4
-rw-r--r--lib/crypto/arm64/chacha-neon-core.S805
-rw-r--r--lib/crypto/arm64/chacha.h96
-rw-r--r--lib/crypto/arm64/poly1305-armv8.pl920
-rw-r--r--lib/crypto/arm64/poly1305.h48
-rw-r--r--lib/crypto/arm64/polyval-ce-core.S359
-rw-r--r--lib/crypto/arm64/polyval.h80
-rw-r--r--lib/crypto/arm64/sha1-ce-core.S130
-rw-r--r--lib/crypto/arm64/sha1.h38
-rw-r--r--lib/crypto/arm64/sha2-armv8.pl786
-rw-r--r--lib/crypto/arm64/sha256-ce.S408
-rw-r--r--lib/crypto/arm64/sha256.h91
-rw-r--r--lib/crypto/arm64/sha3-ce-core.S213
-rw-r--r--lib/crypto/arm64/sha3.h59
-rw-r--r--lib/crypto/arm64/sha512-ce-core.S197
-rw-r--r--lib/crypto/arm64/sha512.h45
-rw-r--r--lib/crypto/blake2b.c174
-rw-r--r--lib/crypto/blake2s-generic.c115
-rw-r--r--lib/crypto/blake2s-selftest.c632
-rw-r--r--lib/crypto/blake2s.c149
-rw-r--r--lib/crypto/chacha-block-generic.c114
-rw-r--r--lib/crypto/chacha.c146
-rw-r--r--lib/crypto/chacha20poly1305-selftest.c10
-rw-r--r--lib/crypto/chacha20poly1305.c88
-rw-r--r--lib/crypto/curve25519-fiat32.c2
-rw-r--r--lib/crypto/curve25519-generic.c24
-rw-r--r--lib/crypto/curve25519-hacl64.c4
-rw-r--r--lib/crypto/curve25519.c69
-rw-r--r--lib/crypto/des.c10
-rw-r--r--lib/crypto/fips.h45
-rw-r--r--lib/crypto/gf128mul.c76
-rw-r--r--lib/crypto/hash_info.c63
-rw-r--r--lib/crypto/libchacha.c35
-rw-r--r--lib/crypto/md5.c322
-rw-r--r--lib/crypto/memneq.c3
-rw-r--r--lib/crypto/mips/.gitignore2
-rw-r--r--lib/crypto/mips/chacha-core.S491
-rw-r--r--lib/crypto/mips/chacha.h14
-rw-r--r--lib/crypto/mips/md5.h65
-rw-r--r--lib/crypto/mips/poly1305-mips.pl1269
-rw-r--r--lib/crypto/mips/poly1305.h14
-rw-r--r--lib/crypto/mips/sha1.h81
-rw-r--r--lib/crypto/mips/sha256.h58
-rw-r--r--lib/crypto/mips/sha512.h74
-rw-r--r--lib/crypto/mpi/Makefile28
-rw-r--r--lib/crypto/mpi/generic_mpih-add1.c48
-rw-r--r--lib/crypto/mpi/generic_mpih-lshift.c50
-rw-r--r--lib/crypto/mpi/generic_mpih-mul1.c44
-rw-r--r--lib/crypto/mpi/generic_mpih-mul2.c47
-rw-r--r--lib/crypto/mpi/generic_mpih-mul3.c48
-rw-r--r--lib/crypto/mpi/generic_mpih-rshift.c50
-rw-r--r--lib/crypto/mpi/generic_mpih-sub1.c47
-rw-r--r--lib/crypto/mpi/longlong.h1361
-rw-r--r--lib/crypto/mpi/mpi-add.c120
-rw-r--r--lib/crypto/mpi/mpi-bit.c177
-rw-r--r--lib/crypto/mpi/mpi-cmp.c74
-rw-r--r--lib/crypto/mpi/mpi-div.c230
-rw-r--r--lib/crypto/mpi/mpi-inline.h109
-rw-r--r--lib/crypto/mpi/mpi-internal.h223
-rw-r--r--lib/crypto/mpi/mpi-mod.c13
-rw-r--r--lib/crypto/mpi/mpi-mul.c111
-rw-r--r--lib/crypto/mpi/mpi-pow.c311
-rw-r--r--lib/crypto/mpi/mpi-sub-ui.c80
-rw-r--r--lib/crypto/mpi/mpicoder.c417
-rw-r--r--lib/crypto/mpi/mpih-cmp.c43
-rw-r--r--lib/crypto/mpi/mpih-div.c517
-rw-r--r--lib/crypto/mpi/mpih-mul.c484
-rw-r--r--lib/crypto/mpi/mpiutil.c152
-rw-r--r--lib/crypto/poly1305-donna32.c5
-rw-r--r--lib/crypto/poly1305-donna64.c7
-rw-r--r--lib/crypto/poly1305.c92
-rw-r--r--lib/crypto/polyval.c307
-rw-r--r--lib/crypto/powerpc/chacha-p10le-8x.S840
-rw-r--r--lib/crypto/powerpc/chacha.h76
-rw-r--r--lib/crypto/powerpc/curve25519-ppc64le_asm.S671
-rw-r--r--lib/crypto/powerpc/curve25519.h186
-rw-r--r--lib/crypto/powerpc/md5-asm.S235
-rw-r--r--lib/crypto/powerpc/md5.h12
-rw-r--r--lib/crypto/powerpc/poly1305-p10le_64.S1075
-rw-r--r--lib/crypto/powerpc/poly1305.h74
-rw-r--r--lib/crypto/powerpc/sha1-powerpc-asm.S188
-rw-r--r--lib/crypto/powerpc/sha1-spe-asm.S294
-rw-r--r--lib/crypto/powerpc/sha1.h67
-rw-r--r--lib/crypto/powerpc/sha256-spe-asm.S318
-rw-r--r--lib/crypto/powerpc/sha256.h58
-rw-r--r--lib/crypto/riscv/chacha-riscv64-zvkb.S297
-rw-r--r--lib/crypto/riscv/chacha.h51
-rw-r--r--lib/crypto/riscv/poly1305-riscv.pl847
-rw-r--r--lib/crypto/riscv/poly1305.h14
-rw-r--r--lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S225
-rw-r--r--lib/crypto/riscv/sha256.h42
-rw-r--r--lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S203
-rw-r--r--lib/crypto/riscv/sha512.h39
-rw-r--r--lib/crypto/s390/chacha-s390.S908
-rw-r--r--lib/crypto/s390/chacha-s390.h14
-rw-r--r--lib/crypto/s390/chacha.h36
-rw-r--r--lib/crypto/s390/sha1.h28
-rw-r--r--lib/crypto/s390/sha256.h28
-rw-r--r--lib/crypto/s390/sha3.h151
-rw-r--r--lib/crypto/s390/sha512.h28
-rw-r--r--lib/crypto/sha1.c247
-rw-r--r--lib/crypto/sha256.c514
-rw-r--r--lib/crypto/sha3.c411
-rw-r--r--lib/crypto/sha512.c440
-rw-r--r--lib/crypto/simd.c11
-rw-r--r--lib/crypto/sm3.c186
-rw-r--r--lib/crypto/sparc/md5.h48
-rw-r--r--lib/crypto/sparc/md5_asm.S70
-rw-r--r--lib/crypto/sparc/sha1.h43
-rw-r--r--lib/crypto/sparc/sha1_asm.S72
-rw-r--r--lib/crypto/sparc/sha256.h43
-rw-r--r--lib/crypto/sparc/sha256_asm.S78
-rw-r--r--lib/crypto/sparc/sha512.h42
-rw-r--r--lib/crypto/sparc/sha512_asm.S102
-rw-r--r--lib/crypto/tests/Kconfig118
-rw-r--r--lib/crypto/tests/Makefile12
-rw-r--r--lib/crypto/tests/blake2b-testvecs.h342
-rw-r--r--lib/crypto/tests/blake2b_kunit.c133
-rw-r--r--lib/crypto/tests/blake2s-testvecs.h238
-rw-r--r--lib/crypto/tests/blake2s_kunit.c133
-rw-r--r--lib/crypto/tests/curve25519_kunit.c (renamed from lib/crypto/curve25519-selftest.c)102
-rw-r--r--lib/crypto/tests/hash-test-template.h568
-rw-r--r--lib/crypto/tests/md5-testvecs.h186
-rw-r--r--lib/crypto/tests/md5_kunit.c39
-rw-r--r--lib/crypto/tests/poly1305-testvecs.h186
-rw-r--r--lib/crypto/tests/poly1305_kunit.c165
-rw-r--r--lib/crypto/tests/polyval-testvecs.h186
-rw-r--r--lib/crypto/tests/polyval_kunit.c223
-rw-r--r--lib/crypto/tests/sha1-testvecs.h212
-rw-r--r--lib/crypto/tests/sha1_kunit.c39
-rw-r--r--lib/crypto/tests/sha224-testvecs.h238
-rw-r--r--lib/crypto/tests/sha224_kunit.c39
-rw-r--r--lib/crypto/tests/sha256-testvecs.h238
-rw-r--r--lib/crypto/tests/sha256_kunit.c224
-rw-r--r--lib/crypto/tests/sha3-testvecs.h249
-rw-r--r--lib/crypto/tests/sha384-testvecs.h290
-rw-r--r--lib/crypto/tests/sha384_kunit.c39
-rw-r--r--lib/crypto/tests/sha3_kunit.c422
-rw-r--r--lib/crypto/tests/sha512-testvecs.h342
-rw-r--r--lib/crypto/tests/sha512_kunit.c39
-rw-r--r--lib/crypto/utils.c6
-rw-r--r--lib/crypto/x86/.gitignore2
-rw-r--r--lib/crypto/x86/blake2s-core.S291
-rw-r--r--lib/crypto/x86/blake2s.h62
-rw-r--r--lib/crypto/x86/chacha-avx2-x86_64.S1021
-rw-r--r--lib/crypto/x86/chacha-avx512vl-x86_64.S836
-rw-r--r--lib/crypto/x86/chacha-ssse3-x86_64.S791
-rw-r--r--lib/crypto/x86/chacha.h176
-rw-r--r--lib/crypto/x86/curve25519.h1613
-rw-r--r--lib/crypto/x86/poly1305-x86_64-cryptogams.pl4240
-rw-r--r--lib/crypto/x86/poly1305.h158
-rw-r--r--lib/crypto/x86/polyval-pclmul-avx.S319
-rw-r--r--lib/crypto/x86/polyval.h83
-rw-r--r--lib/crypto/x86/sha1-avx2-asm.S697
-rw-r--r--lib/crypto/x86/sha1-ni-asm.S152
-rw-r--r--lib/crypto/x86/sha1-ssse3-and-avx.S551
-rw-r--r--lib/crypto/x86/sha1.h74
-rw-r--r--lib/crypto/x86/sha256-avx-asm.S493
-rw-r--r--lib/crypto/x86/sha256-avx2-asm.S770
-rw-r--r--lib/crypto/x86/sha256-ni-asm.S559
-rw-r--r--lib/crypto/x86/sha256-ssse3-asm.S505
-rw-r--r--lib/crypto/x86/sha256.h95
-rw-r--r--lib/crypto/x86/sha512-avx-asm.S420
-rw-r--r--lib/crypto/x86/sha512-avx2-asm.S748
-rw-r--r--lib/crypto/x86/sha512-ssse3-asm.S419
-rw-r--r--lib/crypto/x86/sha512.h52
193 files changed, 50751 insertions, 1395 deletions
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 45436bfc6dff..a3647352bff6 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -2,12 +2,20 @@
menu "Crypto library routines"
+config CRYPTO_HASH_INFO
+ bool
+
config CRYPTO_LIB_UTILS
tristate
config CRYPTO_LIB_AES
tristate
+config CRYPTO_LIB_AESCFB
+ tristate
+ select CRYPTO_LIB_AES
+ select CRYPTO_LIB_UTILS
+
config CRYPTO_LIB_AESGCM
tristate
select CRYPTO_LIB_AES
@@ -20,122 +28,201 @@ config CRYPTO_LIB_ARC4
config CRYPTO_LIB_GF128MUL
tristate
-config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
- bool
+config CRYPTO_LIB_BLAKE2B
+ tristate
help
- Declares whether the architecture provides an arch-specific
- accelerated implementation of the Blake2s library interface,
- either builtin or as a module.
+ The BLAKE2b library functions. Select this if your module uses any of
+ the functions from <crypto/blake2b.h>.
-config CRYPTO_LIB_BLAKE2S_GENERIC
- def_bool !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
- help
- This symbol can be depended upon by arch implementations of the
- Blake2s library interface that require the generic code as a
- fallback, e.g., for SIMD implementations. If no arch specific
- implementation is enabled, this implementation serves the users
- of CRYPTO_LIB_BLAKE2S.
+config CRYPTO_LIB_BLAKE2B_ARCH
+ bool
+ depends on CRYPTO_LIB_BLAKE2B && !UML
+ default y if ARM && KERNEL_MODE_NEON
-config CRYPTO_ARCH_HAVE_LIB_CHACHA
- tristate
- help
- Declares whether the architecture provides an arch-specific
- accelerated implementation of the ChaCha library interface,
- either builtin or as a module.
+# BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option.
+
+config CRYPTO_LIB_BLAKE2S_ARCH
+ bool
+ depends on !UML
+ default y if ARM
+ default y if X86_64
-config CRYPTO_LIB_CHACHA_GENERIC
+config CRYPTO_LIB_CHACHA
tristate
select CRYPTO_LIB_UTILS
help
- This symbol can be depended upon by arch implementations of the
- ChaCha library interface that require the generic code as a
- fallback, e.g., for SIMD implementations. If no arch specific
- implementation is enabled, this implementation serves the users
- of CRYPTO_LIB_CHACHA.
+ Enable the ChaCha library interface. Select this if your module uses
+ chacha_crypt() or hchacha_block().
-config CRYPTO_LIB_CHACHA
- tristate "ChaCha library interface"
- depends on CRYPTO_ARCH_HAVE_LIB_CHACHA || !CRYPTO_ARCH_HAVE_LIB_CHACHA
- select CRYPTO_LIB_CHACHA_GENERIC if CRYPTO_ARCH_HAVE_LIB_CHACHA=n
- help
- Enable the ChaCha library interface. This interface may be fulfilled
- by either the generic implementation or an arch-specific one, if one
- is available and enabled.
+config CRYPTO_LIB_CHACHA_ARCH
+ bool
+ depends on CRYPTO_LIB_CHACHA && !UML && !KMSAN
+ default y if ARM
+ default y if ARM64 && KERNEL_MODE_NEON
+ default y if MIPS && CPU_MIPS32_R2
+ default y if PPC64 && CPU_LITTLE_ENDIAN && VSX
+ default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ default y if S390
+ default y if X86_64
-config CRYPTO_ARCH_HAVE_LIB_CURVE25519
+config CRYPTO_LIB_CURVE25519
tristate
+ select CRYPTO_LIB_UTILS
help
- Declares whether the architecture provides an arch-specific
- accelerated implementation of the Curve25519 library interface,
- either builtin or as a module.
+ The Curve25519 library functions. Select this if your module uses any
+ of the functions from <crypto/curve25519.h>.
+
+config CRYPTO_LIB_CURVE25519_ARCH
+ bool
+ depends on CRYPTO_LIB_CURVE25519 && !UML && !KMSAN
+ default y if ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN
+ default y if PPC64 && CPU_LITTLE_ENDIAN
+ default y if X86_64
config CRYPTO_LIB_CURVE25519_GENERIC
+ bool
+ depends on CRYPTO_LIB_CURVE25519
+ default y if !CRYPTO_LIB_CURVE25519_ARCH || ARM || X86_64
+
+config CRYPTO_LIB_DES
tristate
- help
- This symbol can be depended upon by arch implementations of the
- Curve25519 library interface that require the generic code as a
- fallback, e.g., for SIMD implementations. If no arch specific
- implementation is enabled, this implementation serves the users
- of CRYPTO_LIB_CURVE25519.
-config CRYPTO_LIB_CURVE25519
- tristate "Curve25519 scalar multiplication library"
- depends on CRYPTO_ARCH_HAVE_LIB_CURVE25519 || !CRYPTO_ARCH_HAVE_LIB_CURVE25519
- select CRYPTO_LIB_CURVE25519_GENERIC if CRYPTO_ARCH_HAVE_LIB_CURVE25519=n
- select CRYPTO_LIB_UTILS
+config CRYPTO_LIB_MD5
+ tristate
help
- Enable the Curve25519 library interface. This interface may be
- fulfilled by either the generic implementation or an arch-specific
- one, if one is available and enabled.
+ The MD5 and HMAC-MD5 library functions. Select this if your module
+ uses any of the functions from <crypto/md5.h>.
-config CRYPTO_LIB_DES
+config CRYPTO_LIB_MD5_ARCH
+ bool
+ depends on CRYPTO_LIB_MD5 && !UML
+ default y if MIPS && CPU_CAVIUM_OCTEON
+ default y if PPC
+ default y if SPARC64
+
+config CRYPTO_LIB_POLY1305
tristate
+ help
+ The Poly1305 library functions. Select this if your module uses any
+ of the functions from <crypto/poly1305.h>.
+
+config CRYPTO_LIB_POLY1305_ARCH
+ bool
+ depends on CRYPTO_LIB_POLY1305 && !UML && !KMSAN
+ default y if ARM
+ default y if ARM64 && KERNEL_MODE_NEON
+ default y if MIPS
+ # The PPC64 code needs to be fixed to work in softirq context.
+ default y if PPC64 && CPU_LITTLE_ENDIAN && VSX && BROKEN
+ default y if RISCV
+ default y if X86_64
+
+# This symbol controls the inclusion of the Poly1305 generic code. This differs
+# from most of the other algorithms, which handle the generic code
+# "automatically" via __maybe_unused. This is needed so that the Adiantum code,
+# which calls the poly1305_core_*() functions directly, can enable them.
+config CRYPTO_LIB_POLY1305_GENERIC
+ bool
+ depends on CRYPTO_LIB_POLY1305
+ # Enable if there's no arch impl or the arch impl requires the generic
+ # impl as a fallback. (Or if selected explicitly.)
+ default y if !CRYPTO_LIB_POLY1305_ARCH || PPC64
config CRYPTO_LIB_POLY1305_RSIZE
int
- default 2 if MIPS
+ default 2 if MIPS || RISCV
default 11 if X86_64
default 9 if ARM || ARM64
default 1
-config CRYPTO_ARCH_HAVE_LIB_POLY1305
- tristate
- help
- Declares whether the architecture provides an arch-specific
- accelerated implementation of the Poly1305 library interface,
- either builtin or as a module.
-
-config CRYPTO_LIB_POLY1305_GENERIC
+config CRYPTO_LIB_POLYVAL
tristate
help
- This symbol can be depended upon by arch implementations of the
- Poly1305 library interface that require the generic code as a
- fallback, e.g., for SIMD implementations. If no arch specific
- implementation is enabled, this implementation serves the users
- of CRYPTO_LIB_POLY1305.
+ The POLYVAL library functions. Select this if your module uses any of
+ the functions from <crypto/polyval.h>.
-config CRYPTO_LIB_POLY1305
- tristate "Poly1305 library interface"
- depends on CRYPTO_ARCH_HAVE_LIB_POLY1305 || !CRYPTO_ARCH_HAVE_LIB_POLY1305
- select CRYPTO_LIB_POLY1305_GENERIC if CRYPTO_ARCH_HAVE_LIB_POLY1305=n
- help
- Enable the Poly1305 library interface. This interface may be fulfilled
- by either the generic implementation or an arch-specific one, if one
- is available and enabled.
+config CRYPTO_LIB_POLYVAL_ARCH
+ bool
+ depends on CRYPTO_LIB_POLYVAL && !UML
+ default y if ARM64 && KERNEL_MODE_NEON
+ default y if X86_64
config CRYPTO_LIB_CHACHA20POLY1305
- tristate "ChaCha20-Poly1305 AEAD support (8-byte nonce library version)"
- depends on CRYPTO_ARCH_HAVE_LIB_CHACHA || !CRYPTO_ARCH_HAVE_LIB_CHACHA
- depends on CRYPTO_ARCH_HAVE_LIB_POLY1305 || !CRYPTO_ARCH_HAVE_LIB_POLY1305
- depends on CRYPTO
+ tristate
select CRYPTO_LIB_CHACHA
select CRYPTO_LIB_POLY1305
- select CRYPTO_ALGAPI
+ select CRYPTO_LIB_UTILS
config CRYPTO_LIB_SHA1
tristate
+ help
+ The SHA-1 and HMAC-SHA1 library functions. Select this if your module
+ uses any of the functions from <crypto/sha1.h>.
+
+config CRYPTO_LIB_SHA1_ARCH
+ bool
+ depends on CRYPTO_LIB_SHA1 && !UML
+ default y if ARM
+ default y if ARM64 && KERNEL_MODE_NEON
+ default y if MIPS && CPU_CAVIUM_OCTEON
+ default y if PPC
+ default y if S390
+ default y if SPARC64
+ default y if X86_64
config CRYPTO_LIB_SHA256
tristate
+ help
+ The SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions.
+ Select this if your module uses any of these functions from
+ <crypto/sha2.h>.
+
+config CRYPTO_LIB_SHA256_ARCH
+ bool
+ depends on CRYPTO_LIB_SHA256 && !UML
+ default y if ARM && !CPU_V7M
+ default y if ARM64
+ default y if MIPS && CPU_CAVIUM_OCTEON
+ default y if PPC && SPE
+ default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ default y if S390
+ default y if SPARC64
+ default y if X86_64
+
+config CRYPTO_LIB_SHA512
+ tristate
+ help
+ The SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions.
+ Select this if your module uses any of these functions from
+ <crypto/sha2.h>.
+
+config CRYPTO_LIB_SHA512_ARCH
+ bool
+ depends on CRYPTO_LIB_SHA512 && !UML
+ default y if ARM && !CPU_V7M
+ default y if ARM64
+ default y if MIPS && CPU_CAVIUM_OCTEON
+ default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+ default y if S390
+ default y if SPARC64
+ default y if X86_64
+
+config CRYPTO_LIB_SHA3
+ tristate
+ select CRYPTO_LIB_UTILS
+ help
+ The SHA3 library functions. Select this if your module uses any of
+ the functions from <crypto/sha3.h>.
+
+config CRYPTO_LIB_SHA3_ARCH
+ bool
+ depends on CRYPTO_LIB_SHA3 && !UML
+ default y if ARM64 && KERNEL_MODE_NEON
+ default y if S390
+
+config CRYPTO_LIB_SM3
+ tristate
+
+source "lib/crypto/tests/Kconfig"
endmenu
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 6ec2d4543d9c..b5346cebbb55 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -1,15 +1,26 @@
# SPDX-License-Identifier: GPL-2.0
+aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1
+
+quiet_cmd_perlasm = PERLASM $@
+ cmd_perlasm = $(PERL) $(<) > $(@)
+
+quiet_cmd_perlasm_with_args = PERLASM $@
+ cmd_perlasm_with_args = $(PERL) $(<) void $(@)
+
+obj-$(CONFIG_KUNIT) += tests/
+
+obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o
+
obj-$(CONFIG_CRYPTO_LIB_UTILS) += libcryptoutils.o
libcryptoutils-y := memneq.o utils.o
-# chacha is used by the /dev/random driver which is always builtin
-obj-y += chacha.o
-obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC) += libchacha.o
-
obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o
libaes-y := aes.o
+obj-$(CONFIG_CRYPTO_LIB_AESCFB) += libaescfb.o
+libaescfb-y := aescfb.o
+
obj-$(CONFIG_CRYPTO_LIB_AESGCM) += libaesgcm.o
libaesgcm-y := aesgcm.o
@@ -18,38 +29,282 @@ libarc4-y := arc4.o
obj-$(CONFIG_CRYPTO_LIB_GF128MUL) += gf128mul.o
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2B) += libblake2b.o
+libblake2b-y := blake2b.o
+CFLAGS_blake2b.o := -Wframe-larger-than=4096 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
+ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y)
+CFLAGS_blake2b.o += -I$(src)/$(SRCARCH)
+libblake2b-$(CONFIG_ARM) += arm/blake2b-neon-core.o
+endif # CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+
+################################################################################
+
# blake2s is used by the /dev/random driver which is always builtin
-obj-y += libblake2s.o
-libblake2s-y := blake2s.o
-libblake2s-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) += blake2s-generic.o
+obj-y += blake2s.o
+ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2S_ARCH),y)
+CFLAGS_blake2s.o += -I$(src)/$(SRCARCH)
+obj-$(CONFIG_ARM) += arm/blake2s-core.o
+obj-$(CONFIG_X86) += x86/blake2s-core.o
+endif
+
+################################################################################
+
+# chacha20_block() is used by the /dev/random driver which is always builtin
+obj-y += chacha-block-generic.o
+
+obj-$(CONFIG_CRYPTO_LIB_CHACHA) += libchacha.o
+libchacha-y := chacha.o
+
+ifeq ($(CONFIG_CRYPTO_LIB_CHACHA_ARCH),y)
+CFLAGS_chacha.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libchacha-y += arm/chacha-scalar-core.o
+libchacha-$(CONFIG_KERNEL_MODE_NEON) += arm/chacha-neon-core.o
+endif
+
+libchacha-$(CONFIG_ARM64) += arm64/chacha-neon-core.o
+
+ifeq ($(CONFIG_MIPS),y)
+libchacha-y += mips/chacha-core.o
+AFLAGS_mips/chacha-core.o += -O2 # needed to fill branch delay slots
+endif
+
+libchacha-$(CONFIG_PPC) += powerpc/chacha-p10le-8x.o
+libchacha-$(CONFIG_RISCV) += riscv/chacha-riscv64-zvkb.o
+libchacha-$(CONFIG_S390) += s390/chacha-s390.o
+libchacha-$(CONFIG_X86) += x86/chacha-ssse3-x86_64.o \
+ x86/chacha-avx2-x86_64.o \
+ x86/chacha-avx512vl-x86_64.o
+endif # CONFIG_CRYPTO_LIB_CHACHA_ARCH
+
+################################################################################
obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305) += libchacha20poly1305.o
libchacha20poly1305-y += chacha20poly1305.o
+libchacha20poly1305-$(CONFIG_CRYPTO_SELFTESTS) += chacha20poly1305-selftest.o
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o
+libcurve25519-y := curve25519.o
+
+# Disable GCOV in odd or sensitive code
+GCOV_PROFILE_curve25519.o := n
+
+ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y)
+libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-hacl64.o
+else
+libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-fiat32.o
+endif
+# clang versions prior to 18 may blow out the stack with KASAN
+ifeq ($(CONFIG_CC_IS_CLANG)_$(call clang-min-version, 180000),y_)
+KASAN_SANITIZE_curve25519-hacl64.o := n
+endif
-obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += libcurve25519-generic.o
-libcurve25519-generic-y := curve25519-fiat32.o
-libcurve25519-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := curve25519-hacl64.o
-libcurve25519-generic-y += curve25519-generic.o
+ifeq ($(CONFIG_CRYPTO_LIB_CURVE25519_ARCH),y)
+CFLAGS_curve25519.o += -I$(src)/$(SRCARCH)
+libcurve25519-$(CONFIG_ARM) += arm/curve25519-core.o
+libcurve25519-$(CONFIG_PPC) += powerpc/curve25519-ppc64le_asm.o
+endif
-obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o
-libcurve25519-y += curve25519.o
+################################################################################
obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o
libdes-y := des.o
-obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += libpoly1305.o
-libpoly1305-y := poly1305-donna32.o
-libpoly1305-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o
-libpoly1305-y += poly1305.o
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_MD5) += libmd5.o
+libmd5-y := md5.o
+ifeq ($(CONFIG_CRYPTO_LIB_MD5_ARCH),y)
+CFLAGS_md5.o += -I$(src)/$(SRCARCH)
+libmd5-$(CONFIG_PPC) += powerpc/md5-asm.o
+libmd5-$(CONFIG_SPARC) += sparc/md5_asm.o
+endif # CONFIG_CRYPTO_LIB_MD5_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_POLY1305) += libpoly1305.o
+libpoly1305-y := poly1305.o
+ifeq ($(CONFIG_ARCH_SUPPORTS_INT128),y)
+libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna64.o
+else
+libpoly1305-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += poly1305-donna32.o
+endif
+
+ifeq ($(CONFIG_CRYPTO_LIB_POLY1305_ARCH),y)
+CFLAGS_poly1305.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libpoly1305-y += arm/poly1305-core.o
+$(obj)/arm/poly1305-core.S: $(src)/arm/poly1305-armv4.pl
+ $(call cmd,perlasm)
+# massage the perlasm code a bit so we only get the NEON routine if we need it
+poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
+poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
+AFLAGS_arm/poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
+endif
+
+ifeq ($(CONFIG_ARM64),y)
+libpoly1305-y += arm64/poly1305-core.o
+$(obj)/arm64/poly1305-core.S: $(src)/arm64/poly1305-armv8.pl
+ $(call cmd,perlasm_with_args)
+endif
+
+ifeq ($(CONFIG_MIPS),y)
+libpoly1305-y += mips/poly1305-core.o
+poly1305-perlasm-flavour-$(CONFIG_32BIT) := o32
+poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64
+quiet_cmd_perlasm_poly1305 = PERLASM $@
+ cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@
+# Use if_changed instead of cmd, in case the flavour changed.
+$(obj)/mips/poly1305-core.S: $(src)/mips/poly1305-mips.pl FORCE
+ $(call if_changed,perlasm_poly1305)
+targets += mips/poly1305-core.S
+endif
+
+libpoly1305-$(CONFIG_PPC) += powerpc/poly1305-p10le_64.o
+
+ifeq ($(CONFIG_RISCV),y)
+libpoly1305-y += riscv/poly1305-core.o
+poly1305-perlasm-flavour-$(CONFIG_32BIT) := 32
+poly1305-perlasm-flavour-$(CONFIG_64BIT) := 64
+quiet_cmd_perlasm_poly1305 = PERLASM $@
+ cmd_perlasm_poly1305 = $(PERL) $< $(poly1305-perlasm-flavour-y) $@
+# Use if_changed instead of cmd, in case the flavour changed.
+$(obj)/riscv/poly1305-core.S: $(src)/riscv/poly1305-riscv.pl FORCE
+ $(call if_changed,perlasm_poly1305)
+targets += riscv/poly1305-core.S
+AFLAGS_riscv/poly1305-core.o += -Dpoly1305_init=poly1305_block_init
+endif
+
+ifeq ($(CONFIG_X86),y)
+libpoly1305-y += x86/poly1305-x86_64-cryptogams.o
+$(obj)/x86/poly1305-x86_64-cryptogams.S: $(src)/x86/poly1305-x86_64-cryptogams.pl
+ $(call cmd,perlasm)
+endif
+
+endif # CONFIG_CRYPTO_LIB_POLY1305_ARCH
+
+# clean-files must be defined unconditionally
+clean-files += arm/poly1305-core.S \
+ arm64/poly1305-core.S \
+ mips/poly1305-core.S \
+ riscv/poly1305-core.S \
+ x86/poly1305-x86_64-cryptogams.S
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_POLYVAL) += libpolyval.o
+libpolyval-y := polyval.o
+ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
+CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
+libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
+libpolyval-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
+endif
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
+libsha1-y := sha1.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y)
+CFLAGS_sha1.o += -I$(src)/$(SRCARCH)
+ifeq ($(CONFIG_ARM),y)
+libsha1-y += arm/sha1-armv4-large.o
+libsha1-$(CONFIG_KERNEL_MODE_NEON) += arm/sha1-armv7-neon.o \
+ arm/sha1-ce-core.o
+endif
+libsha1-$(CONFIG_ARM64) += arm64/sha1-ce-core.o
+ifeq ($(CONFIG_PPC),y)
+libsha1-y += powerpc/sha1-powerpc-asm.o
+libsha1-$(CONFIG_SPE) += powerpc/sha1-spe-asm.o
+endif
+libsha1-$(CONFIG_SPARC) += sparc/sha1_asm.o
+libsha1-$(CONFIG_X86) += x86/sha1-ssse3-and-avx.o \
+ x86/sha1-avx2-asm.o \
+ x86/sha1-ni-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA1_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
+libsha256-y := sha256.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA256_ARCH),y)
+CFLAGS_sha256.o += -I$(src)/$(SRCARCH)
-obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
-libsha1-y := sha1.o
+ifeq ($(CONFIG_ARM),y)
+libsha256-y += arm/sha256-ce.o arm/sha256-core.o
+$(obj)/arm/sha256-core.S: $(src)/arm/sha256-armv4.pl
+ $(call cmd,perlasm)
+AFLAGS_arm/sha256-core.o += $(aflags-thumb2-y)
+endif
+
+ifeq ($(CONFIG_ARM64),y)
+libsha256-y += arm64/sha256-core.o
+$(obj)/arm64/sha256-core.S: $(src)/arm64/sha2-armv8.pl
+ $(call cmd,perlasm_with_args)
+libsha256-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha256-ce.o
+endif
-obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
-libsha256-y := sha256.o
+libsha256-$(CONFIG_PPC) += powerpc/sha256-spe-asm.o
+libsha256-$(CONFIG_RISCV) += riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.o
+libsha256-$(CONFIG_SPARC) += sparc/sha256_asm.o
+libsha256-$(CONFIG_X86) += x86/sha256-ssse3-asm.o \
+ x86/sha256-avx-asm.o \
+ x86/sha256-avx2-asm.o \
+ x86/sha256-ni-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA256_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA512) += libsha512.o
+libsha512-y := sha512.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA512_ARCH),y)
+CFLAGS_sha512.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libsha512-y += arm/sha512-core.o
+$(obj)/arm/sha512-core.S: $(src)/arm/sha512-armv4.pl
+ $(call cmd,perlasm)
+AFLAGS_arm/sha512-core.o += $(aflags-thumb2-y)
+endif
-ifneq ($(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS),y)
-libblake2s-y += blake2s-selftest.o
-libchacha20poly1305-y += chacha20poly1305-selftest.o
-libcurve25519-y += curve25519-selftest.o
+ifeq ($(CONFIG_ARM64),y)
+libsha512-y += arm64/sha512-core.o
+$(obj)/arm64/sha512-core.S: $(src)/arm64/sha2-armv8.pl
+ $(call cmd,perlasm_with_args)
+libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o
endif
+
+libsha512-$(CONFIG_RISCV) += riscv/sha512-riscv64-zvknhb-zvkb.o
+libsha512-$(CONFIG_SPARC) += sparc/sha512_asm.o
+libsha512-$(CONFIG_X86) += x86/sha512-ssse3-asm.o \
+ x86/sha512-avx-asm.o \
+ x86/sha512-avx2-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
+
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA3) += libsha3.o
+libsha3-y := sha3.o
+
+ifeq ($(CONFIG_CRYPTO_LIB_SHA3_ARCH),y)
+CFLAGS_sha3.o += -I$(src)/$(SRCARCH)
+libsha3-$(CONFIG_ARM64) += arm64/sha3-ce-core.o
+endif # CONFIG_CRYPTO_LIB_SHA3_ARCH
+
+################################################################################
+
+obj-$(CONFIG_MPILIB) += mpi/
+
+obj-$(CONFIG_CRYPTO_SELFTESTS_FULL) += simd.o
+
+obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o
+libsm3-y := sm3.o
+
+# clean-files must be defined unconditionally
+clean-files += arm/sha256-core.S arm/sha512-core.S
+clean-files += arm64/sha256-core.S arm64/sha512-core.S
diff --git a/lib/crypto/aes.c b/lib/crypto/aes.c
index 827fe89922ff..b57fda3460f1 100644
--- a/lib/crypto/aes.c
+++ b/lib/crypto/aes.c
@@ -5,8 +5,9 @@
#include <crypto/aes.h>
#include <linux/crypto.h>
+#include <linux/export.h>
#include <linux/module.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
/*
* Emit the sbox as volatile const to prevent the compiler from doing
diff --git a/lib/crypto/aescfb.c b/lib/crypto/aescfb.c
new file mode 100644
index 000000000000..0f294c8cbf3c
--- /dev/null
+++ b/lib/crypto/aescfb.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Minimal library implementation of AES in CFB mode
+ *
+ * Copyright 2023 Google LLC
+ */
+
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <asm/irqflags.h>
+
+static void aescfb_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
+ const void *src)
+{
+ unsigned long flags;
+
+ /*
+ * In AES-CFB, the AES encryption operates on known 'plaintext' (the IV
+ * and ciphertext), making it susceptible to timing attacks on the
+ * encryption key. The AES library already mitigates this risk to some
+ * extent by pulling the entire S-box into the caches before doing any
+ * substitutions, but this strategy is more effective when running with
+ * interrupts disabled.
+ */
+ local_irq_save(flags);
+ aes_encrypt(ctx, dst, src);
+ local_irq_restore(flags);
+}
+
+/**
+ * aescfb_encrypt - Perform AES-CFB encryption on a block of data
+ *
+ * @ctx: The AES-CFB key schedule
+ * @dst: Pointer to the ciphertext output buffer
+ * @src: Pointer the plaintext (may equal @dst for encryption in place)
+ * @len: The size in bytes of the plaintext and ciphertext.
+ * @iv: The initialization vector (IV) to use for this block of data
+ */
+void aescfb_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
+ int len, const u8 iv[AES_BLOCK_SIZE])
+{
+ u8 ks[AES_BLOCK_SIZE];
+ const u8 *v = iv;
+
+ while (len > 0) {
+ aescfb_encrypt_block(ctx, ks, v);
+ crypto_xor_cpy(dst, src, ks, min(len, AES_BLOCK_SIZE));
+ v = dst;
+
+ dst += AES_BLOCK_SIZE;
+ src += AES_BLOCK_SIZE;
+ len -= AES_BLOCK_SIZE;
+ }
+
+ memzero_explicit(ks, sizeof(ks));
+}
+EXPORT_SYMBOL(aescfb_encrypt);
+
+/**
+ * aescfb_decrypt - Perform AES-CFB decryption on a block of data
+ *
+ * @ctx: The AES-CFB key schedule
+ * @dst: Pointer to the plaintext output buffer
+ * @src: Pointer the ciphertext (may equal @dst for decryption in place)
+ * @len: The size in bytes of the plaintext and ciphertext.
+ * @iv: The initialization vector (IV) to use for this block of data
+ */
+void aescfb_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
+ int len, const u8 iv[AES_BLOCK_SIZE])
+{
+ u8 ks[2][AES_BLOCK_SIZE];
+
+ aescfb_encrypt_block(ctx, ks[0], iv);
+
+ for (int i = 0; len > 0; i ^= 1) {
+ if (len > AES_BLOCK_SIZE)
+ /*
+ * Generate the keystream for the next block before
+ * performing the XOR, as that may update in place and
+ * overwrite the ciphertext.
+ */
+ aescfb_encrypt_block(ctx, ks[!i], src);
+
+ crypto_xor_cpy(dst, src, ks[i], min(len, AES_BLOCK_SIZE));
+
+ dst += AES_BLOCK_SIZE;
+ src += AES_BLOCK_SIZE;
+ len -= AES_BLOCK_SIZE;
+ }
+
+ memzero_explicit(ks, sizeof(ks));
+}
+EXPORT_SYMBOL(aescfb_decrypt);
+
+MODULE_DESCRIPTION("Generic AES-CFB library");
+MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>");
+MODULE_LICENSE("GPL");
+
+#ifdef CONFIG_CRYPTO_SELFTESTS
+
+/*
+ * Test code below. Vectors taken from crypto/testmgr.h
+ */
+
+static struct {
+ u8 ptext[64] __nonstring;
+ u8 ctext[64] __nonstring;
+
+ u8 key[AES_MAX_KEY_SIZE] __nonstring;
+ u8 iv[AES_BLOCK_SIZE] __nonstring;
+
+ int klen;
+ int len;
+} const aescfb_tv[] __initconst = {
+ { /* From NIST SP800-38A */
+ .key = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+ "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+ .klen = 16,
+ .iv = "\x00\x01\x02\x03\x04\x05\x06\x07"
+ "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+ .ptext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+ "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+ "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+ "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+ "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+ "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+ "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+ "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+ .ctext = "\x3b\x3f\xd9\x2e\xb7\x2d\xad\x20"
+ "\x33\x34\x49\xf8\xe8\x3c\xfb\x4a"
+ "\xc8\xa6\x45\x37\xa0\xb3\xa9\x3f"
+ "\xcd\xe3\xcd\xad\x9f\x1c\xe5\x8b"
+ "\x26\x75\x1f\x67\xa3\xcb\xb1\x40"
+ "\xb1\x80\x8c\xf1\x87\xa4\xf4\xdf"
+ "\xc0\x4b\x05\x35\x7c\x5d\x1c\x0e"
+ "\xea\xc4\xc6\x6f\x9f\xf7\xf2\xe6",
+ .len = 64,
+ }, {
+ .key = "\x8e\x73\xb0\xf7\xda\x0e\x64\x52"
+ "\xc8\x10\xf3\x2b\x80\x90\x79\xe5"
+ "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b",
+ .klen = 24,
+ .iv = "\x00\x01\x02\x03\x04\x05\x06\x07"
+ "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+ .ptext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+ "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+ "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+ "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+ "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+ "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+ "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+ "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+ .ctext = "\xcd\xc8\x0d\x6f\xdd\xf1\x8c\xab"
+ "\x34\xc2\x59\x09\xc9\x9a\x41\x74"
+ "\x67\xce\x7f\x7f\x81\x17\x36\x21"
+ "\x96\x1a\x2b\x70\x17\x1d\x3d\x7a"
+ "\x2e\x1e\x8a\x1d\xd5\x9b\x88\xb1"
+ "\xc8\xe6\x0f\xed\x1e\xfa\xc4\xc9"
+ "\xc0\x5f\x9f\x9c\xa9\x83\x4f\xa0"
+ "\x42\xae\x8f\xba\x58\x4b\x09\xff",
+ .len = 64,
+ }, {
+ .key = "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
+ "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+ "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
+ "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+ .klen = 32,
+ .iv = "\x00\x01\x02\x03\x04\x05\x06\x07"
+ "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+ .ptext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+ "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+ "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+ "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+ "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+ "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+ "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+ "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+ .ctext = "\xdc\x7e\x84\xbf\xda\x79\x16\x4b"
+ "\x7e\xcd\x84\x86\x98\x5d\x38\x60"
+ "\x39\xff\xed\x14\x3b\x28\xb1\xc8"
+ "\x32\x11\x3c\x63\x31\xe5\x40\x7b"
+ "\xdf\x10\x13\x24\x15\xe5\x4b\x92"
+ "\xa1\x3e\xd0\xa8\x26\x7a\xe2\xf9"
+ "\x75\xa3\x85\x74\x1a\xb9\xce\xf8"
+ "\x20\x31\x62\x3d\x55\xb1\xe4\x71",
+ .len = 64,
+ }, { /* > 16 bytes, not a multiple of 16 bytes */
+ .key = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+ "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+ .klen = 16,
+ .iv = "\x00\x01\x02\x03\x04\x05\x06\x07"
+ "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+ .ptext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+ "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+ "\xae",
+ .ctext = "\x3b\x3f\xd9\x2e\xb7\x2d\xad\x20"
+ "\x33\x34\x49\xf8\xe8\x3c\xfb\x4a"
+ "\xc8",
+ .len = 17,
+ }, { /* < 16 bytes */
+ .key = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+ "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+ .klen = 16,
+ .iv = "\x00\x01\x02\x03\x04\x05\x06\x07"
+ "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+ .ptext = "\x6b\xc1\xbe\xe2\x2e\x40\x9f",
+ .ctext = "\x3b\x3f\xd9\x2e\xb7\x2d\xad",
+ .len = 7,
+ },
+};
+
+static int __init libaescfb_init(void)
+{
+ for (int i = 0; i < ARRAY_SIZE(aescfb_tv); i++) {
+ struct crypto_aes_ctx ctx;
+ u8 buf[64];
+
+ if (aes_expandkey(&ctx, aescfb_tv[i].key, aescfb_tv[i].klen)) {
+ pr_err("aes_expandkey() failed on vector %d\n", i);
+ return -ENODEV;
+ }
+
+ aescfb_encrypt(&ctx, buf, aescfb_tv[i].ptext, aescfb_tv[i].len,
+ aescfb_tv[i].iv);
+ if (memcmp(buf, aescfb_tv[i].ctext, aescfb_tv[i].len)) {
+ pr_err("aescfb_encrypt() #1 failed on vector %d\n", i);
+ return -ENODEV;
+ }
+
+ /* decrypt in place */
+ aescfb_decrypt(&ctx, buf, buf, aescfb_tv[i].len, aescfb_tv[i].iv);
+ if (memcmp(buf, aescfb_tv[i].ptext, aescfb_tv[i].len)) {
+ pr_err("aescfb_decrypt() failed on vector %d\n", i);
+ return -ENODEV;
+ }
+
+ /* encrypt in place */
+ aescfb_encrypt(&ctx, buf, buf, aescfb_tv[i].len, aescfb_tv[i].iv);
+ if (memcmp(buf, aescfb_tv[i].ctext, aescfb_tv[i].len)) {
+ pr_err("aescfb_encrypt() #2 failed on vector %d\n", i);
+
+ return -ENODEV;
+ }
+
+ }
+ return 0;
+}
+module_init(libaescfb_init);
+
+static void __exit libaescfb_exit(void)
+{
+}
+module_exit(libaescfb_exit);
+#endif
diff --git a/lib/crypto/aesgcm.c b/lib/crypto/aesgcm.c
index c632d6e17af8..ac0b2fcfd606 100644
--- a/lib/crypto/aesgcm.c
+++ b/lib/crypto/aesgcm.c
@@ -5,12 +5,11 @@
* Copyright 2022 Google LLC
*/
-#include <linux/module.h>
-
#include <crypto/algapi.h>
#include <crypto/gcm.h>
#include <crypto/ghash.h>
-
+#include <linux/export.h>
+#include <linux/module.h>
#include <asm/irqflags.h>
static void aesgcm_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
@@ -73,6 +72,19 @@ static void aesgcm_ghash(be128 *ghash, const be128 *key, const void *src,
}
}
+/**
+ * aesgcm_mac - Generates the authentication tag using AES-GCM algorithm.
+ * @ctx: The data structure that will hold the AES-GCM key schedule
+ * @src: The input source data.
+ * @src_len: Length of the source data.
+ * @assoc: Points to the associated data.
+ * @assoc_len: Length of the associated data values.
+ * @ctr: Points to the counter value.
+ * @authtag: The output buffer for the authentication tag.
+ *
+ * It takes in the AES-GCM context, source data, associated data, counter value,
+ * and an output buffer for the authentication tag.
+ */
static void aesgcm_mac(const struct aesgcm_ctx *ctx, const u8 *src, int src_len,
const u8 *assoc, int assoc_len, __be32 *ctr, u8 *authtag)
{
@@ -186,25 +198,25 @@ MODULE_DESCRIPTION("Generic AES-GCM library");
MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>");
MODULE_LICENSE("GPL");
-#ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
+#ifdef CONFIG_CRYPTO_SELFTESTS
/*
* Test code below. Vectors taken from crypto/testmgr.h
*/
-static const u8 __initconst ctext0[16] =
+static const u8 __initconst ctext0[16] __nonstring =
"\x58\xe2\xfc\xce\xfa\x7e\x30\x61"
"\x36\x7f\x1d\x57\xa4\xe7\x45\x5a";
static const u8 __initconst ptext1[16];
-static const u8 __initconst ctext1[32] =
+static const u8 __initconst ctext1[32] __nonstring =
"\x03\x88\xda\xce\x60\xb6\xa3\x92"
"\xf3\x28\xc2\xb9\x71\xb2\xfe\x78"
"\xab\x6e\x47\xd4\x2c\xec\x13\xbd"
"\xf5\x3a\x67\xb2\x12\x57\xbd\xdf";
-static const u8 __initconst ptext2[64] =
+static const u8 __initconst ptext2[64] __nonstring =
"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -214,7 +226,7 @@ static const u8 __initconst ptext2[64] =
"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
"\xba\x63\x7b\x39\x1a\xaf\xd2\x55";
-static const u8 __initconst ctext2[80] =
+static const u8 __initconst ctext2[80] __nonstring =
"\x42\x83\x1e\xc2\x21\x77\x74\x24"
"\x4b\x72\x21\xb7\x84\xd0\xd4\x9c"
"\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0"
@@ -226,7 +238,7 @@ static const u8 __initconst ctext2[80] =
"\x4d\x5c\x2a\xf3\x27\xcd\x64\xa6"
"\x2c\xf3\x5a\xbd\x2b\xa6\xfa\xb4";
-static const u8 __initconst ptext3[60] =
+static const u8 __initconst ptext3[60] __nonstring =
"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -236,7 +248,7 @@ static const u8 __initconst ptext3[60] =
"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
"\xba\x63\x7b\x39";
-static const u8 __initconst ctext3[76] =
+static const u8 __initconst ctext3[76] __nonstring =
"\x42\x83\x1e\xc2\x21\x77\x74\x24"
"\x4b\x72\x21\xb7\x84\xd0\xd4\x9c"
"\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0"
@@ -248,17 +260,17 @@ static const u8 __initconst ctext3[76] =
"\x5b\xc9\x4f\xbc\x32\x21\xa5\xdb"
"\x94\xfa\xe9\x5a\xe7\x12\x1a\x47";
-static const u8 __initconst ctext4[16] =
+static const u8 __initconst ctext4[16] __nonstring =
"\xcd\x33\xb2\x8a\xc7\x73\xf7\x4b"
"\xa0\x0e\xd1\xf3\x12\x57\x24\x35";
-static const u8 __initconst ctext5[32] =
+static const u8 __initconst ctext5[32] __nonstring =
"\x98\xe7\x24\x7c\x07\xf0\xfe\x41"
"\x1c\x26\x7e\x43\x84\xb0\xf6\x00"
"\x2f\xf5\x8d\x80\x03\x39\x27\xab"
"\x8e\xf4\xd4\x58\x75\x14\xf0\xfb";
-static const u8 __initconst ptext6[64] =
+static const u8 __initconst ptext6[64] __nonstring =
"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -268,7 +280,7 @@ static const u8 __initconst ptext6[64] =
"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
"\xba\x63\x7b\x39\x1a\xaf\xd2\x55";
-static const u8 __initconst ctext6[80] =
+static const u8 __initconst ctext6[80] __nonstring =
"\x39\x80\xca\x0b\x3c\x00\xe8\x41"
"\xeb\x06\xfa\xc4\x87\x2a\x27\x57"
"\x85\x9e\x1c\xea\xa6\xef\xd9\x84"
@@ -280,17 +292,17 @@ static const u8 __initconst ctext6[80] =
"\x99\x24\xa7\xc8\x58\x73\x36\xbf"
"\xb1\x18\x02\x4d\xb8\x67\x4a\x14";
-static const u8 __initconst ctext7[16] =
+static const u8 __initconst ctext7[16] __nonstring =
"\x53\x0f\x8a\xfb\xc7\x45\x36\xb9"
"\xa9\x63\xb4\xf1\xc4\xcb\x73\x8b";
-static const u8 __initconst ctext8[32] =
+static const u8 __initconst ctext8[32] __nonstring =
"\xce\xa7\x40\x3d\x4d\x60\x6b\x6e"
"\x07\x4e\xc5\xd3\xba\xf3\x9d\x18"
"\xd0\xd1\xc8\xa7\x99\x99\x6b\xf0"
"\x26\x5b\x98\xb5\xd4\x8a\xb9\x19";
-static const u8 __initconst ptext9[64] =
+static const u8 __initconst ptext9[64] __nonstring =
"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -300,7 +312,7 @@ static const u8 __initconst ptext9[64] =
"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
"\xba\x63\x7b\x39\x1a\xaf\xd2\x55";
-static const u8 __initconst ctext9[80] =
+static const u8 __initconst ctext9[80] __nonstring =
"\x52\x2d\xc1\xf0\x99\x56\x7d\x07"
"\xf4\x7f\x37\xa3\x2a\x84\x42\x7d"
"\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9"
@@ -312,7 +324,7 @@ static const u8 __initconst ctext9[80] =
"\xb0\x94\xda\xc5\xd9\x34\x71\xbd"
"\xec\x1a\x50\x22\x70\xe3\xcc\x6c";
-static const u8 __initconst ptext10[60] =
+static const u8 __initconst ptext10[60] __nonstring =
"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -322,7 +334,7 @@ static const u8 __initconst ptext10[60] =
"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
"\xba\x63\x7b\x39";
-static const u8 __initconst ctext10[76] =
+static const u8 __initconst ctext10[76] __nonstring =
"\x52\x2d\xc1\xf0\x99\x56\x7d\x07"
"\xf4\x7f\x37\xa3\x2a\x84\x42\x7d"
"\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9"
@@ -334,7 +346,7 @@ static const u8 __initconst ctext10[76] =
"\x76\xfc\x6e\xce\x0f\x4e\x17\x68"
"\xcd\xdf\x88\x53\xbb\x2d\x55\x1b";
-static const u8 __initconst ptext11[60] =
+static const u8 __initconst ptext11[60] __nonstring =
"\xd9\x31\x32\x25\xf8\x84\x06\xe5"
"\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
"\x86\xa7\xa9\x53\x15\x34\xf7\xda"
@@ -344,7 +356,7 @@ static const u8 __initconst ptext11[60] =
"\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57"
"\xba\x63\x7b\x39";
-static const u8 __initconst ctext11[76] =
+static const u8 __initconst ctext11[76] __nonstring =
"\x39\x80\xca\x0b\x3c\x00\xe8\x41"
"\xeb\x06\xfa\xc4\x87\x2a\x27\x57"
"\x85\x9e\x1c\xea\xa6\xef\xd9\x84"
@@ -356,7 +368,7 @@ static const u8 __initconst ctext11[76] =
"\x25\x19\x49\x8e\x80\xf1\x47\x8f"
"\x37\xba\x55\xbd\x6d\x27\x61\x8c";
-static const u8 __initconst ptext12[719] =
+static const u8 __initconst ptext12[719] __nonstring =
"\x42\xc1\xcc\x08\x48\x6f\x41\x3f"
"\x2f\x11\x66\x8b\x2a\x16\xf0\xe0"
"\x58\x83\xf0\xc3\x70\x14\xc0\x5b"
@@ -448,7 +460,7 @@ static const u8 __initconst ptext12[719] =
"\x59\xfa\xfa\xaa\x44\x04\x01\xa7"
"\xa4\x78\xdb\x74\x3d\x8b\xb5";
-static const u8 __initconst ctext12[735] =
+static const u8 __initconst ctext12[735] __nonstring =
"\x84\x0b\xdb\xd5\xb7\xa8\xfe\x20"
"\xbb\xb1\x12\x7f\x41\xea\xb3\xc0"
"\xa2\xb4\x37\x19\x11\x58\xb6\x0b"
@@ -546,9 +558,9 @@ static struct {
const u8 *ptext;
const u8 *ctext;
- u8 key[AES_MAX_KEY_SIZE];
- u8 iv[GCM_AES_IV_SIZE];
- u8 assoc[20];
+ u8 key[AES_MAX_KEY_SIZE] __nonstring;
+ u8 iv[GCM_AES_IV_SIZE] __nonstring;
+ u8 assoc[20] __nonstring;
int klen;
int clen;
@@ -684,7 +696,7 @@ static int __init libaesgcm_init(void)
u8 tagbuf[AES_BLOCK_SIZE];
int plen = aesgcm_tv[i].plen;
struct aesgcm_ctx ctx;
- u8 buf[sizeof(ptext12)];
+ static u8 buf[sizeof(ptext12)];
if (aesgcm_expandkey(&ctx, aesgcm_tv[i].key, aesgcm_tv[i].klen,
aesgcm_tv[i].clen - plen)) {
diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c
index c2020f19c652..4e950e1e66d0 100644
--- a/lib/crypto/arc4.c
+++ b/lib/crypto/arc4.c
@@ -8,6 +8,7 @@
*/
#include <crypto/arc4.h>
+#include <linux/export.h>
#include <linux/module.h>
int arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
@@ -71,4 +72,5 @@ void arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len)
}
EXPORT_SYMBOL(arc4_crypt);
+MODULE_DESCRIPTION("ARC4 Cipher Algorithm");
MODULE_LICENSE("GPL");
diff --git a/lib/crypto/arm/.gitignore b/lib/crypto/arm/.gitignore
new file mode 100644
index 000000000000..f6c4e8ef80da
--- /dev/null
+++ b/lib/crypto/arm/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
+sha512-core.S
diff --git a/lib/crypto/arm/blake2b-neon-core.S b/lib/crypto/arm/blake2b-neon-core.S
new file mode 100644
index 000000000000..b55c37f0b88f
--- /dev/null
+++ b/lib/crypto/arm/blake2b-neon-core.S
@@ -0,0 +1,350 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2b digest algorithm optimized with ARM NEON instructions. On ARM
+ * processors that have NEON support but not the ARMv8 Crypto Extensions,
+ * typically this BLAKE2b implementation is much faster than the SHA-2 family
+ * and slightly faster than SHA-1.
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+ .text
+ .fpu neon
+
+ // The arguments to blake2b_compress_neon()
+ CTX .req r0
+ DATA .req r1
+ NBLOCKS .req r2
+ INC .req r3
+
+ // Pointers to the rotation tables
+ ROR24_TABLE .req r4
+ ROR16_TABLE .req r5
+
+ // The original stack pointer
+ ORIG_SP .req r6
+
+ // NEON registers which contain the message words of the current block.
+ // M_0-M_3 are occasionally used for other purposes too.
+ M_0 .req d16
+ M_1 .req d17
+ M_2 .req d18
+ M_3 .req d19
+ M_4 .req d20
+ M_5 .req d21
+ M_6 .req d22
+ M_7 .req d23
+ M_8 .req d24
+ M_9 .req d25
+ M_10 .req d26
+ M_11 .req d27
+ M_12 .req d28
+ M_13 .req d29
+ M_14 .req d30
+ M_15 .req d31
+
+ .align 4
+ // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
+ // instruction. This is the most efficient way to implement these
+ // rotation amounts with NEON. (On Cortex-A53 it's the same speed as
+ // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
+.Lror24_table:
+ .byte 3, 4, 5, 6, 7, 0, 1, 2
+.Lror16_table:
+ .byte 2, 3, 4, 5, 6, 7, 0, 1
+ // The BLAKE2b initialization vector
+.Lblake2b_IV:
+ .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
+ .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
+ .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
+ .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+
+// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
+// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
+// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
+// (M_0-M_3), so that they can be reloaded if they are used as temporary
+// registers. The macro arguments s0-s15 give the order in which the message
+// words are used in this round. 'final' is 1 if this is the final round.
+.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \
+ s8, s9, s10, s11, s12, s13, s14, s15, final=0
+
+ // Mix the columns:
+ // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
+ // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
+
+ // a += b + m[blake2b_sigma[r][2*i + 0]];
+ vadd.u64 q0, q0, q2
+ vadd.u64 q1, q1, q3
+ vadd.u64 d0, d0, M_\s0
+ vadd.u64 d1, d1, M_\s2
+ vadd.u64 d2, d2, M_\s4
+ vadd.u64 d3, d3, M_\s6
+
+ // d = ror64(d ^ a, 32);
+ veor q6, q6, q0
+ veor q7, q7, q1
+ vrev64.32 q6, q6
+ vrev64.32 q7, q7
+
+ // c += d;
+ vadd.u64 q4, q4, q6
+ vadd.u64 q5, q5, q7
+
+ // b = ror64(b ^ c, 24);
+ vld1.8 {M_0}, [ROR24_TABLE, :64]
+ veor q2, q2, q4
+ veor q3, q3, q5
+ vtbl.8 d4, {d4}, M_0
+ vtbl.8 d5, {d5}, M_0
+ vtbl.8 d6, {d6}, M_0
+ vtbl.8 d7, {d7}, M_0
+
+ // a += b + m[blake2b_sigma[r][2*i + 1]];
+ //
+ // M_0 got clobbered above, so we have to reload it if any of the four
+ // message words this step needs happens to be M_0. Otherwise we don't
+ // need to reload it here, as it will just get clobbered again below.
+.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
+ vld1.8 {M_0}, [sp, :64]
+.endif
+ vadd.u64 q0, q0, q2
+ vadd.u64 q1, q1, q3
+ vadd.u64 d0, d0, M_\s1
+ vadd.u64 d1, d1, M_\s3
+ vadd.u64 d2, d2, M_\s5
+ vadd.u64 d3, d3, M_\s7
+
+ // d = ror64(d ^ a, 16);
+ vld1.8 {M_0}, [ROR16_TABLE, :64]
+ veor q6, q6, q0
+ veor q7, q7, q1
+ vtbl.8 d12, {d12}, M_0
+ vtbl.8 d13, {d13}, M_0
+ vtbl.8 d14, {d14}, M_0
+ vtbl.8 d15, {d15}, M_0
+
+ // c += d;
+ vadd.u64 q4, q4, q6
+ vadd.u64 q5, q5, q7
+
+ // b = ror64(b ^ c, 63);
+ //
+ // This rotation amount isn't a multiple of 8, so it has to be
+ // implemented using a pair of shifts, which requires temporary
+ // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
+ veor q8, q2, q4
+ veor q9, q3, q5
+ vshr.u64 q2, q8, #63
+ vshr.u64 q3, q9, #63
+ vsli.u64 q2, q8, #1
+ vsli.u64 q3, q9, #1
+ vld1.8 {q8-q9}, [sp, :256]
+
+ // Mix the diagonals:
+ // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
+ // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
+ //
+ // There are two possible ways to do this: use 'vext' instructions to
+ // shift the rows of the matrix so that the diagonals become columns,
+ // and undo it afterwards; or just use 64-bit operations on 'd'
+ // registers instead of 128-bit operations on 'q' registers. We use the
+ // latter approach, as it performs much better on Cortex-A7.
+
+ // a += b + m[blake2b_sigma[r][2*i + 0]];
+ vadd.u64 d0, d0, d5
+ vadd.u64 d1, d1, d6
+ vadd.u64 d2, d2, d7
+ vadd.u64 d3, d3, d4
+ vadd.u64 d0, d0, M_\s8
+ vadd.u64 d1, d1, M_\s10
+ vadd.u64 d2, d2, M_\s12
+ vadd.u64 d3, d3, M_\s14
+
+ // d = ror64(d ^ a, 32);
+ veor d15, d15, d0
+ veor d12, d12, d1
+ veor d13, d13, d2
+ veor d14, d14, d3
+ vrev64.32 d15, d15
+ vrev64.32 d12, d12
+ vrev64.32 d13, d13
+ vrev64.32 d14, d14
+
+ // c += d;
+ vadd.u64 d10, d10, d15
+ vadd.u64 d11, d11, d12
+ vadd.u64 d8, d8, d13
+ vadd.u64 d9, d9, d14
+
+ // b = ror64(b ^ c, 24);
+ vld1.8 {M_0}, [ROR24_TABLE, :64]
+ veor d5, d5, d10
+ veor d6, d6, d11
+ veor d7, d7, d8
+ veor d4, d4, d9
+ vtbl.8 d5, {d5}, M_0
+ vtbl.8 d6, {d6}, M_0
+ vtbl.8 d7, {d7}, M_0
+ vtbl.8 d4, {d4}, M_0
+
+ // a += b + m[blake2b_sigma[r][2*i + 1]];
+.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
+ vld1.8 {M_0}, [sp, :64]
+.endif
+ vadd.u64 d0, d0, d5
+ vadd.u64 d1, d1, d6
+ vadd.u64 d2, d2, d7
+ vadd.u64 d3, d3, d4
+ vadd.u64 d0, d0, M_\s9
+ vadd.u64 d1, d1, M_\s11
+ vadd.u64 d2, d2, M_\s13
+ vadd.u64 d3, d3, M_\s15
+
+ // d = ror64(d ^ a, 16);
+ vld1.8 {M_0}, [ROR16_TABLE, :64]
+ veor d15, d15, d0
+ veor d12, d12, d1
+ veor d13, d13, d2
+ veor d14, d14, d3
+ vtbl.8 d12, {d12}, M_0
+ vtbl.8 d13, {d13}, M_0
+ vtbl.8 d14, {d14}, M_0
+ vtbl.8 d15, {d15}, M_0
+
+ // c += d;
+ vadd.u64 d10, d10, d15
+ vadd.u64 d11, d11, d12
+ vadd.u64 d8, d8, d13
+ vadd.u64 d9, d9, d14
+
+ // b = ror64(b ^ c, 63);
+ veor d16, d4, d9
+ veor d17, d5, d10
+ veor d18, d6, d11
+ veor d19, d7, d8
+ vshr.u64 q2, q8, #63
+ vshr.u64 q3, q9, #63
+ vsli.u64 q2, q8, #1
+ vsli.u64 q3, q9, #1
+ // Reloading q8-q9 can be skipped on the final round.
+.if ! \final
+ vld1.8 {q8-q9}, [sp, :256]
+.endif
+.endm
+
+//
+// void blake2b_compress_neon(struct blake2b_ctx *ctx,
+// const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2b_ctx are used:
+// u64 h[8]; (inout)
+// u64 t[2]; (inout)
+// u64 f[2]; (in)
+//
+ .align 5
+ENTRY(blake2b_compress_neon)
+ push {r4-r10}
+
+ // Allocate a 32-byte stack buffer that is 32-byte aligned.
+ mov ORIG_SP, sp
+ sub ip, sp, #32
+ bic ip, ip, #31
+ mov sp, ip
+
+ adr ROR24_TABLE, .Lror24_table
+ adr ROR16_TABLE, .Lror16_table
+
+ mov ip, CTX
+ vld1.64 {q0-q1}, [ip]! // Load h[0..3]
+ vld1.64 {q2-q3}, [ip]! // Load h[4..7]
+.Lnext_block:
+ adr r10, .Lblake2b_IV
+ vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]
+ vld1.64 {q4-q5}, [r10]! // Load IV[0..3]
+ vmov r7, r8, d28 // Copy t[0] to (r7, r8)
+ vld1.64 {q6-q7}, [r10] // Load IV[4..7]
+ adds r7, r7, INC // Increment counter
+ bcs .Lslow_inc_ctr
+ vmov.i32 d28[0], r7
+ vst1.64 {d28}, [ip] // Update t[0]
+.Linc_ctr_done:
+
+ // Load the next message block and finish initializing the state matrix
+ // 'v'. Fortunately, there are exactly enough NEON registers to fit the
+ // entire state matrix in q0-q7 and the entire message block in q8-15.
+ //
+ // However, _blake2b_round also needs some extra registers for rotates,
+ // so we have to spill some registers. It's better to spill the message
+ // registers than the state registers, as the message doesn't change.
+ // Therefore we store a copy of the first 32 bytes of the message block
+ // (q8-q9) in an aligned buffer on the stack so that they can be
+ // reloaded when needed. (We could just reload directly from the
+ // message buffer, but it's faster to use aligned loads.)
+ vld1.8 {q8-q9}, [DATA]!
+ veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
+ vld1.8 {q10-q11}, [DATA]!
+ veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
+ vld1.8 {q12-q13}, [DATA]!
+ vst1.8 {q8-q9}, [sp, :256]
+ mov ip, CTX
+ vld1.8 {q14-q15}, [DATA]!
+
+ // Execute the rounds. Each round is provided the order in which it
+ // needs to use the message words.
+ _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+ _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+ _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+ _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+ _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+ _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+ _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+ _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+ _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+ _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
+ final=1
+
+ // Fold the final state matrix into the hash chaining value:
+ //
+ // for (i = 0; i < 8; i++)
+ // h[i] ^= v[i] ^ v[i + 8];
+ //
+ vld1.64 {q8-q9}, [ip]! // Load old h[0..3]
+ veor q0, q0, q4 // v[0..1] ^= v[8..9]
+ veor q1, q1, q5 // v[2..3] ^= v[10..11]
+ vld1.64 {q10-q11}, [ip] // Load old h[4..7]
+ veor q2, q2, q6 // v[4..5] ^= v[12..13]
+ veor q3, q3, q7 // v[6..7] ^= v[14..15]
+ veor q0, q0, q8 // v[0..1] ^= h[0..1]
+ veor q1, q1, q9 // v[2..3] ^= h[2..3]
+ mov ip, CTX
+ subs NBLOCKS, NBLOCKS, #1 // nblocks--
+ vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
+ veor q2, q2, q10 // v[4..5] ^= h[4..5]
+ veor q3, q3, q11 // v[6..7] ^= h[6..7]
+ vst1.64 {q2-q3}, [ip]! // Store new h[4..7]
+
+ // Advance to the next block, if there is one.
+ bne .Lnext_block // nblocks != 0?
+
+ mov sp, ORIG_SP
+ pop {r4-r10}
+ mov pc, lr
+
+.Lslow_inc_ctr:
+ // Handle the case where the counter overflowed its low 32 bits, by
+ // carrying the overflow bit into the full 128-bit counter.
+ vmov r9, r10, d29
+ adcs r8, r8, #0
+ adcs r9, r9, #0
+ adc r10, r10, #0
+ vmov d28, r7, r8
+ vmov d29, r9, r10
+ vst1.64 {q14}, [ip] // Update t[0] and t[1]
+ b .Linc_ctr_done
+ENDPROC(blake2b_compress_neon)
diff --git a/lib/crypto/arm/blake2b.h b/lib/crypto/arm/blake2b.h
new file mode 100644
index 000000000000..5c76498521e6
--- /dev/null
+++ b/lib/crypto/arm/blake2b.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2b digest algorithm, NEON accelerated
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void blake2b_compress_neon(struct blake2b_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc);
+
+static void blake2b_compress(struct blake2b_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc)
+{
+ if (!static_branch_likely(&have_neon) || !may_use_simd()) {
+ blake2b_compress_generic(ctx, data, nblocks, inc);
+ return;
+ }
+ do {
+ const size_t blocks = min_t(size_t, nblocks,
+ SZ_4K / BLAKE2B_BLOCK_SIZE);
+
+ scoped_ksimd()
+ blake2b_compress_neon(ctx, data, blocks, inc);
+
+ data += blocks * BLAKE2B_BLOCK_SIZE;
+ nblocks -= blocks;
+ } while (nblocks);
+}
+
+#define blake2b_mod_init_arch blake2b_mod_init_arch
+static void blake2b_mod_init_arch(void)
+{
+ if (elf_hwcap & HWCAP_NEON)
+ static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
new file mode 100644
index 000000000000..933f0558b7cd
--- /dev/null
+++ b/lib/crypto/arm/blake2s-core.S
@@ -0,0 +1,309 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2s digest algorithm, ARM scalar implementation. This is faster
+ * than the generic implementations of BLAKE2s and BLAKE2b, but slower
+ * than the NEON implementation of BLAKE2b. There is no NEON
+ * implementation of BLAKE2s, since NEON doesn't really help with it.
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ // Registers used to hold message words temporarily. There aren't
+ // enough ARM registers to hold the whole message block, so we have to
+ // load the words on-demand.
+ M_0 .req r12
+ M_1 .req r14
+
+// The BLAKE2s initialization vector
+.Lblake2s_IV:
+ .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+ .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+
+.macro __ldrd a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ ldrd \a, \b, [\src, #\offset]
+#else
+ ldr \a, [\src, #\offset]
+ ldr \b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ strd \a, \b, [\dst, #\offset]
+#else
+ str \a, [\dst, #\offset]
+ str \b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _le32_bswap a, tmp
+#ifdef __ARMEB__
+ rev_l \a, \tmp
+#endif
+.endm
+
+.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp
+ _le32_bswap \a, \tmp
+ _le32_bswap \b, \tmp
+ _le32_bswap \c, \tmp
+ _le32_bswap \d, \tmp
+ _le32_bswap \e, \tmp
+ _le32_bswap \f, \tmp
+ _le32_bswap \g, \tmp
+ _le32_bswap \h, \tmp
+.endm
+
+// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
+// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
+// columns/diagonals. s0-s1 are the word offsets to the message words the first
+// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
+// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
+//
+// Note that to save instructions, the rotations don't happen when the
+// pseudocode says they should, but rather they are delayed until the values are
+// used. See the comment above _blake2s_round().
+.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3
+
+ ldr M_0, [sp, #32 + 4 * \s0]
+ ldr M_1, [sp, #32 + 4 * \s2]
+
+ // a += b + m[blake2s_sigma[r][2*i + 0]];
+ add \a0, \a0, \b0, ror #brot
+ add \a1, \a1, \b1, ror #brot
+ add \a0, \a0, M_0
+ add \a1, \a1, M_1
+
+ // d = ror32(d ^ a, 16);
+ eor \d0, \a0, \d0, ror #drot
+ eor \d1, \a1, \d1, ror #drot
+
+ // c += d;
+ add \c0, \c0, \d0, ror #16
+ add \c1, \c1, \d1, ror #16
+
+ // b = ror32(b ^ c, 12);
+ eor \b0, \c0, \b0, ror #brot
+ eor \b1, \c1, \b1, ror #brot
+
+ ldr M_0, [sp, #32 + 4 * \s1]
+ ldr M_1, [sp, #32 + 4 * \s3]
+
+ // a += b + m[blake2s_sigma[r][2*i + 1]];
+ add \a0, \a0, \b0, ror #12
+ add \a1, \a1, \b1, ror #12
+ add \a0, \a0, M_0
+ add \a1, \a1, M_1
+
+ // d = ror32(d ^ a, 8);
+ eor \d0, \a0, \d0, ror#16
+ eor \d1, \a1, \d1, ror#16
+
+ // c += d;
+ add \c0, \c0, \d0, ror#8
+ add \c1, \c1, \d1, ror#8
+
+ // b = ror32(b ^ c, 7);
+ eor \b0, \c0, \b0, ror#12
+ eor \b1, \c1, \b1, ror#12
+.endm
+
+// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
+// are in r0..r9. The stack pointer points to 8 bytes of scratch space for
+// spilling v[8..9], then to v[10..15], then to the message block. r10-r12 and
+// r14 are free to use. The macro arguments s0-s15 give the order in which the
+// message words are used in this round.
+//
+// All rotates are performed using the implicit rotate operand accepted by the
+// 'add' and 'eor' instructions. This is faster than using explicit rotate
+// instructions. To make this work, we allow the values in the second and last
+// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
+// wrong rotation amount. The rotation amount is then fixed up just in time
+// when the values are used. 'brot' is the number of bits the values in row 'b'
+// need to be rotated right to arrive at the correct values, and 'drot'
+// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
+// that they end up as (7, 8) after every round.
+.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
+ s8, s9, s10, s11, s12, s13, s14, s15
+
+ // Mix first two columns:
+ // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
+ __ldrd r10, r11, sp, 16 // load v[12] and v[13]
+ _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \
+ \s0, \s1, \s2, \s3
+ __strd r8, r9, sp, 0
+ __strd r10, r11, sp, 16
+
+ // Mix second two columns:
+ // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
+ __ldrd r8, r9, sp, 8 // load v[10] and v[11]
+ __ldrd r10, r11, sp, 24 // load v[14] and v[15]
+ _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \
+ \s4, \s5, \s6, \s7
+ str r10, [sp, #24] // store v[14]
+ // v[10], v[11], and v[15] are used below, so no need to store them yet.
+
+ .set brot, 7
+ .set drot, 8
+
+ // Mix first two diagonals:
+ // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
+ ldr r10, [sp, #16] // load v[12]
+ _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \
+ \s8, \s9, \s10, \s11
+ __strd r8, r9, sp, 8
+ str r11, [sp, #28]
+ str r10, [sp, #16]
+
+ // Mix second two diagonals:
+ // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
+ __ldrd r8, r9, sp, 0 // load v[8] and v[9]
+ __ldrd r10, r11, sp, 20 // load v[13] and v[14]
+ _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \
+ \s12, \s13, \s14, \s15
+ __strd r10, r11, sp, 20
+.endm
+
+//
+// void blake2s_compress(struct blake2s_ctx *ctx,
+// const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+// u32 h[8]; (inout)
+// u32 t[2]; (inout)
+// u32 f[2]; (in)
+//
+ .align 5
+ENTRY(blake2s_compress)
+ push {r0-r2,r4-r11,lr} // keep this an even number
+
+.Lnext_block:
+ // r0 is 'ctx'
+ // r1 is 'data'
+ // r3 is 'inc'
+
+ // Load and increment the counter t[0..1].
+ __ldrd r10, r11, r0, 32
+ adds r10, r10, r3
+ adc r11, r11, #0
+ __strd r10, r11, r0, 32
+
+ // _blake2s_round is very short on registers, so copy the message block
+ // to the stack to save a register during the rounds. This also has the
+ // advantage that misalignment only needs to be dealt with in one place.
+ sub sp, sp, #64
+ mov r12, sp
+ tst r1, #3
+ bne .Lcopy_block_misaligned
+ ldmia r1!, {r2-r9}
+ _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
+ stmia r12!, {r2-r9}
+ ldmia r1!, {r2-r9}
+ _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
+ stmia r12, {r2-r9}
+.Lcopy_block_done:
+ str r1, [sp, #68] // Update message pointer
+
+ // Calculate v[8..15]. Push v[10..15] onto the stack, and leave space
+ // for spilling v[8..9]. Leave v[8..9] in r8-r9.
+ mov r14, r0 // r14 = ctx
+ adr r12, .Lblake2s_IV
+ ldmia r12!, {r8-r9} // load IV[0..1]
+ __ldrd r0, r1, r14, 40 // load f[0..1]
+ ldm r12, {r2-r7} // load IV[2..7]
+ eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
+ eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
+ eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
+ eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
+ push {r2-r7} // push v[10..15]
+ sub sp, sp, #8 // leave space for v[8..9]
+
+ // Load h[0..7] == v[0..7].
+ ldm r14, {r0-r7}
+
+ // Execute the rounds. Each round is provided the order in which it
+ // needs to use the message words.
+ .set brot, 0
+ .set drot, 0
+ _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+ _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+ _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+ _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+ _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+ _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+ _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+ _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+ _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+
+ // Fold the final state matrix into the hash chaining value:
+ //
+ // for (i = 0; i < 8; i++)
+ // h[i] ^= v[i] ^ v[i + 8];
+ //
+ ldr r14, [sp, #96] // r14 = &h[0]
+ add sp, sp, #8 // v[8..9] are already loaded.
+ pop {r10-r11} // load v[10..11]
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ eor r3, r3, r11
+ ldm r14, {r8-r11} // load h[0..3]
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ eor r3, r3, r11
+ stmia r14!, {r0-r3} // store new h[0..3]
+ ldm r14, {r0-r3} // load old h[4..7]
+ pop {r8-r11} // load v[12..15]
+ eor r0, r0, r4, ror #brot
+ eor r1, r1, r5, ror #brot
+ eor r2, r2, r6, ror #brot
+ eor r3, r3, r7, ror #brot
+ eor r0, r0, r8, ror #drot
+ eor r1, r1, r9, ror #drot
+ eor r2, r2, r10, ror #drot
+ eor r3, r3, r11, ror #drot
+ add sp, sp, #64 // skip copy of message block
+ stm r14, {r0-r3} // store new h[4..7]
+
+ // Advance to the next block, if there is one. Note that if there are
+ // multiple blocks, then 'inc' (the counter increment amount) must be
+ // 64. So we can simply set it to 64 without re-loading it.
+ ldm sp, {r0, r1, r2} // load (ctx, data, nblocks)
+ mov r3, #64 // set 'inc'
+ subs r2, r2, #1 // nblocks--
+ str r2, [sp, #8]
+ bne .Lnext_block // nblocks != 0?
+
+ pop {r0-r2,r4-r11,pc}
+
+ // The next message block (pointed to by r1) isn't 4-byte aligned, so it
+ // can't be loaded using ldmia. Copy it to the stack buffer (pointed to
+ // by r12) using an alternative method. r2-r9 are free to use.
+.Lcopy_block_misaligned:
+ mov r2, #64
+1:
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ ldr r3, [r1], #4
+ _le32_bswap r3, r4
+#else
+ ldrb r3, [r1, #0]
+ ldrb r4, [r1, #1]
+ ldrb r5, [r1, #2]
+ ldrb r6, [r1, #3]
+ add r1, r1, #4
+ orr r3, r3, r4, lsl #8
+ orr r3, r3, r5, lsl #16
+ orr r3, r3, r6, lsl #24
+#endif
+ subs r2, r2, #4
+ str r3, [r12], #4
+ bne 1b
+ b .Lcopy_block_done
+ENDPROC(blake2s_compress)
diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h
new file mode 100644
index 000000000000..42c04440c191
--- /dev/null
+++ b/lib/crypto/arm/blake2s.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+/* defined in blake2s-core.S */
+void blake2s_compress(struct blake2s_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc);
diff --git a/lib/crypto/arm/chacha-neon-core.S b/lib/crypto/arm/chacha-neon-core.S
new file mode 100644
index 000000000000..ddd62b6294a5
--- /dev/null
+++ b/lib/crypto/arm/chacha-neon-core.S
@@ -0,0 +1,643 @@
+/*
+ * ChaCha/HChaCha NEON helper functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+ /*
+ * NEON doesn't have a rotate instruction. The alternatives are, more or less:
+ *
+ * (a) vshl.u32 + vsri.u32 (needs temporary register)
+ * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
+ * (c) vrev32.16 (16-bit rotations only)
+ * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
+ * needs index vector)
+ *
+ * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations,
+ * the only choices are (a) and (b). We use (a) since it takes two-thirds the
+ * cycles of (b) on both Cortex-A7 and Cortex-A53.
+ *
+ * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+ * and doesn't need a temporary register.
+ *
+ * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
+ * is twice as fast as (a), even when doing (a) on multiple registers
+ * simultaneously to eliminate the stall between vshl and vsri. Also, it
+ * parallelizes better when temporary registers are scarce.
+ *
+ * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
+ * (a), so the need to load the rotation table actually makes the vtbl method
+ * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
+ * seems to be a good compromise to get a more significant speed boost on some
+ * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+ */
+
+#include <linux/linkage.h>
+#include <asm/cache.h>
+
+ .text
+ .fpu neon
+ .align 5
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3. It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in r3.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha_permute:
+
+ adr ip, .Lrol8_table
+ vld1.8 {d10}, [ip, :64]
+
+.Ldoubleround:
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vrev32.16 q3, q3
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #12
+ vsri.u32 q1, q4, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vtbl.8 d6, {d6}, d10
+ vtbl.8 d7, {d7}, d10
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #7
+ vsri.u32 q1, q4, #25
+
+ // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vext.8 q1, q1, q1, #4
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vext.8 q2, q2, q2, #8
+ // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vext.8 q3, q3, q3, #12
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vrev32.16 q3, q3
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #12
+ vsri.u32 q1, q4, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vtbl.8 d6, {d6}, d10
+ vtbl.8 d7, {d7}, d10
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #7
+ vsri.u32 q1, q4, #25
+
+ // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vext.8 q1, q1, q1, #12
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vext.8 q2, q2, q2, #8
+ // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vext.8 q3, q3, q3, #4
+
+ subs r3, r3, #2
+ bne .Ldoubleround
+
+ bx lr
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_neon)
+ // r0: Input state matrix, s
+ // r1: 1 data block output, o
+ // r2: 1 data block input, i
+ // r3: nrounds
+ push {lr}
+
+ // x0..3 = s0..3
+ add ip, r0, #0x20
+ vld1.32 {q0-q1}, [r0]
+ vld1.32 {q2-q3}, [ip]
+
+ vmov q8, q0
+ vmov q9, q1
+ vmov q10, q2
+ vmov q11, q3
+
+ bl chacha_permute
+
+ add ip, r2, #0x20
+ vld1.8 {q4-q5}, [r2]
+ vld1.8 {q6-q7}, [ip]
+
+ // o0 = i0 ^ (x0 + s0)
+ vadd.i32 q0, q0, q8
+ veor q0, q0, q4
+
+ // o1 = i1 ^ (x1 + s1)
+ vadd.i32 q1, q1, q9
+ veor q1, q1, q5
+
+ // o2 = i2 ^ (x2 + s2)
+ vadd.i32 q2, q2, q10
+ veor q2, q2, q6
+
+ // o3 = i3 ^ (x3 + s3)
+ vadd.i32 q3, q3, q11
+ veor q3, q3, q7
+
+ add ip, r1, #0x20
+ vst1.8 {q0-q1}, [r1]
+ vst1.8 {q2-q3}, [ip]
+
+ pop {pc}
+ENDPROC(chacha_block_xor_neon)
+
+ENTRY(hchacha_block_neon)
+ // r0: Input state matrix, s
+ // r1: output (8 32-bit words)
+ // r2: nrounds
+ push {lr}
+
+ vld1.32 {q0-q1}, [r0]!
+ vld1.32 {q2-q3}, [r0]
+
+ mov r3, r2
+ bl chacha_permute
+
+ vst1.32 {q0}, [r1]!
+ vst1.32 {q3}, [r1]
+
+ pop {pc}
+ENDPROC(hchacha_block_neon)
+
+ .align 4
+.Lctrinc: .word 0, 1, 2, 3
+.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
+
+ .align 5
+ENTRY(chacha_4block_xor_neon)
+ push {r4, lr}
+ mov r4, sp // preserve the stack pointer
+ sub ip, sp, #0x20 // allocate a 32 byte buffer
+ bic ip, ip, #0x1f // aligned to 32 bytes
+ mov sp, ip
+
+ // r0: Input state matrix, s
+ // r1: 4 data blocks output, o
+ // r2: 4 data blocks input, i
+ // r3: nrounds
+
+ //
+ // This function encrypts four consecutive ChaCha blocks by loading
+ // the state matrix in NEON registers four times. The algorithm performs
+ // each operation on the corresponding word of each state matrix, hence
+ // requires no word shuffling. The words are re-interleaved before the
+ // final addition of the original state and the XORing step.
+ //
+
+ // x0..15[0-3] = s0..15[0-3]
+ add ip, r0, #0x20
+ vld1.32 {q0-q1}, [r0]
+ vld1.32 {q2-q3}, [ip]
+
+ adr lr, .Lctrinc
+ vdup.32 q15, d7[1]
+ vdup.32 q14, d7[0]
+ vld1.32 {q4}, [lr, :128]
+ vdup.32 q13, d6[1]
+ vdup.32 q12, d6[0]
+ vdup.32 q11, d5[1]
+ vdup.32 q10, d5[0]
+ vadd.u32 q12, q12, q4 // x12 += counter values 0-3
+ vdup.32 q9, d4[1]
+ vdup.32 q8, d4[0]
+ vdup.32 q7, d3[1]
+ vdup.32 q6, d3[0]
+ vdup.32 q5, d2[1]
+ vdup.32 q4, d2[0]
+ vdup.32 q3, d1[1]
+ vdup.32 q2, d1[0]
+ vdup.32 q1, d0[1]
+ vdup.32 q0, d0[0]
+
+ adr ip, .Lrol8_table
+ b 1f
+
+.Ldoubleround4:
+ vld1.32 {q8-q9}, [sp, :256]
+1:
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vadd.i32 q0, q0, q4
+ vadd.i32 q1, q1, q5
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+
+ veor q12, q12, q0
+ veor q13, q13, q1
+ veor q14, q14, q2
+ veor q15, q15, q3
+
+ vrev32.16 q12, q12
+ vrev32.16 q13, q13
+ vrev32.16 q14, q14
+ vrev32.16 q15, q15
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i32 q10, q10, q14
+ vadd.i32 q11, q11, q15
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q4, q8
+ veor q9, q5, q9
+ vshl.u32 q4, q8, #12
+ vshl.u32 q5, q9, #12
+ vsri.u32 q4, q8, #20
+ vsri.u32 q5, q9, #20
+
+ veor q8, q6, q10
+ veor q9, q7, q11
+ vshl.u32 q6, q8, #12
+ vshl.u32 q7, q9, #12
+ vsri.u32 q6, q8, #20
+ vsri.u32 q7, q9, #20
+
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vld1.8 {d16}, [ip, :64]
+ vadd.i32 q0, q0, q4
+ vadd.i32 q1, q1, q5
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+
+ veor q12, q12, q0
+ veor q13, q13, q1
+ veor q14, q14, q2
+ veor q15, q15, q3
+
+ vtbl.8 d24, {d24}, d16
+ vtbl.8 d25, {d25}, d16
+ vtbl.8 d26, {d26}, d16
+ vtbl.8 d27, {d27}, d16
+ vtbl.8 d28, {d28}, d16
+ vtbl.8 d29, {d29}, d16
+ vtbl.8 d30, {d30}, d16
+ vtbl.8 d31, {d31}, d16
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i32 q10, q10, q14
+ vadd.i32 q11, q11, q15
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q4, q8
+ veor q9, q5, q9
+ vshl.u32 q4, q8, #7
+ vshl.u32 q5, q9, #7
+ vsri.u32 q4, q8, #25
+ vsri.u32 q5, q9, #25
+
+ veor q8, q6, q10
+ veor q9, q7, q11
+ vshl.u32 q6, q8, #7
+ vshl.u32 q7, q9, #7
+ vsri.u32 q6, q8, #25
+ vsri.u32 q7, q9, #25
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vadd.i32 q0, q0, q5
+ vadd.i32 q1, q1, q6
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q4
+
+ veor q15, q15, q0
+ veor q12, q12, q1
+ veor q13, q13, q2
+ veor q14, q14, q3
+
+ vrev32.16 q15, q15
+ vrev32.16 q12, q12
+ vrev32.16 q13, q13
+ vrev32.16 q14, q14
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vadd.i32 q10, q10, q15
+ vadd.i32 q11, q11, q12
+ vadd.i32 q8, q8, q13
+ vadd.i32 q9, q9, q14
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q7, q8
+ veor q9, q4, q9
+ vshl.u32 q7, q8, #12
+ vshl.u32 q4, q9, #12
+ vsri.u32 q7, q8, #20
+ vsri.u32 q4, q9, #20
+
+ veor q8, q5, q10
+ veor q9, q6, q11
+ vshl.u32 q5, q8, #12
+ vshl.u32 q6, q9, #12
+ vsri.u32 q5, q8, #20
+ vsri.u32 q6, q9, #20
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vld1.8 {d16}, [ip, :64]
+ vadd.i32 q0, q0, q5
+ vadd.i32 q1, q1, q6
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q4
+
+ veor q15, q15, q0
+ veor q12, q12, q1
+ veor q13, q13, q2
+ veor q14, q14, q3
+
+ vtbl.8 d30, {d30}, d16
+ vtbl.8 d31, {d31}, d16
+ vtbl.8 d24, {d24}, d16
+ vtbl.8 d25, {d25}, d16
+ vtbl.8 d26, {d26}, d16
+ vtbl.8 d27, {d27}, d16
+ vtbl.8 d28, {d28}, d16
+ vtbl.8 d29, {d29}, d16
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vadd.i32 q10, q10, q15
+ vadd.i32 q11, q11, q12
+ vadd.i32 q8, q8, q13
+ vadd.i32 q9, q9, q14
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q7, q8
+ veor q9, q4, q9
+ vshl.u32 q7, q8, #7
+ vshl.u32 q4, q9, #7
+ vsri.u32 q7, q8, #25
+ vsri.u32 q4, q9, #25
+
+ veor q8, q5, q10
+ veor q9, q6, q11
+ vshl.u32 q5, q8, #7
+ vshl.u32 q6, q9, #7
+ vsri.u32 q5, q8, #25
+ vsri.u32 q6, q9, #25
+
+ subs r3, r3, #2
+ bne .Ldoubleround4
+
+ // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
+ // x8..9[0-3] are on the stack.
+
+ // Re-interleave the words in the first two rows of each block (x0..7).
+ // Also add the counter values 0-3 to x12[0-3].
+ vld1.32 {q8}, [lr, :128] // load counter values 0-3
+ vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
+ vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
+ vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
+ vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
+ vadd.u32 q12, q8 // x12 += counter values 0-3
+ vswp d1, d4
+ vswp d3, d6
+ vld1.32 {q8-q9}, [r0]! // load s0..7
+ vswp d9, d12
+ vswp d11, d14
+
+ // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
+ // after XORing the first 32 bytes.
+ vswp q1, q4
+
+ // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
+
+ // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
+ vadd.u32 q0, q0, q8
+ vadd.u32 q2, q2, q8
+ vadd.u32 q4, q4, q8
+ vadd.u32 q3, q3, q8
+
+ // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
+ vadd.u32 q1, q1, q9
+ vadd.u32 q6, q6, q9
+ vadd.u32 q5, q5, q9
+ vadd.u32 q7, q7, q9
+
+ // XOR first 32 bytes using keystream from first two rows of first block
+ vld1.8 {q8-q9}, [r2]!
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vst1.8 {q8-q9}, [r1]!
+
+ // Re-interleave the words in the last two rows of each block (x8..15).
+ vld1.32 {q8-q9}, [sp, :256]
+ mov sp, r4 // restore original stack pointer
+ ldr r4, [r4, #8] // load number of bytes
+ vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
+ vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
+ vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
+ vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
+ vld1.32 {q0-q1}, [r0] // load s8..15
+ vswp d25, d28
+ vswp d27, d30
+ vswp d17, d20
+ vswp d19, d22
+
+ // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
+
+ // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
+ vadd.u32 q8, q8, q0
+ vadd.u32 q10, q10, q0
+ vadd.u32 q9, q9, q0
+ vadd.u32 q11, q11, q0
+
+ // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
+ vadd.u32 q12, q12, q1
+ vadd.u32 q14, q14, q1
+ vadd.u32 q13, q13, q1
+ vadd.u32 q15, q15, q1
+
+ // XOR the rest of the data with the keystream
+
+ vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #96
+ veor q0, q0, q8
+ veor q1, q1, q12
+ ble .Lle96
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
+ veor q0, q0, q2
+ veor q1, q1, q6
+ ble .Lle128
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
+ veor q0, q0, q10
+ veor q1, q1, q14
+ ble .Lle160
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
+ veor q0, q0, q4
+ veor q1, q1, q5
+ ble .Lle192
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
+ veor q0, q0, q9
+ veor q1, q1, q13
+ ble .Lle224
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
+ veor q0, q0, q3
+ veor q1, q1, q7
+ blt .Llt256
+.Lout:
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]
+ veor q0, q0, q11
+ veor q1, q1, q15
+ vst1.8 {q0-q1}, [r1]
+
+ pop {r4, pc}
+
+.Lle192:
+ vmov q4, q9
+ vmov q5, q13
+
+.Lle160:
+ // nothing to do
+
+.Lfinalblock:
+ // Process the final block if processing less than 4 full blocks.
+ // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
+ // previous 32 byte output block that still needs to be written at
+ // [r1] in q0-q1.
+ beq .Lfullblock
+
+.Lpartialblock:
+ adr lr, .Lpermute + 32
+ add r2, r2, r4
+ add lr, lr, r4
+ add r4, r4, r1
+
+ vld1.8 {q2-q3}, [lr]
+ vld1.8 {q6-q7}, [r2]
+
+ add r4, r4, #32
+
+ vtbl.8 d4, {q4-q5}, d4
+ vtbl.8 d5, {q4-q5}, d5
+ vtbl.8 d6, {q4-q5}, d6
+ vtbl.8 d7, {q4-q5}, d7
+
+ veor q6, q6, q2
+ veor q7, q7, q3
+
+ vst1.8 {q6-q7}, [r4] // overlapping stores
+ vst1.8 {q0-q1}, [r1]
+ pop {r4, pc}
+
+.Lfullblock:
+ vmov q11, q4
+ vmov q15, q5
+ b .Lout
+.Lle96:
+ vmov q4, q2
+ vmov q5, q6
+ b .Lfinalblock
+.Lle128:
+ vmov q4, q10
+ vmov q5, q14
+ b .Lfinalblock
+.Lle224:
+ vmov q4, q3
+ vmov q5, q7
+ b .Lfinalblock
+.Llt256:
+ vmov q4, q11
+ vmov q5, q15
+ b .Lpartialblock
+ENDPROC(chacha_4block_xor_neon)
+
+ .align L1_CACHE_SHIFT
+.Lpermute:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/lib/crypto/arm/chacha-scalar-core.S b/lib/crypto/arm/chacha-scalar-core.S
new file mode 100644
index 000000000000..4951df05c158
--- /dev/null
+++ b/lib/crypto/arm/chacha-scalar-core.S
@@ -0,0 +1,444 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Google, Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Design notes:
+ *
+ * 16 registers would be needed to hold the state matrix, but only 14 are
+ * available because 'sp' and 'pc' cannot be used. So we spill the elements
+ * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
+ * 'ldrd' and one 'strd' instruction per round.
+ *
+ * All rotates are performed using the implicit rotate operand accepted by the
+ * 'add' and 'eor' instructions. This is faster than using explicit rotate
+ * instructions. To make this work, we allow the values in the second and last
+ * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
+ * wrong rotation amount. The rotation amount is then fixed up just in time
+ * when the values are used. 'brot' is the number of bits the values in row 'b'
+ * need to be rotated right to arrive at the correct values, and 'drot'
+ * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
+ * that they end up as (25, 24) after every round.
+ */
+
+ // ChaCha state registers
+ X0 .req r0
+ X1 .req r1
+ X2 .req r2
+ X3 .req r3
+ X4 .req r4
+ X5 .req r5
+ X6 .req r6
+ X7 .req r7
+ X8_X10 .req r8 // shared by x8 and x10
+ X9_X11 .req r9 // shared by x9 and x11
+ X12 .req r10
+ X13 .req r11
+ X14 .req r12
+ X15 .req r14
+
+.macro _le32_bswap_4x a, b, c, d, tmp
+#ifdef __ARMEB__
+ rev_l \a, \tmp
+ rev_l \b, \tmp
+ rev_l \c, \tmp
+ rev_l \d, \tmp
+#endif
+.endm
+
+.macro __ldrd a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ ldrd \a, \b, [\src, #\offset]
+#else
+ ldr \a, [\src, #\offset]
+ ldr \b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ strd \a, \b, [\dst, #\offset]
+#else
+ str \a, [\dst, #\offset]
+ str \b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
+
+ // a += b; d ^= a; d = rol(d, 16);
+ add \a1, \a1, \b1, ror #brot
+ add \a2, \a2, \b2, ror #brot
+ eor \d1, \a1, \d1, ror #drot
+ eor \d2, \a2, \d2, ror #drot
+ // drot == 32 - 16 == 16
+
+ // c += d; b ^= c; b = rol(b, 12);
+ add \c1, \c1, \d1, ror #16
+ add \c2, \c2, \d2, ror #16
+ eor \b1, \c1, \b1, ror #brot
+ eor \b2, \c2, \b2, ror #brot
+ // brot == 32 - 12 == 20
+
+ // a += b; d ^= a; d = rol(d, 8);
+ add \a1, \a1, \b1, ror #20
+ add \a2, \a2, \b2, ror #20
+ eor \d1, \a1, \d1, ror #16
+ eor \d2, \a2, \d2, ror #16
+ // drot == 32 - 8 == 24
+
+ // c += d; b ^= c; b = rol(b, 7);
+ add \c1, \c1, \d1, ror #24
+ add \c2, \c2, \d2, ror #24
+ eor \b1, \c1, \b1, ror #20
+ eor \b2, \c2, \b2, ror #20
+ // brot == 32 - 7 == 25
+.endm
+
+.macro _doubleround
+
+ // column round
+
+ // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
+ _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
+
+ // save (x8, x9); restore (x10, x11)
+ __strd X8_X10, X9_X11, sp, 0
+ __ldrd X8_X10, X9_X11, sp, 8
+
+ // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
+ _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
+
+ .set brot, 25
+ .set drot, 24
+
+ // diagonal round
+
+ // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
+ _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
+
+ // save (x10, x11); restore (x8, x9)
+ __strd X8_X10, X9_X11, sp, 8
+ __ldrd X8_X10, X9_X11, sp, 0
+
+ // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
+ _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
+.endm
+
+.macro _chacha_permute nrounds
+ .set brot, 0
+ .set drot, 0
+ .rept \nrounds / 2
+ _doubleround
+ .endr
+.endm
+
+.macro _chacha nrounds
+
+.Lnext_block\@:
+ // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
+ // Registers contain x0-x9,x12-x15.
+
+ // Do the core ChaCha permutation to update x0-x15.
+ _chacha_permute \nrounds
+
+ add sp, #8
+ // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers contain x0-x9,x12-x15.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
+ push {X8_X10, X9_X11, X12, X13, X14, X15}
+
+ // Load (OUT, IN, LEN).
+ ldr r14, [sp, #96]
+ ldr r12, [sp, #100]
+ ldr r11, [sp, #104]
+
+ orr r10, r14, r12
+
+ // Use slow path if fewer than 64 bytes remain.
+ cmp r11, #64
+ blt .Lxor_slowpath\@
+
+ // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
+ // ARMv6+, since ldmia and stmia (used below) still require alignment.
+ tst r10, #3
+ bne .Lxor_slowpath\@
+
+ // Fast path: XOR 64 bytes of aligned data.
+
+ // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // x0-x3
+ __ldrd r8, r9, sp, 32
+ __ldrd r10, r11, sp, 40
+ add X0, X0, r8
+ add X1, X1, r9
+ add X2, X2, r10
+ add X3, X3, r11
+ _le32_bswap_4x X0, X1, X2, X3, r8
+ ldmia r12!, {r8-r11}
+ eor X0, X0, r8
+ eor X1, X1, r9
+ eor X2, X2, r10
+ eor X3, X3, r11
+ stmia r14!, {X0-X3}
+
+ // x4-x7
+ __ldrd r8, r9, sp, 48
+ __ldrd r10, r11, sp, 56
+ add X4, r8, X4, ror #brot
+ add X5, r9, X5, ror #brot
+ ldmia r12!, {X0-X3}
+ add X6, r10, X6, ror #brot
+ add X7, r11, X7, ror #brot
+ _le32_bswap_4x X4, X5, X6, X7, r8
+ eor X4, X4, X0
+ eor X5, X5, X1
+ eor X6, X6, X2
+ eor X7, X7, X3
+ stmia r14!, {X4-X7}
+
+ // x8-x15
+ pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
+ __ldrd r8, r9, sp, 32
+ __ldrd r10, r11, sp, 40
+ add r0, r0, r8 // x8
+ add r1, r1, r9 // x9
+ add r6, r6, r10 // x10
+ add r7, r7, r11 // x11
+ _le32_bswap_4x r0, r1, r6, r7, r8
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8 // x8
+ eor r1, r1, r9 // x9
+ eor r6, r6, r10 // x10
+ eor r7, r7, r11 // x11
+ stmia r14!, {r0,r1,r6,r7}
+ ldmia r12!, {r0,r1,r6,r7}
+ __ldrd r8, r9, sp, 48
+ __ldrd r10, r11, sp, 56
+ add r2, r8, r2, ror #drot // x12
+ add r3, r9, r3, ror #drot // x13
+ add r4, r10, r4, ror #drot // x14
+ add r5, r11, r5, ror #drot // x15
+ _le32_bswap_4x r2, r3, r4, r5, r9
+ ldr r9, [sp, #72] // load LEN
+ eor r2, r2, r0 // x12
+ eor r3, r3, r1 // x13
+ eor r4, r4, r6 // x14
+ eor r5, r5, r7 // x15
+ subs r9, #64 // decrement and check LEN
+ stmia r14!, {r2-r5}
+
+ beq .Ldone\@
+
+.Lprepare_for_next_block\@:
+
+ // Stack: x0-x15 OUT IN LEN
+
+ // Increment block counter (x12)
+ add r8, #1
+
+ // Store updated (OUT, IN, LEN)
+ str r14, [sp, #64]
+ str r12, [sp, #68]
+ str r9, [sp, #72]
+
+ mov r14, sp
+
+ // Store updated block counter (x12)
+ str r8, [sp, #48]
+
+ sub sp, #16
+
+ // Reload state and do next block
+ ldmia r14!, {r0-r11} // load x0-x11
+ __strd r10, r11, sp, 8 // store x10-x11 before state
+ ldmia r14, {r10-r12,r14} // load x12-x15
+ b .Lnext_block\@
+
+.Lxor_slowpath\@:
+ // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
+ // We handle it by storing the 64 bytes of keystream to the stack, then
+ // XOR-ing the needed portion with the data.
+
+ // Allocate keystream buffer
+ sub sp, #64
+ mov r14, sp
+
+ // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // Save keystream for x0-x3
+ __ldrd r8, r9, sp, 96
+ __ldrd r10, r11, sp, 104
+ add X0, X0, r8
+ add X1, X1, r9
+ add X2, X2, r10
+ add X3, X3, r11
+ _le32_bswap_4x X0, X1, X2, X3, r8
+ stmia r14!, {X0-X3}
+
+ // Save keystream for x4-x7
+ __ldrd r8, r9, sp, 112
+ __ldrd r10, r11, sp, 120
+ add X4, r8, X4, ror #brot
+ add X5, r9, X5, ror #brot
+ add X6, r10, X6, ror #brot
+ add X7, r11, X7, ror #brot
+ _le32_bswap_4x X4, X5, X6, X7, r8
+ add r8, sp, #64
+ stmia r14!, {X4-X7}
+
+ // Save keystream for x8-x15
+ ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
+ __ldrd r8, r9, sp, 128
+ __ldrd r10, r11, sp, 136
+ add r0, r0, r8 // x8
+ add r1, r1, r9 // x9
+ add r6, r6, r10 // x10
+ add r7, r7, r11 // x11
+ _le32_bswap_4x r0, r1, r6, r7, r8
+ stmia r14!, {r0,r1,r6,r7}
+ __ldrd r8, r9, sp, 144
+ __ldrd r10, r11, sp, 152
+ add r2, r8, r2, ror #drot // x12
+ add r3, r9, r3, ror #drot // x13
+ add r4, r10, r4, ror #drot // x14
+ add r5, r11, r5, ror #drot // x15
+ _le32_bswap_4x r2, r3, r4, r5, r9
+ stmia r14, {r2-r5}
+
+ // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
+ // Registers: r8 is block counter, r12 is IN.
+
+ ldr r9, [sp, #168] // LEN
+ ldr r14, [sp, #160] // OUT
+ cmp r9, #64
+ mov r0, sp
+ movle r1, r9
+ movgt r1, #64
+ // r1 is number of bytes to XOR, in range [1, 64]
+
+.if __LINUX_ARM_ARCH__ < 6
+ orr r2, r12, r14
+ tst r2, #3 // IN or OUT misaligned?
+ bne .Lxor_next_byte\@
+.endif
+
+ // XOR a word at a time
+.rept 16
+ subs r1, #4
+ blt .Lxor_words_done\@
+ ldr r2, [r12], #4
+ ldr r3, [r0], #4
+ eor r2, r2, r3
+ str r2, [r14], #4
+.endr
+ b .Lxor_slowpath_done\@
+.Lxor_words_done\@:
+ ands r1, r1, #3
+ beq .Lxor_slowpath_done\@
+
+ // XOR a byte at a time
+.Lxor_next_byte\@:
+ ldrb r2, [r12], #1
+ ldrb r3, [r0], #1
+ eor r2, r2, r3
+ strb r2, [r14], #1
+ subs r1, #1
+ bne .Lxor_next_byte\@
+
+.Lxor_slowpath_done\@:
+ subs r9, #64
+ add sp, #96
+ bgt .Lprepare_for_next_block\@
+
+.Ldone\@:
+.endm // _chacha
+
+/*
+ * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+ * const struct chacha_state *state, int nrounds);
+ */
+ENTRY(chacha_doarm)
+ cmp r2, #0 // len == 0?
+ reteq lr
+
+ ldr ip, [sp]
+ cmp ip, #12
+
+ push {r0-r2,r4-r11,lr}
+
+ // Push state x0-x15 onto stack.
+ // Also store an extra copy of x10-x11 just before the state.
+
+ add X12, r3, #48
+ ldm X12, {X12,X13,X14,X15}
+ push {X12,X13,X14,X15}
+ sub sp, sp, #64
+
+ __ldrd X8_X10, X9_X11, r3, 40
+ __strd X8_X10, X9_X11, sp, 8
+ __strd X8_X10, X9_X11, sp, 56
+ ldm r3, {X0-X9_X11}
+ __strd X0, X1, sp, 16
+ __strd X2, X3, sp, 24
+ __strd X4, X5, sp, 32
+ __strd X6, X7, sp, 40
+ __strd X8_X10, X9_X11, sp, 48
+
+ beq 1f
+ _chacha 20
+
+0: add sp, #76
+ pop {r4-r11, pc}
+
+1: _chacha 12
+ b 0b
+ENDPROC(chacha_doarm)
+
+/*
+ * void hchacha_block_arm(const struct chacha_state *state,
+ * u32 out[HCHACHA_OUT_WORDS], int nrounds);
+ */
+ENTRY(hchacha_block_arm)
+ push {r1,r4-r11,lr}
+
+ cmp r2, #12 // ChaCha12 ?
+
+ mov r14, r0
+ ldmia r14!, {r0-r11} // load x0-x11
+ push {r10-r11} // store x10-x11 to stack
+ ldm r14, {r10-r12,r14} // load x12-x15
+ sub sp, #8
+
+ beq 1f
+ _chacha_permute 20
+
+ // Skip over (unused0-unused1, x10-x11)
+0: add sp, #16
+
+ // Fix up rotations of x12-x15
+ ror X12, X12, #drot
+ ror X13, X13, #drot
+ pop {r4} // load 'out'
+ ror X14, X14, #drot
+ ror X15, X15, #drot
+
+ // Store (x0-x3,x12-x15) to 'out'
+ stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+
+ pop {r4-r11,pc}
+
+1: _chacha_permute 12
+ b 0b
+ENDPROC(hchacha_block_arm)
diff --git a/lib/crypto/arm/chacha.h b/lib/crypto/arm/chacha.h
new file mode 100644
index 000000000000..836e49088e98
--- /dev/null
+++ b/lib/crypto/arm/chacha.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ChaCha and HChaCha functions (ARM optimized)
+ *
+ * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+#include <asm/cputype.h>
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+ u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ int nrounds, unsigned int nbytes);
+asmlinkage void hchacha_block_arm(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+ const struct chacha_state *state, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
+
+static inline bool neon_usable(void)
+{
+ return static_branch_likely(&use_neon) && crypto_simd_usable();
+}
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ u8 buf[CHACHA_BLOCK_SIZE];
+
+ while (bytes > CHACHA_BLOCK_SIZE) {
+ unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+ chacha_4block_xor_neon(state, dst, src, nrounds, l);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+ }
+ if (bytes) {
+ const u8 *s = src;
+ u8 *d = dst;
+
+ if (bytes != CHACHA_BLOCK_SIZE)
+ s = d = memcpy(buf, src, bytes);
+ chacha_block_xor_neon(state, d, s, nrounds);
+ if (d != dst)
+ memcpy(dst, buf, bytes);
+ state->x[12]++;
+ }
+}
+
+static void hchacha_block_arch(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+ if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
+ hchacha_block_arm(state, out, nrounds);
+ } else {
+ scoped_ksimd()
+ hchacha_block_neon(state, out, nrounds);
+ }
+}
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int bytes, int nrounds)
+{
+ if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
+ bytes <= CHACHA_BLOCK_SIZE) {
+ chacha_doarm(dst, src, bytes, state, nrounds);
+ state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
+ return;
+ }
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ scoped_ksimd()
+ chacha_doneon(state, dst, src, todo, nrounds);
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+ switch (read_cpuid_part()) {
+ case ARM_CPU_PART_CORTEX_A7:
+ case ARM_CPU_PART_CORTEX_A5:
+ /*
+ * The Cortex-A7 and Cortex-A5 do not perform well with
+ * the NEON implementation but do incredibly with the
+ * scalar one and use less power.
+ */
+ break;
+ default:
+ static_branch_enable(&use_neon);
+ }
+ }
+}
diff --git a/lib/crypto/arm/curve25519-core.S b/lib/crypto/arm/curve25519-core.S
new file mode 100644
index 000000000000..b697fa5d059a
--- /dev/null
+++ b/lib/crypto/arm/curve25519-core.S
@@ -0,0 +1,2062 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
+ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
+ * manually reworked for use in kernel space.
+ */
+
+#include <linux/linkage.h>
+
+.text
+.arch armv7-a
+.fpu neon
+.align 4
+
+ENTRY(curve25519_neon)
+ push {r4-r11, lr}
+ mov ip, sp
+ sub r3, sp, #704
+ and r3, r3, #0xfffffff0
+ mov sp, r3
+ movw r4, #0
+ movw r5, #254
+ vmov.i32 q0, #1
+ vshr.u64 q1, q0, #7
+ vshr.u64 q0, q0, #8
+ vmov.i32 d4, #19
+ vmov.i32 d5, #38
+ add r6, sp, #480
+ vst1.8 {d2-d3}, [r6, : 128]!
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vst1.8 {d4-d5}, [r6, : 128]
+ add r6, r3, #0
+ vmov.i32 q2, #0
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 d4, [r6, : 64]
+ add r6, r3, #0
+ movw r7, #960
+ sub r7, r7, #2
+ neg r7, r7
+ sub r7, r7, r7, LSL #7
+ str r7, [r6]
+ add r6, sp, #672
+ vld1.8 {d4-d5}, [r1]!
+ vld1.8 {d6-d7}, [r1]
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 {d6-d7}, [r6, : 128]
+ sub r1, r6, #16
+ ldrb r6, [r1]
+ and r6, r6, #248
+ strb r6, [r1]
+ ldrb r6, [r1, #31]
+ and r6, r6, #127
+ orr r6, r6, #64
+ strb r6, [r1, #31]
+ vmov.i64 q2, #0xffffffff
+ vshr.u64 q3, q2, #7
+ vshr.u64 q2, q2, #6
+ vld1.8 {d8}, [r2]
+ vld1.8 {d10}, [r2]
+ add r2, r2, #6
+ vld1.8 {d12}, [r2]
+ vld1.8 {d14}, [r2]
+ add r2, r2, #6
+ vld1.8 {d16}, [r2]
+ add r2, r2, #4
+ vld1.8 {d18}, [r2]
+ vld1.8 {d20}, [r2]
+ add r2, r2, #6
+ vld1.8 {d22}, [r2]
+ add r2, r2, #2
+ vld1.8 {d24}, [r2]
+ vld1.8 {d26}, [r2]
+ vshr.u64 q5, q5, #26
+ vshr.u64 q6, q6, #3
+ vshr.u64 q7, q7, #29
+ vshr.u64 q8, q8, #6
+ vshr.u64 q10, q10, #25
+ vshr.u64 q11, q11, #3
+ vshr.u64 q12, q12, #12
+ vshr.u64 q13, q13, #38
+ vand q4, q4, q2
+ vand q6, q6, q2
+ vand q8, q8, q2
+ vand q10, q10, q2
+ vand q2, q12, q2
+ vand q5, q5, q3
+ vand q7, q7, q3
+ vand q9, q9, q3
+ vand q11, q11, q3
+ vand q3, q13, q3
+ add r2, r3, #48
+ vadd.i64 q12, q4, q1
+ vadd.i64 q13, q10, q1
+ vshr.s64 q12, q12, #26
+ vshr.s64 q13, q13, #26
+ vadd.i64 q5, q5, q12
+ vshl.i64 q12, q12, #26
+ vadd.i64 q14, q5, q0
+ vadd.i64 q11, q11, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q11, q0
+ vsub.i64 q4, q4, q12
+ vshr.s64 q12, q14, #25
+ vsub.i64 q10, q10, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q12
+ vshl.i64 q12, q12, #25
+ vadd.i64 q14, q6, q1
+ vadd.i64 q2, q2, q13
+ vsub.i64 q5, q5, q12
+ vshr.s64 q12, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q1
+ vadd.i64 q7, q7, q12
+ vshl.i64 q12, q12, #26
+ vadd.i64 q15, q7, q0
+ vsub.i64 q11, q11, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q12
+ vshr.s64 q12, q15, #25
+ vadd.i64 q3, q3, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q3, q0
+ vadd.i64 q8, q8, q12
+ vshl.i64 q12, q12, #25
+ vadd.i64 q15, q8, q1
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q7, q12
+ vshr.s64 q12, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q9, q9, q12
+ vtrn.32 d12, d14
+ vshl.i64 q12, q12, #26
+ vtrn.32 d13, d15
+ vadd.i64 q0, q9, q0
+ vadd.i64 q4, q4, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q6, q13, #4
+ vsub.i64 q7, q8, q12
+ vshr.s64 q0, q0, #25
+ vadd.i64 q4, q4, q6
+ vadd.i64 q6, q10, q0
+ vshl.i64 q0, q0, #25
+ vadd.i64 q8, q6, q1
+ vadd.i64 q4, q4, q13
+ vshl.i64 q10, q13, #25
+ vadd.i64 q1, q4, q1
+ vsub.i64 q0, q9, q0
+ vshr.s64 q8, q8, #26
+ vsub.i64 q3, q3, q10
+ vtrn.32 d14, d0
+ vshr.s64 q1, q1, #26
+ vtrn.32 d15, d1
+ vadd.i64 q0, q11, q8
+ vst1.8 d14, [r2, : 64]
+ vshl.i64 q7, q8, #26
+ vadd.i64 q5, q5, q1
+ vtrn.32 d4, d6
+ vshl.i64 q1, q1, #26
+ vtrn.32 d5, d7
+ vsub.i64 q3, q6, q7
+ add r2, r2, #16
+ vsub.i64 q1, q4, q1
+ vst1.8 d4, [r2, : 64]
+ vtrn.32 d6, d0
+ vtrn.32 d7, d1
+ sub r2, r2, #8
+ vtrn.32 d2, d10
+ vtrn.32 d3, d11
+ vst1.8 d6, [r2, : 64]
+ sub r2, r2, #24
+ vst1.8 d2, [r2, : 64]
+ add r2, r3, #96
+ vmov.i32 q0, #0
+ vmov.i64 d2, #0xff
+ vmov.i64 d3, #0
+ vshr.u32 q1, q1, #7
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 d0, [r2, : 64]
+ add r2, r3, #144
+ vmov.i32 q0, #0
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 d0, [r2, : 64]
+ add r2, r3, #240
+ vmov.i32 q0, #0
+ vmov.i64 d2, #0xff
+ vmov.i64 d3, #0
+ vshr.u32 q1, q1, #7
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 d0, [r2, : 64]
+ add r2, r3, #48
+ add r6, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4}, [r2, : 64]
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vst1.8 {d2-d3}, [r6, : 128]!
+ vst1.8 d4, [r6, : 64]
+.Lmainloop:
+ mov r2, r5, LSR #3
+ and r6, r5, #7
+ ldrb r2, [r1, r2]
+ mov r2, r2, LSR r6
+ and r2, r2, #1
+ str r5, [sp, #456]
+ eor r4, r4, r2
+ str r2, [sp, #460]
+ neg r2, r4
+ add r4, r3, #96
+ add r5, r3, #192
+ add r6, r3, #144
+ vld1.8 {d8-d9}, [r4, : 128]!
+ add r7, r3, #240
+ vld1.8 {d10-d11}, [r5, : 128]!
+ veor q6, q4, q5
+ vld1.8 {d14-d15}, [r6, : 128]!
+ vdup.i32 q8, r2
+ vld1.8 {d18-d19}, [r7, : 128]!
+ veor q10, q7, q9
+ vld1.8 {d22-d23}, [r4, : 128]!
+ vand q6, q6, q8
+ vld1.8 {d24-d25}, [r5, : 128]!
+ vand q10, q10, q8
+ vld1.8 {d26-d27}, [r6, : 128]!
+ veor q4, q4, q6
+ vld1.8 {d28-d29}, [r7, : 128]!
+ veor q5, q5, q6
+ vld1.8 {d0}, [r4, : 64]
+ veor q6, q7, q10
+ vld1.8 {d2}, [r5, : 64]
+ veor q7, q9, q10
+ vld1.8 {d4}, [r6, : 64]
+ veor q9, q11, q12
+ vld1.8 {d6}, [r7, : 64]
+ veor q10, q0, q1
+ sub r2, r4, #32
+ vand q9, q9, q8
+ sub r4, r5, #32
+ vand q10, q10, q8
+ sub r5, r6, #32
+ veor q11, q11, q9
+ sub r6, r7, #32
+ veor q0, q0, q10
+ veor q9, q12, q9
+ veor q1, q1, q10
+ veor q10, q13, q14
+ veor q12, q2, q3
+ vand q10, q10, q8
+ vand q8, q12, q8
+ veor q12, q13, q10
+ veor q2, q2, q8
+ veor q10, q14, q10
+ veor q3, q3, q8
+ vadd.i32 q8, q4, q6
+ vsub.i32 q4, q4, q6
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vadd.i32 q6, q11, q12
+ vst1.8 {d8-d9}, [r5, : 128]!
+ vsub.i32 q4, q11, q12
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vadd.i32 q6, q0, q2
+ vst1.8 {d8-d9}, [r5, : 128]!
+ vsub.i32 q0, q0, q2
+ vst1.8 d12, [r2, : 64]
+ vadd.i32 q2, q5, q7
+ vst1.8 d0, [r5, : 64]
+ vsub.i32 q0, q5, q7
+ vst1.8 {d4-d5}, [r4, : 128]!
+ vadd.i32 q2, q9, q10
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vsub.i32 q0, q9, q10
+ vst1.8 {d4-d5}, [r4, : 128]!
+ vadd.i32 q2, q1, q3
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vsub.i32 q0, q1, q3
+ vst1.8 d4, [r4, : 64]
+ vst1.8 d0, [r6, : 64]
+ add r2, sp, #512
+ add r4, r3, #96
+ add r5, r3, #144
+ vld1.8 {d0-d1}, [r2, : 128]
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4-d5}, [r5, : 128]!
+ vzip.i32 q1, q2
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vld1.8 {d8-d9}, [r5, : 128]!
+ vshl.i32 q5, q1, #1
+ vzip.i32 q3, q4
+ vshl.i32 q6, q2, #1
+ vld1.8 {d14}, [r4, : 64]
+ vshl.i32 q8, q3, #1
+ vld1.8 {d15}, [r5, : 64]
+ vshl.i32 q9, q4, #1
+ vmul.i32 d21, d7, d1
+ vtrn.32 d14, d15
+ vmul.i32 q11, q4, q0
+ vmul.i32 q0, q7, q0
+ vmull.s32 q12, d2, d2
+ vmlal.s32 q12, d11, d1
+ vmlal.s32 q12, d12, d0
+ vmlal.s32 q12, d13, d23
+ vmlal.s32 q12, d16, d22
+ vmlal.s32 q12, d7, d21
+ vmull.s32 q10, d2, d11
+ vmlal.s32 q10, d4, d1
+ vmlal.s32 q10, d13, d0
+ vmlal.s32 q10, d6, d23
+ vmlal.s32 q10, d17, d22
+ vmull.s32 q13, d10, d4
+ vmlal.s32 q13, d11, d3
+ vmlal.s32 q13, d13, d1
+ vmlal.s32 q13, d16, d0
+ vmlal.s32 q13, d17, d23
+ vmlal.s32 q13, d8, d22
+ vmull.s32 q1, d10, d5
+ vmlal.s32 q1, d11, d4
+ vmlal.s32 q1, d6, d1
+ vmlal.s32 q1, d17, d0
+ vmlal.s32 q1, d8, d23
+ vmull.s32 q14, d10, d6
+ vmlal.s32 q14, d11, d13
+ vmlal.s32 q14, d4, d4
+ vmlal.s32 q14, d17, d1
+ vmlal.s32 q14, d18, d0
+ vmlal.s32 q14, d9, d23
+ vmull.s32 q11, d10, d7
+ vmlal.s32 q11, d11, d6
+ vmlal.s32 q11, d12, d5
+ vmlal.s32 q11, d8, d1
+ vmlal.s32 q11, d19, d0
+ vmull.s32 q15, d10, d8
+ vmlal.s32 q15, d11, d17
+ vmlal.s32 q15, d12, d6
+ vmlal.s32 q15, d13, d5
+ vmlal.s32 q15, d19, d1
+ vmlal.s32 q15, d14, d0
+ vmull.s32 q2, d10, d9
+ vmlal.s32 q2, d11, d8
+ vmlal.s32 q2, d12, d7
+ vmlal.s32 q2, d13, d6
+ vmlal.s32 q2, d14, d1
+ vmull.s32 q0, d15, d1
+ vmlal.s32 q0, d10, d14
+ vmlal.s32 q0, d11, d19
+ vmlal.s32 q0, d12, d8
+ vmlal.s32 q0, d13, d17
+ vmlal.s32 q0, d6, d6
+ add r2, sp, #480
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vmull.s32 q3, d16, d7
+ vmlal.s32 q3, d10, d15
+ vmlal.s32 q3, d11, d14
+ vmlal.s32 q3, d12, d9
+ vmlal.s32 q3, d13, d8
+ vld1.8 {d8-d9}, [r2, : 128]
+ vadd.i64 q5, q12, q9
+ vadd.i64 q6, q15, q9
+ vshr.s64 q5, q5, #26
+ vshr.s64 q6, q6, #26
+ vadd.i64 q7, q10, q5
+ vshl.i64 q5, q5, #26
+ vadd.i64 q8, q7, q4
+ vadd.i64 q2, q2, q6
+ vshl.i64 q6, q6, #26
+ vadd.i64 q10, q2, q4
+ vsub.i64 q5, q12, q5
+ vshr.s64 q8, q8, #25
+ vsub.i64 q6, q15, q6
+ vshr.s64 q10, q10, #25
+ vadd.i64 q12, q13, q8
+ vshl.i64 q8, q8, #25
+ vadd.i64 q13, q12, q9
+ vadd.i64 q0, q0, q10
+ vsub.i64 q7, q7, q8
+ vshr.s64 q8, q13, #26
+ vshl.i64 q10, q10, #25
+ vadd.i64 q13, q0, q9
+ vadd.i64 q1, q1, q8
+ vshl.i64 q8, q8, #26
+ vadd.i64 q15, q1, q4
+ vsub.i64 q2, q2, q10
+ vshr.s64 q10, q13, #26
+ vsub.i64 q8, q12, q8
+ vshr.s64 q12, q15, #25
+ vadd.i64 q3, q3, q10
+ vshl.i64 q10, q10, #26
+ vadd.i64 q13, q3, q4
+ vadd.i64 q14, q14, q12
+ add r2, r3, #288
+ vshl.i64 q12, q12, #25
+ add r4, r3, #336
+ vadd.i64 q15, q14, q9
+ add r2, r2, #8
+ vsub.i64 q0, q0, q10
+ add r4, r4, #8
+ vshr.s64 q10, q13, #25
+ vsub.i64 q1, q1, q12
+ vshr.s64 q12, q15, #26
+ vadd.i64 q13, q10, q10
+ vadd.i64 q11, q11, q12
+ vtrn.32 d16, d2
+ vshl.i64 q12, q12, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q11, q4
+ vadd.i64 q4, q5, q13
+ vst1.8 d16, [r2, : 64]!
+ vshl.i64 q5, q10, #4
+ vst1.8 d17, [r4, : 64]!
+ vsub.i64 q8, q14, q12
+ vshr.s64 q1, q1, #25
+ vadd.i64 q4, q4, q5
+ vadd.i64 q5, q6, q1
+ vshl.i64 q1, q1, #25
+ vadd.i64 q6, q5, q9
+ vadd.i64 q4, q4, q10
+ vshl.i64 q10, q10, #25
+ vadd.i64 q9, q4, q9
+ vsub.i64 q1, q11, q1
+ vshr.s64 q6, q6, #26
+ vsub.i64 q3, q3, q10
+ vtrn.32 d16, d2
+ vshr.s64 q9, q9, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q2, q6
+ vst1.8 d16, [r2, : 64]
+ vshl.i64 q2, q6, #26
+ vst1.8 d17, [r4, : 64]
+ vadd.i64 q6, q7, q9
+ vtrn.32 d0, d6
+ vshl.i64 q7, q9, #26
+ vtrn.32 d1, d7
+ vsub.i64 q2, q5, q2
+ add r2, r2, #16
+ vsub.i64 q3, q4, q7
+ vst1.8 d0, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d1, [r4, : 64]
+ vtrn.32 d4, d2
+ vtrn.32 d5, d3
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d6, d12
+ vtrn.32 d7, d13
+ vst1.8 d4, [r2, : 64]
+ vst1.8 d5, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d6, [r2, : 64]
+ vst1.8 d7, [r4, : 64]
+ add r2, r3, #240
+ add r4, r3, #96
+ vld1.8 {d0-d1}, [r4, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4}, [r4, : 64]
+ add r4, r3, #144
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vtrn.32 q0, q3
+ vld1.8 {d8-d9}, [r4, : 128]!
+ vshl.i32 q5, q0, #4
+ vtrn.32 q1, q4
+ vshl.i32 q6, q3, #4
+ vadd.i32 q5, q5, q0
+ vadd.i32 q6, q6, q3
+ vshl.i32 q7, q1, #4
+ vld1.8 {d5}, [r4, : 64]
+ vshl.i32 q8, q4, #4
+ vtrn.32 d4, d5
+ vadd.i32 q7, q7, q1
+ vadd.i32 q8, q8, q4
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vshl.i32 q10, q2, #4
+ vld1.8 {d22-d23}, [r2, : 128]!
+ vadd.i32 q10, q10, q2
+ vld1.8 {d24}, [r2, : 64]
+ vadd.i32 q5, q5, q0
+ add r2, r3, #192
+ vld1.8 {d26-d27}, [r2, : 128]!
+ vadd.i32 q6, q6, q3
+ vld1.8 {d28-d29}, [r2, : 128]!
+ vadd.i32 q8, q8, q4
+ vld1.8 {d25}, [r2, : 64]
+ vadd.i32 q10, q10, q2
+ vtrn.32 q9, q13
+ vadd.i32 q7, q7, q1
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q6, q13, #1
+ vst1.8 {d20-d21}, [r2, : 128]!
+ vshl.i32 q10, q14, #1
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+ vmlal.s32 q8, d27, d3
+ vmlal.s32 q8, d22, d8
+ vmlal.s32 q8, d28, d2
+ vmlal.s32 q8, d23, d7
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+ vst1.8 {d14-d15}, [r2, : 128]!
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+ vmlal.s32 q2, d19, d3
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+ vst1.8 {d20-d21}, [r2, : 128]!
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+ vmlal.s32 q7, d27, d2
+ vmlal.s32 q7, d22, d7
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+ vst1.8 {d10-d11}, [r2, : 128]!
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+ vmlal.s32 q4, d27, d1
+ vmlal.s32 q4, d22, d6
+ vmlal.s32 q4, d28, d0
+ vmull.s32 q8, d18, d7
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+ vmlal.s32 q4, d23, d21
+ vmlal.s32 q4, d29, d20
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+ vst1.8 {d8-d9}, [r2, : 128]
+ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+ vmull.s32 q1, d18, d2
+ vmlal.s32 q1, d19, d1
+ vmlal.s32 q1, d22, d0
+ vmlal.s32 q1, d24, d27
+ vmlal.s32 q1, d23, d20
+ vmlal.s32 q1, d12, d7
+ vmlal.s32 q1, d13, d6
+ vmull.s32 q6, d18, d1
+ vmlal.s32 q6, d19, d0
+ vmlal.s32 q6, d23, d27
+ vmlal.s32 q6, d22, d20
+ vmlal.s32 q6, d24, d26
+ vmull.s32 q0, d18, d0
+ vmlal.s32 q0, d22, d27
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+ vmlal.s32 q5, d18, d6
+ vmlal.s32 q1, d18, d21
+ vmlal.s32 q0, d18, d28
+ vmlal.s32 q6, d18, d29
+ vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d19, d28
+ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+ vadd.i64 q14, q5, q11
+ vmlal.s32 q6, d30, d9
+ vshr.s64 q4, q13, #26
+ vshr.s64 q13, q14, #26
+ vadd.i64 q7, q7, q4
+ vshl.i64 q4, q4, #26
+ vadd.i64 q14, q7, q3
+ vadd.i64 q9, q9, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q9, q3
+ vsub.i64 q0, q0, q4
+ vshr.s64 q4, q14, #25
+ vsub.i64 q5, q5, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q4
+ vshl.i64 q4, q4, #25
+ vadd.i64 q14, q6, q11
+ vadd.i64 q2, q2, q13
+ vsub.i64 q4, q7, q4
+ vshr.s64 q7, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q11
+ vadd.i64 q8, q8, q7
+ vshl.i64 q7, q7, #26
+ vadd.i64 q15, q8, q3
+ vsub.i64 q9, q9, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q7
+ vshr.s64 q7, q15, #25
+ vadd.i64 q10, q10, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q10, q3
+ vadd.i64 q1, q1, q7
+ add r2, r3, #144
+ vshl.i64 q7, q7, #25
+ add r4, r3, #96
+ vadd.i64 q15, q1, q11
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ add r4, r4, #8
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q8, q7
+ vshr.s64 q8, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q12, q12, q8
+ vtrn.32 d12, d14
+ vshl.i64 q8, q8, #26
+ vtrn.32 d13, d15
+ vadd.i64 q3, q12, q3
+ vadd.i64 q0, q0, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q7, q13, #4
+ vst1.8 d13, [r4, : 64]!
+ vsub.i64 q1, q1, q8
+ vshr.s64 q3, q3, #25
+ vadd.i64 q0, q0, q7
+ vadd.i64 q5, q5, q3
+ vshl.i64 q3, q3, #25
+ vadd.i64 q6, q5, q11
+ vadd.i64 q0, q0, q13
+ vshl.i64 q7, q13, #25
+ vadd.i64 q8, q0, q11
+ vsub.i64 q3, q12, q3
+ vshr.s64 q6, q6, #26
+ vsub.i64 q7, q10, q7
+ vtrn.32 d2, d6
+ vshr.s64 q8, q8, #26
+ vtrn.32 d3, d7
+ vadd.i64 q3, q9, q6
+ vst1.8 d2, [r2, : 64]
+ vshl.i64 q6, q6, #26
+ vst1.8 d3, [r4, : 64]
+ vadd.i64 q1, q4, q8
+ vtrn.32 d4, d14
+ vshl.i64 q4, q8, #26
+ vtrn.32 d5, d15
+ vsub.i64 q5, q5, q6
+ add r2, r2, #16
+ vsub.i64 q0, q0, q4
+ vst1.8 d4, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d5, [r4, : 64]
+ vtrn.32 d10, d6
+ vtrn.32 d11, d7
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vst1.8 d10, [r2, : 64]
+ vst1.8 d11, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+ add r2, r3, #288
+ add r4, r3, #336
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vsub.i32 q0, q0, q1
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4-d5}, [r4, : 128]!
+ vsub.i32 q1, q1, q2
+ add r5, r3, #240
+ vld1.8 {d4}, [r2, : 64]
+ vld1.8 {d6}, [r4, : 64]
+ vsub.i32 q2, q2, q3
+ vst1.8 {d0-d1}, [r5, : 128]!
+ vst1.8 {d2-d3}, [r5, : 128]!
+ vst1.8 d4, [r5, : 64]
+ add r2, r3, #144
+ add r4, r3, #96
+ add r5, r3, #144
+ add r6, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vsub.i32 q2, q0, q1
+ vadd.i32 q0, q0, q1
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vsub.i32 q4, q1, q3
+ vadd.i32 q1, q1, q3
+ vld1.8 {d6}, [r2, : 64]
+ vld1.8 {d10}, [r4, : 64]
+ vsub.i32 q6, q3, q5
+ vadd.i32 q3, q3, q5
+ vst1.8 {d4-d5}, [r5, : 128]!
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vst1.8 {d8-d9}, [r5, : 128]!
+ vst1.8 {d2-d3}, [r6, : 128]!
+ vst1.8 d12, [r5, : 64]
+ vst1.8 d6, [r6, : 64]
+ add r2, r3, #0
+ add r4, r3, #240
+ vld1.8 {d0-d1}, [r4, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4}, [r4, : 64]
+ add r4, r3, #336
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vtrn.32 q0, q3
+ vld1.8 {d8-d9}, [r4, : 128]!
+ vshl.i32 q5, q0, #4
+ vtrn.32 q1, q4
+ vshl.i32 q6, q3, #4
+ vadd.i32 q5, q5, q0
+ vadd.i32 q6, q6, q3
+ vshl.i32 q7, q1, #4
+ vld1.8 {d5}, [r4, : 64]
+ vshl.i32 q8, q4, #4
+ vtrn.32 d4, d5
+ vadd.i32 q7, q7, q1
+ vadd.i32 q8, q8, q4
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vshl.i32 q10, q2, #4
+ vld1.8 {d22-d23}, [r2, : 128]!
+ vadd.i32 q10, q10, q2
+ vld1.8 {d24}, [r2, : 64]
+ vadd.i32 q5, q5, q0
+ add r2, r3, #288
+ vld1.8 {d26-d27}, [r2, : 128]!
+ vadd.i32 q6, q6, q3
+ vld1.8 {d28-d29}, [r2, : 128]!
+ vadd.i32 q8, q8, q4
+ vld1.8 {d25}, [r2, : 64]
+ vadd.i32 q10, q10, q2
+ vtrn.32 q9, q13
+ vadd.i32 q7, q7, q1
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q6, q13, #1
+ vst1.8 {d20-d21}, [r2, : 128]!
+ vshl.i32 q10, q14, #1
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+ vmlal.s32 q8, d27, d3
+ vmlal.s32 q8, d22, d8
+ vmlal.s32 q8, d28, d2
+ vmlal.s32 q8, d23, d7
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+ vst1.8 {d14-d15}, [r2, : 128]!
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+ vmlal.s32 q2, d19, d3
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+ vst1.8 {d20-d21}, [r2, : 128]!
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+ vmlal.s32 q7, d27, d2
+ vmlal.s32 q7, d22, d7
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+ vst1.8 {d10-d11}, [r2, : 128]!
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+ vmlal.s32 q4, d27, d1
+ vmlal.s32 q4, d22, d6
+ vmlal.s32 q4, d28, d0
+ vmull.s32 q8, d18, d7
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+ vmlal.s32 q4, d23, d21
+ vmlal.s32 q4, d29, d20
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+ vst1.8 {d8-d9}, [r2, : 128]
+ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+ vmull.s32 q1, d18, d2
+ vmlal.s32 q1, d19, d1
+ vmlal.s32 q1, d22, d0
+ vmlal.s32 q1, d24, d27
+ vmlal.s32 q1, d23, d20
+ vmlal.s32 q1, d12, d7
+ vmlal.s32 q1, d13, d6
+ vmull.s32 q6, d18, d1
+ vmlal.s32 q6, d19, d0
+ vmlal.s32 q6, d23, d27
+ vmlal.s32 q6, d22, d20
+ vmlal.s32 q6, d24, d26
+ vmull.s32 q0, d18, d0
+ vmlal.s32 q0, d22, d27
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+ vmlal.s32 q5, d18, d6
+ vmlal.s32 q1, d18, d21
+ vmlal.s32 q0, d18, d28
+ vmlal.s32 q6, d18, d29
+ vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d19, d28
+ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+ vadd.i64 q14, q5, q11
+ vmlal.s32 q6, d30, d9
+ vshr.s64 q4, q13, #26
+ vshr.s64 q13, q14, #26
+ vadd.i64 q7, q7, q4
+ vshl.i64 q4, q4, #26
+ vadd.i64 q14, q7, q3
+ vadd.i64 q9, q9, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q9, q3
+ vsub.i64 q0, q0, q4
+ vshr.s64 q4, q14, #25
+ vsub.i64 q5, q5, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q4
+ vshl.i64 q4, q4, #25
+ vadd.i64 q14, q6, q11
+ vadd.i64 q2, q2, q13
+ vsub.i64 q4, q7, q4
+ vshr.s64 q7, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q11
+ vadd.i64 q8, q8, q7
+ vshl.i64 q7, q7, #26
+ vadd.i64 q15, q8, q3
+ vsub.i64 q9, q9, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q7
+ vshr.s64 q7, q15, #25
+ vadd.i64 q10, q10, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q10, q3
+ vadd.i64 q1, q1, q7
+ add r2, r3, #288
+ vshl.i64 q7, q7, #25
+ add r4, r3, #96
+ vadd.i64 q15, q1, q11
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ add r4, r4, #8
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q8, q7
+ vshr.s64 q8, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q12, q12, q8
+ vtrn.32 d12, d14
+ vshl.i64 q8, q8, #26
+ vtrn.32 d13, d15
+ vadd.i64 q3, q12, q3
+ vadd.i64 q0, q0, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q7, q13, #4
+ vst1.8 d13, [r4, : 64]!
+ vsub.i64 q1, q1, q8
+ vshr.s64 q3, q3, #25
+ vadd.i64 q0, q0, q7
+ vadd.i64 q5, q5, q3
+ vshl.i64 q3, q3, #25
+ vadd.i64 q6, q5, q11
+ vadd.i64 q0, q0, q13
+ vshl.i64 q7, q13, #25
+ vadd.i64 q8, q0, q11
+ vsub.i64 q3, q12, q3
+ vshr.s64 q6, q6, #26
+ vsub.i64 q7, q10, q7
+ vtrn.32 d2, d6
+ vshr.s64 q8, q8, #26
+ vtrn.32 d3, d7
+ vadd.i64 q3, q9, q6
+ vst1.8 d2, [r2, : 64]
+ vshl.i64 q6, q6, #26
+ vst1.8 d3, [r4, : 64]
+ vadd.i64 q1, q4, q8
+ vtrn.32 d4, d14
+ vshl.i64 q4, q8, #26
+ vtrn.32 d5, d15
+ vsub.i64 q5, q5, q6
+ add r2, r2, #16
+ vsub.i64 q0, q0, q4
+ vst1.8 d4, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d5, [r4, : 64]
+ vtrn.32 d10, d6
+ vtrn.32 d11, d7
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vst1.8 d10, [r2, : 64]
+ vst1.8 d11, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+ add r2, sp, #512
+ add r4, r3, #144
+ add r5, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4-d5}, [r5, : 128]!
+ vzip.i32 q1, q2
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vld1.8 {d8-d9}, [r5, : 128]!
+ vshl.i32 q5, q1, #1
+ vzip.i32 q3, q4
+ vshl.i32 q6, q2, #1
+ vld1.8 {d14}, [r4, : 64]
+ vshl.i32 q8, q3, #1
+ vld1.8 {d15}, [r5, : 64]
+ vshl.i32 q9, q4, #1
+ vmul.i32 d21, d7, d1
+ vtrn.32 d14, d15
+ vmul.i32 q11, q4, q0
+ vmul.i32 q0, q7, q0
+ vmull.s32 q12, d2, d2
+ vmlal.s32 q12, d11, d1
+ vmlal.s32 q12, d12, d0
+ vmlal.s32 q12, d13, d23
+ vmlal.s32 q12, d16, d22
+ vmlal.s32 q12, d7, d21
+ vmull.s32 q10, d2, d11
+ vmlal.s32 q10, d4, d1
+ vmlal.s32 q10, d13, d0
+ vmlal.s32 q10, d6, d23
+ vmlal.s32 q10, d17, d22
+ vmull.s32 q13, d10, d4
+ vmlal.s32 q13, d11, d3
+ vmlal.s32 q13, d13, d1
+ vmlal.s32 q13, d16, d0
+ vmlal.s32 q13, d17, d23
+ vmlal.s32 q13, d8, d22
+ vmull.s32 q1, d10, d5
+ vmlal.s32 q1, d11, d4
+ vmlal.s32 q1, d6, d1
+ vmlal.s32 q1, d17, d0
+ vmlal.s32 q1, d8, d23
+ vmull.s32 q14, d10, d6
+ vmlal.s32 q14, d11, d13
+ vmlal.s32 q14, d4, d4
+ vmlal.s32 q14, d17, d1
+ vmlal.s32 q14, d18, d0
+ vmlal.s32 q14, d9, d23
+ vmull.s32 q11, d10, d7
+ vmlal.s32 q11, d11, d6
+ vmlal.s32 q11, d12, d5
+ vmlal.s32 q11, d8, d1
+ vmlal.s32 q11, d19, d0
+ vmull.s32 q15, d10, d8
+ vmlal.s32 q15, d11, d17
+ vmlal.s32 q15, d12, d6
+ vmlal.s32 q15, d13, d5
+ vmlal.s32 q15, d19, d1
+ vmlal.s32 q15, d14, d0
+ vmull.s32 q2, d10, d9
+ vmlal.s32 q2, d11, d8
+ vmlal.s32 q2, d12, d7
+ vmlal.s32 q2, d13, d6
+ vmlal.s32 q2, d14, d1
+ vmull.s32 q0, d15, d1
+ vmlal.s32 q0, d10, d14
+ vmlal.s32 q0, d11, d19
+ vmlal.s32 q0, d12, d8
+ vmlal.s32 q0, d13, d17
+ vmlal.s32 q0, d6, d6
+ add r2, sp, #480
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vmull.s32 q3, d16, d7
+ vmlal.s32 q3, d10, d15
+ vmlal.s32 q3, d11, d14
+ vmlal.s32 q3, d12, d9
+ vmlal.s32 q3, d13, d8
+ vld1.8 {d8-d9}, [r2, : 128]
+ vadd.i64 q5, q12, q9
+ vadd.i64 q6, q15, q9
+ vshr.s64 q5, q5, #26
+ vshr.s64 q6, q6, #26
+ vadd.i64 q7, q10, q5
+ vshl.i64 q5, q5, #26
+ vadd.i64 q8, q7, q4
+ vadd.i64 q2, q2, q6
+ vshl.i64 q6, q6, #26
+ vadd.i64 q10, q2, q4
+ vsub.i64 q5, q12, q5
+ vshr.s64 q8, q8, #25
+ vsub.i64 q6, q15, q6
+ vshr.s64 q10, q10, #25
+ vadd.i64 q12, q13, q8
+ vshl.i64 q8, q8, #25
+ vadd.i64 q13, q12, q9
+ vadd.i64 q0, q0, q10
+ vsub.i64 q7, q7, q8
+ vshr.s64 q8, q13, #26
+ vshl.i64 q10, q10, #25
+ vadd.i64 q13, q0, q9
+ vadd.i64 q1, q1, q8
+ vshl.i64 q8, q8, #26
+ vadd.i64 q15, q1, q4
+ vsub.i64 q2, q2, q10
+ vshr.s64 q10, q13, #26
+ vsub.i64 q8, q12, q8
+ vshr.s64 q12, q15, #25
+ vadd.i64 q3, q3, q10
+ vshl.i64 q10, q10, #26
+ vadd.i64 q13, q3, q4
+ vadd.i64 q14, q14, q12
+ add r2, r3, #144
+ vshl.i64 q12, q12, #25
+ add r4, r3, #192
+ vadd.i64 q15, q14, q9
+ add r2, r2, #8
+ vsub.i64 q0, q0, q10
+ add r4, r4, #8
+ vshr.s64 q10, q13, #25
+ vsub.i64 q1, q1, q12
+ vshr.s64 q12, q15, #26
+ vadd.i64 q13, q10, q10
+ vadd.i64 q11, q11, q12
+ vtrn.32 d16, d2
+ vshl.i64 q12, q12, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q11, q4
+ vadd.i64 q4, q5, q13
+ vst1.8 d16, [r2, : 64]!
+ vshl.i64 q5, q10, #4
+ vst1.8 d17, [r4, : 64]!
+ vsub.i64 q8, q14, q12
+ vshr.s64 q1, q1, #25
+ vadd.i64 q4, q4, q5
+ vadd.i64 q5, q6, q1
+ vshl.i64 q1, q1, #25
+ vadd.i64 q6, q5, q9
+ vadd.i64 q4, q4, q10
+ vshl.i64 q10, q10, #25
+ vadd.i64 q9, q4, q9
+ vsub.i64 q1, q11, q1
+ vshr.s64 q6, q6, #26
+ vsub.i64 q3, q3, q10
+ vtrn.32 d16, d2
+ vshr.s64 q9, q9, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q2, q6
+ vst1.8 d16, [r2, : 64]
+ vshl.i64 q2, q6, #26
+ vst1.8 d17, [r4, : 64]
+ vadd.i64 q6, q7, q9
+ vtrn.32 d0, d6
+ vshl.i64 q7, q9, #26
+ vtrn.32 d1, d7
+ vsub.i64 q2, q5, q2
+ add r2, r2, #16
+ vsub.i64 q3, q4, q7
+ vst1.8 d0, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d1, [r4, : 64]
+ vtrn.32 d4, d2
+ vtrn.32 d5, d3
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d6, d12
+ vtrn.32 d7, d13
+ vst1.8 d4, [r2, : 64]
+ vst1.8 d5, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d6, [r2, : 64]
+ vst1.8 d7, [r4, : 64]
+ add r2, r3, #336
+ add r4, r3, #288
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vadd.i32 q0, q0, q1
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4-d5}, [r4, : 128]!
+ vadd.i32 q1, q1, q2
+ add r5, r3, #288
+ vld1.8 {d4}, [r2, : 64]
+ vld1.8 {d6}, [r4, : 64]
+ vadd.i32 q2, q2, q3
+ vst1.8 {d0-d1}, [r5, : 128]!
+ vst1.8 {d2-d3}, [r5, : 128]!
+ vst1.8 d4, [r5, : 64]
+ add r2, r3, #48
+ add r4, r3, #144
+ vld1.8 {d0-d1}, [r4, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4}, [r4, : 64]
+ add r4, r3, #288
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vtrn.32 q0, q3
+ vld1.8 {d8-d9}, [r4, : 128]!
+ vshl.i32 q5, q0, #4
+ vtrn.32 q1, q4
+ vshl.i32 q6, q3, #4
+ vadd.i32 q5, q5, q0
+ vadd.i32 q6, q6, q3
+ vshl.i32 q7, q1, #4
+ vld1.8 {d5}, [r4, : 64]
+ vshl.i32 q8, q4, #4
+ vtrn.32 d4, d5
+ vadd.i32 q7, q7, q1
+ vadd.i32 q8, q8, q4
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vshl.i32 q10, q2, #4
+ vld1.8 {d22-d23}, [r2, : 128]!
+ vadd.i32 q10, q10, q2
+ vld1.8 {d24}, [r2, : 64]
+ vadd.i32 q5, q5, q0
+ add r2, r3, #240
+ vld1.8 {d26-d27}, [r2, : 128]!
+ vadd.i32 q6, q6, q3
+ vld1.8 {d28-d29}, [r2, : 128]!
+ vadd.i32 q8, q8, q4
+ vld1.8 {d25}, [r2, : 64]
+ vadd.i32 q10, q10, q2
+ vtrn.32 q9, q13
+ vadd.i32 q7, q7, q1
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q6, q13, #1
+ vst1.8 {d20-d21}, [r2, : 128]!
+ vshl.i32 q10, q14, #1
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+ vmlal.s32 q8, d27, d3
+ vmlal.s32 q8, d22, d8
+ vmlal.s32 q8, d28, d2
+ vmlal.s32 q8, d23, d7
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+ vst1.8 {d14-d15}, [r2, : 128]!
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+ vmlal.s32 q2, d19, d3
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+ vst1.8 {d20-d21}, [r2, : 128]!
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+ vmlal.s32 q7, d27, d2
+ vmlal.s32 q7, d22, d7
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+ vst1.8 {d10-d11}, [r2, : 128]!
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+ vmlal.s32 q4, d27, d1
+ vmlal.s32 q4, d22, d6
+ vmlal.s32 q4, d28, d0
+ vmull.s32 q8, d18, d7
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+ vmlal.s32 q4, d23, d21
+ vmlal.s32 q4, d29, d20
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+ vst1.8 {d8-d9}, [r2, : 128]
+ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+ vmull.s32 q1, d18, d2
+ vmlal.s32 q1, d19, d1
+ vmlal.s32 q1, d22, d0
+ vmlal.s32 q1, d24, d27
+ vmlal.s32 q1, d23, d20
+ vmlal.s32 q1, d12, d7
+ vmlal.s32 q1, d13, d6
+ vmull.s32 q6, d18, d1
+ vmlal.s32 q6, d19, d0
+ vmlal.s32 q6, d23, d27
+ vmlal.s32 q6, d22, d20
+ vmlal.s32 q6, d24, d26
+ vmull.s32 q0, d18, d0
+ vmlal.s32 q0, d22, d27
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+ vmlal.s32 q5, d18, d6
+ vmlal.s32 q1, d18, d21
+ vmlal.s32 q0, d18, d28
+ vmlal.s32 q6, d18, d29
+ vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d19, d28
+ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+ vadd.i64 q14, q5, q11
+ vmlal.s32 q6, d30, d9
+ vshr.s64 q4, q13, #26
+ vshr.s64 q13, q14, #26
+ vadd.i64 q7, q7, q4
+ vshl.i64 q4, q4, #26
+ vadd.i64 q14, q7, q3
+ vadd.i64 q9, q9, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q9, q3
+ vsub.i64 q0, q0, q4
+ vshr.s64 q4, q14, #25
+ vsub.i64 q5, q5, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q4
+ vshl.i64 q4, q4, #25
+ vadd.i64 q14, q6, q11
+ vadd.i64 q2, q2, q13
+ vsub.i64 q4, q7, q4
+ vshr.s64 q7, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q11
+ vadd.i64 q8, q8, q7
+ vshl.i64 q7, q7, #26
+ vadd.i64 q15, q8, q3
+ vsub.i64 q9, q9, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q7
+ vshr.s64 q7, q15, #25
+ vadd.i64 q10, q10, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q10, q3
+ vadd.i64 q1, q1, q7
+ add r2, r3, #240
+ vshl.i64 q7, q7, #25
+ add r4, r3, #144
+ vadd.i64 q15, q1, q11
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ add r4, r4, #8
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q8, q7
+ vshr.s64 q8, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q12, q12, q8
+ vtrn.32 d12, d14
+ vshl.i64 q8, q8, #26
+ vtrn.32 d13, d15
+ vadd.i64 q3, q12, q3
+ vadd.i64 q0, q0, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q7, q13, #4
+ vst1.8 d13, [r4, : 64]!
+ vsub.i64 q1, q1, q8
+ vshr.s64 q3, q3, #25
+ vadd.i64 q0, q0, q7
+ vadd.i64 q5, q5, q3
+ vshl.i64 q3, q3, #25
+ vadd.i64 q6, q5, q11
+ vadd.i64 q0, q0, q13
+ vshl.i64 q7, q13, #25
+ vadd.i64 q8, q0, q11
+ vsub.i64 q3, q12, q3
+ vshr.s64 q6, q6, #26
+ vsub.i64 q7, q10, q7
+ vtrn.32 d2, d6
+ vshr.s64 q8, q8, #26
+ vtrn.32 d3, d7
+ vadd.i64 q3, q9, q6
+ vst1.8 d2, [r2, : 64]
+ vshl.i64 q6, q6, #26
+ vst1.8 d3, [r4, : 64]
+ vadd.i64 q1, q4, q8
+ vtrn.32 d4, d14
+ vshl.i64 q4, q8, #26
+ vtrn.32 d5, d15
+ vsub.i64 q5, q5, q6
+ add r2, r2, #16
+ vsub.i64 q0, q0, q4
+ vst1.8 d4, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d5, [r4, : 64]
+ vtrn.32 d10, d6
+ vtrn.32 d11, d7
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vst1.8 d10, [r2, : 64]
+ vst1.8 d11, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+ ldr r2, [sp, #456]
+ ldr r4, [sp, #460]
+ subs r5, r2, #1
+ bge .Lmainloop
+ add r1, r3, #144
+ add r2, r3, #336
+ vld1.8 {d0-d1}, [r1, : 128]!
+ vld1.8 {d2-d3}, [r1, : 128]!
+ vld1.8 {d4}, [r1, : 64]
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 d4, [r2, : 64]
+ movw r1, #0
+.Linvertloop:
+ add r2, r3, #144
+ movw r4, #0
+ movw r5, #2
+ cmp r1, #1
+ moveq r5, #1
+ addeq r2, r3, #336
+ addeq r4, r3, #48
+ cmp r1, #2
+ moveq r5, #1
+ addeq r2, r3, #48
+ cmp r1, #3
+ moveq r5, #5
+ addeq r4, r3, #336
+ cmp r1, #4
+ moveq r5, #10
+ cmp r1, #5
+ moveq r5, #20
+ cmp r1, #6
+ moveq r5, #10
+ addeq r2, r3, #336
+ addeq r4, r3, #336
+ cmp r1, #7
+ moveq r5, #50
+ cmp r1, #8
+ moveq r5, #100
+ cmp r1, #9
+ moveq r5, #50
+ addeq r2, r3, #336
+ cmp r1, #10
+ moveq r5, #5
+ addeq r2, r3, #48
+ cmp r1, #11
+ moveq r5, #0
+ addeq r2, r3, #96
+ add r6, r3, #144
+ add r7, r3, #288
+ vld1.8 {d0-d1}, [r6, : 128]!
+ vld1.8 {d2-d3}, [r6, : 128]!
+ vld1.8 {d4}, [r6, : 64]
+ vst1.8 {d0-d1}, [r7, : 128]!
+ vst1.8 {d2-d3}, [r7, : 128]!
+ vst1.8 d4, [r7, : 64]
+ cmp r5, #0
+ beq .Lskipsquaringloop
+.Lsquaringloop:
+ add r6, r3, #288
+ add r7, r3, #288
+ add r8, r3, #288
+ vmov.i32 q0, #19
+ vmov.i32 q1, #0
+ vmov.i32 q2, #1
+ vzip.i32 q1, q2
+ vld1.8 {d4-d5}, [r7, : 128]!
+ vld1.8 {d6-d7}, [r7, : 128]!
+ vld1.8 {d9}, [r7, : 64]
+ vld1.8 {d10-d11}, [r6, : 128]!
+ add r7, sp, #384
+ vld1.8 {d12-d13}, [r6, : 128]!
+ vmul.i32 q7, q2, q0
+ vld1.8 {d8}, [r6, : 64]
+ vext.32 d17, d11, d10, #1
+ vmul.i32 q9, q3, q0
+ vext.32 d16, d10, d8, #1
+ vshl.u32 q10, q5, q1
+ vext.32 d22, d14, d4, #1
+ vext.32 d24, d18, d6, #1
+ vshl.u32 q13, q6, q1
+ vshl.u32 d28, d8, d2
+ vrev64.i32 d22, d22
+ vmul.i32 d1, d9, d1
+ vrev64.i32 d24, d24
+ vext.32 d29, d8, d13, #1
+ vext.32 d0, d1, d9, #1
+ vrev64.i32 d0, d0
+ vext.32 d2, d9, d1, #1
+ vext.32 d23, d15, d5, #1
+ vmull.s32 q4, d20, d4
+ vrev64.i32 d23, d23
+ vmlal.s32 q4, d21, d1
+ vrev64.i32 d2, d2
+ vmlal.s32 q4, d26, d19
+ vext.32 d3, d5, d15, #1
+ vmlal.s32 q4, d27, d18
+ vrev64.i32 d3, d3
+ vmlal.s32 q4, d28, d15
+ vext.32 d14, d12, d11, #1
+ vmull.s32 q5, d16, d23
+ vext.32 d15, d13, d12, #1
+ vmlal.s32 q5, d17, d4
+ vst1.8 d8, [r7, : 64]!
+ vmlal.s32 q5, d14, d1
+ vext.32 d12, d9, d8, #0
+ vmlal.s32 q5, d15, d19
+ vmov.i64 d13, #0
+ vmlal.s32 q5, d29, d18
+ vext.32 d25, d19, d7, #1
+ vmlal.s32 q6, d20, d5
+ vrev64.i32 d25, d25
+ vmlal.s32 q6, d21, d4
+ vst1.8 d11, [r7, : 64]!
+ vmlal.s32 q6, d26, d1
+ vext.32 d9, d10, d10, #0
+ vmlal.s32 q6, d27, d19
+ vmov.i64 d8, #0
+ vmlal.s32 q6, d28, d18
+ vmlal.s32 q4, d16, d24
+ vmlal.s32 q4, d17, d5
+ vmlal.s32 q4, d14, d4
+ vst1.8 d12, [r7, : 64]!
+ vmlal.s32 q4, d15, d1
+ vext.32 d10, d13, d12, #0
+ vmlal.s32 q4, d29, d19
+ vmov.i64 d11, #0
+ vmlal.s32 q5, d20, d6
+ vmlal.s32 q5, d21, d5
+ vmlal.s32 q5, d26, d4
+ vext.32 d13, d8, d8, #0
+ vmlal.s32 q5, d27, d1
+ vmov.i64 d12, #0
+ vmlal.s32 q5, d28, d19
+ vst1.8 d9, [r7, : 64]!
+ vmlal.s32 q6, d16, d25
+ vmlal.s32 q6, d17, d6
+ vst1.8 d10, [r7, : 64]
+ vmlal.s32 q6, d14, d5
+ vext.32 d8, d11, d10, #0
+ vmlal.s32 q6, d15, d4
+ vmov.i64 d9, #0
+ vmlal.s32 q6, d29, d1
+ vmlal.s32 q4, d20, d7
+ vmlal.s32 q4, d21, d6
+ vmlal.s32 q4, d26, d5
+ vext.32 d11, d12, d12, #0
+ vmlal.s32 q4, d27, d4
+ vmov.i64 d10, #0
+ vmlal.s32 q4, d28, d1
+ vmlal.s32 q5, d16, d0
+ sub r6, r7, #32
+ vmlal.s32 q5, d17, d7
+ vmlal.s32 q5, d14, d6
+ vext.32 d30, d9, d8, #0
+ vmlal.s32 q5, d15, d5
+ vld1.8 {d31}, [r6, : 64]!
+ vmlal.s32 q5, d29, d4
+ vmlal.s32 q15, d20, d0
+ vext.32 d0, d6, d18, #1
+ vmlal.s32 q15, d21, d25
+ vrev64.i32 d0, d0
+ vmlal.s32 q15, d26, d24
+ vext.32 d1, d7, d19, #1
+ vext.32 d7, d10, d10, #0
+ vmlal.s32 q15, d27, d23
+ vrev64.i32 d1, d1
+ vld1.8 {d6}, [r6, : 64]
+ vmlal.s32 q15, d28, d22
+ vmlal.s32 q3, d16, d4
+ add r6, r6, #24
+ vmlal.s32 q3, d17, d2
+ vext.32 d4, d31, d30, #0
+ vmov d17, d11
+ vmlal.s32 q3, d14, d1
+ vext.32 d11, d13, d13, #0
+ vext.32 d13, d30, d30, #0
+ vmlal.s32 q3, d15, d0
+ vext.32 d1, d8, d8, #0
+ vmlal.s32 q3, d29, d3
+ vld1.8 {d5}, [r6, : 64]
+ sub r6, r6, #16
+ vext.32 d10, d6, d6, #0
+ vmov.i32 q1, #0xffffffff
+ vshl.i64 q4, q1, #25
+ add r7, sp, #480
+ vld1.8 {d14-d15}, [r7, : 128]
+ vadd.i64 q9, q2, q7
+ vshl.i64 q1, q1, #26
+ vshr.s64 q10, q9, #26
+ vld1.8 {d0}, [r6, : 64]!
+ vadd.i64 q5, q5, q10
+ vand q9, q9, q1
+ vld1.8 {d16}, [r6, : 64]!
+ add r6, sp, #496
+ vld1.8 {d20-d21}, [r6, : 128]
+ vadd.i64 q11, q5, q10
+ vsub.i64 q2, q2, q9
+ vshr.s64 q9, q11, #25
+ vext.32 d12, d5, d4, #0
+ vand q11, q11, q4
+ vadd.i64 q0, q0, q9
+ vmov d19, d7
+ vadd.i64 q3, q0, q7
+ vsub.i64 q5, q5, q11
+ vshr.s64 q11, q3, #26
+ vext.32 d18, d11, d10, #0
+ vand q3, q3, q1
+ vadd.i64 q8, q8, q11
+ vadd.i64 q11, q8, q10
+ vsub.i64 q0, q0, q3
+ vshr.s64 q3, q11, #25
+ vand q11, q11, q4
+ vadd.i64 q3, q6, q3
+ vadd.i64 q6, q3, q7
+ vsub.i64 q8, q8, q11
+ vshr.s64 q11, q6, #26
+ vand q6, q6, q1
+ vadd.i64 q9, q9, q11
+ vadd.i64 d25, d19, d21
+ vsub.i64 q3, q3, q6
+ vshr.s64 d23, d25, #25
+ vand q4, q12, q4
+ vadd.i64 d21, d23, d23
+ vshl.i64 d25, d23, #4
+ vadd.i64 d21, d21, d23
+ vadd.i64 d25, d25, d21
+ vadd.i64 d4, d4, d25
+ vzip.i32 q0, q8
+ vadd.i64 d12, d4, d14
+ add r6, r8, #8
+ vst1.8 d0, [r6, : 64]
+ vsub.i64 d19, d19, d9
+ add r6, r6, #16
+ vst1.8 d16, [r6, : 64]
+ vshr.s64 d22, d12, #26
+ vand q0, q6, q1
+ vadd.i64 d10, d10, d22
+ vzip.i32 q3, q9
+ vsub.i64 d4, d4, d0
+ sub r6, r6, #8
+ vst1.8 d6, [r6, : 64]
+ add r6, r6, #16
+ vst1.8 d18, [r6, : 64]
+ vzip.i32 q2, q5
+ sub r6, r6, #32
+ vst1.8 d4, [r6, : 64]
+ subs r5, r5, #1
+ bhi .Lsquaringloop
+.Lskipsquaringloop:
+ mov r2, r2
+ add r5, r3, #288
+ add r6, r3, #144
+ vmov.i32 q0, #19
+ vmov.i32 q1, #0
+ vmov.i32 q2, #1
+ vzip.i32 q1, q2
+ vld1.8 {d4-d5}, [r5, : 128]!
+ vld1.8 {d6-d7}, [r5, : 128]!
+ vld1.8 {d9}, [r5, : 64]
+ vld1.8 {d10-d11}, [r2, : 128]!
+ add r5, sp, #384
+ vld1.8 {d12-d13}, [r2, : 128]!
+ vmul.i32 q7, q2, q0
+ vld1.8 {d8}, [r2, : 64]
+ vext.32 d17, d11, d10, #1
+ vmul.i32 q9, q3, q0
+ vext.32 d16, d10, d8, #1
+ vshl.u32 q10, q5, q1
+ vext.32 d22, d14, d4, #1
+ vext.32 d24, d18, d6, #1
+ vshl.u32 q13, q6, q1
+ vshl.u32 d28, d8, d2
+ vrev64.i32 d22, d22
+ vmul.i32 d1, d9, d1
+ vrev64.i32 d24, d24
+ vext.32 d29, d8, d13, #1
+ vext.32 d0, d1, d9, #1
+ vrev64.i32 d0, d0
+ vext.32 d2, d9, d1, #1
+ vext.32 d23, d15, d5, #1
+ vmull.s32 q4, d20, d4
+ vrev64.i32 d23, d23
+ vmlal.s32 q4, d21, d1
+ vrev64.i32 d2, d2
+ vmlal.s32 q4, d26, d19
+ vext.32 d3, d5, d15, #1
+ vmlal.s32 q4, d27, d18
+ vrev64.i32 d3, d3
+ vmlal.s32 q4, d28, d15
+ vext.32 d14, d12, d11, #1
+ vmull.s32 q5, d16, d23
+ vext.32 d15, d13, d12, #1
+ vmlal.s32 q5, d17, d4
+ vst1.8 d8, [r5, : 64]!
+ vmlal.s32 q5, d14, d1
+ vext.32 d12, d9, d8, #0
+ vmlal.s32 q5, d15, d19
+ vmov.i64 d13, #0
+ vmlal.s32 q5, d29, d18
+ vext.32 d25, d19, d7, #1
+ vmlal.s32 q6, d20, d5
+ vrev64.i32 d25, d25
+ vmlal.s32 q6, d21, d4
+ vst1.8 d11, [r5, : 64]!
+ vmlal.s32 q6, d26, d1
+ vext.32 d9, d10, d10, #0
+ vmlal.s32 q6, d27, d19
+ vmov.i64 d8, #0
+ vmlal.s32 q6, d28, d18
+ vmlal.s32 q4, d16, d24
+ vmlal.s32 q4, d17, d5
+ vmlal.s32 q4, d14, d4
+ vst1.8 d12, [r5, : 64]!
+ vmlal.s32 q4, d15, d1
+ vext.32 d10, d13, d12, #0
+ vmlal.s32 q4, d29, d19
+ vmov.i64 d11, #0
+ vmlal.s32 q5, d20, d6
+ vmlal.s32 q5, d21, d5
+ vmlal.s32 q5, d26, d4
+ vext.32 d13, d8, d8, #0
+ vmlal.s32 q5, d27, d1
+ vmov.i64 d12, #0
+ vmlal.s32 q5, d28, d19
+ vst1.8 d9, [r5, : 64]!
+ vmlal.s32 q6, d16, d25
+ vmlal.s32 q6, d17, d6
+ vst1.8 d10, [r5, : 64]
+ vmlal.s32 q6, d14, d5
+ vext.32 d8, d11, d10, #0
+ vmlal.s32 q6, d15, d4
+ vmov.i64 d9, #0
+ vmlal.s32 q6, d29, d1
+ vmlal.s32 q4, d20, d7
+ vmlal.s32 q4, d21, d6
+ vmlal.s32 q4, d26, d5
+ vext.32 d11, d12, d12, #0
+ vmlal.s32 q4, d27, d4
+ vmov.i64 d10, #0
+ vmlal.s32 q4, d28, d1
+ vmlal.s32 q5, d16, d0
+ sub r2, r5, #32
+ vmlal.s32 q5, d17, d7
+ vmlal.s32 q5, d14, d6
+ vext.32 d30, d9, d8, #0
+ vmlal.s32 q5, d15, d5
+ vld1.8 {d31}, [r2, : 64]!
+ vmlal.s32 q5, d29, d4
+ vmlal.s32 q15, d20, d0
+ vext.32 d0, d6, d18, #1
+ vmlal.s32 q15, d21, d25
+ vrev64.i32 d0, d0
+ vmlal.s32 q15, d26, d24
+ vext.32 d1, d7, d19, #1
+ vext.32 d7, d10, d10, #0
+ vmlal.s32 q15, d27, d23
+ vrev64.i32 d1, d1
+ vld1.8 {d6}, [r2, : 64]
+ vmlal.s32 q15, d28, d22
+ vmlal.s32 q3, d16, d4
+ add r2, r2, #24
+ vmlal.s32 q3, d17, d2
+ vext.32 d4, d31, d30, #0
+ vmov d17, d11
+ vmlal.s32 q3, d14, d1
+ vext.32 d11, d13, d13, #0
+ vext.32 d13, d30, d30, #0
+ vmlal.s32 q3, d15, d0
+ vext.32 d1, d8, d8, #0
+ vmlal.s32 q3, d29, d3
+ vld1.8 {d5}, [r2, : 64]
+ sub r2, r2, #16
+ vext.32 d10, d6, d6, #0
+ vmov.i32 q1, #0xffffffff
+ vshl.i64 q4, q1, #25
+ add r5, sp, #480
+ vld1.8 {d14-d15}, [r5, : 128]
+ vadd.i64 q9, q2, q7
+ vshl.i64 q1, q1, #26
+ vshr.s64 q10, q9, #26
+ vld1.8 {d0}, [r2, : 64]!
+ vadd.i64 q5, q5, q10
+ vand q9, q9, q1
+ vld1.8 {d16}, [r2, : 64]!
+ add r2, sp, #496
+ vld1.8 {d20-d21}, [r2, : 128]
+ vadd.i64 q11, q5, q10
+ vsub.i64 q2, q2, q9
+ vshr.s64 q9, q11, #25
+ vext.32 d12, d5, d4, #0
+ vand q11, q11, q4
+ vadd.i64 q0, q0, q9
+ vmov d19, d7
+ vadd.i64 q3, q0, q7
+ vsub.i64 q5, q5, q11
+ vshr.s64 q11, q3, #26
+ vext.32 d18, d11, d10, #0
+ vand q3, q3, q1
+ vadd.i64 q8, q8, q11
+ vadd.i64 q11, q8, q10
+ vsub.i64 q0, q0, q3
+ vshr.s64 q3, q11, #25
+ vand q11, q11, q4
+ vadd.i64 q3, q6, q3
+ vadd.i64 q6, q3, q7
+ vsub.i64 q8, q8, q11
+ vshr.s64 q11, q6, #26
+ vand q6, q6, q1
+ vadd.i64 q9, q9, q11
+ vadd.i64 d25, d19, d21
+ vsub.i64 q3, q3, q6
+ vshr.s64 d23, d25, #25
+ vand q4, q12, q4
+ vadd.i64 d21, d23, d23
+ vshl.i64 d25, d23, #4
+ vadd.i64 d21, d21, d23
+ vadd.i64 d25, d25, d21
+ vadd.i64 d4, d4, d25
+ vzip.i32 q0, q8
+ vadd.i64 d12, d4, d14
+ add r2, r6, #8
+ vst1.8 d0, [r2, : 64]
+ vsub.i64 d19, d19, d9
+ add r2, r2, #16
+ vst1.8 d16, [r2, : 64]
+ vshr.s64 d22, d12, #26
+ vand q0, q6, q1
+ vadd.i64 d10, d10, d22
+ vzip.i32 q3, q9
+ vsub.i64 d4, d4, d0
+ sub r2, r2, #8
+ vst1.8 d6, [r2, : 64]
+ add r2, r2, #16
+ vst1.8 d18, [r2, : 64]
+ vzip.i32 q2, q5
+ sub r2, r2, #32
+ vst1.8 d4, [r2, : 64]
+ cmp r4, #0
+ beq .Lskippostcopy
+ add r2, r3, #144
+ mov r4, r4
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4}, [r2, : 64]
+ vst1.8 {d0-d1}, [r4, : 128]!
+ vst1.8 {d2-d3}, [r4, : 128]!
+ vst1.8 d4, [r4, : 64]
+.Lskippostcopy:
+ cmp r1, #1
+ bne .Lskipfinalcopy
+ add r2, r3, #288
+ add r4, r3, #144
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4}, [r2, : 64]
+ vst1.8 {d0-d1}, [r4, : 128]!
+ vst1.8 {d2-d3}, [r4, : 128]!
+ vst1.8 d4, [r4, : 64]
+.Lskipfinalcopy:
+ add r1, r1, #1
+ cmp r1, #12
+ blo .Linvertloop
+ add r1, r3, #144
+ ldr r2, [r1], #4
+ ldr r3, [r1], #4
+ ldr r4, [r1], #4
+ ldr r5, [r1], #4
+ ldr r6, [r1], #4
+ ldr r7, [r1], #4
+ ldr r8, [r1], #4
+ ldr r9, [r1], #4
+ ldr r10, [r1], #4
+ ldr r1, [r1]
+ add r11, r1, r1, LSL #4
+ add r11, r11, r1, LSL #1
+ add r11, r11, #16777216
+ mov r11, r11, ASR #25
+ add r11, r11, r2
+ mov r11, r11, ASR #26
+ add r11, r11, r3
+ mov r11, r11, ASR #25
+ add r11, r11, r4
+ mov r11, r11, ASR #26
+ add r11, r11, r5
+ mov r11, r11, ASR #25
+ add r11, r11, r6
+ mov r11, r11, ASR #26
+ add r11, r11, r7
+ mov r11, r11, ASR #25
+ add r11, r11, r8
+ mov r11, r11, ASR #26
+ add r11, r11, r9
+ mov r11, r11, ASR #25
+ add r11, r11, r10
+ mov r11, r11, ASR #26
+ add r11, r11, r1
+ mov r11, r11, ASR #25
+ add r2, r2, r11
+ add r2, r2, r11, LSL #1
+ add r2, r2, r11, LSL #4
+ mov r11, r2, ASR #26
+ add r3, r3, r11
+ sub r2, r2, r11, LSL #26
+ mov r11, r3, ASR #25
+ add r4, r4, r11
+ sub r3, r3, r11, LSL #25
+ mov r11, r4, ASR #26
+ add r5, r5, r11
+ sub r4, r4, r11, LSL #26
+ mov r11, r5, ASR #25
+ add r6, r6, r11
+ sub r5, r5, r11, LSL #25
+ mov r11, r6, ASR #26
+ add r7, r7, r11
+ sub r6, r6, r11, LSL #26
+ mov r11, r7, ASR #25
+ add r8, r8, r11
+ sub r7, r7, r11, LSL #25
+ mov r11, r8, ASR #26
+ add r9, r9, r11
+ sub r8, r8, r11, LSL #26
+ mov r11, r9, ASR #25
+ add r10, r10, r11
+ sub r9, r9, r11, LSL #25
+ mov r11, r10, ASR #26
+ add r1, r1, r11
+ sub r10, r10, r11, LSL #26
+ mov r11, r1, ASR #25
+ sub r1, r1, r11, LSL #25
+ add r2, r2, r3, LSL #26
+ mov r3, r3, LSR #6
+ add r3, r3, r4, LSL #19
+ mov r4, r4, LSR #13
+ add r4, r4, r5, LSL #13
+ mov r5, r5, LSR #19
+ add r5, r5, r6, LSL #6
+ add r6, r7, r8, LSL #25
+ mov r7, r8, LSR #7
+ add r7, r7, r9, LSL #19
+ mov r8, r9, LSR #13
+ add r8, r8, r10, LSL #12
+ mov r9, r10, LSR #20
+ add r1, r9, r1, LSL #6
+ str r2, [r0]
+ str r3, [r0, #4]
+ str r4, [r0, #8]
+ str r5, [r0, #12]
+ str r6, [r0, #16]
+ str r7, [r0, #20]
+ str r8, [r0, #24]
+ str r1, [r0, #28]
+ movw r0, #0
+ mov sp, ip
+ pop {r4-r11, pc}
+ENDPROC(curve25519_neon)
diff --git a/lib/crypto/arm/curve25519.h b/lib/crypto/arm/curve25519.h
new file mode 100644
index 000000000000..b1a566885e95
--- /dev/null
+++ b/lib/crypto/arm/curve25519.h
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
+ * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
+ * manually reworked for use in kernel space.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <linux/types.h>
+#include <linux/jump_label.h>
+
+asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
+ const u8 scalar[CURVE25519_KEY_SIZE],
+ const u8 point[CURVE25519_KEY_SIZE])
+{
+ if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
+ scoped_ksimd()
+ curve25519_neon(out, scalar, point);
+ } else {
+ curve25519_generic(out, scalar, point);
+ }
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ curve25519_arch(pub, secret, curve25519_base_point);
+}
+
+#define curve25519_mod_init_arch curve25519_mod_init_arch
+static void curve25519_mod_init_arch(void)
+{
+ if (elf_hwcap & HWCAP_NEON)
+ static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl
new file mode 100644
index 000000000000..34c11b7b44bd
--- /dev/null
+++ b/lib/crypto/arm/poly1305-armv4.pl
@@ -0,0 +1,1235 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# IALU(*)/gcc-4.4 NEON
+#
+# ARM11xx(ARMv6) 7.78/+100% -
+# Cortex-A5 6.35/+130% 3.00
+# Cortex-A8 6.25/+115% 2.36
+# Cortex-A9 5.10/+95% 2.55
+# Cortex-A15 3.85/+85% 1.25(**)
+# Snapdragon S4 5.70/+100% 1.48(**)
+#
+# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
+# (**) these are trade-off results, they can be improved by ~8% but at
+# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
+# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
+# define poly1305_init poly1305_block_init
+# define poly1305_blocks poly1305_blocks_arm
+#endif
+
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.text
+
+.globl poly1305_emit
+.globl poly1305_blocks
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+.Lpoly1305_init:
+ stmdb sp!,{r4-r11}
+
+ eor r3,r3,r3
+ cmp $inp,#0
+ str r3,[$ctx,#0] @ zero hash value
+ str r3,[$ctx,#4]
+ str r3,[$ctx,#8]
+ str r3,[$ctx,#12]
+ str r3,[$ctx,#16]
+ str r3,[$ctx,#36] @ clear is_base2_26
+ add $ctx,$ctx,#20
+
+#ifdef __thumb2__
+ it eq
+#endif
+ moveq r0,#0
+ beq .Lno_key
+
+#if __ARM_MAX_ARCH__>=7
+ mov r3,#-1
+ str r3,[$ctx,#28] @ impossible key power value
+# ifndef __KERNEL__
+ adr r11,.Lpoly1305_init
+ ldr r12,.LOPENSSL_armcap
+# endif
+#endif
+ ldrb r4,[$inp,#0]
+ mov r10,#0x0fffffff
+ ldrb r5,[$inp,#1]
+ and r3,r10,#-4 @ 0x0ffffffc
+ ldrb r6,[$inp,#2]
+ ldrb r7,[$inp,#3]
+ orr r4,r4,r5,lsl#8
+ ldrb r5,[$inp,#4]
+ orr r4,r4,r6,lsl#16
+ ldrb r6,[$inp,#5]
+ orr r4,r4,r7,lsl#24
+ ldrb r7,[$inp,#6]
+ and r4,r4,r10
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+# if !defined(_WIN32)
+ ldr r12,[r11,r12] @ OPENSSL_armcap_P
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
+ ldr r12,[r12]
+# endif
+#endif
+ ldrb r8,[$inp,#7]
+ orr r5,r5,r6,lsl#8
+ ldrb r6,[$inp,#8]
+ orr r5,r5,r7,lsl#16
+ ldrb r7,[$inp,#9]
+ orr r5,r5,r8,lsl#24
+ ldrb r8,[$inp,#10]
+ and r5,r5,r3
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ tst r12,#ARMV7_NEON @ check for NEON
+# ifdef __thumb2__
+ adr r9,.Lpoly1305_blocks_neon
+ adr r11,.Lpoly1305_blocks
+ it ne
+ movne r11,r9
+ adr r12,.Lpoly1305_emit
+ orr r11,r11,#1 @ thumb-ify addresses
+ orr r12,r12,#1
+# else
+ add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
+ ite eq
+ addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
+ addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
+# endif
+#endif
+ ldrb r9,[$inp,#11]
+ orr r6,r6,r7,lsl#8
+ ldrb r7,[$inp,#12]
+ orr r6,r6,r8,lsl#16
+ ldrb r8,[$inp,#13]
+ orr r6,r6,r9,lsl#24
+ ldrb r9,[$inp,#14]
+ and r6,r6,r3
+
+ ldrb r10,[$inp,#15]
+ orr r7,r7,r8,lsl#8
+ str r4,[$ctx,#0]
+ orr r7,r7,r9,lsl#16
+ str r5,[$ctx,#4]
+ orr r7,r7,r10,lsl#24
+ str r6,[$ctx,#8]
+ and r7,r7,r3
+ str r7,[$ctx,#12]
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ stmia r2,{r11,r12} @ fill functions table
+ mov r0,#1
+#else
+ mov r0,#0
+#endif
+.Lno_key:
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
+my ($s1,$s2,$s3)=($r1,$r2,$r3);
+
+$code.=<<___;
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ stmdb sp!,{r3-r11,lr}
+
+ ands $len,$len,#-16
+ beq .Lno_data
+
+ add $len,$len,$inp @ end pointer
+ sub sp,sp,#32
+
+#if __ARM_ARCH__<7
+ ldmia $ctx,{$h0-$r3} @ load context
+ add $ctx,$ctx,#20
+ str $len,[sp,#16] @ offload stuff
+ str $ctx,[sp,#12]
+#else
+ ldr lr,[$ctx,#36] @ is_base2_26
+ ldmia $ctx!,{$h0-$h4} @ load hash value
+ str $len,[sp,#16] @ offload stuff
+ str $ctx,[sp,#12]
+
+ adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
+ mov $r1,$h1,lsr#6
+ adcs $r1,$r1,$h2,lsl#20
+ mov $r2,$h2,lsr#12
+ adcs $r2,$r2,$h3,lsl#14
+ mov $r3,$h3,lsr#18
+ adcs $r3,$r3,$h4,lsl#8
+ mov $len,#0
+ teq lr,#0
+ str $len,[$ctx,#16] @ clear is_base2_26
+ adc $len,$len,$h4,lsr#24
+
+ itttt ne
+ movne $h0,$r0 @ choose between radixes
+ movne $h1,$r1
+ movne $h2,$r2
+ movne $h3,$r3
+ ldmia $ctx,{$r0-$r3} @ load key
+ it ne
+ movne $h4,$len
+#endif
+
+ mov lr,$inp
+ cmp $padbit,#0
+ str $r1,[sp,#20]
+ str $r2,[sp,#24]
+ str $r3,[sp,#28]
+ b .Loop
+
+.align 4
+.Loop:
+#if __ARM_ARCH__<7
+ ldrb r0,[lr],#16 @ load input
+# ifdef __thumb2__
+ it hi
+# endif
+ addhi $h4,$h4,#1 @ 1<<128
+ ldrb r1,[lr,#-15]
+ ldrb r2,[lr,#-14]
+ ldrb r3,[lr,#-13]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-12]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-11]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-10]
+ adds $h0,$h0,r3 @ accumulate input
+
+ ldrb r3,[lr,#-9]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-8]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-7]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-6]
+ adcs $h1,$h1,r3
+
+ ldrb r3,[lr,#-5]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-4]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-3]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-2]
+ adcs $h2,$h2,r3
+
+ ldrb r3,[lr,#-1]
+ orr r1,r0,r1,lsl#8
+ str lr,[sp,#8] @ offload input pointer
+ orr r2,r1,r2,lsl#16
+ add $s1,$r1,$r1,lsr#2
+ orr r3,r2,r3,lsl#24
+#else
+ ldr r0,[lr],#16 @ load input
+ it hi
+ addhi $h4,$h4,#1 @ padbit
+ ldr r1,[lr,#-12]
+ ldr r2,[lr,#-8]
+ ldr r3,[lr,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ adds $h0,$h0,r0 @ accumulate input
+ str lr,[sp,#8] @ offload input pointer
+ adcs $h1,$h1,r1
+ add $s1,$r1,$r1,lsr#2
+ adcs $h2,$h2,r2
+#endif
+ add $s2,$r2,$r2,lsr#2
+ adcs $h3,$h3,r3
+ add $s3,$r3,$r3,lsr#2
+
+ umull r2,r3,$h1,$r0
+ adc $h4,$h4,#0
+ umull r0,r1,$h0,$r0
+ umlal r2,r3,$h4,$s1
+ umlal r0,r1,$h3,$s1
+ ldr $r1,[sp,#20] @ reload $r1
+ umlal r2,r3,$h2,$s3
+ umlal r0,r1,$h1,$s3
+ umlal r2,r3,$h3,$s2
+ umlal r0,r1,$h2,$s2
+ umlal r2,r3,$h0,$r1
+ str r0,[sp,#0] @ future $h0
+ mul r0,$s2,$h4
+ ldr $r2,[sp,#24] @ reload $r2
+ adds r2,r2,r1 @ d1+=d0>>32
+ eor r1,r1,r1
+ adc lr,r3,#0 @ future $h2
+ str r2,[sp,#4] @ future $h1
+
+ mul r2,$s3,$h4
+ eor r3,r3,r3
+ umlal r0,r1,$h3,$s3
+ ldr $r3,[sp,#28] @ reload $r3
+ umlal r2,r3,$h3,$r0
+ umlal r0,r1,$h2,$r0
+ umlal r2,r3,$h2,$r1
+ umlal r0,r1,$h1,$r1
+ umlal r2,r3,$h1,$r2
+ umlal r0,r1,$h0,$r2
+ umlal r2,r3,$h0,$r3
+ ldr $h0,[sp,#0]
+ mul $h4,$r0,$h4
+ ldr $h1,[sp,#4]
+
+ adds $h2,lr,r0 @ d2+=d1>>32
+ ldr lr,[sp,#8] @ reload input pointer
+ adc r1,r1,#0
+ adds $h3,r2,r1 @ d3+=d2>>32
+ ldr r0,[sp,#16] @ reload end pointer
+ adc r3,r3,#0
+ add $h4,$h4,r3 @ h4+=d3>>32
+
+ and r1,$h4,#-4
+ and $h4,$h4,#3
+ add r1,r1,r1,lsr#2 @ *=5
+ adds $h0,$h0,r1
+ adcs $h1,$h1,#0
+ adcs $h2,$h2,#0
+ adcs $h3,$h3,#0
+ adc $h4,$h4,#0
+
+ cmp r0,lr @ done yet?
+ bhi .Loop
+
+ ldr $ctx,[sp,#12]
+ add sp,sp,#32
+ stmdb $ctx,{$h0-$h4} @ store the result
+
+.Lno_data:
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r3-r11,pc}
+#else
+ ldmia sp!,{r3-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce)=map("r$_",(0..2));
+my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
+my $g4=$ctx;
+
+$code.=<<___;
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+ stmdb sp!,{r4-r11}
+
+ ldmia $ctx,{$h0-$h4}
+
+#if __ARM_ARCH__>=7
+ ldr ip,[$ctx,#36] @ is_base2_26
+
+ adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
+ mov $g1,$h1,lsr#6
+ adcs $g1,$g1,$h2,lsl#20
+ mov $g2,$h2,lsr#12
+ adcs $g2,$g2,$h3,lsl#14
+ mov $g3,$h3,lsr#18
+ adcs $g3,$g3,$h4,lsl#8
+ mov $g4,#0
+ adc $g4,$g4,$h4,lsr#24
+
+ tst ip,ip
+ itttt ne
+ movne $h0,$g0
+ movne $h1,$g1
+ movne $h2,$g2
+ movne $h3,$g3
+ it ne
+ movne $h4,$g4
+#endif
+
+ adds $g0,$h0,#5 @ compare to modulus
+ adcs $g1,$h1,#0
+ adcs $g2,$h2,#0
+ adcs $g3,$h3,#0
+ adc $g4,$h4,#0
+ tst $g4,#4 @ did it carry/borrow?
+
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h0,$g0
+ ldr $g0,[$nonce,#0]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h1,$g1
+ ldr $g1,[$nonce,#4]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h2,$g2
+ ldr $g2,[$nonce,#8]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h3,$g3
+ ldr $g3,[$nonce,#12]
+
+ adds $h0,$h0,$g0
+ adcs $h1,$h1,$g1
+ adcs $h2,$h2,$g2
+ adc $h3,$h3,$g3
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+ rev $h0,$h0
+ rev $h1,$h1
+ rev $h2,$h2
+ rev $h3,$h3
+# endif
+ str $h0,[$mac,#0]
+ str $h1,[$mac,#4]
+ str $h2,[$mac,#8]
+ str $h3,[$mac,#12]
+#else
+ strb $h0,[$mac,#0]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#4]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#8]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#12]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#1]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#5]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#9]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#13]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#2]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#6]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#10]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#14]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#3]
+ strb $h1,[$mac,#7]
+ strb $h2,[$mac,#11]
+ strb $h3,[$mac,#15]
+#endif
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_emit,.-poly1305_emit
+___
+{
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
+my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
+my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
+
+my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.fpu neon
+
+.type poly1305_init_neon,%function
+.align 5
+poly1305_init_neon:
+.Lpoly1305_init_neon:
+ ldr r3,[$ctx,#48] @ first table element
+ cmp r3,#-1 @ is value impossible?
+ bne .Lno_init_neon
+
+ ldr r4,[$ctx,#20] @ load key base 2^32
+ ldr r5,[$ctx,#24]
+ ldr r6,[$ctx,#28]
+ ldr r7,[$ctx,#32]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ and r3,r3,#0x03ffffff
+ and r4,r4,#0x03ffffff
+ and r5,r5,#0x03ffffff
+
+ vdup.32 $R0,r2 @ r^1 in both lanes
+ add r2,r3,r3,lsl#2 @ *5
+ vdup.32 $R1,r3
+ add r3,r4,r4,lsl#2
+ vdup.32 $S1,r2
+ vdup.32 $R2,r4
+ add r4,r5,r5,lsl#2
+ vdup.32 $S2,r3
+ vdup.32 $R3,r5
+ add r5,r6,r6,lsl#2
+ vdup.32 $S3,r4
+ vdup.32 $R4,r6
+ vdup.32 $S4,r5
+
+ mov $zeros,#2 @ counter
+
+.Lsquare_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+
+ vmull.u32 $D0,$R0,${R0}[1]
+ vmull.u32 $D1,$R1,${R0}[1]
+ vmull.u32 $D2,$R2,${R0}[1]
+ vmull.u32 $D3,$R3,${R0}[1]
+ vmull.u32 $D4,$R4,${R0}[1]
+
+ vmlal.u32 $D0,$R4,${S1}[1]
+ vmlal.u32 $D1,$R0,${R1}[1]
+ vmlal.u32 $D2,$R1,${R1}[1]
+ vmlal.u32 $D3,$R2,${R1}[1]
+ vmlal.u32 $D4,$R3,${R1}[1]
+
+ vmlal.u32 $D0,$R3,${S2}[1]
+ vmlal.u32 $D1,$R4,${S2}[1]
+ vmlal.u32 $D3,$R1,${R2}[1]
+ vmlal.u32 $D2,$R0,${R2}[1]
+ vmlal.u32 $D4,$R2,${R2}[1]
+
+ vmlal.u32 $D0,$R2,${S3}[1]
+ vmlal.u32 $D3,$R0,${R3}[1]
+ vmlal.u32 $D1,$R3,${S3}[1]
+ vmlal.u32 $D2,$R4,${S3}[1]
+ vmlal.u32 $D4,$R1,${R3}[1]
+
+ vmlal.u32 $D3,$R4,${S4}[1]
+ vmlal.u32 $D0,$R1,${S4}[1]
+ vmlal.u32 $D1,$R2,${S4}[1]
+ vmlal.u32 $D2,$R3,${S4}[1]
+ vmlal.u32 $D4,$R0,${R4}[1]
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ @ and P. Schwabe
+ @
+ @ H0>>+H1>>+H2>>+H3>>+H4
+ @ H3>>+H4>>*5+H0>>+H1
+ @
+ @ Trivia.
+ @
+ @ Result of multiplication of n-bit number by m-bit number is
+ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+ @ m-bit number multiplied by 2^n is still n+m bits wide.
+ @
+ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+ @ one is n+1 bits wide.
+ @
+ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+ @ can be 27. However! In cases when their width exceeds 26 bits
+ @ they are limited by 2^26+2^6. This in turn means that *sum*
+ @ of the products with these values can still be viewed as sum
+ @ of 52-bit numbers as long as the amount of addends is not a
+ @ power of 2. For example,
+ @
+ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+ @
+ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
+ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+ @ which is less than 32 * (2^52) or 2^57. And when processing
+ @ data we are looking at triple as many addends...
+ @
+ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
+ @ This means that result of reduction have to be compressed upon
+ @ loop wrap-around. This can be done in the process of reduction
+ @ to minimize amount of instructions [as well as amount of
+ @ 128-bit instructions, which benefits low-end processors], but
+ @ one has to watch for H2 (which is narrower than H0) and 5*H4
+ @ not being wider than 58 bits, so that result of right shift
+ @ by 26 bits fits in 32 bits. This is also useful on x86,
+ @ because it allows to use paddd in place for paddq, which
+ @ benefits Atom, where paddq is ridiculously slow.
+
+ vshr.u64 $T0,$D3,#26
+ vmovn.i64 $D3#lo,$D3
+ vshr.u64 $T1,$D0,#26
+ vmovn.i64 $D0#lo,$D0
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+ vbic.i32 $D0#lo,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D4,#26
+ vmovn.i64 $D4#lo,$D4
+ vshr.u64 $T1,$D1,#26
+ vmovn.i64 $D1#lo,$D1
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+ vbic.i32 $D4#lo,#0xfc000000
+ vbic.i32 $D1#lo,#0xfc000000
+
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
+ vshl.u32 $T0#lo,$T0#lo,#2
+ vshrn.u64 $T1#lo,$D2,#26
+ vmovn.i64 $D2#lo,$D2
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
+ vbic.i32 $D2#lo,#0xfc000000
+
+ vshr.u32 $T0#lo,$D0#lo,#26
+ vbic.i32 $D0#lo,#0xfc000000
+ vshr.u32 $T1#lo,$D3#lo,#26
+ vbic.i32 $D3#lo,#0xfc000000
+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
+
+ subs $zeros,$zeros,#1
+ beq .Lsquare_break_neon
+
+ add $tbl0,$ctx,#(48+0*9*4)
+ add $tbl1,$ctx,#(48+1*9*4)
+
+ vtrn.32 $R0,$D0#lo @ r^2:r^1
+ vtrn.32 $R2,$D2#lo
+ vtrn.32 $R3,$D3#lo
+ vtrn.32 $R1,$D1#lo
+ vtrn.32 $R4,$D4#lo
+
+ vshl.u32 $S2,$R2,#2 @ *5
+ vshl.u32 $S3,$R3,#2
+ vshl.u32 $S1,$R1,#2
+ vshl.u32 $S4,$R4,#2
+ vadd.i32 $S2,$S2,$R2
+ vadd.i32 $S1,$S1,$R1
+ vadd.i32 $S3,$S3,$R3
+ vadd.i32 $S4,$S4,$R4
+
+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vst1.32 {${S4}[0]},[$tbl0,:32]
+ vst1.32 {${S4}[1]},[$tbl1,:32]
+
+ b .Lsquare_neon
+
+.align 4
+.Lsquare_break_neon:
+ add $tbl0,$ctx,#(48+2*4*9)
+ add $tbl1,$ctx,#(48+3*4*9)
+
+ vmov $R0,$D0#lo @ r^4:r^3
+ vshl.u32 $S1,$D1#lo,#2 @ *5
+ vmov $R1,$D1#lo
+ vshl.u32 $S2,$D2#lo,#2
+ vmov $R2,$D2#lo
+ vshl.u32 $S3,$D3#lo,#2
+ vmov $R3,$D3#lo
+ vshl.u32 $S4,$D4#lo,#2
+ vmov $R4,$D4#lo
+ vadd.i32 $S1,$S1,$D1#lo
+ vadd.i32 $S2,$S2,$D2#lo
+ vadd.i32 $S3,$S3,$D3#lo
+ vadd.i32 $S4,$S4,$D4#lo
+
+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vst1.32 {${S4}[0]},[$tbl0]
+ vst1.32 {${S4}[1]},[$tbl1]
+
+.Lno_init_neon:
+ ret @ bx lr
+.size poly1305_init_neon,.-poly1305_init_neon
+
+.globl poly1305_blocks_neon
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+ ldr ip,[$ctx,#36] @ is_base2_26
+
+ cmp $len,#64
+ blo .Lpoly1305_blocks
+
+ stmdb sp!,{r4-r7}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ tst ip,ip @ is_base2_26?
+ bne .Lbase2_26_neon
+
+ stmdb sp!,{r1-r3,lr}
+ bl .Lpoly1305_init_neon
+
+ ldr r4,[$ctx,#0] @ load hash value base 2^32
+ ldr r5,[$ctx,#4]
+ ldr r6,[$ctx,#8]
+ ldr r7,[$ctx,#12]
+ ldr ip,[$ctx,#16]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ veor $D0#lo,$D0#lo,$D0#lo
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ veor $D1#lo,$D1#lo,$D1#lo
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ veor $D2#lo,$D2#lo,$D2#lo
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ veor $D3#lo,$D3#lo,$D3#lo
+ and r3,r3,#0x03ffffff
+ orr r6,r6,ip,lsl#24
+ veor $D4#lo,$D4#lo,$D4#lo
+ and r4,r4,#0x03ffffff
+ mov r1,#1
+ and r5,r5,#0x03ffffff
+ str r1,[$ctx,#36] @ set is_base2_26
+
+ vmov.32 $D0#lo[0],r2
+ vmov.32 $D1#lo[0],r3
+ vmov.32 $D2#lo[0],r4
+ vmov.32 $D3#lo[0],r5
+ vmov.32 $D4#lo[0],r6
+ adr $zeros,.Lzeros
+
+ ldmia sp!,{r1-r3,lr}
+ b .Lhash_loaded
+
+.align 4
+.Lbase2_26_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ load hash value
+
+ veor $D0#lo,$D0#lo,$D0#lo
+ veor $D1#lo,$D1#lo,$D1#lo
+ veor $D2#lo,$D2#lo,$D2#lo
+ veor $D3#lo,$D3#lo,$D3#lo
+ veor $D4#lo,$D4#lo,$D4#lo
+ vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+ adr $zeros,.Lzeros
+ vld1.32 {$D4#lo[0]},[$ctx]
+ sub $ctx,$ctx,#16 @ rewind
+
+.Lhash_loaded:
+ add $in2,$inp,#32
+ mov $padbit,$padbit,lsl#24
+ tst $len,#31
+ beq .Leven
+
+ vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
+ vmov.32 $H4#lo[0],$padbit
+ sub $len,$len,#16
+ add $in2,$inp,#32
+
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H3,$H3
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+# endif
+ vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
+ vshl.u32 $H3#lo,$H3#lo,#18
+
+ vsri.u32 $H3#lo,$H2#lo,#14
+ vshl.u32 $H2#lo,$H2#lo,#12
+ vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
+
+ vbic.i32 $H3#lo,#0xfc000000
+ vsri.u32 $H2#lo,$H1#lo,#20
+ vshl.u32 $H1#lo,$H1#lo,#6
+
+ vbic.i32 $H2#lo,#0xfc000000
+ vsri.u32 $H1#lo,$H0#lo,#26
+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
+
+ vbic.i32 $H0#lo,#0xfc000000
+ vbic.i32 $H1#lo,#0xfc000000
+ vadd.i32 $H2#hi,$H2#lo,$D2#lo
+
+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
+
+ mov $tbl1,$zeros
+ add $tbl0,$ctx,#48
+
+ cmp $len,$len
+ b .Long_tail
+
+.align 4
+.Leven:
+ subs $len,$len,#64
+ it lo
+ movlo $in2,$zeros
+
+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
+ add $inp,$inp,#64
+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
+ add $in2,$in2,#64
+ itt hi
+ addhi $tbl1,$ctx,#(48+1*9*4)
+ addhi $tbl0,$ctx,#(48+3*9*4)
+
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H3,$H3
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+# endif
+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
+ vshl.u32 $H3,$H3,#18
+
+ vsri.u32 $H3,$H2,#14
+ vshl.u32 $H2,$H2,#12
+
+ vbic.i32 $H3,#0xfc000000
+ vsri.u32 $H2,$H1,#20
+ vshl.u32 $H1,$H1,#6
+
+ vbic.i32 $H2,#0xfc000000
+ vsri.u32 $H1,$H0,#26
+
+ vbic.i32 $H0,#0xfc000000
+ vbic.i32 $H1,#0xfc000000
+
+ bls .Lskip_loop
+
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ b .Loop_neon
+
+.align 5
+.Loop_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ @ \___________________/
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ @ \___________________/ \____________________/
+ @
+ @ Note that we start with inp[2:3]*r^2. This is because it
+ @ doesn't depend on reduction in previous iteration.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ inp[2:3]*r^2
+
+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
+ vmull.u32 $D2,$H2#hi,${R0}[1]
+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
+ vmull.u32 $D0,$H0#hi,${R0}[1]
+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
+ vmull.u32 $D3,$H3#hi,${R0}[1]
+ vmlal.u32 $D2,$H1#hi,${R1}[1]
+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
+ vmull.u32 $D1,$H1#hi,${R0}[1]
+
+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
+ vmull.u32 $D4,$H4#hi,${R0}[1]
+ subs $len,$len,#64
+ vmlal.u32 $D0,$H4#hi,${S1}[1]
+ it lo
+ movlo $in2,$zeros
+ vmlal.u32 $D3,$H2#hi,${R1}[1]
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D1,$H0#hi,${R1}[1]
+ vmlal.u32 $D4,$H3#hi,${R1}[1]
+
+ vmlal.u32 $D0,$H3#hi,${S2}[1]
+ vmlal.u32 $D3,$H1#hi,${R2}[1]
+ vmlal.u32 $D4,$H2#hi,${R2}[1]
+ vmlal.u32 $D1,$H4#hi,${S2}[1]
+ vmlal.u32 $D2,$H0#hi,${R2}[1]
+
+ vmlal.u32 $D3,$H0#hi,${R3}[1]
+ vmlal.u32 $D0,$H2#hi,${S3}[1]
+ vmlal.u32 $D4,$H1#hi,${R3}[1]
+ vmlal.u32 $D1,$H3#hi,${S3}[1]
+ vmlal.u32 $D2,$H4#hi,${S3}[1]
+
+ vmlal.u32 $D3,$H4#hi,${S4}[1]
+ vmlal.u32 $D0,$H1#hi,${S4}[1]
+ vmlal.u32 $D4,$H0#hi,${R4}[1]
+ vmlal.u32 $D1,$H2#hi,${S4}[1]
+ vmlal.u32 $D2,$H3#hi,${S4}[1]
+
+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
+ add $in2,$in2,#64
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4 and accumulate
+
+ vmlal.u32 $D3,$H3#lo,${R0}[0]
+ vmlal.u32 $D0,$H0#lo,${R0}[0]
+ vmlal.u32 $D4,$H4#lo,${R0}[0]
+ vmlal.u32 $D1,$H1#lo,${R0}[0]
+ vmlal.u32 $D2,$H2#lo,${R0}[0]
+ vld1.32 ${S4}[0],[$tbl0,:32]
+
+ vmlal.u32 $D3,$H2#lo,${R1}[0]
+ vmlal.u32 $D0,$H4#lo,${S1}[0]
+ vmlal.u32 $D4,$H3#lo,${R1}[0]
+ vmlal.u32 $D1,$H0#lo,${R1}[0]
+ vmlal.u32 $D2,$H1#lo,${R1}[0]
+
+ vmlal.u32 $D3,$H1#lo,${R2}[0]
+ vmlal.u32 $D0,$H3#lo,${S2}[0]
+ vmlal.u32 $D4,$H2#lo,${R2}[0]
+ vmlal.u32 $D1,$H4#lo,${S2}[0]
+ vmlal.u32 $D2,$H0#lo,${R2}[0]
+
+ vmlal.u32 $D3,$H0#lo,${R3}[0]
+ vmlal.u32 $D0,$H2#lo,${S3}[0]
+ vmlal.u32 $D4,$H1#lo,${R3}[0]
+ vmlal.u32 $D1,$H3#lo,${S3}[0]
+ vmlal.u32 $D3,$H4#lo,${S4}[0]
+
+ vmlal.u32 $D2,$H4#lo,${S3}[0]
+ vmlal.u32 $D0,$H1#lo,${S4}[0]
+ vmlal.u32 $D4,$H0#lo,${R4}[0]
+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
+ vmlal.u32 $D1,$H2#lo,${S4}[0]
+ vmlal.u32 $D2,$H3#lo,${S4}[0]
+
+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
+ add $inp,$inp,#64
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+ vrev32.8 $H3,$H3
+# endif
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
+ @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
+
+ vshr.u64 $T0,$D3,#26
+ vmovn.i64 $D3#lo,$D3
+ vshr.u64 $T1,$D0,#26
+ vmovn.i64 $D0#lo,$D0
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vbic.i32 $D3#lo,#0xfc000000
+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+ vshl.u32 $H3,$H3,#18
+ vbic.i32 $D0#lo,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D4,#26
+ vmovn.i64 $D4#lo,$D4
+ vshr.u64 $T1,$D1,#26
+ vmovn.i64 $D1#lo,$D1
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+ vsri.u32 $H3,$H2,#14
+ vbic.i32 $D4#lo,#0xfc000000
+ vshl.u32 $H2,$H2,#12
+ vbic.i32 $D1#lo,#0xfc000000
+
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
+ vshl.u32 $T0#lo,$T0#lo,#2
+ vbic.i32 $H3,#0xfc000000
+ vshrn.u64 $T1#lo,$D2,#26
+ vmovn.i64 $D2#lo,$D2
+ vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
+ vsri.u32 $H2,$H1,#20
+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
+ vshl.u32 $H1,$H1,#6
+ vbic.i32 $D2#lo,#0xfc000000
+ vbic.i32 $H2,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
+ vmovn.i64 $D0#lo,$D0
+ vsri.u32 $H1,$H0,#26
+ vbic.i32 $H0,#0xfc000000
+ vshr.u32 $T1#lo,$D3#lo,#26
+ vbic.i32 $D3#lo,#0xfc000000
+ vbic.i32 $D0#lo,#0xfc000000
+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
+ vbic.i32 $H1,#0xfc000000
+
+ bhi .Loop_neon
+
+.Lskip_loop:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ add $tbl1,$ctx,#(48+0*9*4)
+ add $tbl0,$ctx,#(48+1*9*4)
+ adds $len,$len,#32
+ it ne
+ movne $len,#0
+ bne .Long_tail
+
+ vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
+ vadd.i32 $H4#hi,$H4#lo,$D4#lo
+
+.Long_tail:
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
+
+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
+ vmull.u32 $D2,$H2#hi,$R0
+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
+ vmull.u32 $D0,$H0#hi,$R0
+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
+ vmull.u32 $D3,$H3#hi,$R0
+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
+ vmull.u32 $D1,$H1#hi,$R0
+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
+ vmull.u32 $D4,$H4#hi,$R0
+
+ vmlal.u32 $D0,$H4#hi,$S1
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vmlal.u32 $D3,$H2#hi,$R1
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vmlal.u32 $D1,$H0#hi,$R1
+ vmlal.u32 $D4,$H3#hi,$R1
+ vmlal.u32 $D2,$H1#hi,$R1
+
+ vmlal.u32 $D3,$H1#hi,$R2
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D0,$H3#hi,$S2
+ vld1.32 ${S4}[0],[$tbl0,:32]
+ vmlal.u32 $D4,$H2#hi,$R2
+ vmlal.u32 $D1,$H4#hi,$S2
+ vmlal.u32 $D2,$H0#hi,$R2
+
+ vmlal.u32 $D3,$H0#hi,$R3
+ it ne
+ addne $tbl1,$ctx,#(48+2*9*4)
+ vmlal.u32 $D0,$H2#hi,$S3
+ it ne
+ addne $tbl0,$ctx,#(48+3*9*4)
+ vmlal.u32 $D4,$H1#hi,$R3
+ vmlal.u32 $D1,$H3#hi,$S3
+ vmlal.u32 $D2,$H4#hi,$S3
+
+ vmlal.u32 $D3,$H4#hi,$S4
+ vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
+ vmlal.u32 $D0,$H1#hi,$S4
+ vshr.u64 $MASK,$MASK,#38
+ vmlal.u32 $D4,$H0#hi,$R4
+ vmlal.u32 $D1,$H2#hi,$S4
+ vmlal.u32 $D2,$H3#hi,$S4
+
+ beq .Lshort_tail
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
+
+ vmlal.u32 $D2,$H2#lo,$R0
+ vmlal.u32 $D0,$H0#lo,$R0
+ vmlal.u32 $D3,$H3#lo,$R0
+ vmlal.u32 $D1,$H1#lo,$R0
+ vmlal.u32 $D4,$H4#lo,$R0
+
+ vmlal.u32 $D0,$H4#lo,$S1
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vmlal.u32 $D3,$H2#lo,$R1
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vmlal.u32 $D1,$H0#lo,$R1
+ vmlal.u32 $D4,$H3#lo,$R1
+ vmlal.u32 $D2,$H1#lo,$R1
+
+ vmlal.u32 $D3,$H1#lo,$R2
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D0,$H3#lo,$S2
+ vld1.32 ${S4}[0],[$tbl0,:32]
+ vmlal.u32 $D4,$H2#lo,$R2
+ vmlal.u32 $D1,$H4#lo,$S2
+ vmlal.u32 $D2,$H0#lo,$R2
+
+ vmlal.u32 $D3,$H0#lo,$R3
+ vmlal.u32 $D0,$H2#lo,$S3
+ vmlal.u32 $D4,$H1#lo,$R3
+ vmlal.u32 $D1,$H3#lo,$S3
+ vmlal.u32 $D2,$H4#lo,$S3
+
+ vmlal.u32 $D3,$H4#lo,$S4
+ vorn $MASK,$MASK,$MASK @ all-ones
+ vmlal.u32 $D0,$H1#lo,$S4
+ vshr.u64 $MASK,$MASK,#38
+ vmlal.u32 $D4,$H0#lo,$R4
+ vmlal.u32 $D1,$H2#lo,$S4
+ vmlal.u32 $D2,$H3#lo,$S4
+
+.Lshort_tail:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ horizontal addition
+
+ vadd.i64 $D3#lo,$D3#lo,$D3#hi
+ vadd.i64 $D0#lo,$D0#lo,$D0#hi
+ vadd.i64 $D4#lo,$D4#lo,$D4#hi
+ vadd.i64 $D1#lo,$D1#lo,$D1#hi
+ vadd.i64 $D2#lo,$D2#lo,$D2#hi
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction, but without narrowing
+
+ vshr.u64 $T0,$D3,#26
+ vand.i64 $D3,$D3,$MASK
+ vshr.u64 $T1,$D0,#26
+ vand.i64 $D0,$D0,$MASK
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+
+ vshr.u64 $T0,$D4,#26
+ vand.i64 $D4,$D4,$MASK
+ vshr.u64 $T1,$D1,#26
+ vand.i64 $D1,$D1,$MASK
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+
+ vadd.i64 $D0,$D0,$T0
+ vshl.u64 $T0,$T0,#2
+ vshr.u64 $T1,$D2,#26
+ vand.i64 $D2,$D2,$MASK
+ vadd.i64 $D0,$D0,$T0 @ h4 -> h0
+ vadd.i64 $D3,$D3,$T1 @ h2 -> h3
+
+ vshr.u64 $T0,$D0,#26
+ vand.i64 $D0,$D0,$MASK
+ vshr.u64 $T1,$D3,#26
+ vand.i64 $D3,$D3,$MASK
+ vadd.i64 $D1,$D1,$T0 @ h0 -> h1
+ vadd.i64 $D4,$D4,$T1 @ h3 -> h4
+
+ cmp $len,#0
+ bne .Leven
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ store hash value
+
+ vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+ vst1.32 {$D4#lo[0]},[$ctx]
+
+ vldmia sp!,{d8-d15} @ epilogue
+ ldmia sp!,{r4-r7}
+ ret @ bx lr
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+#ifndef __KERNEL__
+.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
+.word OPENSSL_armcap_P-.Lpoly1305_init
+# endif
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+#endif
+___
+} }
+$code.=<<___;
+.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/poly1305.h b/lib/crypto/arm/poly1305.h
new file mode 100644
index 000000000000..0fe903d8de55
--- /dev/null
+++ b/lib/crypto/arm/poly1305.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+ const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
+ const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+ const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+ u8 digest[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
+ unsigned int len, u32 padbit)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_neon) && likely(may_use_simd())) {
+ do {
+ unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+ scoped_ksimd()
+ poly1305_blocks_neon(state, src, todo, padbit);
+
+ len -= todo;
+ src += todo;
+ } while (len);
+ } else
+ poly1305_blocks_arm(state, src, len, padbit);
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+ if (elf_hwcap & HWCAP_NEON)
+ static_branch_enable(&have_neon);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm/sha1-armv4-large.S b/lib/crypto/arm/sha1-armv4-large.S
new file mode 100644
index 000000000000..1c8b685149f2
--- /dev/null
+++ b/lib/crypto/arm/sha1-armv4-large.S
@@ -0,0 +1,507 @@
+#define __ARM_ARCH__ __LINUX_ARM_ARCH__
+@ SPDX-License-Identifier: GPL-2.0
+
+@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
+@ has relicensed it under the GPLv2. Therefore this program is free software;
+@ you can redistribute it and/or modify it under the terms of the GNU General
+@ Public License version 2 as published by the Free Software Foundation.
+@
+@ The original headers, including the original license headers, are
+@ included below for completeness.
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see https://www.openssl.org/~appro/cryptogams/.
+@ ====================================================================
+
+@ sha1_block procedure for ARMv4.
+@
+@ January 2007.
+
+@ Size/performance trade-off
+@ ====================================================================
+@ impl size in bytes comp cycles[*] measured performance
+@ ====================================================================
+@ thumb 304 3212 4420
+@ armv4-small 392/+29% 1958/+64% 2250/+96%
+@ armv4-compact 740/+89% 1552/+26% 1840/+22%
+@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
+@ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
+@ ====================================================================
+@ thumb = same as 'small' but in Thumb instructions[**] and
+@ with recurring code in two private functions;
+@ small = detached Xload/update, loops are folded;
+@ compact = detached Xload/update, 5x unroll;
+@ large = interleaved Xload/update, 5x unroll;
+@ full unroll = interleaved Xload/update, full unroll, estimated[!];
+@
+@ [*] Manually counted instructions in "grand" loop body. Measured
+@ performance is affected by prologue and epilogue overhead,
+@ i-cache availability, branch penalties, etc.
+@ [**] While each Thumb instruction is twice smaller, they are not as
+@ diverse as ARM ones: e.g., there are only two arithmetic
+@ instructions with 3 arguments, no [fixed] rotate, addressing
+@ modes are limited. As result it takes more instructions to do
+@ the same job in Thumb, therefore the code is never twice as
+@ small and always slower.
+@ [***] which is also ~35% better than compiler generated code. Dual-
+@ issue Cortex A8 core was measured to process input block in
+@ ~990 cycles.
+
+@ August 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
+@ Cortex A8 core and in absolute terms ~870 cycles per input block
+@ [or 13.6 cycles per byte].
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 10%
+@ improvement on Cortex A8 core and 12.2 cycles per byte.
+
+#include <linux/linkage.h>
+
+.text
+
+.align 2
+ENTRY(sha1_block_data_order)
+ stmdb sp!,{r4-r12,lr}
+ add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
+ ldmia r0,{r3,r4,r5,r6,r7}
+.Lloop:
+ ldr r8,.LK_00_19
+ mov r14,sp
+ sub sp,sp,#15*4
+ mov r5,r5,ror#30
+ mov r6,r6,ror#30
+ mov r7,r7,ror#30 @ [6]
+.L_00_15:
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r5,r6 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ eor r10,r5,r6 @ F_xx_xx
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r4,r10,ror#2
+ add r7,r7,r9 @ E+=X[i]
+ eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r7,r7,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r6,r8,r6,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r4,r5 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r6,r8,r6,ror#2 @ E+=K_00_19
+ eor r10,r4,r5 @ F_xx_xx
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r3,r10,ror#2
+ add r6,r6,r9 @ E+=X[i]
+ eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r6,r6,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r5,r8,r5,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r3,r4 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r5,r8,r5,ror#2 @ E+=K_00_19
+ eor r10,r3,r4 @ F_xx_xx
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r7,r10,ror#2
+ add r5,r5,r9 @ E+=X[i]
+ eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r5,r5,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r4,r8,r4,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r7,r3 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r4,r8,r4,ror#2 @ E+=K_00_19
+ eor r10,r7,r3 @ F_xx_xx
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r6,r10,ror#2
+ add r4,r4,r9 @ E+=X[i]
+ eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r4,r4,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r3,r8,r3,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r6,r7 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r3,r8,r3,ror#2 @ E+=K_00_19
+ eor r10,r6,r7 @ F_xx_xx
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r5,r10,ror#2
+ add r3,r3,r9 @ E+=X[i]
+ eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r3,r3,r10 @ E+=F_00_19(B,C,D)
+ cmp r14,sp
+ bne .L_00_15 @ [((11+4)*5+2)*3]
+ sub sp,sp,#25*4
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r5,r6 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ eor r10,r5,r6 @ F_xx_xx
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r4,r10,ror#2
+ add r7,r7,r9 @ E+=X[i]
+ eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r7,r7,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r3,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
+ add r6,r6,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r7,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
+ add r5,r5,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r6,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
+ add r4,r4,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r5,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
+ add r3,r3,r10 @ E+=F_00_19(B,C,D)
+
+ ldr r8,.LK_20_39 @ [+15+16*4]
+ cmn sp,#0 @ [+3], clear carry to denote 20_39
+.L_20_39_or_60_79:
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r7,r8,r7,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r5,r6 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r4,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r7,r7,r9 @ E+=X[i]
+ add r7,r7,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r3,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ add r6,r6,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r7,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ add r5,r5,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r6,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ add r4,r4,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r5,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ add r3,r3,r10 @ E+=F_20_39(B,C,D)
+ ARM( teq r14,sp ) @ preserve carry
+ THUMB( mov r11,sp )
+ THUMB( teq r14,r11 ) @ preserve carry
+ bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
+ bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
+
+ ldr r8,.LK_40_59
+ sub sp,sp,#20*4 @ [+2]
+.L_40_59:
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r7,r8,r7,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r5,r6 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r4,r10,ror#2 @ F_xx_xx
+ and r11,r5,r6 @ F_xx_xx
+ add r7,r7,r9 @ E+=X[i]
+ add r7,r7,r10 @ E+=F_40_59(B,C,D)
+ add r7,r7,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r3,r10,ror#2 @ F_xx_xx
+ and r11,r4,r5 @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ add r6,r6,r10 @ E+=F_40_59(B,C,D)
+ add r6,r6,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r7,r10,ror#2 @ F_xx_xx
+ and r11,r3,r4 @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ add r5,r5,r10 @ E+=F_40_59(B,C,D)
+ add r5,r5,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r6,r10,ror#2 @ F_xx_xx
+ and r11,r7,r3 @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ add r4,r4,r10 @ E+=F_40_59(B,C,D)
+ add r4,r4,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r5,r10,ror#2 @ F_xx_xx
+ and r11,r6,r7 @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ add r3,r3,r10 @ E+=F_40_59(B,C,D)
+ add r3,r3,r11,ror#2
+ cmp r14,sp
+ bne .L_40_59 @ [+((12+5)*5+2)*4]
+
+ ldr r8,.LK_60_79
+ sub sp,sp,#20*4
+ cmp sp,#0 @ set carry to denote 60_79
+ b .L_20_39_or_60_79 @ [+4], spare 300 bytes
+.L_done:
+ add sp,sp,#80*4 @ "deallocate" stack frame
+ ldmia r0,{r8,r9,r10,r11,r12}
+ add r3,r8,r3
+ add r4,r9,r4
+ add r5,r10,r5,ror#2
+ add r6,r11,r6,ror#2
+ add r7,r12,r7,ror#2
+ stmia r0,{r3,r4,r5,r6,r7}
+ teq r1,r2
+ bne .Lloop @ [+18], total 1307
+
+ ldmia sp!,{r4-r12,pc}
+.align 2
+.LK_00_19: .word 0x5a827999
+.LK_20_39: .word 0x6ed9eba1
+.LK_40_59: .word 0x8f1bbcdc
+.LK_60_79: .word 0xca62c1d6
+ENDPROC(sha1_block_data_order)
+.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
+.align 2
diff --git a/lib/crypto/arm/sha1-armv7-neon.S b/lib/crypto/arm/sha1-armv7-neon.S
new file mode 100644
index 000000000000..a0323fa5c58a
--- /dev/null
+++ b/lib/crypto/arm/sha1-armv7-neon.S
@@ -0,0 +1,633 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* ARM/NEON accelerated SHA-1 transform function
+ *
+ * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.syntax unified
+.fpu neon
+
+.text
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+#define K1 0x5A827999
+#define K2 0x6ED9EBA1
+#define K3 0x8F1BBCDC
+#define K4 0xCA62C1D6
+.align 4
+.LK_VEC:
+.LK1: .long K1, K1, K1, K1
+.LK2: .long K2, K2, K2, K2
+.LK3: .long K3, K3, K3, K3
+.LK4: .long K4, K4, K4, K4
+
+
+/* Register macros */
+
+#define RSTATE r0
+#define RDATA r1
+#define RNBLKS r2
+#define ROLDSTACK r3
+#define RWK lr
+
+#define _a r4
+#define _b r5
+#define _c r6
+#define _d r7
+#define _e r8
+
+#define RT0 r9
+#define RT1 r10
+#define RT2 r11
+#define RT3 r12
+
+#define W0 q0
+#define W1 q7
+#define W2 q2
+#define W3 q3
+#define W4 q4
+#define W5 q6
+#define W6 q5
+#define W7 q1
+
+#define tmp0 q8
+#define tmp1 q9
+#define tmp2 q10
+#define tmp3 q11
+
+#define qK1 q12
+#define qK2 q13
+#define qK3 q14
+#define qK4 q15
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define ARM_LE(code...)
+#else
+#define ARM_LE(code...) code
+#endif
+
+/* Round function macros. */
+
+#define WK_offs(i) (((i) & 15) * 4)
+
+#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ldr RT3, [sp, WK_offs(i)]; \
+ pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ bic RT0, d, b; \
+ add e, e, a, ror #(32 - 5); \
+ and RT1, c, b; \
+ pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add RT0, RT0, RT3; \
+ add e, e, RT1; \
+ ror b, #(32 - 30); \
+ pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT0;
+
+#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ldr RT3, [sp, WK_offs(i)]; \
+ pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ eor RT0, d, b; \
+ add e, e, a, ror #(32 - 5); \
+ eor RT0, RT0, c; \
+ pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT3; \
+ ror b, #(32 - 30); \
+ pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT0; \
+
+#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ldr RT3, [sp, WK_offs(i)]; \
+ pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ eor RT0, b, c; \
+ and RT1, b, c; \
+ add e, e, a, ror #(32 - 5); \
+ pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ and RT0, RT0, d; \
+ add RT1, RT1, RT3; \
+ add e, e, RT0; \
+ ror b, #(32 - 30); \
+ pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+ add e, e, RT1;
+
+#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define R(a,b,c,d,e,f,i) \
+ _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
+ W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define dummy(...)
+
+
+/* Input expansion macros. */
+
+/********* Precalc macros for rounds 0-15 *************************************/
+
+#define W_PRECALC_00_15() \
+ add RWK, sp, #(WK_offs(0)); \
+ \
+ vld1.32 {W0, W7}, [RDATA]!; \
+ ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
+ vld1.32 {W6, W5}, [RDATA]!; \
+ vadd.u32 tmp0, W0, curK; \
+ ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
+ ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
+ vadd.u32 tmp1, W7, curK; \
+ ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
+ vadd.u32 tmp2, W6, curK; \
+ vst1.32 {tmp0, tmp1}, [RWK]!; \
+ vadd.u32 tmp3, W5, curK; \
+ vst1.32 {tmp2, tmp3}, [RWK]; \
+
+#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vld1.32 {W0, W7}, [RDATA]!; \
+
+#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ add RWK, sp, #(WK_offs(0)); \
+
+#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
+
+#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vld1.32 {W6, W5}, [RDATA]!; \
+
+#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp0, W0, curK; \
+
+#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
+
+#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
+
+#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp1, W7, curK; \
+
+#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
+
+#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp2, W6, curK; \
+
+#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp0, tmp1}, [RWK]!; \
+
+#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp3, W5, curK; \
+
+#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp2, tmp3}, [RWK]; \
+
+
+/********* Precalc macros for rounds 16-31 ************************************/
+
+#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp0, tmp0; \
+ vext.8 W, W_m16, W_m12, #8; \
+
+#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ add RWK, sp, #(WK_offs(i)); \
+ vext.8 tmp0, W_m04, tmp0, #4; \
+
+#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp0, tmp0, W_m16; \
+ veor.32 W, W, W_m08; \
+
+#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp1, tmp1; \
+ veor W, W, tmp0; \
+
+#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshl.u32 tmp0, W, #1; \
+
+#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vext.8 tmp1, tmp1, W, #(16-12); \
+ vshr.u32 W, W, #31; \
+
+#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vorr tmp0, tmp0, W; \
+ vshr.u32 W, tmp1, #30; \
+
+#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshl.u32 tmp1, tmp1, #2; \
+
+#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor tmp0, tmp0, W; \
+
+#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, tmp0, tmp1; \
+
+#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp0}, [RWK];
+
+
+/********* Precalc macros for rounds 32-79 ************************************/
+
+#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, W_m28; \
+
+#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vext.8 tmp0, W_m08, W_m04, #8; \
+
+#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, W_m16; \
+
+#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ veor W, tmp0; \
+
+#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ add RWK, sp, #(WK_offs(i&~3)); \
+
+#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshl.u32 tmp1, W, #2; \
+
+#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vshr.u32 tmp0, W, #30; \
+
+#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vorr W, tmp0, tmp1; \
+
+#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ vst1.32 {tmp0}, [RWK];
+
+
+/*
+ * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA.
+ *
+ * void sha1_transform_neon(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+.align 3
+ENTRY(sha1_transform_neon)
+ /* input:
+ * r0: state
+ * r1: data (64*nblocks bytes)
+ * r2: nblocks
+ */
+
+ cmp RNBLKS, #0;
+ beq .Ldo_nothing;
+
+ push {r4-r12, lr};
+ /*vpush {q4-q7};*/
+
+ adr RT3, .LK_VEC;
+
+ mov ROLDSTACK, sp;
+
+ /* Align stack. */
+ sub RT0, sp, #(16*4);
+ and RT0, #(~(16-1));
+ mov sp, RT0;
+
+ vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
+
+ /* Get the values of the chaining variables. */
+ ldm RSTATE, {_a-_e};
+
+ vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
+
+#undef curK
+#define curK qK1
+ /* Precalc 0-15. */
+ W_PRECALC_00_15();
+
+.Loop:
+ /* Transform 0-15 + Precalc 16-31. */
+ _R( _a, _b, _c, _d, _e, F1, 0,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
+ W4, W5, W6, W7, W0, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F1, 1,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
+ W4, W5, W6, W7, W0, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F1, 2,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
+ W4, W5, W6, W7, W0, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F1, 3,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
+ W4, W5, W6, W7, W0, _, _, _ );
+
+#undef curK
+#define curK qK2
+ _R( _b, _c, _d, _e, _a, F1, 4,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
+ W3, W4, W5, W6, W7, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F1, 5,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
+ W3, W4, W5, W6, W7, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F1, 6,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
+ W3, W4, W5, W6, W7, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F1, 7,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
+ W3, W4, W5, W6, W7, _, _, _ );
+
+ _R( _c, _d, _e, _a, _b, F1, 8,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
+ W2, W3, W4, W5, W6, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F1, 9,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
+ W2, W3, W4, W5, W6, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F1, 10,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
+ W2, W3, W4, W5, W6, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F1, 11,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
+ W2, W3, W4, W5, W6, _, _, _ );
+
+ _R( _d, _e, _a, _b, _c, F1, 12,
+ WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
+ W1, W2, W3, W4, W5, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F1, 13,
+ WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
+ W1, W2, W3, W4, W5, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F1, 14,
+ WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
+ W1, W2, W3, W4, W5, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F1, 15,
+ WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
+ W1, W2, W3, W4, W5, _, _, _ );
+
+ /* Transform 16-63 + Precalc 32-79. */
+ _R( _e, _a, _b, _c, _d, F1, 16,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _d, _e, _a, _b, _c, F1, 17,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _c, _d, _e, _a, _b, F1, 18,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _b, _c, _d, _e, _a, F1, 19,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+
+ _R( _a, _b, _c, _d, _e, F2, 20,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _e, _a, _b, _c, _d, F2, 21,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _d, _e, _a, _b, _c, F2, 22,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _c, _d, _e, _a, _b, F2, 23,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+
+#undef curK
+#define curK qK3
+ _R( _b, _c, _d, _e, _a, F2, 24,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _a, _b, _c, _d, _e, F2, 25,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _e, _a, _b, _c, _d, F2, 26,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _d, _e, _a, _b, _c, F2, 27,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+
+ _R( _c, _d, _e, _a, _b, F2, 28,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _b, _c, _d, _e, _a, F2, 29,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _a, _b, _c, _d, _e, F2, 30,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _e, _a, _b, _c, _d, F2, 31,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+
+ _R( _d, _e, _a, _b, _c, F2, 32,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+ _R( _c, _d, _e, _a, _b, F2, 33,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+ _R( _b, _c, _d, _e, _a, F2, 34,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+ _R( _a, _b, _c, _d, _e, F2, 35,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48,
+ W4, W5, W6, W7, W0, W1, W2, W3);
+
+ _R( _e, _a, _b, _c, _d, F2, 36,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+ _R( _d, _e, _a, _b, _c, F2, 37,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+ _R( _c, _d, _e, _a, _b, F2, 38,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+ _R( _b, _c, _d, _e, _a, F2, 39,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52,
+ W3, W4, W5, W6, W7, W0, W1, W2);
+
+ _R( _a, _b, _c, _d, _e, F3, 40,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+ _R( _e, _a, _b, _c, _d, F3, 41,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+ _R( _d, _e, _a, _b, _c, F3, 42,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+ _R( _c, _d, _e, _a, _b, F3, 43,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56,
+ W2, W3, W4, W5, W6, W7, W0, W1);
+
+#undef curK
+#define curK qK4
+ _R( _b, _c, _d, _e, _a, F3, 44,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+ _R( _a, _b, _c, _d, _e, F3, 45,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+ _R( _e, _a, _b, _c, _d, F3, 46,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+ _R( _d, _e, _a, _b, _c, F3, 47,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60,
+ W1, W2, W3, W4, W5, W6, W7, W0);
+
+ _R( _c, _d, _e, _a, _b, F3, 48,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _b, _c, _d, _e, _a, F3, 49,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _a, _b, _c, _d, _e, F3, 50,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+ _R( _e, _a, _b, _c, _d, F3, 51,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64,
+ W0, W1, W2, W3, W4, W5, W6, W7);
+
+ _R( _d, _e, _a, _b, _c, F3, 52,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _c, _d, _e, _a, _b, F3, 53,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _b, _c, _d, _e, _a, F3, 54,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+ _R( _a, _b, _c, _d, _e, F3, 55,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68,
+ W7, W0, W1, W2, W3, W4, W5, W6);
+
+ _R( _e, _a, _b, _c, _d, F3, 56,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _d, _e, _a, _b, _c, F3, 57,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _c, _d, _e, _a, _b, F3, 58,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+ _R( _b, _c, _d, _e, _a, F3, 59,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72,
+ W6, W7, W0, W1, W2, W3, W4, W5);
+
+ subs RNBLKS, #1;
+
+ _R( _a, _b, _c, _d, _e, F4, 60,
+ WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _e, _a, _b, _c, _d, F4, 61,
+ WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _d, _e, _a, _b, _c, F4, 62,
+ WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+ _R( _c, _d, _e, _a, _b, F4, 63,
+ WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76,
+ W5, W6, W7, W0, W1, W2, W3, W4);
+
+ beq .Lend;
+
+ /* Transform 64-79 + Precalc 0-15 of next block. */
+#undef curK
+#define curK qK1
+ _R( _b, _c, _d, _e, _a, F4, 64,
+ WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F4, 65,
+ WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F4, 66,
+ WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F4, 67,
+ WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+ _R( _c, _d, _e, _a, _b, F4, 68,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F4, 69,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F4, 70,
+ WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _e, _a, _b, _c, _d, F4, 71,
+ WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+ _R( _d, _e, _a, _b, _c, F4, 72,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F4, 73,
+ dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F4, 74,
+ WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _a, _b, _c, _d, _e, F4, 75,
+ WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+ _R( _e, _a, _b, _c, _d, F4, 76,
+ WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _d, _e, _a, _b, _c, F4, 77,
+ WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _c, _d, _e, _a, _b, F4, 78,
+ WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+ _R( _b, _c, _d, _e, _a, F4, 79,
+ WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
+
+ /* Update the chaining variables. */
+ ldm RSTATE, {RT0-RT3};
+ add _a, RT0;
+ ldr RT0, [RSTATE, #state_h4];
+ add _b, RT1;
+ add _c, RT2;
+ add _d, RT3;
+ add _e, RT0;
+ stm RSTATE, {_a-_e};
+
+ b .Loop;
+
+.Lend:
+ /* Transform 64-79 */
+ R( _b, _c, _d, _e, _a, F4, 64 );
+ R( _a, _b, _c, _d, _e, F4, 65 );
+ R( _e, _a, _b, _c, _d, F4, 66 );
+ R( _d, _e, _a, _b, _c, F4, 67 );
+ R( _c, _d, _e, _a, _b, F4, 68 );
+ R( _b, _c, _d, _e, _a, F4, 69 );
+ R( _a, _b, _c, _d, _e, F4, 70 );
+ R( _e, _a, _b, _c, _d, F4, 71 );
+ R( _d, _e, _a, _b, _c, F4, 72 );
+ R( _c, _d, _e, _a, _b, F4, 73 );
+ R( _b, _c, _d, _e, _a, F4, 74 );
+ R( _a, _b, _c, _d, _e, F4, 75 );
+ R( _e, _a, _b, _c, _d, F4, 76 );
+ R( _d, _e, _a, _b, _c, F4, 77 );
+ R( _c, _d, _e, _a, _b, F4, 78 );
+ R( _b, _c, _d, _e, _a, F4, 79 );
+
+ mov sp, ROLDSTACK;
+
+ /* Update the chaining variables. */
+ ldm RSTATE, {RT0-RT3};
+ add _a, RT0;
+ ldr RT0, [RSTATE, #state_h4];
+ add _b, RT1;
+ add _c, RT2;
+ add _d, RT3;
+ /*vpop {q4-q7};*/
+ add _e, RT0;
+ stm RSTATE, {_a-_e};
+
+ pop {r4-r12, pc};
+
+.Ldo_nothing:
+ bx lr
+ENDPROC(sha1_transform_neon)
diff --git a/lib/crypto/arm/sha1-ce-core.S b/lib/crypto/arm/sha1-ce-core.S
new file mode 100644
index 000000000000..7d6b2631ca8d
--- /dev/null
+++ b/lib/crypto/arm/sha1-ce-core.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .arch armv8-a
+ .fpu crypto-neon-fp-armv8
+
+ k0 .req q0
+ k1 .req q1
+ k2 .req q2
+ k3 .req q3
+
+ ta0 .req q4
+ ta1 .req q5
+ tb0 .req q5
+ tb1 .req q4
+
+ dga .req q6
+ dgb .req q7
+ dgbs .req s28
+
+ dg0 .req q12
+ dg1a0 .req q13
+ dg1a1 .req q14
+ dg1b0 .req q14
+ dg1b1 .req q13
+
+ .macro add_only, op, ev, rc, s0, dg1
+ .ifnb \s0
+ vadd.u32 tb\ev, q\s0, \rc
+ .endif
+ sha1h.32 dg1b\ev, dg0
+ .ifb \dg1
+ sha1\op\().32 dg0, dg1a\ev, ta\ev
+ .else
+ sha1\op\().32 dg0, \dg1, ta\ev
+ .endif
+ .endm
+
+ .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
+ sha1su0.32 q\s0, q\s1, q\s2
+ add_only \op, \ev, \rc, \s1, \dg1
+ sha1su1.32 q\s0, q\s3
+ .endm
+
+ .align 6
+.Lsha1_rcon:
+ .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+ .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+ .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+ .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+ /*
+ * void sha1_ce_transform(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+ENTRY(sha1_ce_transform)
+ /* load round constants */
+ adr ip, .Lsha1_rcon
+ vld1.32 {k0-k1}, [ip, :128]!
+ vld1.32 {k2-k3}, [ip, :128]
+
+ /* load state */
+ vld1.32 {dga}, [r0]
+ vldr dgbs, [r0, #16]
+
+ /* load input */
+0: vld1.32 {q8-q9}, [r1]!
+ vld1.32 {q10-q11}, [r1]!
+ subs r2, r2, #1
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+ vrev32.8 q8, q8
+ vrev32.8 q9, q9
+ vrev32.8 q10, q10
+ vrev32.8 q11, q11
+#endif
+
+ vadd.u32 ta0, q8, k0
+ vmov dg0, dga
+
+ add_update c, 0, k0, 8, 9, 10, 11, dgb
+ add_update c, 1, k0, 9, 10, 11, 8
+ add_update c, 0, k0, 10, 11, 8, 9
+ add_update c, 1, k0, 11, 8, 9, 10
+ add_update c, 0, k1, 8, 9, 10, 11
+
+ add_update p, 1, k1, 9, 10, 11, 8
+ add_update p, 0, k1, 10, 11, 8, 9
+ add_update p, 1, k1, 11, 8, 9, 10
+ add_update p, 0, k1, 8, 9, 10, 11
+ add_update p, 1, k2, 9, 10, 11, 8
+
+ add_update m, 0, k2, 10, 11, 8, 9
+ add_update m, 1, k2, 11, 8, 9, 10
+ add_update m, 0, k2, 8, 9, 10, 11
+ add_update m, 1, k2, 9, 10, 11, 8
+ add_update m, 0, k3, 10, 11, 8, 9
+
+ add_update p, 1, k3, 11, 8, 9, 10
+ add_only p, 0, k3, 9
+ add_only p, 1, k3, 10
+ add_only p, 0, k3, 11
+ add_only p, 1
+
+ /* update state */
+ vadd.u32 dga, dga, dg0
+ vadd.u32 dgb, dgb, dg1a0
+ bne 0b
+
+ /* store new state */
+ vst1.32 {dga}, [r0]
+ vstr dgbs, [r0, #16]
+ bx lr
+ENDPROC(sha1_ce_transform)
diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h
new file mode 100644
index 000000000000..3e2d8c7cab9f
--- /dev/null
+++ b/lib/crypto/arm/sha1.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage void sha1_block_data_order(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage void sha1_transform_neon(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage void sha1_ce_transform(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_neon) && likely(may_use_simd())) {
+ scoped_ksimd() {
+ if (static_branch_likely(&have_ce))
+ sha1_ce_transform(state, data, nblocks);
+ else
+ sha1_transform_neon(state, data, nblocks);
+ }
+ } else {
+ sha1_block_data_order(state, data, nblocks);
+ }
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+ if (elf_hwcap & HWCAP_NEON) {
+ static_branch_enable(&have_neon);
+ if (elf_hwcap2 & HWCAP2_SHA1)
+ static_branch_enable(&have_ce);
+ }
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm/sha256-armv4.pl b/lib/crypto/arm/sha256-armv4.pl
new file mode 100644
index 000000000000..f3a2b54efd4e
--- /dev/null
+++ b/lib/crypto/arm/sha256-armv4.pl
@@ -0,0 +1,724 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0"; $t0="r0";
+$inp="r1"; $t4="r1";
+$len="r2"; $t1="r2";
+$T1="r3"; $t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+ @ ldr $t1,[$inp],#4 @ $i
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+# ifndef __ARMEB__
+ rev $t1,$t1
+# endif
+#else
+ @ ldrb $t1,[$inp,#3] @ $i
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ ldrb $t2,[$inp,#2]
+ ldrb $t0,[$inp,#1]
+ orr $t1,$t1,$t2,lsl#8
+ ldrb $t2,[$inp],#4
+ orr $t1,$t1,$t0,lsl#16
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ orr $t1,$t1,$t2,lsl#24
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+#endif
+___
+$code.=<<___;
+ ldr $t2,[$Ktbl],#4 @ *K256++
+ add $h,$h,$t1 @ h+=X[i]
+ str $t1,[sp,#`$i%16`*4]
+ eor $t1,$f,$g
+ add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
+ and $t1,$t1,$e
+ add $h,$h,$t2 @ h+=K256[i]
+ eor $t1,$t1,$g @ Ch(e,f,g)
+ eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+ add $h,$h,$t1 @ h+=Ch(e,f,g)
+#if $i==31
+ and $t2,$t2,#0xff
+ cmp $t2,#0xf2 @ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4 @ prefetch
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t2,$a,$b @ a^b, b^c in next round
+#else
+ ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
+ eor $t2,$a,$b @ a^b, b^c in next round
+ ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
+#endif
+ eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
+ and $t3,$t3,$t2 @ (b^c)&=(a^b)
+ add $d,$d,$h @ d+=h
+ eor $t3,$t3,$b @ Maj(a,b,c)
+ add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
+ @ add $h,$h,$t3 @ h+=Maj(a,b,c)
+___
+ ($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+ @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ @ ldr $t4,[sp,#`($i+14)%16`*4]
+ mov $t0,$t1,ror#$sigma0[0]
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ mov $t2,$t4,ror#$sigma1[0]
+ eor $t0,$t0,$t1,ror#$sigma0[1]
+ eor $t2,$t2,$t4,ror#$sigma1[1]
+ eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
+ ldr $t1,[sp,#`($i+0)%16`*4]
+ eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
+ ldr $t4,[sp,#`($i+9)%16`*4]
+
+ add $t2,$t2,$t0
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
+ add $t1,$t1,$t2
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ add $t1,$t1,$t4 @ X[i]
+___
+ &BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+.Lsha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,.Lsha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
+ ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+ sub $Ktbl,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t3,$B,$C @ magic
+ eor $t2,$t2,$t2
+___
+for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq $t3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t0,[$t3,#0]
+ ldr $t1,[$t3,#4]
+ ldr $t2,[$t3,#8]
+ add $A,$A,$t0
+ ldr $t0,[$t3,#12]
+ add $B,$B,$t1
+ ldr $t1,[$t3,#16]
+ add $C,$C,$t2
+ ldr $t2,[$t3,#20]
+ add $D,$D,$t0
+ ldr $t0,[$t3,#24]
+ add $E,$E,$t1
+ ldr $t1,[$t3,#28]
+ add $F,$F,$t2
+ ldr $inp,[sp,#17*4] @ pull inp
+ ldr $t2,[sp,#18*4] @ pull inp+len
+ add $G,$G,$t0
+ add $H,$H,$t1
+ stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+ cmp $inp,$t2
+ sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#`16+3`*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ while($#insns>=2) { eval(shift(@insns)); }
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vrev32_8 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&eor ($t1,$f,$g)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&and ($t1,$t1,$e)',
+ '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&eor ($t1,$t1,$g)', # Ch(e,f,g)
+ '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&ldr ($t1,"[sp,#64]") if ($j==31)',
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&add ($d,$d,$h)', # d+=h
+ '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub $H,sp,#16*4+16
+ adr $Ktbl,.Lsha256_block_data_order
+ sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
+ bic $H,$H,#15 @ align for 128-bit stores
+ mov $t2,sp
+ mov sp,$H @ alloca
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {@X[0]},[$inp]!
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ vld1.32 {$T0},[$Ktbl,:128]!
+ vld1.32 {$T1},[$Ktbl,:128]!
+ vld1.32 {$T2},[$Ktbl,:128]!
+ vld1.32 {$T3},[$Ktbl,:128]!
+ vrev32.8 @X[0],@X[0] @ yes, even on
+ str $ctx,[sp,#64]
+ vrev32.8 @X[1],@X[1] @ big-endian
+ str $inp,[sp,#68]
+ mov $Xfer,sp
+ vrev32.8 @X[2],@X[2]
+ str $len,[sp,#72]
+ vrev32.8 @X[3],@X[3]
+ str $t2,[sp,#76] @ save original sp
+ vadd.i32 $T0,$T0,@X[0]
+ vadd.i32 $T1,$T1,@X[1]
+ vst1.32 {$T0},[$Xfer,:128]!
+ vadd.i32 $T2,$T2,@X[2]
+ vst1.32 {$T1},[$Xfer,:128]!
+ vadd.i32 $T3,$T3,@X[3]
+ vst1.32 {$T2},[$Xfer,:128]!
+ vst1.32 {$T3},[$Xfer,:128]!
+
+ ldmia $ctx,{$A-$H}
+ sub $Xfer,$Xfer,#64
+ ldr $t1,[sp,#0]
+ eor $t2,$t2,$t2
+ eor $t3,$B,$C
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ teq $t1,#0 @ check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ ldr $inp,[sp,#68]
+ ldr $t0,[sp,#72]
+ sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
+ teq $inp,$t0
+ it eq
+ subeq $inp,$inp,#64 @ avoid SEGV
+ vld1.8 {@X[0]},[$inp]! @ load next input block
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ it ne
+ strne $inp,[sp,#68]
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ ldr $t0,[$t1,#0]
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t2,[$t1,#4]
+ ldr $t3,[$t1,#8]
+ ldr $t4,[$t1,#12]
+ add $A,$A,$t0 @ accumulate
+ ldr $t0,[$t1,#16]
+ add $B,$B,$t2
+ ldr $t2,[$t1,#20]
+ add $C,$C,$t3
+ ldr $t3,[$t1,#24]
+ add $D,$D,$t4
+ ldr $t4,[$t1,#28]
+ add $E,$E,$t0
+ str $A,[$t1],#4
+ add $F,$F,$t2
+ str $B,[$t1],#4
+ add $G,$G,$t3
+ str $C,[$t1],#4
+ add $H,$H,$t4
+ str $D,[$t1],#4
+ stmia $t1,{$E-$H}
+
+ ittte ne
+ movne $Xfer,sp
+ ldrne $t1,[sp,#0]
+ eorne $t2,$t2,$t2
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne $t3,$B,$C
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+ adr $Ktbl,.LARMv8
+ sub $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+ adrl $Ktbl,K256
+# endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
+ vld1.32 {$W0},[$Ktbl]!
+ vrev32.8 @MSG[0],@MSG[0]
+ vrev32.8 @MSG[1],@MSG[1]
+ vrev32.8 @MSG[2],@MSG[2]
+ vrev32.8 @MSG[3],@MSG[3]
+ vmov $ABCD_SAVE,$ABCD @ offload
+ vmov $EFGH_SAVE,$EFGH
+ teq $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vld1.32 {$W0},[$Ktbl]!
+ vadd.i32 $W1,$W1,@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vld1.32 {$W1},[$Ktbl]
+ vadd.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#256-16 @ rewind
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vadd.i32 $W1,$W1,@MSG[3]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
+ vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {$ABCD,$EFGH},[$ctx]
+
+ ret @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+{ my %opcode = (
+ "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
+ "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S
new file mode 100644
index 000000000000..144ee805f64a
--- /dev/null
+++ b/lib/crypto/arm/sha256-ce.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-224/256 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .arch armv8-a
+ .fpu crypto-neon-fp-armv8
+
+ k0 .req q7
+ k1 .req q8
+ rk .req r3
+
+ ta0 .req q9
+ ta1 .req q10
+ tb0 .req q10
+ tb1 .req q9
+
+ dga .req q11
+ dgb .req q12
+
+ dg0 .req q13
+ dg1 .req q14
+ dg2 .req q15
+
+ .macro add_only, ev, s0
+ vmov dg2, dg0
+ .ifnb \s0
+ vld1.32 {k\ev}, [rk, :128]!
+ .endif
+ sha256h.32 dg0, dg1, tb\ev
+ sha256h2.32 dg1, dg2, tb\ev
+ .ifnb \s0
+ vadd.u32 ta\ev, q\s0, k\ev
+ .endif
+ .endm
+
+ .macro add_update, ev, s0, s1, s2, s3
+ sha256su0.32 q\s0, q\s1
+ add_only \ev, \s1
+ sha256su1.32 q\s0, q\s2, q\s3
+ .endm
+
+ .align 6
+.Lsha256_rcon:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+ /*
+ * void sha256_ce_transform(struct sha256_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+ENTRY(sha256_ce_transform)
+ /* load state */
+ vld1.32 {dga-dgb}, [r0]
+
+ /* load input */
+0: vld1.32 {q0-q1}, [r1]!
+ vld1.32 {q2-q3}, [r1]!
+ subs r2, r2, #1
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+ vrev32.8 q0, q0
+ vrev32.8 q1, q1
+ vrev32.8 q2, q2
+ vrev32.8 q3, q3
+#endif
+
+ /* load first round constant */
+ adr rk, .Lsha256_rcon
+ vld1.32 {k0}, [rk, :128]!
+
+ vadd.u32 ta0, q0, k0
+ vmov dg0, dga
+ vmov dg1, dgb
+
+ add_update 1, 0, 1, 2, 3
+ add_update 0, 1, 2, 3, 0
+ add_update 1, 2, 3, 0, 1
+ add_update 0, 3, 0, 1, 2
+ add_update 1, 0, 1, 2, 3
+ add_update 0, 1, 2, 3, 0
+ add_update 1, 2, 3, 0, 1
+ add_update 0, 3, 0, 1, 2
+ add_update 1, 0, 1, 2, 3
+ add_update 0, 1, 2, 3, 0
+ add_update 1, 2, 3, 0, 1
+ add_update 0, 3, 0, 1, 2
+
+ add_only 1, 1
+ add_only 0, 2
+ add_only 1, 3
+ add_only 0
+
+ /* update state */
+ vadd.u32 dga, dga, dg0
+ vadd.u32 dgb, dgb, dg1
+ bne 0b
+
+ /* store new state */
+ vst1.32 {dga-dgb}, [r0]
+ bx lr
+ENDPROC(sha256_ce_transform)
diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h
new file mode 100644
index 000000000000..ae7e52dd6e3b
--- /dev/null
+++ b/lib/crypto/arm/sha256.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage void sha256_ce_transform(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_neon) && likely(may_use_simd())) {
+ scoped_ksimd() {
+ if (static_branch_likely(&have_ce))
+ sha256_ce_transform(state, data, nblocks);
+ else
+ sha256_block_data_order_neon(state, data, nblocks);
+ }
+ } else {
+ sha256_block_data_order(state, data, nblocks);
+ }
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+ if (elf_hwcap & HWCAP_NEON) {
+ static_branch_enable(&have_neon);
+ if (elf_hwcap2 & HWCAP2_SHA2)
+ static_branch_enable(&have_ce);
+ }
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm/sha512-armv4.pl b/lib/crypto/arm/sha512-armv4.pl
new file mode 100644
index 000000000000..2fc3516912fa
--- /dev/null
+++ b/lib/crypto/arm/sha512-armv4.pl
@@ -0,0 +1,657 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA512 block procedure for ARMv4. September 2007.
+
+# This code is ~4.5 (four and a half) times faster than code generated
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+# August 2012.
+#
+# Improve NEON performance by 12% on Snapdragon S4. In absolute
+# terms it's 22.6 cycles per byte, which is disappointing result.
+# Technical writers asserted that 3-way S4 pipeline can sustain
+# multiple NEON instructions per cycle, but dual NEON issue could
+# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
+
+# Byte order [in]dependence. =========================================
+#
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
+# ====================================================================
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0"; # parameter block
+$inp="r1";
+$len="r2";
+
+$Tlo="r3";
+$Thi="r4";
+$Alo="r5";
+$Ahi="r6";
+$Elo="r7";
+$Ehi="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+############ r13 is stack pointer
+$Ktbl="r14";
+############ r15 is program counter
+
+$Aoff=8*0;
+$Boff=8*1;
+$Coff=8*2;
+$Doff=8*3;
+$Eoff=8*4;
+$Foff=8*5;
+$Goff=8*6;
+$Hoff=8*7;
+$Xoff=8*8;
+
+sub BODY_00_15() {
+my $magic = shift;
+$code.=<<___;
+ @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
+ @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+ @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+ mov $t0,$Elo,lsr#14
+ str $Tlo,[sp,#$Xoff+0]
+ mov $t1,$Ehi,lsr#14
+ str $Thi,[sp,#$Xoff+4]
+ eor $t0,$t0,$Ehi,lsl#18
+ ldr $t2,[sp,#$Hoff+0] @ h.lo
+ eor $t1,$t1,$Elo,lsl#18
+ ldr $t3,[sp,#$Hoff+4] @ h.hi
+ eor $t0,$t0,$Elo,lsr#18
+ eor $t1,$t1,$Ehi,lsr#18
+ eor $t0,$t0,$Ehi,lsl#14
+ eor $t1,$t1,$Elo,lsl#14
+ eor $t0,$t0,$Ehi,lsr#9
+ eor $t1,$t1,$Elo,lsr#9
+ eor $t0,$t0,$Elo,lsl#23
+ eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
+ adds $Tlo,$Tlo,$t0
+ ldr $t0,[sp,#$Foff+0] @ f.lo
+ adc $Thi,$Thi,$t1 @ T += Sigma1(e)
+ ldr $t1,[sp,#$Foff+4] @ f.hi
+ adds $Tlo,$Tlo,$t2
+ ldr $t2,[sp,#$Goff+0] @ g.lo
+ adc $Thi,$Thi,$t3 @ T += h
+ ldr $t3,[sp,#$Goff+4] @ g.hi
+
+ eor $t0,$t0,$t2
+ str $Elo,[sp,#$Eoff+0]
+ eor $t1,$t1,$t3
+ str $Ehi,[sp,#$Eoff+4]
+ and $t0,$t0,$Elo
+ str $Alo,[sp,#$Aoff+0]
+ and $t1,$t1,$Ehi
+ str $Ahi,[sp,#$Aoff+4]
+ eor $t0,$t0,$t2
+ ldr $t2,[$Ktbl,#$lo] @ K[i].lo
+ eor $t1,$t1,$t3 @ Ch(e,f,g)
+ ldr $t3,[$Ktbl,#$hi] @ K[i].hi
+
+ adds $Tlo,$Tlo,$t0
+ ldr $Elo,[sp,#$Doff+0] @ d.lo
+ adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
+ ldr $Ehi,[sp,#$Doff+4] @ d.hi
+ adds $Tlo,$Tlo,$t2
+ and $t0,$t2,#0xff
+ adc $Thi,$Thi,$t3 @ T += K[i]
+ adds $Elo,$Elo,$Tlo
+ ldr $t2,[sp,#$Boff+0] @ b.lo
+ adc $Ehi,$Ehi,$Thi @ d += T
+ teq $t0,#$magic
+
+ ldr $t3,[sp,#$Coff+0] @ c.lo
+#if __ARM_ARCH__>=7
+ it eq @ Thumb2 thing, sanity check in ARM
+#endif
+ orreq $Ktbl,$Ktbl,#1
+ @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+ @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+ @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+ mov $t0,$Alo,lsr#28
+ mov $t1,$Ahi,lsr#28
+ eor $t0,$t0,$Ahi,lsl#4
+ eor $t1,$t1,$Alo,lsl#4
+ eor $t0,$t0,$Ahi,lsr#2
+ eor $t1,$t1,$Alo,lsr#2
+ eor $t0,$t0,$Alo,lsl#30
+ eor $t1,$t1,$Ahi,lsl#30
+ eor $t0,$t0,$Ahi,lsr#7
+ eor $t1,$t1,$Alo,lsr#7
+ eor $t0,$t0,$Alo,lsl#25
+ eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
+ adds $Tlo,$Tlo,$t0
+ and $t0,$Alo,$t2
+ adc $Thi,$Thi,$t1 @ T += Sigma0(a)
+
+ ldr $t1,[sp,#$Boff+4] @ b.hi
+ orr $Alo,$Alo,$t2
+ ldr $t2,[sp,#$Coff+4] @ c.hi
+ and $Alo,$Alo,$t3
+ and $t3,$Ahi,$t1
+ orr $Ahi,$Ahi,$t1
+ orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
+ and $Ahi,$Ahi,$t2
+ adds $Alo,$Alo,$Tlo
+ orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
+ sub sp,sp,#8
+ adc $Ahi,$Ahi,$Thi @ h += T
+ tst $Ktbl,#1
+ add $Ktbl,$Ktbl,#8
+___
+}
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K512,%object
+.align 5
+K512:
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size K512,.-K512
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha512_block_data_order
+.skip 32-4
+#else
+.skip 32
+#endif
+
+.global sha512_block_data_order
+.type sha512_block_data_order,%function
+sha512_block_data_order:
+.Lsha512_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha512_block_data_order
+#else
+ adr r3,.Lsha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#1
+ bne .LNEON
+#endif
+ add $len,$inp,$len,lsl#7 @ len to point at the end of inp
+ stmdb sp!,{r4-r12,lr}
+ sub $Ktbl,r3,#672 @ K512
+ sub sp,sp,#9*8
+
+ ldr $Elo,[$ctx,#$Eoff+$lo]
+ ldr $Ehi,[$ctx,#$Eoff+$hi]
+ ldr $t0, [$ctx,#$Goff+$lo]
+ ldr $t1, [$ctx,#$Goff+$hi]
+ ldr $t2, [$ctx,#$Hoff+$lo]
+ ldr $t3, [$ctx,#$Hoff+$hi]
+.Loop:
+ str $t0, [sp,#$Goff+0]
+ str $t1, [sp,#$Goff+4]
+ str $t2, [sp,#$Hoff+0]
+ str $t3, [sp,#$Hoff+4]
+ ldr $Alo,[$ctx,#$Aoff+$lo]
+ ldr $Ahi,[$ctx,#$Aoff+$hi]
+ ldr $Tlo,[$ctx,#$Boff+$lo]
+ ldr $Thi,[$ctx,#$Boff+$hi]
+ ldr $t0, [$ctx,#$Coff+$lo]
+ ldr $t1, [$ctx,#$Coff+$hi]
+ ldr $t2, [$ctx,#$Doff+$lo]
+ ldr $t3, [$ctx,#$Doff+$hi]
+ str $Tlo,[sp,#$Boff+0]
+ str $Thi,[sp,#$Boff+4]
+ str $t0, [sp,#$Coff+0]
+ str $t1, [sp,#$Coff+4]
+ str $t2, [sp,#$Doff+0]
+ str $t3, [sp,#$Doff+4]
+ ldr $Tlo,[$ctx,#$Foff+$lo]
+ ldr $Thi,[$ctx,#$Foff+$hi]
+ str $Tlo,[sp,#$Foff+0]
+ str $Thi,[sp,#$Foff+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+ ldrb $Tlo,[$inp,#7]
+ ldrb $t0, [$inp,#6]
+ ldrb $t1, [$inp,#5]
+ ldrb $t2, [$inp,#4]
+ ldrb $Thi,[$inp,#3]
+ ldrb $t3, [$inp,#2]
+ orr $Tlo,$Tlo,$t0,lsl#8
+ ldrb $t0, [$inp,#1]
+ orr $Tlo,$Tlo,$t1,lsl#16
+ ldrb $t1, [$inp],#8
+ orr $Tlo,$Tlo,$t2,lsl#24
+ orr $Thi,$Thi,$t3,lsl#8
+ orr $Thi,$Thi,$t0,lsl#16
+ orr $Thi,$Thi,$t1,lsl#24
+#else
+ ldr $Tlo,[$inp,#4]
+ ldr $Thi,[$inp],#8
+#ifdef __ARMEL__
+ rev $Tlo,$Tlo
+ rev $Thi,$Thi
+#endif
+#endif
+___
+ &BODY_00_15(0x94);
+$code.=<<___;
+ tst $Ktbl,#1
+ beq .L00_15
+ ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
+ ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
+ bic $Ktbl,$Ktbl,#1
+.L16_79:
+ @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
+ @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+ @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
+ mov $Tlo,$t0,lsr#1
+ ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
+ mov $Thi,$t1,lsr#1
+ ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
+ eor $Tlo,$Tlo,$t1,lsl#31
+ eor $Thi,$Thi,$t0,lsl#31
+ eor $Tlo,$Tlo,$t0,lsr#8
+ eor $Thi,$Thi,$t1,lsr#8
+ eor $Tlo,$Tlo,$t1,lsl#24
+ eor $Thi,$Thi,$t0,lsl#24
+ eor $Tlo,$Tlo,$t0,lsr#7
+ eor $Thi,$Thi,$t1,lsr#7
+ eor $Tlo,$Tlo,$t1,lsl#25
+
+ @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+ @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+ @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+ mov $t0,$t2,lsr#19
+ mov $t1,$t3,lsr#19
+ eor $t0,$t0,$t3,lsl#13
+ eor $t1,$t1,$t2,lsl#13
+ eor $t0,$t0,$t3,lsr#29
+ eor $t1,$t1,$t2,lsr#29
+ eor $t0,$t0,$t2,lsl#3
+ eor $t1,$t1,$t3,lsl#3
+ eor $t0,$t0,$t2,lsr#6
+ eor $t1,$t1,$t3,lsr#6
+ ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
+ eor $t0,$t0,$t3,lsl#26
+
+ ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
+ adds $Tlo,$Tlo,$t0
+ ldr $t0,[sp,#`$Xoff+8*16`+0]
+ adc $Thi,$Thi,$t1
+
+ ldr $t1,[sp,#`$Xoff+8*16`+4]
+ adds $Tlo,$Tlo,$t2
+ adc $Thi,$Thi,$t3
+ adds $Tlo,$Tlo,$t0
+ adc $Thi,$Thi,$t1
+___
+ &BODY_00_15(0x17);
+$code.=<<___;
+#if __ARM_ARCH__>=7
+ ittt eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
+ ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
+ beq .L16_79
+ bic $Ktbl,$Ktbl,#1
+
+ ldr $Tlo,[sp,#$Boff+0]
+ ldr $Thi,[sp,#$Boff+4]
+ ldr $t0, [$ctx,#$Aoff+$lo]
+ ldr $t1, [$ctx,#$Aoff+$hi]
+ ldr $t2, [$ctx,#$Boff+$lo]
+ ldr $t3, [$ctx,#$Boff+$hi]
+ adds $t0,$Alo,$t0
+ str $t0, [$ctx,#$Aoff+$lo]
+ adc $t1,$Ahi,$t1
+ str $t1, [$ctx,#$Aoff+$hi]
+ adds $t2,$Tlo,$t2
+ str $t2, [$ctx,#$Boff+$lo]
+ adc $t3,$Thi,$t3
+ str $t3, [$ctx,#$Boff+$hi]
+
+ ldr $Alo,[sp,#$Coff+0]
+ ldr $Ahi,[sp,#$Coff+4]
+ ldr $Tlo,[sp,#$Doff+0]
+ ldr $Thi,[sp,#$Doff+4]
+ ldr $t0, [$ctx,#$Coff+$lo]
+ ldr $t1, [$ctx,#$Coff+$hi]
+ ldr $t2, [$ctx,#$Doff+$lo]
+ ldr $t3, [$ctx,#$Doff+$hi]
+ adds $t0,$Alo,$t0
+ str $t0, [$ctx,#$Coff+$lo]
+ adc $t1,$Ahi,$t1
+ str $t1, [$ctx,#$Coff+$hi]
+ adds $t2,$Tlo,$t2
+ str $t2, [$ctx,#$Doff+$lo]
+ adc $t3,$Thi,$t3
+ str $t3, [$ctx,#$Doff+$hi]
+
+ ldr $Tlo,[sp,#$Foff+0]
+ ldr $Thi,[sp,#$Foff+4]
+ ldr $t0, [$ctx,#$Eoff+$lo]
+ ldr $t1, [$ctx,#$Eoff+$hi]
+ ldr $t2, [$ctx,#$Foff+$lo]
+ ldr $t3, [$ctx,#$Foff+$hi]
+ adds $Elo,$Elo,$t0
+ str $Elo,[$ctx,#$Eoff+$lo]
+ adc $Ehi,$Ehi,$t1
+ str $Ehi,[$ctx,#$Eoff+$hi]
+ adds $t2,$Tlo,$t2
+ str $t2, [$ctx,#$Foff+$lo]
+ adc $t3,$Thi,$t3
+ str $t3, [$ctx,#$Foff+$hi]
+
+ ldr $Alo,[sp,#$Goff+0]
+ ldr $Ahi,[sp,#$Goff+4]
+ ldr $Tlo,[sp,#$Hoff+0]
+ ldr $Thi,[sp,#$Hoff+4]
+ ldr $t0, [$ctx,#$Goff+$lo]
+ ldr $t1, [$ctx,#$Goff+$hi]
+ ldr $t2, [$ctx,#$Hoff+$lo]
+ ldr $t3, [$ctx,#$Hoff+$hi]
+ adds $t0,$Alo,$t0
+ str $t0, [$ctx,#$Goff+$lo]
+ adc $t1,$Ahi,$t1
+ str $t1, [$ctx,#$Goff+$hi]
+ adds $t2,$Tlo,$t2
+ str $t2, [$ctx,#$Hoff+$lo]
+ adc $t3,$Thi,$t3
+ str $t3, [$ctx,#$Hoff+$hi]
+
+ add sp,sp,#640
+ sub $Ktbl,$Ktbl,#640
+
+ teq $inp,$len
+ bne .Loop
+
+ add sp,sp,#8*9 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r12,pc}
+#else
+ ldmia sp!,{r4-r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size sha512_block_data_order,.-sha512_block_data_order
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
+
+$code.=<<___ if ($i<16 || $i&1);
+ vshr.u64 $t0,$e,#@Sigma1[0] @ $i
+#if $i<16
+ vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
+#endif
+ vshr.u64 $t1,$e,#@Sigma1[1]
+#if $i>0
+ vadd.i64 $a,$Maj @ h+=Maj from the past
+#endif
+ vshr.u64 $t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+ vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
+ vsli.64 $t0,$e,#`64-@Sigma1[0]`
+ vsli.64 $t1,$e,#`64-@Sigma1[1]`
+ vmov $Ch,$e
+ vsli.64 $t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+ vrev64.8 @X[$i],@X[$i]
+#endif
+ veor $t1,$t0
+ vbsl $Ch,$f,$g @ Ch(e,f,g)
+ vshr.u64 $t0,$a,#@Sigma0[0]
+ veor $t2,$t1 @ Sigma1(e)
+ vadd.i64 $T1,$Ch,$h
+ vshr.u64 $t1,$a,#@Sigma0[1]
+ vsli.64 $t0,$a,#`64-@Sigma0[0]`
+ vadd.i64 $T1,$t2
+ vshr.u64 $t2,$a,#@Sigma0[2]
+ vadd.i64 $K,@X[$i%16]
+ vsli.64 $t1,$a,#`64-@Sigma0[1]`
+ veor $Maj,$a,$b
+ vsli.64 $t2,$a,#`64-@Sigma0[2]`
+ veor $h,$t0,$t1
+ vadd.i64 $T1,$K
+ vbsl $Maj,$c,$b @ Maj(a,b,c)
+ veor $h,$t2 @ Sigma0(a)
+ vadd.i64 $d,$T1
+ vadd.i64 $Maj,$T1
+ @ vadd.i64 $h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1) { &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7)); # view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
+my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
+my $e=@_[4]; # $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+ vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
+ vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
+ vadd.i64 @_[0],d30 @ h+=Maj from the past
+ vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
+ vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
+ vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
+ vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
+ veor $s1,$t0
+ vshr.u64 $t0,$s0,#@sigma0[0]
+ veor $s1,$t1 @ sigma1(X[i+14])
+ vshr.u64 $t1,$s0,#@sigma0[1]
+ vadd.i64 @X[$i%8],$s1
+ vshr.u64 $s1,$s0,#@sigma0[2]
+ vsli.64 $t0,$s0,#`64-@sigma0[0]`
+ vsli.64 $t1,$s0,#`64-@sigma0[1]`
+ vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
+ veor $s1,$t0
+ vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
+ vadd.i64 @X[$i%8],$s0
+ vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
+ veor $s1,$t1 @ sigma0(X[i+1])
+ vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
+ vadd.i64 @X[$i%8],$s1
+___
+ &NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha512_block_data_order_neon
+.type sha512_block_data_order_neon,%function
+.align 4
+sha512_block_data_order_neon:
+.LNEON:
+ dmb @ errata #451034 on early Cortex A8
+ add $len,$inp,$len,lsl#7 @ len to point at the end of inp
+ VFP_ABI_PUSH
+ adr $Ktbl,.Lsha512_block_data_order
+ sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
+ vldmia $ctx,{$A-$H} @ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ mov $cnt,#4
+.L16_79_neon:
+ subs $cnt,#1
+___
+for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ bne .L16_79_neon
+
+ vadd.i64 $A,d30 @ h+=Maj from the past
+ vldmia $ctx,{d24-d31} @ load context to temp
+ vadd.i64 q8,q12 @ vectorized accumulate
+ vadd.i64 q9,q13
+ vadd.i64 q10,q14
+ vadd.i64 q11,q15
+ vstmia $ctx,{$A-$H} @ save context
+ teq $inp,$len
+ sub $Ktbl,#640 @ rewind K512
+ bne .Loop_neon
+
+ VFP_ABI_POP
+ ret @ bx lr
+.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+print $code;
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h
new file mode 100644
index 000000000000..ed9bd81d6d78
--- /dev/null
+++ b/lib/crypto/arm/sha512.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * arm32-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void sha512_block_data_order(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage void sha512_block_data_order_neon(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_neon) && likely(may_use_simd())) {
+ scoped_ksimd()
+ sha512_block_data_order_neon(state, data, nblocks);
+ } else {
+ sha512_block_data_order(state, data, nblocks);
+ }
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+ if (cpu_has_neon())
+ static_branch_enable(&have_neon);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm64/.gitignore b/lib/crypto/arm64/.gitignore
new file mode 100644
index 000000000000..f6c4e8ef80da
--- /dev/null
+++ b/lib/crypto/arm64/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
+sha512-core.S
diff --git a/lib/crypto/arm64/chacha-neon-core.S b/lib/crypto/arm64/chacha-neon-core.S
new file mode 100644
index 000000000000..80079586ecc7
--- /dev/null
+++ b/lib/crypto/arm64/chacha-neon-core.S
@@ -0,0 +1,805 @@
+/*
+ * ChaCha/HChaCha NEON helper functions
+ *
+ * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Originally based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
+
+ .text
+ .align 6
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers v0-v3. It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in w3.
+ *
+ * Clobbers: w3, x10, v4, v12
+ */
+SYM_FUNC_START_LOCAL(chacha_permute)
+
+ adr_l x10, ROT8
+ ld1 {v12.4s}, [x10]
+
+.Ldoubleround:
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ rev32 v3.8h, v3.8h
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #12
+ sri v1.4s, v4.4s, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ tbl v3.16b, {v3.16b}, v12.16b
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #7
+ sri v1.4s, v4.4s, #25
+
+ // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ ext v1.16b, v1.16b, v1.16b, #4
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ ext v2.16b, v2.16b, v2.16b, #8
+ // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ ext v3.16b, v3.16b, v3.16b, #12
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ rev32 v3.8h, v3.8h
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #12
+ sri v1.4s, v4.4s, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v3.16b, v0.16b
+ tbl v3.16b, {v3.16b}, v12.16b
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ add v2.4s, v2.4s, v3.4s
+ eor v4.16b, v1.16b, v2.16b
+ shl v1.4s, v4.4s, #7
+ sri v1.4s, v4.4s, #25
+
+ // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ ext v1.16b, v1.16b, v1.16b, #12
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ ext v2.16b, v2.16b, v2.16b, #8
+ // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ ext v3.16b, v3.16b, v3.16b, #4
+
+ subs w3, w3, #2
+ b.ne .Ldoubleround
+
+ ret
+SYM_FUNC_END(chacha_permute)
+
+SYM_FUNC_START(chacha_block_xor_neon)
+ // x0: Input state matrix, s
+ // x1: 1 data block output, o
+ // x2: 1 data block input, i
+ // w3: nrounds
+
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ // x0..3 = s0..3
+ ld1 {v0.4s-v3.4s}, [x0]
+ ld1 {v8.4s-v11.4s}, [x0]
+
+ bl chacha_permute
+
+ ld1 {v4.16b-v7.16b}, [x2]
+
+ // o0 = i0 ^ (x0 + s0)
+ add v0.4s, v0.4s, v8.4s
+ eor v0.16b, v0.16b, v4.16b
+
+ // o1 = i1 ^ (x1 + s1)
+ add v1.4s, v1.4s, v9.4s
+ eor v1.16b, v1.16b, v5.16b
+
+ // o2 = i2 ^ (x2 + s2)
+ add v2.4s, v2.4s, v10.4s
+ eor v2.16b, v2.16b, v6.16b
+
+ // o3 = i3 ^ (x3 + s3)
+ add v3.4s, v3.4s, v11.4s
+ eor v3.16b, v3.16b, v7.16b
+
+ st1 {v0.16b-v3.16b}, [x1]
+
+ ldp x29, x30, [sp], #16
+ ret
+SYM_FUNC_END(chacha_block_xor_neon)
+
+SYM_FUNC_START(hchacha_block_neon)
+ // x0: Input state matrix, s
+ // x1: output (8 32-bit words)
+ // w2: nrounds
+
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ ld1 {v0.4s-v3.4s}, [x0]
+
+ mov w3, w2
+ bl chacha_permute
+
+ st1 {v0.4s}, [x1], #16
+ st1 {v3.4s}, [x1]
+
+ ldp x29, x30, [sp], #16
+ ret
+SYM_FUNC_END(hchacha_block_neon)
+
+ a0 .req w12
+ a1 .req w13
+ a2 .req w14
+ a3 .req w15
+ a4 .req w16
+ a5 .req w17
+ a6 .req w19
+ a7 .req w20
+ a8 .req w21
+ a9 .req w22
+ a10 .req w23
+ a11 .req w24
+ a12 .req w25
+ a13 .req w26
+ a14 .req w27
+ a15 .req w28
+
+ .align 6
+SYM_FUNC_START(chacha_4block_xor_neon)
+ frame_push 10
+
+ // x0: Input state matrix, s
+ // x1: 4 data blocks output, o
+ // x2: 4 data blocks input, i
+ // w3: nrounds
+ // x4: byte count
+
+ adr_l x10, .Lpermute
+ and x5, x4, #63
+ add x10, x10, x5
+
+ //
+ // This function encrypts four consecutive ChaCha blocks by loading
+ // the state matrix in NEON registers four times. The algorithm performs
+ // each operation on the corresponding word of each state matrix, hence
+ // requires no word shuffling. For final XORing step we transpose the
+ // matrix by interleaving 32- and then 64-bit words, which allows us to
+ // do XOR in NEON registers.
+ //
+ // At the same time, a fifth block is encrypted in parallel using
+ // scalar registers
+ //
+ adr_l x9, CTRINC // ... and ROT8
+ ld1 {v30.4s-v31.4s}, [x9]
+
+ // x0..15[0-3] = s0..3[0..3]
+ add x8, x0, #16
+ ld4r { v0.4s- v3.4s}, [x0]
+ ld4r { v4.4s- v7.4s}, [x8], #16
+ ld4r { v8.4s-v11.4s}, [x8], #16
+ ld4r {v12.4s-v15.4s}, [x8]
+
+ mov a0, v0.s[0]
+ mov a1, v1.s[0]
+ mov a2, v2.s[0]
+ mov a3, v3.s[0]
+ mov a4, v4.s[0]
+ mov a5, v5.s[0]
+ mov a6, v6.s[0]
+ mov a7, v7.s[0]
+ mov a8, v8.s[0]
+ mov a9, v9.s[0]
+ mov a10, v10.s[0]
+ mov a11, v11.s[0]
+ mov a12, v12.s[0]
+ mov a13, v13.s[0]
+ mov a14, v14.s[0]
+ mov a15, v15.s[0]
+
+ // x12 += counter values 1-4
+ add v12.4s, v12.4s, v30.4s
+
+.Ldoubleround4:
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ add v0.4s, v0.4s, v4.4s
+ add a0, a0, a4
+ add v1.4s, v1.4s, v5.4s
+ add a1, a1, a5
+ add v2.4s, v2.4s, v6.4s
+ add a2, a2, a6
+ add v3.4s, v3.4s, v7.4s
+ add a3, a3, a7
+
+ eor v12.16b, v12.16b, v0.16b
+ eor a12, a12, a0
+ eor v13.16b, v13.16b, v1.16b
+ eor a13, a13, a1
+ eor v14.16b, v14.16b, v2.16b
+ eor a14, a14, a2
+ eor v15.16b, v15.16b, v3.16b
+ eor a15, a15, a3
+
+ rev32 v12.8h, v12.8h
+ ror a12, a12, #16
+ rev32 v13.8h, v13.8h
+ ror a13, a13, #16
+ rev32 v14.8h, v14.8h
+ ror a14, a14, #16
+ rev32 v15.8h, v15.8h
+ ror a15, a15, #16
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ add v8.4s, v8.4s, v12.4s
+ add a8, a8, a12
+ add v9.4s, v9.4s, v13.4s
+ add a9, a9, a13
+ add v10.4s, v10.4s, v14.4s
+ add a10, a10, a14
+ add v11.4s, v11.4s, v15.4s
+ add a11, a11, a15
+
+ eor v16.16b, v4.16b, v8.16b
+ eor a4, a4, a8
+ eor v17.16b, v5.16b, v9.16b
+ eor a5, a5, a9
+ eor v18.16b, v6.16b, v10.16b
+ eor a6, a6, a10
+ eor v19.16b, v7.16b, v11.16b
+ eor a7, a7, a11
+
+ shl v4.4s, v16.4s, #12
+ shl v5.4s, v17.4s, #12
+ shl v6.4s, v18.4s, #12
+ shl v7.4s, v19.4s, #12
+
+ sri v4.4s, v16.4s, #20
+ ror a4, a4, #20
+ sri v5.4s, v17.4s, #20
+ ror a5, a5, #20
+ sri v6.4s, v18.4s, #20
+ ror a6, a6, #20
+ sri v7.4s, v19.4s, #20
+ ror a7, a7, #20
+
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ add v0.4s, v0.4s, v4.4s
+ add a0, a0, a4
+ add v1.4s, v1.4s, v5.4s
+ add a1, a1, a5
+ add v2.4s, v2.4s, v6.4s
+ add a2, a2, a6
+ add v3.4s, v3.4s, v7.4s
+ add a3, a3, a7
+
+ eor v12.16b, v12.16b, v0.16b
+ eor a12, a12, a0
+ eor v13.16b, v13.16b, v1.16b
+ eor a13, a13, a1
+ eor v14.16b, v14.16b, v2.16b
+ eor a14, a14, a2
+ eor v15.16b, v15.16b, v3.16b
+ eor a15, a15, a3
+
+ tbl v12.16b, {v12.16b}, v31.16b
+ ror a12, a12, #24
+ tbl v13.16b, {v13.16b}, v31.16b
+ ror a13, a13, #24
+ tbl v14.16b, {v14.16b}, v31.16b
+ ror a14, a14, #24
+ tbl v15.16b, {v15.16b}, v31.16b
+ ror a15, a15, #24
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ add v8.4s, v8.4s, v12.4s
+ add a8, a8, a12
+ add v9.4s, v9.4s, v13.4s
+ add a9, a9, a13
+ add v10.4s, v10.4s, v14.4s
+ add a10, a10, a14
+ add v11.4s, v11.4s, v15.4s
+ add a11, a11, a15
+
+ eor v16.16b, v4.16b, v8.16b
+ eor a4, a4, a8
+ eor v17.16b, v5.16b, v9.16b
+ eor a5, a5, a9
+ eor v18.16b, v6.16b, v10.16b
+ eor a6, a6, a10
+ eor v19.16b, v7.16b, v11.16b
+ eor a7, a7, a11
+
+ shl v4.4s, v16.4s, #7
+ shl v5.4s, v17.4s, #7
+ shl v6.4s, v18.4s, #7
+ shl v7.4s, v19.4s, #7
+
+ sri v4.4s, v16.4s, #25
+ ror a4, a4, #25
+ sri v5.4s, v17.4s, #25
+ ror a5, a5, #25
+ sri v6.4s, v18.4s, #25
+ ror a6, a6, #25
+ sri v7.4s, v19.4s, #25
+ ror a7, a7, #25
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ add v0.4s, v0.4s, v5.4s
+ add a0, a0, a5
+ add v1.4s, v1.4s, v6.4s
+ add a1, a1, a6
+ add v2.4s, v2.4s, v7.4s
+ add a2, a2, a7
+ add v3.4s, v3.4s, v4.4s
+ add a3, a3, a4
+
+ eor v15.16b, v15.16b, v0.16b
+ eor a15, a15, a0
+ eor v12.16b, v12.16b, v1.16b
+ eor a12, a12, a1
+ eor v13.16b, v13.16b, v2.16b
+ eor a13, a13, a2
+ eor v14.16b, v14.16b, v3.16b
+ eor a14, a14, a3
+
+ rev32 v15.8h, v15.8h
+ ror a15, a15, #16
+ rev32 v12.8h, v12.8h
+ ror a12, a12, #16
+ rev32 v13.8h, v13.8h
+ ror a13, a13, #16
+ rev32 v14.8h, v14.8h
+ ror a14, a14, #16
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ add v10.4s, v10.4s, v15.4s
+ add a10, a10, a15
+ add v11.4s, v11.4s, v12.4s
+ add a11, a11, a12
+ add v8.4s, v8.4s, v13.4s
+ add a8, a8, a13
+ add v9.4s, v9.4s, v14.4s
+ add a9, a9, a14
+
+ eor v16.16b, v5.16b, v10.16b
+ eor a5, a5, a10
+ eor v17.16b, v6.16b, v11.16b
+ eor a6, a6, a11
+ eor v18.16b, v7.16b, v8.16b
+ eor a7, a7, a8
+ eor v19.16b, v4.16b, v9.16b
+ eor a4, a4, a9
+
+ shl v5.4s, v16.4s, #12
+ shl v6.4s, v17.4s, #12
+ shl v7.4s, v18.4s, #12
+ shl v4.4s, v19.4s, #12
+
+ sri v5.4s, v16.4s, #20
+ ror a5, a5, #20
+ sri v6.4s, v17.4s, #20
+ ror a6, a6, #20
+ sri v7.4s, v18.4s, #20
+ ror a7, a7, #20
+ sri v4.4s, v19.4s, #20
+ ror a4, a4, #20
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ add v0.4s, v0.4s, v5.4s
+ add a0, a0, a5
+ add v1.4s, v1.4s, v6.4s
+ add a1, a1, a6
+ add v2.4s, v2.4s, v7.4s
+ add a2, a2, a7
+ add v3.4s, v3.4s, v4.4s
+ add a3, a3, a4
+
+ eor v15.16b, v15.16b, v0.16b
+ eor a15, a15, a0
+ eor v12.16b, v12.16b, v1.16b
+ eor a12, a12, a1
+ eor v13.16b, v13.16b, v2.16b
+ eor a13, a13, a2
+ eor v14.16b, v14.16b, v3.16b
+ eor a14, a14, a3
+
+ tbl v15.16b, {v15.16b}, v31.16b
+ ror a15, a15, #24
+ tbl v12.16b, {v12.16b}, v31.16b
+ ror a12, a12, #24
+ tbl v13.16b, {v13.16b}, v31.16b
+ ror a13, a13, #24
+ tbl v14.16b, {v14.16b}, v31.16b
+ ror a14, a14, #24
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ add v10.4s, v10.4s, v15.4s
+ add a10, a10, a15
+ add v11.4s, v11.4s, v12.4s
+ add a11, a11, a12
+ add v8.4s, v8.4s, v13.4s
+ add a8, a8, a13
+ add v9.4s, v9.4s, v14.4s
+ add a9, a9, a14
+
+ eor v16.16b, v5.16b, v10.16b
+ eor a5, a5, a10
+ eor v17.16b, v6.16b, v11.16b
+ eor a6, a6, a11
+ eor v18.16b, v7.16b, v8.16b
+ eor a7, a7, a8
+ eor v19.16b, v4.16b, v9.16b
+ eor a4, a4, a9
+
+ shl v5.4s, v16.4s, #7
+ shl v6.4s, v17.4s, #7
+ shl v7.4s, v18.4s, #7
+ shl v4.4s, v19.4s, #7
+
+ sri v5.4s, v16.4s, #25
+ ror a5, a5, #25
+ sri v6.4s, v17.4s, #25
+ ror a6, a6, #25
+ sri v7.4s, v18.4s, #25
+ ror a7, a7, #25
+ sri v4.4s, v19.4s, #25
+ ror a4, a4, #25
+
+ subs w3, w3, #2
+ b.ne .Ldoubleround4
+
+ ld4r {v16.4s-v19.4s}, [x0], #16
+ ld4r {v20.4s-v23.4s}, [x0], #16
+
+ // x12 += counter values 0-3
+ add v12.4s, v12.4s, v30.4s
+
+ // x0[0-3] += s0[0]
+ // x1[0-3] += s0[1]
+ // x2[0-3] += s0[2]
+ // x3[0-3] += s0[3]
+ add v0.4s, v0.4s, v16.4s
+ mov w6, v16.s[0]
+ mov w7, v17.s[0]
+ add v1.4s, v1.4s, v17.4s
+ mov w8, v18.s[0]
+ mov w9, v19.s[0]
+ add v2.4s, v2.4s, v18.4s
+ add a0, a0, w6
+ add a1, a1, w7
+ add v3.4s, v3.4s, v19.4s
+ add a2, a2, w8
+ add a3, a3, w9
+CPU_BE( rev a0, a0 )
+CPU_BE( rev a1, a1 )
+CPU_BE( rev a2, a2 )
+CPU_BE( rev a3, a3 )
+
+ ld4r {v24.4s-v27.4s}, [x0], #16
+ ld4r {v28.4s-v31.4s}, [x0]
+
+ // x4[0-3] += s1[0]
+ // x5[0-3] += s1[1]
+ // x6[0-3] += s1[2]
+ // x7[0-3] += s1[3]
+ add v4.4s, v4.4s, v20.4s
+ mov w6, v20.s[0]
+ mov w7, v21.s[0]
+ add v5.4s, v5.4s, v21.4s
+ mov w8, v22.s[0]
+ mov w9, v23.s[0]
+ add v6.4s, v6.4s, v22.4s
+ add a4, a4, w6
+ add a5, a5, w7
+ add v7.4s, v7.4s, v23.4s
+ add a6, a6, w8
+ add a7, a7, w9
+CPU_BE( rev a4, a4 )
+CPU_BE( rev a5, a5 )
+CPU_BE( rev a6, a6 )
+CPU_BE( rev a7, a7 )
+
+ // x8[0-3] += s2[0]
+ // x9[0-3] += s2[1]
+ // x10[0-3] += s2[2]
+ // x11[0-3] += s2[3]
+ add v8.4s, v8.4s, v24.4s
+ mov w6, v24.s[0]
+ mov w7, v25.s[0]
+ add v9.4s, v9.4s, v25.4s
+ mov w8, v26.s[0]
+ mov w9, v27.s[0]
+ add v10.4s, v10.4s, v26.4s
+ add a8, a8, w6
+ add a9, a9, w7
+ add v11.4s, v11.4s, v27.4s
+ add a10, a10, w8
+ add a11, a11, w9
+CPU_BE( rev a8, a8 )
+CPU_BE( rev a9, a9 )
+CPU_BE( rev a10, a10 )
+CPU_BE( rev a11, a11 )
+
+ // x12[0-3] += s3[0]
+ // x13[0-3] += s3[1]
+ // x14[0-3] += s3[2]
+ // x15[0-3] += s3[3]
+ add v12.4s, v12.4s, v28.4s
+ mov w6, v28.s[0]
+ mov w7, v29.s[0]
+ add v13.4s, v13.4s, v29.4s
+ mov w8, v30.s[0]
+ mov w9, v31.s[0]
+ add v14.4s, v14.4s, v30.4s
+ add a12, a12, w6
+ add a13, a13, w7
+ add v15.4s, v15.4s, v31.4s
+ add a14, a14, w8
+ add a15, a15, w9
+CPU_BE( rev a12, a12 )
+CPU_BE( rev a13, a13 )
+CPU_BE( rev a14, a14 )
+CPU_BE( rev a15, a15 )
+
+ // interleave 32-bit words in state n, n+1
+ ldp w6, w7, [x2], #64
+ zip1 v16.4s, v0.4s, v1.4s
+ ldp w8, w9, [x2, #-56]
+ eor a0, a0, w6
+ zip2 v17.4s, v0.4s, v1.4s
+ eor a1, a1, w7
+ zip1 v18.4s, v2.4s, v3.4s
+ eor a2, a2, w8
+ zip2 v19.4s, v2.4s, v3.4s
+ eor a3, a3, w9
+ ldp w6, w7, [x2, #-48]
+ zip1 v20.4s, v4.4s, v5.4s
+ ldp w8, w9, [x2, #-40]
+ eor a4, a4, w6
+ zip2 v21.4s, v4.4s, v5.4s
+ eor a5, a5, w7
+ zip1 v22.4s, v6.4s, v7.4s
+ eor a6, a6, w8
+ zip2 v23.4s, v6.4s, v7.4s
+ eor a7, a7, w9
+ ldp w6, w7, [x2, #-32]
+ zip1 v24.4s, v8.4s, v9.4s
+ ldp w8, w9, [x2, #-24]
+ eor a8, a8, w6
+ zip2 v25.4s, v8.4s, v9.4s
+ eor a9, a9, w7
+ zip1 v26.4s, v10.4s, v11.4s
+ eor a10, a10, w8
+ zip2 v27.4s, v10.4s, v11.4s
+ eor a11, a11, w9
+ ldp w6, w7, [x2, #-16]
+ zip1 v28.4s, v12.4s, v13.4s
+ ldp w8, w9, [x2, #-8]
+ eor a12, a12, w6
+ zip2 v29.4s, v12.4s, v13.4s
+ eor a13, a13, w7
+ zip1 v30.4s, v14.4s, v15.4s
+ eor a14, a14, w8
+ zip2 v31.4s, v14.4s, v15.4s
+ eor a15, a15, w9
+
+ add x3, x2, x4
+ sub x3, x3, #128 // start of last block
+
+ subs x5, x4, #128
+ csel x2, x2, x3, ge
+
+ // interleave 64-bit words in state n, n+2
+ zip1 v0.2d, v16.2d, v18.2d
+ zip2 v4.2d, v16.2d, v18.2d
+ stp a0, a1, [x1], #64
+ zip1 v8.2d, v17.2d, v19.2d
+ zip2 v12.2d, v17.2d, v19.2d
+ stp a2, a3, [x1, #-56]
+
+ subs x6, x4, #192
+ ld1 {v16.16b-v19.16b}, [x2], #64
+ csel x2, x2, x3, ge
+
+ zip1 v1.2d, v20.2d, v22.2d
+ zip2 v5.2d, v20.2d, v22.2d
+ stp a4, a5, [x1, #-48]
+ zip1 v9.2d, v21.2d, v23.2d
+ zip2 v13.2d, v21.2d, v23.2d
+ stp a6, a7, [x1, #-40]
+
+ subs x7, x4, #256
+ ld1 {v20.16b-v23.16b}, [x2], #64
+ csel x2, x2, x3, ge
+
+ zip1 v2.2d, v24.2d, v26.2d
+ zip2 v6.2d, v24.2d, v26.2d
+ stp a8, a9, [x1, #-32]
+ zip1 v10.2d, v25.2d, v27.2d
+ zip2 v14.2d, v25.2d, v27.2d
+ stp a10, a11, [x1, #-24]
+
+ subs x8, x4, #320
+ ld1 {v24.16b-v27.16b}, [x2], #64
+ csel x2, x2, x3, ge
+
+ zip1 v3.2d, v28.2d, v30.2d
+ zip2 v7.2d, v28.2d, v30.2d
+ stp a12, a13, [x1, #-16]
+ zip1 v11.2d, v29.2d, v31.2d
+ zip2 v15.2d, v29.2d, v31.2d
+ stp a14, a15, [x1, #-8]
+
+ tbnz x5, #63, .Lt128
+ ld1 {v28.16b-v31.16b}, [x2]
+
+ // xor with corresponding input, write to output
+ eor v16.16b, v16.16b, v0.16b
+ eor v17.16b, v17.16b, v1.16b
+ eor v18.16b, v18.16b, v2.16b
+ eor v19.16b, v19.16b, v3.16b
+
+ tbnz x6, #63, .Lt192
+
+ eor v20.16b, v20.16b, v4.16b
+ eor v21.16b, v21.16b, v5.16b
+ eor v22.16b, v22.16b, v6.16b
+ eor v23.16b, v23.16b, v7.16b
+
+ st1 {v16.16b-v19.16b}, [x1], #64
+ tbnz x7, #63, .Lt256
+
+ eor v24.16b, v24.16b, v8.16b
+ eor v25.16b, v25.16b, v9.16b
+ eor v26.16b, v26.16b, v10.16b
+ eor v27.16b, v27.16b, v11.16b
+
+ st1 {v20.16b-v23.16b}, [x1], #64
+ tbnz x8, #63, .Lt320
+
+ eor v28.16b, v28.16b, v12.16b
+ eor v29.16b, v29.16b, v13.16b
+ eor v30.16b, v30.16b, v14.16b
+ eor v31.16b, v31.16b, v15.16b
+
+ st1 {v24.16b-v27.16b}, [x1], #64
+ st1 {v28.16b-v31.16b}, [x1]
+
+.Lout: frame_pop
+ ret
+
+ // fewer than 192 bytes of in/output
+.Lt192: cbz x5, 1f // exactly 128 bytes?
+ ld1 {v28.16b-v31.16b}, [x10]
+ add x5, x5, x1
+ tbl v28.16b, {v4.16b-v7.16b}, v28.16b
+ tbl v29.16b, {v4.16b-v7.16b}, v29.16b
+ tbl v30.16b, {v4.16b-v7.16b}, v30.16b
+ tbl v31.16b, {v4.16b-v7.16b}, v31.16b
+
+0: eor v20.16b, v20.16b, v28.16b
+ eor v21.16b, v21.16b, v29.16b
+ eor v22.16b, v22.16b, v30.16b
+ eor v23.16b, v23.16b, v31.16b
+ st1 {v20.16b-v23.16b}, [x5] // overlapping stores
+1: st1 {v16.16b-v19.16b}, [x1]
+ b .Lout
+
+ // fewer than 128 bytes of in/output
+.Lt128: ld1 {v28.16b-v31.16b}, [x10]
+ add x5, x5, x1
+ sub x1, x1, #64
+ tbl v28.16b, {v0.16b-v3.16b}, v28.16b
+ tbl v29.16b, {v0.16b-v3.16b}, v29.16b
+ tbl v30.16b, {v0.16b-v3.16b}, v30.16b
+ tbl v31.16b, {v0.16b-v3.16b}, v31.16b
+ ld1 {v16.16b-v19.16b}, [x1] // reload first output block
+ b 0b
+
+ // fewer than 256 bytes of in/output
+.Lt256: cbz x6, 2f // exactly 192 bytes?
+ ld1 {v4.16b-v7.16b}, [x10]
+ add x6, x6, x1
+ tbl v0.16b, {v8.16b-v11.16b}, v4.16b
+ tbl v1.16b, {v8.16b-v11.16b}, v5.16b
+ tbl v2.16b, {v8.16b-v11.16b}, v6.16b
+ tbl v3.16b, {v8.16b-v11.16b}, v7.16b
+
+ eor v28.16b, v28.16b, v0.16b
+ eor v29.16b, v29.16b, v1.16b
+ eor v30.16b, v30.16b, v2.16b
+ eor v31.16b, v31.16b, v3.16b
+ st1 {v28.16b-v31.16b}, [x6] // overlapping stores
+2: st1 {v20.16b-v23.16b}, [x1]
+ b .Lout
+
+ // fewer than 320 bytes of in/output
+.Lt320: cbz x7, 3f // exactly 256 bytes?
+ ld1 {v4.16b-v7.16b}, [x10]
+ add x7, x7, x1
+ tbl v0.16b, {v12.16b-v15.16b}, v4.16b
+ tbl v1.16b, {v12.16b-v15.16b}, v5.16b
+ tbl v2.16b, {v12.16b-v15.16b}, v6.16b
+ tbl v3.16b, {v12.16b-v15.16b}, v7.16b
+
+ eor v28.16b, v28.16b, v0.16b
+ eor v29.16b, v29.16b, v1.16b
+ eor v30.16b, v30.16b, v2.16b
+ eor v31.16b, v31.16b, v3.16b
+ st1 {v28.16b-v31.16b}, [x7] // overlapping stores
+3: st1 {v24.16b-v27.16b}, [x1]
+ b .Lout
+SYM_FUNC_END(chacha_4block_xor_neon)
+
+ .section ".rodata", "a", %progbits
+ .align L1_CACHE_SHIFT
+.Lpermute:
+ .set .Li, 0
+ .rept 128
+ .byte (.Li - 64)
+ .set .Li, .Li + 1
+ .endr
+
+CTRINC: .word 1, 2, 3, 4
+ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/lib/crypto/arm64/chacha.h b/lib/crypto/arm64/chacha.h
new file mode 100644
index 000000000000..ca8c6a8b0578
--- /dev/null
+++ b/lib/crypto/arm64/chacha.h
@@ -0,0 +1,96 @@
+/*
+ * ChaCha and HChaCha functions (ARM64 optimized)
+ *
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+ u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ int nrounds, int bytes);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+ int bytes, int nrounds)
+{
+ while (bytes > 0) {
+ int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
+
+ if (l <= CHACHA_BLOCK_SIZE) {
+ u8 buf[CHACHA_BLOCK_SIZE];
+
+ memcpy(buf, src, l);
+ chacha_block_xor_neon(state, buf, buf, nrounds);
+ memcpy(dst, buf, l);
+ state->x[12] += 1;
+ break;
+ }
+ chacha_4block_xor_neon(state, dst, src, nrounds, l);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+ }
+}
+
+static void hchacha_block_arch(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+ if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
+ hchacha_block_generic(state, out, nrounds);
+ } else {
+ scoped_ksimd()
+ hchacha_block_neon(state, out, nrounds);
+ }
+}
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int bytes, int nrounds)
+{
+ if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
+ !crypto_simd_usable())
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ scoped_ksimd()
+ chacha_doneon(state, dst, src, todo, nrounds);
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(ASIMD))
+ static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl
new file mode 100644
index 000000000000..f1930c6b55ce
--- /dev/null
+++ b/lib/crypto/arm64/poly1305-armv8.pl
@@ -0,0 +1,920 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# This module implements Poly1305 hash for ARMv8.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc-4.9 NEON
+#
+# Apple A7 1.86/+5% 0.72
+# Cortex-A53 2.69/+58% 1.47
+# Cortex-A57 2.70/+7% 1.14
+# Denver 1.64/+50% 1.18(*)
+# X-Gene 2.13/+68% 2.27
+# Mongoose 1.77/+75% 1.12
+# Kryo 2.70/+55% 1.13
+# ThunderX2 1.17/+95% 1.36
+#
+# (*) estimate based on resources availability is less than 1.0,
+# i.e. measured result is worse than expected, presumably binary
+# translator is not almighty;
+
+$flavour=shift;
+$output=shift;
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armcap_P
+#else
+# define poly1305_init poly1305_block_init
+# define poly1305_blocks poly1305_blocks_arm64
+#endif
+
+.text
+
+// forward "declarations" are required for Apple
+.globl poly1305_blocks
+.globl poly1305_emit
+
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+ cmp $inp,xzr
+ stp xzr,xzr,[$ctx] // zero hash value
+ stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+#ifndef __KERNEL__
+ adrp x17,OPENSSL_armcap_P
+ ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
+#endif
+
+ ldp $r0,$r1,[$inp] // load key
+ mov $s1,#0xfffffffc0fffffff
+ movk $s1,#0x0fff,lsl#48
+#ifdef __AARCH64EB__
+ rev $r0,$r0 // flip bytes
+ rev $r1,$r1
+#endif
+ and $r0,$r0,$s1 // &=0ffffffc0fffffff
+ and $s1,$s1,#-4
+ and $r1,$r1,$s1 // &=0ffffffc0ffffffc
+ mov w#$s1,#-1
+ stp $r0,$r1,[$ctx,#32] // save key value
+ str w#$s1,[$ctx,#48] // impossible key power value
+
+#ifndef __KERNEL__
+ tst w17,#ARMV7_NEON
+
+ adr $d0,.Lpoly1305_blocks
+ adr $r0,.Lpoly1305_blocks_neon
+ adr $d1,.Lpoly1305_emit
+
+ csel $d0,$d0,$r0,eq
+
+# ifdef __ILP32__
+ stp w#$d0,w#$d1,[$len]
+# else
+ stp $d0,$d1,[$len]
+# endif
+#endif
+ mov x0,#1
+.Lno_key:
+ ret
+.size poly1305_init,.-poly1305_init
+
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ ands $len,$len,#-16
+ b.eq .Lno_data
+
+ ldp $h0,$h1,[$ctx] // load hash value
+ ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+#ifdef __AARCH64EB__
+ lsr $d0,$h0,#32
+ mov w#$d1,w#$h0
+ lsr $d2,$h1,#32
+ mov w15,w#$h1
+ lsr x16,$h2,#32
+#else
+ mov w#$d0,w#$h0
+ lsr $d1,$h0,#32
+ mov w#$d2,w#$h1
+ lsr x15,$h1,#32
+ mov w16,w#$h2
+#endif
+
+ add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
+ lsr $d1,$d2,#12
+ adds $d0,$d0,$d2,lsl#52
+ add $d1,$d1,x15,lsl#14
+ adc $d1,$d1,xzr
+ lsr $d2,x16,#24
+ adds $d1,$d1,x16,lsl#40
+ adc $d2,$d2,xzr
+
+ cmp x17,#0 // is_base2_26?
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ csel $h0,$h0,$d0,eq // choose between radixes
+ csel $h1,$h1,$d1,eq
+ csel $h2,$h2,$d2,eq
+
+.Loop:
+ ldp $t0,$t1,[$inp],#16 // load input
+ sub $len,$len,#16
+#ifdef __AARCH64EB__
+ rev $t0,$t0
+ rev $t1,$t1
+#endif
+ adds $h0,$h0,$t0 // accumulate input
+ adcs $h1,$h1,$t1
+
+ mul $d0,$h0,$r0 // h0*r0
+ adc $h2,$h2,$padbit
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ cbnz $len,.Loop
+
+ stp $h0,$h1,[$ctx] // store hash value
+ stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+ ldp $h0,$h1,[$ctx] // load hash base 2^64
+ ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
+ ldp $t0,$t1,[$nonce] // load nonce
+
+#ifdef __AARCH64EB__
+ lsr $d0,$h0,#32
+ mov w#$d1,w#$h0
+ lsr $d2,$h1,#32
+ mov w15,w#$h1
+ lsr x16,$h2,#32
+#else
+ mov w#$d0,w#$h0
+ lsr $d1,$h0,#32
+ mov w#$d2,w#$h1
+ lsr x15,$h1,#32
+ mov w16,w#$h2
+#endif
+
+ add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
+ lsr $d1,$d2,#12
+ adds $d0,$d0,$d2,lsl#52
+ add $d1,$d1,x15,lsl#14
+ adc $d1,$d1,xzr
+ lsr $d2,x16,#24
+ adds $d1,$d1,x16,lsl#40
+ adc $d2,$d2,xzr
+
+ cmp $r0,#0 // is_base2_26?
+ csel $h0,$h0,$d0,eq // choose between radixes
+ csel $h1,$h1,$d1,eq
+ csel $h2,$h2,$d2,eq
+
+ adds $d0,$h0,#5 // compare to modulus
+ adcs $d1,$h1,xzr
+ adc $d2,$h2,xzr
+
+ tst $d2,#-4 // see if it's carried/borrowed
+
+ csel $h0,$h0,$d0,eq
+ csel $h1,$h1,$d1,eq
+
+#ifdef __AARCH64EB__
+ ror $t0,$t0,#32 // flip nonce words
+ ror $t1,$t1,#32
+#endif
+ adds $h0,$h0,$t0 // accumulate nonce
+ adc $h1,$h1,$t1
+#ifdef __AARCH64EB__
+ rev $h0,$h0 // flip output bytes
+ rev $h1,$h1
+#endif
+ stp $h0,$h1,[$mac] // write result
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+___
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
+my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
+my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
+my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
+my ($T0,$T1,$MASK) = map("v$_",(29..31));
+
+my ($in2,$zeros)=("x16","x17");
+my $is_base2_26 = $zeros; # borrow
+
+$code.=<<___;
+.type poly1305_mult,%function
+.align 5
+poly1305_mult:
+ mul $d0,$h0,$r0 // h0*r0
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ ret
+.size poly1305_mult,.-poly1305_mult
+
+.type poly1305_splat,%function
+.align 4
+poly1305_splat:
+ and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,$h0,#26,#26
+ extr x14,$h1,$h0,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,$h1,#14,#26
+ extr x16,$h2,$h1,#40
+
+ str w12,[$ctx,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[$ctx,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[$ctx,#16*2] // s1
+ str w14,[$ctx,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[$ctx,#16*4] // s2
+ str w15,[$ctx,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[$ctx,#16*6] // s3
+ str w16,[$ctx,#16*7] // r4
+ str w15,[$ctx,#16*8] // s4
+
+ ret
+.size poly1305_splat,.-poly1305_splat
+
+#ifdef __KERNEL__
+.globl poly1305_blocks_neon
+#endif
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+ ldr $is_base2_26,[$ctx,#24]
+ cmp $len,#128
+ b.lo .Lpoly1305_blocks
+
+ .inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ cbz $is_base2_26,.Lbase2_64_neon
+
+ ldp w10,w11,[$ctx] // load hash value base 2^26
+ ldp w12,w13,[$ctx,#8]
+ ldr w14,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Leven_neon
+
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr $h1,x12,#12
+ adds $h0,$h0,x12,lsl#52
+ add $h1,$h1,x13,lsl#14
+ adc $h1,$h1,xzr
+ lsr $h2,x14,#24
+ adds $h1,$h1,x14,lsl#40
+ adc $d2,$h2,xzr // can be partially reduced...
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+
+#ifdef __AARCH64EB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ b .Leven_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ ldp $h0,$h1,[$ctx] // load hash value base 2^64
+ ldr $h2,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Linit_neon
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __AARCH64EB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+
+.Linit_neon:
+ ldr w17,[$ctx,#48] // first table element
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ cmp w17,#-1 // is value impossible?
+ b.ne .Leven_neon
+
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+ ////////////////////////////////// initialize r^n table
+ mov $h0,$r0 // r^1
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov $h1,$r1
+ mov $h2,xzr
+ add $ctx,$ctx,#48+12
+ bl poly1305_splat
+
+ bl poly1305_mult // r^2
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^3
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^4
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+ sub $ctx,$ctx,#48 // restore original $ctx
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+.Ldo_neon:
+ ldp x8,x12,[$inp,#32] // inp[2:3]
+ subs $len,$len,#64
+ ldp x9,x13,[$inp,#48]
+ add $in2,$inp,#96
+ adrp $zeros,.Lzeros
+ add $zeros,$zeros,#:lo12:.Lzeros
+
+ lsl $padbit,$padbit,#24
+ add x15,$ctx,#48
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN23_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN23_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov $IN23_2,x8
+ fmov $IN23_3,x10
+ fmov $IN23_4,x12
+
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ ldp x9,x13,[$inp],#48
+
+ ld1 {$R0,$R1,$S1,$R2},[x15],#64
+ ld1 {$S2,$R3,$S3,$R4},[x15],#64
+ ld1 {$S4},[x15]
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN01_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN01_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi $MASK.2d,#-1
+ fmov $IN01_2,x8
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+ ushr $MASK.2d,$MASK.2d,#38
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // \___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // \___________________/ \____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs $len,$len,#64
+ umull $ACC4,$IN23_0,${R4}[2]
+ csel $in2,$zeros,$in2,lo
+ umull $ACC3,$IN23_0,${R3}[2]
+ umull $ACC2,$IN23_0,${R2}[2]
+ ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
+ umull $ACC1,$IN23_0,${R1}[2]
+ ldp x9,x13,[$in2],#48
+ umull $ACC0,$IN23_0,${R0}[2]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal $ACC4,$IN23_1,${R3}[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC3,$IN23_1,${R2}[2]
+ and x5,x9,#0x03ffffff
+ umlal $ACC2,$IN23_1,${R1}[2]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN23_1,${R0}[2]
+ ubfx x7,x9,#26,#26
+ umlal $ACC0,$IN23_1,${S4}[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal $ACC4,$IN23_2,${R2}[2]
+ extr x8,x12,x8,#52
+ umlal $ACC3,$IN23_2,${R1}[2]
+ extr x9,x13,x9,#52
+ umlal $ACC2,$IN23_2,${R0}[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC1,$IN23_2,${S4}[2]
+ fmov $IN23_0,x4
+ umlal $ACC0,$IN23_2,${S3}[2]
+ and x8,x8,#0x03ffffff
+
+ umlal $ACC4,$IN23_3,${R1}[2]
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN23_3,${R0}[2]
+ ubfx x10,x12,#14,#26
+ umlal $ACC2,$IN23_3,${S4}[2]
+ ubfx x11,x13,#14,#26
+ umlal $ACC1,$IN23_3,${S3}[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC0,$IN23_3,${S2}[2]
+ fmov $IN23_1,x6
+
+ add $IN01_2,$IN01_2,$H2
+ add x12,$padbit,x12,lsr#40
+ umlal $ACC4,$IN23_4,${R0}[2]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC3,$IN23_4,${S4}[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC2,$IN23_4,${S3}[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN23_4,${S2}[2]
+ fmov $IN23_2,x8
+ umlal $ACC0,$IN23_4,${S1}[2]
+ fmov $IN23_3,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ fmov $IN23_4,x12
+ umlal $ACC3,$IN01_2,${R1}[0]
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ umlal $ACC0,$IN01_2,${S3}[0]
+ ldp x9,x13,[$inp],#48
+ umlal $ACC4,$IN01_2,${R2}[0]
+ umlal $ACC1,$IN01_2,${S4}[0]
+ umlal $ACC2,$IN01_2,${R0}[0]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}[0]
+ umlal $ACC4,$IN01_0,${R4}[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC2,$IN01_0,${R2}[0]
+ and x5,x9,#0x03ffffff
+ umlal $ACC0,$IN01_0,${R0}[0]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN01_0,${R1}[0]
+ ubfx x7,x9,#26,#26
+
+ add $IN01_3,$IN01_3,$H3
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal $ACC3,$IN01_1,${R2}[0]
+ extr x8,x12,x8,#52
+ umlal $ACC4,$IN01_1,${R3}[0]
+ extr x9,x13,x9,#52
+ umlal $ACC0,$IN01_1,${S4}[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC2,$IN01_1,${R1}[0]
+ fmov $IN01_0,x4
+ umlal $ACC1,$IN01_1,${R0}[0]
+ and x8,x8,#0x03ffffff
+
+ add $IN01_4,$IN01_4,$H4
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN01_3,${R0}[0]
+ ubfx x10,x12,#14,#26
+ umlal $ACC0,$IN01_3,${S2}[0]
+ ubfx x11,x13,#14,#26
+ umlal $ACC4,$IN01_3,${R1}[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC1,$IN01_3,${S3}[0]
+ fmov $IN01_1,x6
+ umlal $ACC2,$IN01_3,${S4}[0]
+ add x12,$padbit,x12,lsr#40
+
+ umlal $ACC3,$IN01_4,${S4}[0]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC0,$IN01_4,${S1}[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC4,$IN01_4,${R0}[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN01_4,${S2}[0]
+ fmov $IN01_2,x8
+ umlal $ACC2,$IN01_4,${S3}[0]
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
+
+ ushr $T0.2d,$ACC3,#26
+ xtn $H3,$ACC3
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ bic $H3,#0xfc,lsl#24 // &=0x03ffffff
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ xtn $H4,$ACC4
+ ushr $T1.2d,$ACC1,#26
+ xtn $H1,$ACC1
+ bic $H4,#0xfc,lsl#24
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ shrn $T1.2s,$ACC2,#26
+ xtn $H2,$ACC2
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ bic $H1,#0xfc,lsl#24
+ add $H3,$H3,$T1.2s // h2 -> h3
+ bic $H2,#0xfc,lsl#24
+
+ shrn $T0.2s,$ACC0,#26
+ xtn $H0,$ACC0
+ ushr $T1.2s,$H3,#26
+ bic $H3,#0xfc,lsl#24
+ bic $H0,#0xfc,lsl#24
+ add $H1,$H1,$T0.2s // h0 -> h1
+ add $H4,$H4,$T1.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup $IN23_2,${IN23_2}[0]
+ add $IN01_2,$IN01_2,$H2
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds $len,$len,#32
+ b.ne .Long_tail
+
+ dup $IN23_2,${IN01_2}[0]
+ add $IN23_0,$IN01_0,$H0
+ add $IN23_3,$IN01_3,$H3
+ add $IN23_1,$IN01_1,$H1
+ add $IN23_4,$IN01_4,$H4
+
+.Long_tail:
+ dup $IN23_0,${IN23_0}[0]
+ umull2 $ACC0,$IN23_2,${S3}
+ umull2 $ACC3,$IN23_2,${R1}
+ umull2 $ACC4,$IN23_2,${R2}
+ umull2 $ACC2,$IN23_2,${R0}
+ umull2 $ACC1,$IN23_2,${S4}
+
+ dup $IN23_1,${IN23_1}[0]
+ umlal2 $ACC0,$IN23_0,${R0}
+ umlal2 $ACC2,$IN23_0,${R2}
+ umlal2 $ACC3,$IN23_0,${R3}
+ umlal2 $ACC4,$IN23_0,${R4}
+ umlal2 $ACC1,$IN23_0,${R1}
+
+ dup $IN23_3,${IN23_3}[0]
+ umlal2 $ACC0,$IN23_1,${S4}
+ umlal2 $ACC3,$IN23_1,${R2}
+ umlal2 $ACC2,$IN23_1,${R1}
+ umlal2 $ACC4,$IN23_1,${R3}
+ umlal2 $ACC1,$IN23_1,${R0}
+
+ dup $IN23_4,${IN23_4}[0]
+ umlal2 $ACC3,$IN23_3,${R0}
+ umlal2 $ACC4,$IN23_3,${R1}
+ umlal2 $ACC0,$IN23_3,${S2}
+ umlal2 $ACC1,$IN23_3,${S3}
+ umlal2 $ACC2,$IN23_3,${S4}
+
+ umlal2 $ACC3,$IN23_4,${S4}
+ umlal2 $ACC0,$IN23_4,${S1}
+ umlal2 $ACC4,$IN23_4,${R0}
+ umlal2 $ACC1,$IN23_4,${S2}
+ umlal2 $ACC2,$IN23_4,${S3}
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ umlal $ACC3,$IN01_2,${R1}
+ umlal $ACC0,$IN01_2,${S3}
+ umlal $ACC4,$IN01_2,${R2}
+ umlal $ACC1,$IN01_2,${S4}
+ umlal $ACC2,$IN01_2,${R0}
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}
+ umlal $ACC0,$IN01_0,${R0}
+ umlal $ACC4,$IN01_0,${R4}
+ umlal $ACC1,$IN01_0,${R1}
+ umlal $ACC2,$IN01_0,${R2}
+
+ add $IN01_3,$IN01_3,$H3
+ umlal $ACC3,$IN01_1,${R2}
+ umlal $ACC0,$IN01_1,${S4}
+ umlal $ACC4,$IN01_1,${R3}
+ umlal $ACC1,$IN01_1,${R0}
+ umlal $ACC2,$IN01_1,${R1}
+
+ add $IN01_4,$IN01_4,$H4
+ umlal $ACC3,$IN01_3,${R0}
+ umlal $ACC0,$IN01_3,${S2}
+ umlal $ACC4,$IN01_3,${R1}
+ umlal $ACC1,$IN01_3,${S3}
+ umlal $ACC2,$IN01_3,${S4}
+
+ umlal $ACC3,$IN01_4,${S4}
+ umlal $ACC0,$IN01_4,${S1}
+ umlal $ACC4,$IN01_4,${R0}
+ umlal $ACC1,$IN01_4,${S2}
+ umlal $ACC2,$IN01_4,${S3}
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp $ACC3,$ACC3,$ACC3
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp $ACC0,$ACC0,$ACC0
+ ldp d10,d11,[sp,#32]
+ addp $ACC4,$ACC4,$ACC4
+ ldp d12,d13,[sp,#48]
+ addp $ACC1,$ACC1,$ACC1
+ ldp d14,d15,[sp,#64]
+ addp $ACC2,$ACC2,$ACC2
+ ldr x30,[sp,#8]
+
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr $T0.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ and $ACC4,$ACC4,$MASK.2d
+ ushr $T1.2d,$ACC1,#26
+ and $ACC1,$ACC1,$MASK.2d
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ ushr $T1.2d,$ACC2,#26
+ and $ACC2,$ACC2,$MASK.2d
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ add $ACC3,$ACC3,$T1.2d // h2 -> h3
+
+ ushr $T0.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ ushr $T1.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ add $ACC1,$ACC1,$T0.2d // h0 -> h1
+ add $ACC4,$ACC4,$T1.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
+ mov x4,#1
+ st1 {$ACC4}[0],[$ctx]
+ str x4,[$ctx,#8] // set is_base2_26
+
+ ldr x29,[sp],#80
+ .inst 0xd50323bf // autiasp
+ ret
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.pushsection .rodata
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
+.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
+.popsection
+
+.align 2
+#if !defined(__KERNEL__) && !defined(_WIN64)
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+___
+
+foreach (split("\n",$code)) {
+ s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
+ s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
+ (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
+ (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
+ (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
+ (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
+ (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
+
+ s/\.[124]([sd])\[/.$1\[/;
+ s/w#x([0-9]+)/w$1/g;
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/lib/crypto/arm64/poly1305.h b/lib/crypto/arm64/poly1305.h
new file mode 100644
index 000000000000..b77669767cd6
--- /dev/null
+++ b/lib/crypto/arm64/poly1305.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+ const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_arm64(struct poly1305_block_state *state,
+ const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+ const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+ u8 digest[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
+ unsigned int len, u32 padbit)
+{
+ if (static_branch_likely(&have_neon) && likely(may_use_simd())) {
+ do {
+ unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+ scoped_ksimd()
+ poly1305_blocks_neon(state, src, todo, padbit);
+
+ len -= todo;
+ src += todo;
+ } while (len);
+ } else
+ poly1305_blocks_arm64(state, src, len, padbit);
+}
+
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(ASIMD))
+ static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm64/polyval-ce-core.S b/lib/crypto/arm64/polyval-ce-core.S
new file mode 100644
index 000000000000..7c731a044d02
--- /dev/null
+++ b/lib/crypto/arm64/polyval-ce-core.S
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Implementation of POLYVAL using ARMv8 Crypto Extensions.
+ *
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
+ * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
+ * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
+ * finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#define STRIDE_BLOCKS 8
+
+ACCUMULATOR .req x0
+KEY_POWERS .req x1
+MSG .req x2
+BLOCKS_LEFT .req x3
+KEY_START .req x10
+EXTRA_BYTES .req x11
+TMP .req x13
+
+M0 .req v0
+M1 .req v1
+M2 .req v2
+M3 .req v3
+M4 .req v4
+M5 .req v5
+M6 .req v6
+M7 .req v7
+KEY8 .req v8
+KEY7 .req v9
+KEY6 .req v10
+KEY5 .req v11
+KEY4 .req v12
+KEY3 .req v13
+KEY2 .req v14
+KEY1 .req v15
+PL .req v16
+PH .req v17
+TMP_V .req v18
+LO .req v20
+MI .req v21
+HI .req v22
+SUM .req v23
+GSTAR .req v24
+
+ .text
+
+ .arch armv8-a+crypto
+ .align 4
+
+.Lgstar:
+ .quad 0xc200000000000000, 0xc200000000000000
+
+/*
+ * Computes the product of two 128-bit polynomials in X and Y and XORs the
+ * components of the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ * X = [X_1 : X_0]
+ * Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ * LO += X_0 * Y_0
+ * MI += (X_0 + X_1) * (Y_0 + Y_1)
+ * HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * Karatsuba multiplication is used instead of Schoolbook multiplication because
+ * it was found to be slightly faster on ARM64 CPUs.
+ *
+ */
+.macro karatsuba1 X Y
+ X .req \X
+ Y .req \Y
+ ext v25.16b, X.16b, X.16b, #8
+ ext v26.16b, Y.16b, Y.16b, #8
+ eor v25.16b, v25.16b, X.16b
+ eor v26.16b, v26.16b, Y.16b
+ pmull2 v28.1q, X.2d, Y.2d
+ pmull v29.1q, X.1d, Y.1d
+ pmull v27.1q, v25.1d, v26.1d
+ eor HI.16b, HI.16b, v28.16b
+ eor LO.16b, LO.16b, v29.16b
+ eor MI.16b, MI.16b, v27.16b
+ .unreq X
+ .unreq Y
+.endm
+
+/*
+ * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
+ * them.
+ */
+.macro karatsuba1_store X Y
+ X .req \X
+ Y .req \Y
+ ext v25.16b, X.16b, X.16b, #8
+ ext v26.16b, Y.16b, Y.16b, #8
+ eor v25.16b, v25.16b, X.16b
+ eor v26.16b, v26.16b, Y.16b
+ pmull2 HI.1q, X.2d, Y.2d
+ pmull LO.1q, X.1d, Y.1d
+ pmull MI.1q, v25.1d, v26.1d
+ .unreq X
+ .unreq Y
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ * [PH : PL] =
+ * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+ */
+.macro karatsuba2
+ // v4 = [HI_1 + MI_1 : HI_0 + MI_0]
+ eor v4.16b, HI.16b, MI.16b
+ // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
+ eor v4.16b, v4.16b, LO.16b
+ // v5 = [HI_0 : LO_1]
+ ext v5.16b, LO.16b, HI.16b, #8
+ // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
+ eor v4.16b, v4.16b, v5.16b
+ // HI = [HI_0 : HI_1]
+ ext HI.16b, HI.16b, HI.16b, #8
+ // LO = [LO_0 : LO_1]
+ ext LO.16b, LO.16b, LO.16b, #8
+ // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
+ ext PH.16b, v4.16b, HI.16b, #8
+ // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+ ext PL.16b, LO.16b, v4.16b, #8
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form. We need to reduce it
+ * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128. To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right
+ * shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ * T = T_1 : T_0 = g*(x) * P_0
+ * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+ DEST .req \dest
+ // TMP_V = T_1 : T_0 = P_0 * g*(x)
+ pmull TMP_V.1q, PL.1d, GSTAR.1d
+ // TMP_V = T_0 : T_1
+ ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+ // TMP_V = P_1 + T_0 : P_0 + T_1
+ eor TMP_V.16b, PL.16b, TMP_V.16b
+ // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+ eor PH.16b, PH.16b, TMP_V.16b
+ // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
+ pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
+ eor DEST.16b, PH.16b, TMP_V.16b
+ .unreq DEST
+.endm
+
+/*
+ * Compute Polyval on 8 blocks.
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ *
+ * Sets PL, PH.
+ */
+.macro full_stride reduce
+ eor LO.16b, LO.16b, LO.16b
+ eor MI.16b, MI.16b, MI.16b
+ eor HI.16b, HI.16b, HI.16b
+
+ ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
+ ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
+
+ karatsuba1 M7 KEY1
+ .if \reduce
+ pmull TMP_V.1q, PL.1d, GSTAR.1d
+ .endif
+
+ karatsuba1 M6 KEY2
+ .if \reduce
+ ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+ .endif
+
+ karatsuba1 M5 KEY3
+ .if \reduce
+ eor TMP_V.16b, PL.16b, TMP_V.16b
+ .endif
+
+ karatsuba1 M4 KEY4
+ .if \reduce
+ eor PH.16b, PH.16b, TMP_V.16b
+ .endif
+
+ karatsuba1 M3 KEY5
+ .if \reduce
+ pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
+ .endif
+
+ karatsuba1 M2 KEY6
+ .if \reduce
+ eor SUM.16b, PH.16b, TMP_V.16b
+ .endif
+
+ karatsuba1 M1 KEY7
+ eor M0.16b, M0.16b, SUM.16b
+
+ karatsuba1 M0 KEY8
+ karatsuba2
+.endm
+
+/*
+ * Handle any extra blocks after full_stride loop.
+ */
+.macro partial_stride
+ add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
+ sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
+ ld1 {KEY1.16b}, [KEY_POWERS], #16
+
+ ld1 {TMP_V.16b}, [MSG], #16
+ eor SUM.16b, SUM.16b, TMP_V.16b
+ karatsuba1_store KEY1 SUM
+ sub BLOCKS_LEFT, BLOCKS_LEFT, #1
+
+ tst BLOCKS_LEFT, #4
+ beq .Lpartial4BlocksDone
+ ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
+ ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
+ karatsuba1 M0 KEY8
+ karatsuba1 M1 KEY7
+ karatsuba1 M2 KEY6
+ karatsuba1 M3 KEY5
+.Lpartial4BlocksDone:
+ tst BLOCKS_LEFT, #2
+ beq .Lpartial2BlocksDone
+ ld1 {M0.16b, M1.16b}, [MSG], #32
+ ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
+ karatsuba1 M0 KEY8
+ karatsuba1 M1 KEY7
+.Lpartial2BlocksDone:
+ tst BLOCKS_LEFT, #1
+ beq .LpartialDone
+ ld1 {M0.16b}, [MSG], #16
+ ld1 {KEY8.16b}, [KEY_POWERS], #16
+ karatsuba1 M0 KEY8
+.LpartialDone:
+ karatsuba2
+ montgomery_reduction SUM
+.endm
+
+/*
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * void polyval_mul_pmull(struct polyval_elem *a,
+ * const struct polyval_elem *b);
+ */
+SYM_FUNC_START(polyval_mul_pmull)
+ adr TMP, .Lgstar
+ ld1 {GSTAR.2d}, [TMP]
+ ld1 {v0.16b}, [x0]
+ ld1 {v1.16b}, [x1]
+ karatsuba1_store v0 v1
+ karatsuba2
+ montgomery_reduction SUM
+ st1 {SUM.16b}, [x0]
+ ret
+SYM_FUNC_END(polyval_mul_pmull)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL. This computes:
+ * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * x0 - pointer to accumulator
+ * x1 - pointer to precomputed key powers h^8 ... h^1
+ * x2 - pointer to message blocks
+ * x3 - number of blocks to hash
+ *
+ * void polyval_blocks_pmull(struct polyval_elem *acc,
+ * const struct polyval_key *key,
+ * const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(polyval_blocks_pmull)
+ adr TMP, .Lgstar
+ mov KEY_START, KEY_POWERS
+ ld1 {GSTAR.2d}, [TMP]
+ ld1 {SUM.16b}, [ACCUMULATOR]
+ subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ blt .LstrideLoopExit
+ ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
+ ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
+ full_stride 0
+ subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ blt .LstrideLoopExitReduce
+.LstrideLoop:
+ full_stride 1
+ subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ bge .LstrideLoop
+.LstrideLoopExitReduce:
+ montgomery_reduction SUM
+.LstrideLoopExit:
+ adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ beq .LskipPartial
+ partial_stride
+.LskipPartial:
+ st1 {SUM.16b}, [ACCUMULATOR]
+ ret
+SYM_FUNC_END(polyval_blocks_pmull)
diff --git a/lib/crypto/arm64/polyval.h b/lib/crypto/arm64/polyval.h
new file mode 100644
index 000000000000..a39763395e9b
--- /dev/null
+++ b/lib/crypto/arm64/polyval.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, arm64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+asmlinkage void polyval_mul_pmull(struct polyval_elem *a,
+ const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pmull(struct polyval_elem *acc,
+ const struct polyval_key *key,
+ const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+ const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+ static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+ memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+ if (static_branch_likely(&have_pmull) && may_use_simd()) {
+ scoped_ksimd() {
+ for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+ key->h_powers[i] = key->h_powers[i + 1];
+ polyval_mul_pmull(
+ &key->h_powers[i],
+ &key->h_powers[NUM_H_POWERS - 1]);
+ }
+ }
+ } else {
+ for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+ key->h_powers[i] = key->h_powers[i + 1];
+ polyval_mul_generic(&key->h_powers[i],
+ &key->h_powers[NUM_H_POWERS - 1]);
+ }
+ }
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+ const struct polyval_key *key)
+{
+ if (static_branch_likely(&have_pmull) && may_use_simd()) {
+ scoped_ksimd()
+ polyval_mul_pmull(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ } else {
+ polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ }
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+ const struct polyval_key *key,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_pmull) && may_use_simd()) {
+ do {
+ /* Allow rescheduling every 4 KiB. */
+ size_t n = min_t(size_t, nblocks,
+ 4096 / POLYVAL_BLOCK_SIZE);
+
+ scoped_ksimd()
+ polyval_blocks_pmull(acc, key, data, n);
+ data += n * POLYVAL_BLOCK_SIZE;
+ nblocks -= n;
+ } while (nblocks);
+ } else {
+ polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+ data, nblocks);
+ }
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(PMULL))
+ static_branch_enable(&have_pmull);
+}
diff --git a/lib/crypto/arm64/sha1-ce-core.S b/lib/crypto/arm64/sha1-ce-core.S
new file mode 100644
index 000000000000..8fbd4767f0f0
--- /dev/null
+++ b/lib/crypto/arm64/sha1-ce-core.S
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .arch armv8-a+crypto
+
+ k0 .req v0
+ k1 .req v1
+ k2 .req v2
+ k3 .req v3
+
+ t0 .req v4
+ t1 .req v5
+
+ dga .req q6
+ dgav .req v6
+ dgb .req s7
+ dgbv .req v7
+
+ dg0q .req q12
+ dg0s .req s12
+ dg0v .req v12
+ dg1s .req s13
+ dg1v .req v13
+ dg2s .req s14
+
+ .macro add_only, op, ev, rc, s0, dg1
+ .ifc \ev, ev
+ add t1.4s, v\s0\().4s, \rc\().4s
+ sha1h dg2s, dg0s
+ .ifnb \dg1
+ sha1\op dg0q, \dg1, t0.4s
+ .else
+ sha1\op dg0q, dg1s, t0.4s
+ .endif
+ .else
+ .ifnb \s0
+ add t0.4s, v\s0\().4s, \rc\().4s
+ .endif
+ sha1h dg1s, dg0s
+ sha1\op dg0q, dg2s, t1.4s
+ .endif
+ .endm
+
+ .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
+ sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s
+ add_only \op, \ev, \rc, \s1, \dg1
+ sha1su1 v\s0\().4s, v\s3\().4s
+ .endm
+
+ .macro loadrc, k, val, tmp
+ movz \tmp, :abs_g0_nc:\val
+ movk \tmp, :abs_g1:\val
+ dup \k, \tmp
+ .endm
+
+ /*
+ * size_t __sha1_ce_transform(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(__sha1_ce_transform)
+ /* load round constants */
+ loadrc k0.4s, 0x5a827999, w6
+ loadrc k1.4s, 0x6ed9eba1, w6
+ loadrc k2.4s, 0x8f1bbcdc, w6
+ loadrc k3.4s, 0xca62c1d6, w6
+
+ /* load state */
+ ld1 {dgav.4s}, [x0]
+ ldr dgb, [x0, #16]
+
+ /* load input */
+0: ld1 {v8.4s-v11.4s}, [x1], #64
+ sub x2, x2, #1
+
+CPU_LE( rev32 v8.16b, v8.16b )
+CPU_LE( rev32 v9.16b, v9.16b )
+CPU_LE( rev32 v10.16b, v10.16b )
+CPU_LE( rev32 v11.16b, v11.16b )
+
+ add t0.4s, v8.4s, k0.4s
+ mov dg0v.16b, dgav.16b
+
+ add_update c, ev, k0, 8, 9, 10, 11, dgb
+ add_update c, od, k0, 9, 10, 11, 8
+ add_update c, ev, k0, 10, 11, 8, 9
+ add_update c, od, k0, 11, 8, 9, 10
+ add_update c, ev, k1, 8, 9, 10, 11
+
+ add_update p, od, k1, 9, 10, 11, 8
+ add_update p, ev, k1, 10, 11, 8, 9
+ add_update p, od, k1, 11, 8, 9, 10
+ add_update p, ev, k1, 8, 9, 10, 11
+ add_update p, od, k2, 9, 10, 11, 8
+
+ add_update m, ev, k2, 10, 11, 8, 9
+ add_update m, od, k2, 11, 8, 9, 10
+ add_update m, ev, k2, 8, 9, 10, 11
+ add_update m, od, k2, 9, 10, 11, 8
+ add_update m, ev, k3, 10, 11, 8, 9
+
+ add_update p, od, k3, 11, 8, 9, 10
+ add_only p, ev, k3, 9
+ add_only p, od, k3, 10
+ add_only p, ev, k3, 11
+ add_only p, od
+
+ /* update state */
+ add dgbv.2s, dgbv.2s, dg1v.2s
+ add dgav.4s, dgav.4s, dg0v.4s
+
+ /* return early if voluntary preemption is needed */
+ cond_yield 1f, x5, x6
+
+ /* handled all input blocks? */
+ cbnz x2, 0b
+
+ /* store new state */
+1: st1 {dgav.4s}, [x0]
+ str dgb, [x0, #16]
+ mov x0, x2
+ ret
+SYM_FUNC_END(__sha1_ce_transform)
diff --git a/lib/crypto/arm64/sha1.h b/lib/crypto/arm64/sha1.h
new file mode 100644
index 000000000000..bc7071f1be09
--- /dev/null
+++ b/lib/crypto/arm64/sha1.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for ARM64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage size_t __sha1_ce_transform(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_ce) && likely(may_use_simd())) {
+ do {
+ size_t rem;
+
+ scoped_ksimd()
+ rem = __sha1_ce_transform(state, data, nblocks);
+
+ data += (nblocks - rem) * SHA1_BLOCK_SIZE;
+ nblocks = rem;
+ } while (nblocks);
+ } else {
+ sha1_blocks_generic(state, data, nblocks);
+ }
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(SHA1))
+ static_branch_enable(&have_ce);
+}
diff --git a/lib/crypto/arm64/sha2-armv8.pl b/lib/crypto/arm64/sha2-armv8.pl
new file mode 100644
index 000000000000..35ec9ae99fe1
--- /dev/null
+++ b/lib/crypto/arm64/sha2-armv8.pl
@@ -0,0 +1,786 @@
+#! /usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+# SHA256-hw SHA256(*) SHA512
+# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+# Denver 2.01 10.5 (+26%) 6.70 (+8%)
+# X-Gene 20.0 (+100%) 12.8 (+300%(***))
+# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+#
+# (*) Software SHA256 results are of lesser relevance, presented
+# mostly for informational purposes.
+# (**) The result is a trade-off: it's possible to improve it by
+# 10% (or by 1 cycle per round), but at the cost of 20% loss
+# on Cortex-A53 (or by 4 cycles per round).
+# (***) Super-impressive coefficients over gcc-generated code are
+# indication of some compiler "pathology", most notably code
+# generated with -mgeneral-regs-only is significantly faster
+# and the gap is only 40-90%.
+#
+# October 2016.
+#
+# Originally it was reckoned that it makes no sense to implement NEON
+# version of SHA256 for 64-bit processors. This is because performance
+# improvement on most wide-spread Cortex-A5x processors was observed
+# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
+# observed that 32-bit NEON SHA256 performs significantly better than
+# 64-bit scalar version on *some* of the more recent processors. As
+# result 64-bit NEON version of SHA256 was added to provide best
+# all-round performance. For example it executes ~30% faster on X-Gene
+# and Mongoose. [For reference, NEON version of SHA512 is bound to
+# deliver much less improvement, likely *negative* on Cortex-A5x.
+# Which is why NEON support is limited to SHA256.]
+
+$output=pop;
+$flavour=pop;
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open OUT,"| \"$^X\" $xlate $flavour $output";
+ *STDOUT=*OUT;
+} else {
+ open STDOUT,">$output";
+}
+
+if ($output =~ /512/) {
+ $BITS=512;
+ $SZ=8;
+ @Sigma0=(28,34,39);
+ @Sigma1=(14,18,41);
+ @sigma0=(1, 8, 7);
+ @sigma1=(19,61, 6);
+ $rounds=80;
+ $reg_t="x";
+} else {
+ $BITS=256;
+ $SZ=4;
+ @Sigma0=( 2,13,22);
+ @Sigma1=( 6,11,25);
+ @sigma0=( 7,18, 3);
+ @sigma1=(17,19,10);
+ $rounds=64;
+ $reg_t="w";
+}
+
+$func="sha${BITS}_block_data_order";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+ $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___ if ($i<16);
+#ifndef __AARCH64EB__
+ rev @X[$i],@X[$i] // $i
+#endif
+___
+$code.=<<___ if ($i<13 && ($i&1));
+ ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___ if ($i==13);
+ ldp @X[14],@X[15],[$inp]
+___
+$code.=<<___ if ($i>=14);
+ ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___ if ($i>0 && $i<16);
+ add $a,$a,$t1 // h+=Sigma0(a)
+___
+$code.=<<___ if ($i>=11);
+ str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___ if ($i<15);
+ ror $t0,$e,#$Sigma1[0]
+ add $h,$h,$t2 // h+=K[i]
+ eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+ and $t1,$f,$e
+ bic $t2,$g,$e
+ add $h,$h,@X[$i&15] // h+=X[i]
+ orr $t1,$t1,$t2 // Ch(e,f,g)
+ eor $t2,$a,$b // a^b, b^c in next round
+ eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
+ ror $T0,$a,#$Sigma0[0]
+ add $h,$h,$t1 // h+=Ch(e,f,g)
+ eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+ add $h,$h,$t0 // h+=Sigma1(e)
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
+ add $d,$d,$h // d+=h
+ eor $t3,$t3,$b // Maj(a,b,c)
+ eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
+ add $h,$h,$t3 // h+=Maj(a,b,c)
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
+ //add $h,$h,$t1 // h+=Sigma0(a)
+___
+$code.=<<___ if ($i>=15);
+ ror $t0,$e,#$Sigma1[0]
+ add $h,$h,$t2 // h+=K[i]
+ ror $T1,@X[($j+1)&15],#$sigma0[0]
+ and $t1,$f,$e
+ ror $T2,@X[($j+14)&15],#$sigma1[0]
+ bic $t2,$g,$e
+ ror $T0,$a,#$Sigma0[0]
+ add $h,$h,@X[$i&15] // h+=X[i]
+ eor $t0,$t0,$e,ror#$Sigma1[1]
+ eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+ orr $t1,$t1,$t2 // Ch(e,f,g)
+ eor $t2,$a,$b // a^b, b^c in next round
+ eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
+ eor $T0,$T0,$a,ror#$Sigma0[1]
+ add $h,$h,$t1 // h+=Ch(e,f,g)
+ and $t3,$t3,$t2 // (b^c)&=(a^b)
+ eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+ eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
+ add $h,$h,$t0 // h+=Sigma1(e)
+ eor $t3,$t3,$b // Maj(a,b,c)
+ eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
+ eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
+ add @X[$j],@X[$j],@X[($j+9)&15]
+ add $d,$d,$h // d+=h
+ add $h,$h,$t3 // h+=Maj(a,b,c)
+ ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
+ add @X[$j],@X[$j],$T1
+ add $h,$h,$t1 // h+=Sigma0(a)
+ add @X[$j],@X[$j],$T2
+___
+ ($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#endif
+
+.text
+
+.extern OPENSSL_armcap_P
+.globl $func
+.type $func,%function
+.align 6
+$func:
+___
+$code.=<<___ if ($SZ==4);
+#ifndef __KERNEL__
+# ifdef __ILP32__
+ ldrsw x16,.LOPENSSL_armcap_P
+# else
+ ldr x16,.LOPENSSL_armcap_P
+# endif
+ adr x17,.LOPENSSL_armcap_P
+ add x16,x16,x17
+ ldr w16,[x16]
+ tst w16,#ARMV8_SHA256
+ b.ne .Lv8_entry
+ tst w16,#ARMV7_NEON
+ b.ne .Lneon_entry
+#endif
+___
+$code.=<<___;
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*$SZ
+
+ ldp $A,$B,[$ctx] // load context
+ ldp $C,$D,[$ctx,#2*$SZ]
+ ldp $E,$F,[$ctx,#4*$SZ]
+ add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
+ ldp $G,$H,[$ctx,#6*$SZ]
+ adr $Ktbl,.LK$BITS
+ stp $ctx,$num,[x29,#96]
+
+.Loop:
+ ldp @X[0],@X[1],[$inp],#2*$SZ
+ ldr $t2,[$Ktbl],#$SZ // *K++
+ eor $t3,$B,$C // magic seed
+ str $inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+ cbnz $t2,.Loop_16_xx
+
+ ldp $ctx,$num,[x29,#96]
+ ldr $inp,[x29,#112]
+ sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
+
+ ldp @X[0],@X[1],[$ctx]
+ ldp @X[2],@X[3],[$ctx,#2*$SZ]
+ add $inp,$inp,#14*$SZ // advance input pointer
+ ldp @X[4],@X[5],[$ctx,#4*$SZ]
+ add $A,$A,@X[0]
+ ldp @X[6],@X[7],[$ctx,#6*$SZ]
+ add $B,$B,@X[1]
+ add $C,$C,@X[2]
+ add $D,$D,@X[3]
+ stp $A,$B,[$ctx]
+ add $E,$E,@X[4]
+ add $F,$F,@X[5]
+ stp $C,$D,[$ctx,#2*$SZ]
+ add $G,$G,@X[6]
+ add $H,$H,@X[7]
+ cmp $inp,$num
+ stp $E,$F,[$ctx,#4*$SZ]
+ stp $G,$H,[$ctx,#6*$SZ]
+ b.ne .Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*$SZ
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ ret
+.size $func,.-$func
+
+.align 6
+.type .LK$BITS,%object
+.LK$BITS:
+___
+$code.=<<___ if ($SZ==8);
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+ .quad 0 // terminator
+___
+$code.=<<___ if ($SZ==4);
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .long 0 //terminator
+___
+$code.=<<___;
+.size .LK$BITS,.-.LK$BITS
+#ifndef __KERNEL__
+.align 3
+.LOPENSSL_armcap_P:
+# ifdef __ILP32__
+ .long OPENSSL_armcap_P-.
+# else
+ .quad OPENSSL_armcap_P-.
+# endif
+#endif
+.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+#ifndef __KERNEL__
+.type sha256_block_armv8,%function
+.align 6
+sha256_block_armv8:
+.Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1.32 {$ABCD,$EFGH},[$ctx]
+ adr $Ktbl,.LK256
+
+.Loop_hw:
+ ld1 {@MSG[0]-@MSG[3]},[$inp],#64
+ sub $num,$num,#1
+ ld1.32 {$W0},[$Ktbl],#16
+ rev32 @MSG[0],@MSG[0]
+ rev32 @MSG[1],@MSG[1]
+ rev32 @MSG[2],@MSG[2]
+ rev32 @MSG[3],@MSG[3]
+ orr $ABCD_SAVE,$ABCD,$ABCD // offload
+ orr $EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ ld1.32 {$W1},[$Ktbl],#16
+ add.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ ld1.32 {$W1},[$Ktbl],#16
+ add.i32 $W0,$W0,@MSG[0]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ ld1.32 {$W0},[$Ktbl],#16
+ add.i32 $W1,$W1,@MSG[1]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ ld1.32 {$W1},[$Ktbl]
+ add.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ add.i32 $W1,$W1,@MSG[3]
+ orr $abcd,$ABCD,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ add.i32 $ABCD,$ABCD,$ABCD_SAVE
+ add.i32 $EFGH,$EFGH,$EFGH_SAVE
+
+ cbnz $num,.Loop_hw
+
+ st1.32 {$ABCD,$EFGH},[$ctx]
+
+ ldr x29,[sp],#16
+ ret
+.size sha256_block_armv8,.-sha256_block_armv8
+#endif
+___
+}
+
+if ($SZ==4) { ######################################### NEON stuff #
+# You'll surely note a lot of similarities with sha256-armv4 module,
+# and of course it's not a coincidence. sha256-armv4 was used as
+# initial template, but was adapted for ARMv8 instruction set and
+# extensively re-tuned for all-round performance.
+
+my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
+my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
+my $Ktbl="x16";
+my $Xfer="x17";
+my @X = map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
+my $j=0;
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
+sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
+sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &ext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ext_8 ($T3,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ &ushr_32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ &sli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T4,$T7,$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_32 ($T4,$T7,32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T5,$T7,$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T3,$T7,$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_u32 ($T3,$T7,32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ushr_32 ($T6,@X[0],$sigma1[0]);
+ eval(shift(@insns));
+ &ushr_32 ($T7,@X[0],$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_32 ($T6,@X[0],32-$sigma1[0]);
+ eval(shift(@insns));
+ &ushr_32 ($T5,@X[0],$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T7,$T7,$T6);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &sli_32 ($T5,@X[0],32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ld1_32 ("{$T0}","[$Ktbl], #16");
+ eval(shift(@insns));
+ &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &eor_8 ($T5,$T5,$T5);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &mov (&Dhi($T5), &Dlo($T7));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 ($T0,$T0,@X[0]);
+ while($#insns>=1) { eval(shift(@insns)); }
+ &st1_32 ("{$T0}","[$Xfer], #16");
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ld1_8 ("{@X[0]}","[$inp],#16");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &ld1_32 ("{$T0}","[$Ktbl],#16");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &rev32 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &add_32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &st1_32 ("{$T0}","[$Xfer], #16");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past
+ '&and ($t1,$f,$e)',
+ '&bic ($t4,$g,$e)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&orr ($t1,$t1,$t4)', # Ch(e,f,g)
+ '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ror ($t0,$t0,"#$Sigma1[0]")',
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t0)', # h+=Sigma1(e)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&ror ($t4,$t4,"#$Sigma0[0]")',
+ '&add ($d,$d,$h)', # d+=h
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#ifdef __KERNEL__
+.globl sha256_block_neon
+#endif
+.type sha256_block_neon,%function
+.align 4
+sha256_block_neon:
+.Lneon_entry:
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ sub sp,sp,#16*4
+
+ adr $Ktbl,.LK256
+ add $num,$inp,$num,lsl#6 // len to point at the end of inp
+
+ ld1.8 {@X[0]},[$inp], #16
+ ld1.8 {@X[1]},[$inp], #16
+ ld1.8 {@X[2]},[$inp], #16
+ ld1.8 {@X[3]},[$inp], #16
+ ld1.32 {$T0},[$Ktbl], #16
+ ld1.32 {$T1},[$Ktbl], #16
+ ld1.32 {$T2},[$Ktbl], #16
+ ld1.32 {$T3},[$Ktbl], #16
+ rev32 @X[0],@X[0] // yes, even on
+ rev32 @X[1],@X[1] // big-endian
+ rev32 @X[2],@X[2]
+ rev32 @X[3],@X[3]
+ mov $Xfer,sp
+ add.32 $T0,$T0,@X[0]
+ add.32 $T1,$T1,@X[1]
+ add.32 $T2,$T2,@X[2]
+ st1.32 {$T0-$T1},[$Xfer], #32
+ add.32 $T3,$T3,@X[3]
+ st1.32 {$T2-$T3},[$Xfer]
+ sub $Xfer,$Xfer,#32
+
+ ldp $A,$B,[$ctx]
+ ldp $C,$D,[$ctx,#8]
+ ldp $E,$F,[$ctx,#16]
+ ldp $G,$H,[$ctx,#24]
+ ldr $t1,[sp,#0]
+ mov $t2,wzr
+ eor $t3,$B,$C
+ mov $t4,wzr
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ cmp $t1,#0 // check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ sub $Ktbl,$Ktbl,#256 // rewind $Ktbl
+ cmp $inp,$num
+ mov $Xfer, #64
+ csel $Xfer, $Xfer, xzr, eq
+ sub $inp,$inp,$Xfer // avoid SEGV
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ add $A,$A,$t4 // h+=Sigma0(a) from the past
+ ldp $t0,$t1,[$ctx,#0]
+ add $A,$A,$t2 // h+=Maj(a,b,c) from the past
+ ldp $t2,$t3,[$ctx,#8]
+ add $A,$A,$t0 // accumulate
+ add $B,$B,$t1
+ ldp $t0,$t1,[$ctx,#16]
+ add $C,$C,$t2
+ add $D,$D,$t3
+ ldp $t2,$t3,[$ctx,#24]
+ add $E,$E,$t0
+ add $F,$F,$t1
+ ldr $t1,[sp,#0]
+ stp $A,$B,[$ctx,#0]
+ add $G,$G,$t2
+ mov $t2,wzr
+ stp $C,$D,[$ctx,#8]
+ add $H,$H,$t3
+ stp $E,$F,[$ctx,#16]
+ eor $t3,$B,$C
+ stp $G,$H,[$ctx,#24]
+ mov $t4,wzr
+ mov $Xfer,sp
+ b.ne .L_00_48
+
+ ldr x29,[x29]
+ add sp,sp,#16*4+16
+ ret
+.size sha256_block_neon,.-sha256_block_neon
+___
+}
+
+$code.=<<___;
+#ifndef __KERNEL__
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+{ my %opcode = (
+ "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
+ "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+ $mnemonic,$arg;
+ }
+}
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
+
+ s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
+
+ s/\.[ui]?8(\s)/$1/;
+ s/\.\w?32\b// and s/\.16b/\.4s/g;
+ m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;
+
+ print $_,"\n";
+}
+
+close STDOUT;
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S
new file mode 100644
index 000000000000..e4bfe42a61a9
--- /dev/null
+++ b/lib/crypto/arm64/sha256-ce.S
@@ -0,0 +1,408 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Core SHA-224/SHA-256 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .arch armv8-a+crypto
+
+ dga .req q20
+ dgav .req v20
+ dgb .req q21
+ dgbv .req v21
+
+ t0 .req v22
+ t1 .req v23
+
+ dg0q .req q24
+ dg0v .req v24
+ dg1q .req q25
+ dg1v .req v25
+ dg2q .req q26
+ dg2v .req v26
+
+ .macro add_only, ev, rc, s0
+ mov dg2v.16b, dg0v.16b
+ .ifeq \ev
+ add t1.4s, v\s0\().4s, \rc\().4s
+ sha256h dg0q, dg1q, t0.4s
+ sha256h2 dg1q, dg2q, t0.4s
+ .else
+ .ifnb \s0
+ add t0.4s, v\s0\().4s, \rc\().4s
+ .endif
+ sha256h dg0q, dg1q, t1.4s
+ sha256h2 dg1q, dg2q, t1.4s
+ .endif
+ .endm
+
+ .macro add_update, ev, rc, s0, s1, s2, s3
+ sha256su0 v\s0\().4s, v\s1\().4s
+ add_only \ev, \rc, \s1
+ sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
+ .endm
+
+ /*
+ * The SHA-256 round constants
+ */
+ .section ".rodata", "a"
+ .align 4
+.Lsha2_rcon:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+ .macro load_round_constants tmp
+ adr_l \tmp, .Lsha2_rcon
+ ld1 { v0.4s- v3.4s}, [\tmp], #64
+ ld1 { v4.4s- v7.4s}, [\tmp], #64
+ ld1 { v8.4s-v11.4s}, [\tmp], #64
+ ld1 {v12.4s-v15.4s}, [\tmp]
+ .endm
+
+ /*
+ * size_t __sha256_ce_transform(struct sha256_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+ .text
+SYM_FUNC_START(__sha256_ce_transform)
+
+ load_round_constants x8
+
+ /* load state */
+ ld1 {dgav.4s, dgbv.4s}, [x0]
+
+ /* load input */
+0: ld1 {v16.4s-v19.4s}, [x1], #64
+ sub x2, x2, #1
+
+CPU_LE( rev32 v16.16b, v16.16b )
+CPU_LE( rev32 v17.16b, v17.16b )
+CPU_LE( rev32 v18.16b, v18.16b )
+CPU_LE( rev32 v19.16b, v19.16b )
+
+ add t0.4s, v16.4s, v0.4s
+ mov dg0v.16b, dgav.16b
+ mov dg1v.16b, dgbv.16b
+
+ add_update 0, v1, 16, 17, 18, 19
+ add_update 1, v2, 17, 18, 19, 16
+ add_update 0, v3, 18, 19, 16, 17
+ add_update 1, v4, 19, 16, 17, 18
+
+ add_update 0, v5, 16, 17, 18, 19
+ add_update 1, v6, 17, 18, 19, 16
+ add_update 0, v7, 18, 19, 16, 17
+ add_update 1, v8, 19, 16, 17, 18
+
+ add_update 0, v9, 16, 17, 18, 19
+ add_update 1, v10, 17, 18, 19, 16
+ add_update 0, v11, 18, 19, 16, 17
+ add_update 1, v12, 19, 16, 17, 18
+
+ add_only 0, v13, 17
+ add_only 1, v14, 18
+ add_only 0, v15, 19
+ add_only 1
+
+ /* update state */
+ add dgav.4s, dgav.4s, dg0v.4s
+ add dgbv.4s, dgbv.4s, dg1v.4s
+
+ /* return early if voluntary preemption is needed */
+ cond_yield 1f, x5, x6
+
+ /* handled all input blocks? */
+ cbnz x2, 0b
+
+ /* store new state */
+1: st1 {dgav.4s, dgbv.4s}, [x0]
+ mov x0, x2
+ ret
+SYM_FUNC_END(__sha256_ce_transform)
+
+ .unreq dga
+ .unreq dgav
+ .unreq dgb
+ .unreq dgbv
+ .unreq t0
+ .unreq t1
+ .unreq dg0q
+ .unreq dg0v
+ .unreq dg1q
+ .unreq dg1v
+ .unreq dg2q
+ .unreq dg2v
+
+ // parameters for sha256_ce_finup2x()
+ ctx .req x0
+ data1 .req x1
+ data2 .req x2
+ len .req w3
+ out1 .req x4
+ out2 .req x5
+
+ // other scalar variables
+ count .req x6
+ final_step .req w7
+
+ // x8-x9 are used as temporaries.
+
+ // v0-v15 are used to cache the SHA-256 round constants.
+ // v16-v19 are used for the message schedule for the first message.
+ // v20-v23 are used for the message schedule for the second message.
+ // v24-v31 are used for the state and temporaries as given below.
+ // *_a are for the first message and *_b for the second.
+ state0_a_q .req q24
+ state0_a .req v24
+ state1_a_q .req q25
+ state1_a .req v25
+ state0_b_q .req q26
+ state0_b .req v26
+ state1_b_q .req q27
+ state1_b .req v27
+ t0_a .req v28
+ t0_b .req v29
+ t1_a_q .req q30
+ t1_a .req v30
+ t1_b_q .req q31
+ t1_b .req v31
+
+#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
+// offsetof(struct __sha256_ctx, state) is assumed to be 0.
+
+ // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
+ // and m0_b contain the current 4 message schedule words for the first
+ // and second message respectively.
+ //
+ // If not all the message schedule words have been computed yet, then
+ // this also computes 4 more message schedule words for each message.
+ // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
+ // the first message, and likewise m1_b-m3_b for the second. After
+ // consuming the current value of m0_a, this macro computes the group
+ // after m3_a and writes it to m0_a, and likewise for *_b. This means
+ // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
+ // m3_a, m0_a), and likewise for *_b, so the caller must cycle through
+ // the registers accordingly.
+ .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
+ m0_b, m1_b, m2_b, m3_b
+ add t0_a\().4s, \m0_a\().4s, \k\().4s
+ add t0_b\().4s, \m0_b\().4s, \k\().4s
+ .if \i < 48
+ sha256su0 \m0_a\().4s, \m1_a\().4s
+ sha256su0 \m0_b\().4s, \m1_b\().4s
+ sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
+ sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
+ .endif
+ mov t1_a.16b, state0_a.16b
+ mov t1_b.16b, state0_b.16b
+ sha256h state0_a_q, state1_a_q, t0_a\().4s
+ sha256h state0_b_q, state1_b_q, t0_b\().4s
+ sha256h2 state1_a_q, t1_a_q, t0_a\().4s
+ sha256h2 state1_b_q, t1_b_q, t0_b\().4s
+ .endm
+
+ .macro do_16rounds_2x i, k0, k1, k2, k3
+ do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
+ do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
+ do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
+ do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
+ .endm
+
+//
+// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+// const u8 *data1, const u8 *data2, int len,
+// u8 out1[SHA256_DIGEST_SIZE],
+// u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved. On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ce_finup2x)
+ sub sp, sp, #128
+ mov final_step, #0
+ load_round_constants x8
+
+ // Load the initial state from ctx->state.
+ ld1 {state0_a.4s-state1_a.4s}, [ctx]
+
+ // Load ctx->bytecount. Take the mod 64 of it to get the number of
+ // bytes that are buffered in ctx->buf. Also save it in a register with
+ // len added to it.
+ ldr x8, [ctx, #OFFSETOF_BYTECOUNT]
+ add count, x8, len, sxtw
+ and x8, x8, #63
+ cbz x8, .Lfinup2x_enter_loop // No bytes buffered?
+
+ // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
+ // followed by the first 64 - x8 bytes of data. Since len >= 64, we
+ // just load 64 bytes from each of ctx->buf, data1, and data2
+ // unconditionally and rearrange the data as needed.
+ add x9, ctx, #OFFSETOF_BUF
+ ld1 {v16.16b-v19.16b}, [x9]
+ st1 {v16.16b-v19.16b}, [sp]
+
+ ld1 {v16.16b-v19.16b}, [data1], #64
+ add x9, sp, x8
+ st1 {v16.16b-v19.16b}, [x9]
+ ld1 {v16.4s-v19.4s}, [sp]
+
+ ld1 {v20.16b-v23.16b}, [data2], #64
+ st1 {v20.16b-v23.16b}, [x9]
+ ld1 {v20.4s-v23.4s}, [sp]
+
+ sub len, len, #64
+ sub data1, data1, x8
+ sub data2, data2, x8
+ add len, len, w8
+ mov state0_b.16b, state0_a.16b
+ mov state1_b.16b, state1_a.16b
+ b .Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+ sub len, len, #64
+ mov state0_b.16b, state0_a.16b
+ mov state1_b.16b, state1_a.16b
+.Lfinup2x_loop:
+ // Load the next two data blocks.
+ ld1 {v16.4s-v19.4s}, [data1], #64
+ ld1 {v20.4s-v23.4s}, [data2], #64
+.Lfinup2x_loop_have_data:
+ // Convert the words of the data blocks from big endian.
+CPU_LE( rev32 v16.16b, v16.16b )
+CPU_LE( rev32 v17.16b, v17.16b )
+CPU_LE( rev32 v18.16b, v18.16b )
+CPU_LE( rev32 v19.16b, v19.16b )
+CPU_LE( rev32 v20.16b, v20.16b )
+CPU_LE( rev32 v21.16b, v21.16b )
+CPU_LE( rev32 v22.16b, v22.16b )
+CPU_LE( rev32 v23.16b, v23.16b )
+.Lfinup2x_loop_have_bswapped_data:
+
+ // Save the original state for each block.
+ st1 {state0_a.4s-state1_b.4s}, [sp]
+
+ // Do the SHA-256 rounds on each block.
+ do_16rounds_2x 0, v0, v1, v2, v3
+ do_16rounds_2x 16, v4, v5, v6, v7
+ do_16rounds_2x 32, v8, v9, v10, v11
+ do_16rounds_2x 48, v12, v13, v14, v15
+
+ // Add the original state for each block.
+ ld1 {v16.4s-v19.4s}, [sp]
+ add state0_a.4s, state0_a.4s, v16.4s
+ add state1_a.4s, state1_a.4s, v17.4s
+ add state0_b.4s, state0_b.4s, v18.4s
+ add state1_b.4s, state1_b.4s, v19.4s
+
+ // Update len and loop back if more blocks remain.
+ sub len, len, #64
+ tbz len, #31, .Lfinup2x_loop // len >= 0?
+
+ // Check if any final blocks need to be handled.
+ // final_step = 2: all done
+ // final_step = 1: need to do count-only padding block
+ // final_step = 0: need to do the block with 0x80 padding byte
+ tbnz final_step, #1, .Lfinup2x_done
+ tbnz final_step, #0, .Lfinup2x_finalize_countonly
+ add len, len, #64
+ cbz len, .Lfinup2x_finalize_blockaligned
+
+ // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
+ // To do this, write the padding starting with the 0x80 byte to
+ // &sp[64]. Then for each message, copy the last 64 data bytes to sp
+ // and load from &sp[64 - len] to get the needed padding block. This
+ // code relies on the data buffers being >= 64 bytes in length.
+ sub w8, len, #64 // w8 = len - 64
+ add data1, data1, w8, sxtw // data1 += len - 64
+ add data2, data2, w8, sxtw // data2 += len - 64
+CPU_LE( mov x9, #0x80 )
+CPU_LE( fmov d16, x9 )
+CPU_BE( movi v16.16b, #0 )
+CPU_BE( mov x9, #0x8000000000000000 )
+CPU_BE( mov v16.d[1], x9 )
+ movi v17.16b, #0
+ stp q16, q17, [sp, #64]
+ stp q17, q17, [sp, #96]
+ sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
+ cmp len, #56
+ b.ge 1f // will count spill into its own block?
+ lsl count, count, #3
+CPU_LE( rev count, count )
+ str count, [x9, #56]
+ mov final_step, #2 // won't need count-only block
+ b 2f
+1:
+ mov final_step, #1 // will need count-only block
+2:
+ ld1 {v16.16b-v19.16b}, [data1]
+ st1 {v16.16b-v19.16b}, [sp]
+ ld1 {v16.4s-v19.4s}, [x9]
+ ld1 {v20.16b-v23.16b}, [data2]
+ st1 {v20.16b-v23.16b}, [sp]
+ ld1 {v20.4s-v23.4s}, [x9]
+ b .Lfinup2x_loop_have_data
+
+ // Prepare a padding block, either:
+ //
+ // {0x80, 0, 0, 0, ..., count (as __be64)}
+ // This is for a block aligned message.
+ //
+ // { 0, 0, 0, 0, ..., count (as __be64)}
+ // This is for a message whose length mod 64 is >= 56.
+ //
+ // Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+ movi v16.2d, #0
+ b 1f
+.Lfinup2x_finalize_blockaligned:
+ mov x8, #0x80000000
+ fmov d16, x8
+1:
+ movi v17.2d, #0
+ movi v18.2d, #0
+ ror count, count, #29 // ror(lsl(count, 3), 32)
+ mov v19.d[0], xzr
+ mov v19.d[1], count
+ mov v20.16b, v16.16b
+ movi v21.2d, #0
+ movi v22.2d, #0
+ mov v23.16b, v19.16b
+ mov final_step, #2
+ b .Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+ // Write the two digests with all bytes in the correct order.
+CPU_LE( rev32 state0_a.16b, state0_a.16b )
+CPU_LE( rev32 state1_a.16b, state1_a.16b )
+CPU_LE( rev32 state0_b.16b, state0_b.16b )
+CPU_LE( rev32 state1_b.16b, state1_b.16b )
+ st1 {state0_a.4s-state1_a.4s}, [out1]
+ st1 {state0_b.4s-state1_b.4s}, [out2]
+ add sp, sp, #128
+ ret
+SYM_FUNC_END(sha256_ce_finup2x)
diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h
new file mode 100644
index 000000000000..568dff0f276a
--- /dev/null
+++ b/lib/crypto/arm64/sha256.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for ARM64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage void sha256_block_neon(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_neon) && likely(may_use_simd())) {
+ if (static_branch_likely(&have_ce)) {
+ do {
+ size_t rem;
+
+ scoped_ksimd()
+ rem = __sha256_ce_transform(state, data,
+ nblocks);
+
+ data += (nblocks - rem) * SHA256_BLOCK_SIZE;
+ nblocks = rem;
+ } while (nblocks);
+ } else {
+ scoped_ksimd()
+ sha256_block_neon(state, data, nblocks);
+ }
+ } else {
+ sha256_block_data_order(state, data, nblocks);
+ }
+}
+
+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, int len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, size_t len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE])
+{
+ /*
+ * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+ * Further limit len to 65536 to avoid spending too long with preemption
+ * disabled. (Of course, in practice len is nearly always 4096 anyway.)
+ */
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
+ len <= 65536 && likely(may_use_simd())) {
+ scoped_ksimd()
+ sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
+ kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+ kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+ return true;
+ }
+ return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+ return static_key_enabled(&have_ce);
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(ASIMD)) {
+ static_branch_enable(&have_neon);
+ if (cpu_have_named_feature(SHA2))
+ static_branch_enable(&have_ce);
+ }
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm64/sha3-ce-core.S b/lib/crypto/arm64/sha3-ce-core.S
new file mode 100644
index 000000000000..ace90b506490
--- /dev/null
+++ b/lib/crypto/arm64/sha3-ce-core.S
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Core SHA-3 transform using v8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+ .set .Lv\b\().2d, \b
+ .set .Lv\b\().16b, \b
+ .endr
+
+ /*
+ * ARMv8.2 Crypto Extensions instructions
+ */
+ .macro eor3, rd, rn, rm, ra
+ .inst 0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+ .endm
+
+ .macro rax1, rd, rn, rm
+ .inst 0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+ .endm
+
+ .macro bcax, rd, rn, rm, ra
+ .inst 0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+ .endm
+
+ .macro xar, rd, rn, rm, imm6
+ .inst 0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
+ .endm
+
+ /*
+ * size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+ * size_t nblocks, size_t block_size)
+ *
+ * block_size is assumed to be one of 72 (SHA3-512), 104 (SHA3-384), 136
+ * (SHA3-256 and SHAKE256), 144 (SHA3-224), or 168 (SHAKE128).
+ */
+ .text
+SYM_FUNC_START(sha3_ce_transform)
+ /* load state */
+ add x8, x0, #32
+ ld1 { v0.1d- v3.1d}, [x0]
+ ld1 { v4.1d- v7.1d}, [x8], #32
+ ld1 { v8.1d-v11.1d}, [x8], #32
+ ld1 {v12.1d-v15.1d}, [x8], #32
+ ld1 {v16.1d-v19.1d}, [x8], #32
+ ld1 {v20.1d-v23.1d}, [x8], #32
+ ld1 {v24.1d}, [x8]
+
+0: sub x2, x2, #1
+ mov w8, #24
+ adr_l x9, .Lsha3_rcon
+
+ /* load input */
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ ld1 {v29.8b}, [x1], #8
+ eor v0.8b, v0.8b, v25.8b
+ eor v1.8b, v1.8b, v26.8b
+ eor v2.8b, v2.8b, v27.8b
+ eor v3.8b, v3.8b, v28.8b
+ eor v4.8b, v4.8b, v29.8b
+
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v5.8b, v5.8b, v25.8b
+ eor v6.8b, v6.8b, v26.8b
+ eor v7.8b, v7.8b, v27.8b
+ eor v8.8b, v8.8b, v28.8b
+ cmp x3, #72
+ b.eq 3f /* SHA3-512 (block_size=72)? */
+
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v9.8b, v9.8b, v25.8b
+ eor v10.8b, v10.8b, v26.8b
+ eor v11.8b, v11.8b, v27.8b
+ eor v12.8b, v12.8b, v28.8b
+ cmp x3, #104
+ b.eq 3f /* SHA3-384 (block_size=104)? */
+
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v13.8b, v13.8b, v25.8b
+ eor v14.8b, v14.8b, v26.8b
+ eor v15.8b, v15.8b, v27.8b
+ eor v16.8b, v16.8b, v28.8b
+ cmp x3, #144
+ b.lt 3f /* SHA3-256 or SHAKE256 (block_size=136)? */
+ b.eq 2f /* SHA3-224 (block_size=144)? */
+
+ /* SHAKE128 (block_size=168) */
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ eor v17.8b, v17.8b, v25.8b
+ eor v18.8b, v18.8b, v26.8b
+ eor v19.8b, v19.8b, v27.8b
+ eor v20.8b, v20.8b, v28.8b
+ b 3f
+2:
+ /* SHA3-224 (block_size=144) */
+ ld1 {v25.8b}, [x1], #8
+ eor v17.8b, v17.8b, v25.8b
+
+3: sub w8, w8, #1
+
+ eor3 v29.16b, v4.16b, v9.16b, v14.16b
+ eor3 v26.16b, v1.16b, v6.16b, v11.16b
+ eor3 v28.16b, v3.16b, v8.16b, v13.16b
+ eor3 v25.16b, v0.16b, v5.16b, v10.16b
+ eor3 v27.16b, v2.16b, v7.16b, v12.16b
+ eor3 v29.16b, v29.16b, v19.16b, v24.16b
+ eor3 v26.16b, v26.16b, v16.16b, v21.16b
+ eor3 v28.16b, v28.16b, v18.16b, v23.16b
+ eor3 v25.16b, v25.16b, v15.16b, v20.16b
+ eor3 v27.16b, v27.16b, v17.16b, v22.16b
+
+ rax1 v30.2d, v29.2d, v26.2d // bc[0]
+ rax1 v26.2d, v26.2d, v28.2d // bc[2]
+ rax1 v28.2d, v28.2d, v25.2d // bc[4]
+ rax1 v25.2d, v25.2d, v27.2d // bc[1]
+ rax1 v27.2d, v27.2d, v29.2d // bc[3]
+
+ eor v0.16b, v0.16b, v30.16b
+ xar v29.2d, v1.2d, v25.2d, (64 - 1)
+ xar v1.2d, v6.2d, v25.2d, (64 - 44)
+ xar v6.2d, v9.2d, v28.2d, (64 - 20)
+ xar v9.2d, v22.2d, v26.2d, (64 - 61)
+ xar v22.2d, v14.2d, v28.2d, (64 - 39)
+ xar v14.2d, v20.2d, v30.2d, (64 - 18)
+ xar v31.2d, v2.2d, v26.2d, (64 - 62)
+ xar v2.2d, v12.2d, v26.2d, (64 - 43)
+ xar v12.2d, v13.2d, v27.2d, (64 - 25)
+ xar v13.2d, v19.2d, v28.2d, (64 - 8)
+ xar v19.2d, v23.2d, v27.2d, (64 - 56)
+ xar v23.2d, v15.2d, v30.2d, (64 - 41)
+ xar v15.2d, v4.2d, v28.2d, (64 - 27)
+ xar v28.2d, v24.2d, v28.2d, (64 - 14)
+ xar v24.2d, v21.2d, v25.2d, (64 - 2)
+ xar v8.2d, v8.2d, v27.2d, (64 - 55)
+ xar v4.2d, v16.2d, v25.2d, (64 - 45)
+ xar v16.2d, v5.2d, v30.2d, (64 - 36)
+ xar v5.2d, v3.2d, v27.2d, (64 - 28)
+ xar v27.2d, v18.2d, v27.2d, (64 - 21)
+ xar v3.2d, v17.2d, v26.2d, (64 - 15)
+ xar v25.2d, v11.2d, v25.2d, (64 - 10)
+ xar v26.2d, v7.2d, v26.2d, (64 - 6)
+ xar v30.2d, v10.2d, v30.2d, (64 - 3)
+
+ bcax v20.16b, v31.16b, v22.16b, v8.16b
+ bcax v21.16b, v8.16b, v23.16b, v22.16b
+ bcax v22.16b, v22.16b, v24.16b, v23.16b
+ bcax v23.16b, v23.16b, v31.16b, v24.16b
+ bcax v24.16b, v24.16b, v8.16b, v31.16b
+
+ ld1r {v31.2d}, [x9], #8
+
+ bcax v17.16b, v25.16b, v19.16b, v3.16b
+ bcax v18.16b, v3.16b, v15.16b, v19.16b
+ bcax v19.16b, v19.16b, v16.16b, v15.16b
+ bcax v15.16b, v15.16b, v25.16b, v16.16b
+ bcax v16.16b, v16.16b, v3.16b, v25.16b
+
+ bcax v10.16b, v29.16b, v12.16b, v26.16b
+ bcax v11.16b, v26.16b, v13.16b, v12.16b
+ bcax v12.16b, v12.16b, v14.16b, v13.16b
+ bcax v13.16b, v13.16b, v29.16b, v14.16b
+ bcax v14.16b, v14.16b, v26.16b, v29.16b
+
+ bcax v7.16b, v30.16b, v9.16b, v4.16b
+ bcax v8.16b, v4.16b, v5.16b, v9.16b
+ bcax v9.16b, v9.16b, v6.16b, v5.16b
+ bcax v5.16b, v5.16b, v30.16b, v6.16b
+ bcax v6.16b, v6.16b, v4.16b, v30.16b
+
+ bcax v3.16b, v27.16b, v0.16b, v28.16b
+ bcax v4.16b, v28.16b, v1.16b, v0.16b
+ bcax v0.16b, v0.16b, v2.16b, v1.16b
+ bcax v1.16b, v1.16b, v27.16b, v2.16b
+ bcax v2.16b, v2.16b, v28.16b, v27.16b
+
+ eor v0.16b, v0.16b, v31.16b
+
+ cbnz w8, 3b
+ cond_yield 4f, x8, x9
+ cbnz x2, 0b
+
+ /* save state */
+4: st1 { v0.1d- v3.1d}, [x0], #32
+ st1 { v4.1d- v7.1d}, [x0], #32
+ st1 { v8.1d-v11.1d}, [x0], #32
+ st1 {v12.1d-v15.1d}, [x0], #32
+ st1 {v16.1d-v19.1d}, [x0], #32
+ st1 {v20.1d-v23.1d}, [x0], #32
+ st1 {v24.1d}, [x0]
+ mov x0, x2
+ ret
+SYM_FUNC_END(sha3_ce_transform)
+
+ .section ".rodata", "a"
+ .align 8
+.Lsha3_rcon:
+ .quad 0x0000000000000001, 0x0000000000008082, 0x800000000000808a
+ .quad 0x8000000080008000, 0x000000000000808b, 0x0000000080000001
+ .quad 0x8000000080008081, 0x8000000000008009, 0x000000000000008a
+ .quad 0x0000000000000088, 0x0000000080008009, 0x000000008000000a
+ .quad 0x000000008000808b, 0x800000000000008b, 0x8000000000008089
+ .quad 0x8000000000008003, 0x8000000000008002, 0x8000000000000080
+ .quad 0x000000000000800a, 0x800000008000000a, 0x8000000080008081
+ .quad 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
diff --git a/lib/crypto/arm64/sha3.h b/lib/crypto/arm64/sha3.h
new file mode 100644
index 000000000000..b602f1b3b282
--- /dev/null
+++ b/lib/crypto/arm64/sha3.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+
+asmlinkage size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+ size_t nblocks, size_t block_size);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+ size_t nblocks, size_t block_size)
+{
+ if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+ do {
+ size_t rem;
+
+ scoped_ksimd()
+ rem = sha3_ce_transform(state, data, nblocks,
+ block_size);
+ data += (nblocks - rem) * block_size;
+ nblocks = rem;
+ } while (nblocks);
+ } else {
+ sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+ }
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+ if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+ /*
+ * Passing zeroes into sha3_ce_transform() gives the plain
+ * Keccak-f permutation, which is what we want here. Any
+ * supported block size may be used. Use SHA3_512_BLOCK_SIZE
+ * since it's the shortest.
+ */
+ static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+ scoped_ksimd()
+ sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
+ } else {
+ sha3_keccakf_generic(state);
+ }
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(SHA3))
+ static_branch_enable(&have_sha3);
+}
diff --git a/lib/crypto/arm64/sha512-ce-core.S b/lib/crypto/arm64/sha512-ce-core.S
new file mode 100644
index 000000000000..ffd51acfd1ee
--- /dev/null
+++ b/lib/crypto/arm64/sha512-ce-core.S
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Core SHA-384/SHA-512 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ /*
+ * We have to specify the "sha3" feature here, since the GNU and clang
+ * assemblers both consider the SHA-512 instructions to be part of the
+ * "sha3" feature. (Except binutils 2.30 through 2.42, which used
+ * "sha2". But "sha3" implies "sha2", so "sha3" still works in those
+ * versions.) "sha3" doesn't make a lot of sense, since SHA-512 is part
+ * of the SHA-2 family of algorithms, and also the Arm Architecture
+ * Reference Manual defines FEAT_SHA512 and FEAT_SHA3 separately.
+ * Regardless, we must use "sha3" to be compatible with the assemblers.
+ */
+ .arch armv8-a+sha3
+
+ /*
+ * The SHA-512 round constants
+ */
+ .section ".rodata", "a"
+ .align 4
+.Lsha512_rcon:
+ .quad 0x428a2f98d728ae22, 0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538, 0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242, 0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235, 0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+ .quad 0x983e5152ee66dfab, 0xa831c66d2db43210
+ .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725
+ .quad 0x06ca6351e003826f, 0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+ .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6, 0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218, 0xd69906245565a910
+ .quad 0xf40e35855771202a, 0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec
+ .quad 0x90befffa23631e28, 0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b
+ .quad 0xca273eceea26619c, 0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae, 0x1b710b35131c471b
+ .quad 0x28db77f523047d84, 0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+
+ .macro dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4
+ .ifnb \rc1
+ ld1 {v\rc1\().2d}, [x4], #16
+ .endif
+ add v5.2d, v\rc0\().2d, v\in0\().2d
+ ext v6.16b, v\i2\().16b, v\i3\().16b, #8
+ ext v5.16b, v5.16b, v5.16b, #8
+ ext v7.16b, v\i1\().16b, v\i2\().16b, #8
+ add v\i3\().2d, v\i3\().2d, v5.2d
+ .ifnb \in1
+ ext v5.16b, v\in3\().16b, v\in4\().16b, #8
+ sha512su0 v\in0\().2d, v\in1\().2d
+ .endif
+ sha512h q\i3, q6, v7.2d
+ .ifnb \in1
+ sha512su1 v\in0\().2d, v\in2\().2d, v5.2d
+ .endif
+ add v\i4\().2d, v\i1\().2d, v\i3\().2d
+ sha512h2 q\i3, q\i1, v\i0\().2d
+ .endm
+
+ /*
+ * size_t __sha512_ce_transform(struct sha512_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+ .text
+SYM_FUNC_START(__sha512_ce_transform)
+ /* load state */
+ ld1 {v8.2d-v11.2d}, [x0]
+
+ /* load first 4 round constants */
+ adr_l x3, .Lsha512_rcon
+ ld1 {v20.2d-v23.2d}, [x3], #64
+
+ /* load input */
+0: ld1 {v12.2d-v15.2d}, [x1], #64
+ ld1 {v16.2d-v19.2d}, [x1], #64
+ sub x2, x2, #1
+
+CPU_LE( rev64 v12.16b, v12.16b )
+CPU_LE( rev64 v13.16b, v13.16b )
+CPU_LE( rev64 v14.16b, v14.16b )
+CPU_LE( rev64 v15.16b, v15.16b )
+CPU_LE( rev64 v16.16b, v16.16b )
+CPU_LE( rev64 v17.16b, v17.16b )
+CPU_LE( rev64 v18.16b, v18.16b )
+CPU_LE( rev64 v19.16b, v19.16b )
+
+ mov x4, x3 // rc pointer
+
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+
+ // v0 ab cd -- ef gh ab
+ // v1 cd -- ef gh ab cd
+ // v2 ef gh ab cd -- ef
+ // v3 gh ab cd -- ef gh
+ // v4 -- ef gh ab cd --
+
+ dround 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17
+ dround 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18
+ dround 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19
+ dround 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12
+ dround 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13
+
+ dround 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14
+ dround 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15
+ dround 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16
+ dround 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17
+ dround 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18
+
+ dround 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19
+ dround 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12
+ dround 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13
+ dround 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14
+ dround 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15
+
+ dround 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16
+ dround 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17
+ dround 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18
+ dround 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19
+ dround 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12
+
+ dround 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13
+ dround 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14
+ dround 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15
+ dround 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16
+ dround 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17
+
+ dround 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18
+ dround 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19
+ dround 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12
+ dround 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13
+ dround 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14
+
+ dround 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15
+ dround 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16
+ dround 2, 3, 1, 4, 0, 28, 24, 12
+ dround 4, 2, 0, 1, 3, 29, 25, 13
+ dround 1, 4, 3, 0, 2, 30, 26, 14
+
+ dround 0, 1, 2, 3, 4, 31, 27, 15
+ dround 3, 0, 4, 2, 1, 24, , 16
+ dround 2, 3, 1, 4, 0, 25, , 17
+ dround 4, 2, 0, 1, 3, 26, , 18
+ dround 1, 4, 3, 0, 2, 27, , 19
+
+ /* update state */
+ add v8.2d, v8.2d, v0.2d
+ add v9.2d, v9.2d, v1.2d
+ add v10.2d, v10.2d, v2.2d
+ add v11.2d, v11.2d, v3.2d
+
+ cond_yield 3f, x4, x5
+ /* handled all input blocks? */
+ cbnz x2, 0b
+
+ /* store new state */
+3: st1 {v8.2d-v11.2d}, [x0]
+ mov x0, x2
+ ret
+SYM_FUNC_END(__sha512_ce_transform)
diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h
new file mode 100644
index 000000000000..7eb7ef04d268
--- /dev/null
+++ b/lib/crypto/arm64/sha512.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * arm64-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_insns);
+
+asmlinkage void sha512_block_data_order(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks);
+asmlinkage size_t __sha512_ce_transform(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_sha512_insns) &&
+ likely(may_use_simd())) {
+ do {
+ size_t rem;
+
+ scoped_ksimd()
+ rem = __sha512_ce_transform(state, data, nblocks);
+
+ data += (nblocks - rem) * SHA512_BLOCK_SIZE;
+ nblocks = rem;
+ } while (nblocks);
+ } else {
+ sha512_block_data_order(state, data, nblocks);
+ }
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(SHA512))
+ static_branch_enable(&have_sha512_insns);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/blake2b.c b/lib/crypto/blake2b.c
new file mode 100644
index 000000000000..09c6d65d8a6e
--- /dev/null
+++ b/lib/crypto/blake2b.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2025 Google LLC
+ *
+ * This is an implementation of the BLAKE2b hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ */
+
+#include <crypto/blake2b.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+static const u8 blake2b_sigma[12][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
+static inline void blake2b_increment_counter(struct blake2b_ctx *ctx, u32 inc)
+{
+ ctx->t[0] += inc;
+ ctx->t[1] += (ctx->t[0] < inc);
+}
+
+static void __maybe_unused
+blake2b_compress_generic(struct blake2b_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc)
+{
+ u64 m[16];
+ u64 v[16];
+ int i;
+
+ WARN_ON(IS_ENABLED(DEBUG) &&
+ (nblocks > 1 && inc != BLAKE2B_BLOCK_SIZE));
+
+ while (nblocks > 0) {
+ blake2b_increment_counter(ctx, inc);
+ memcpy(m, data, BLAKE2B_BLOCK_SIZE);
+ le64_to_cpu_array(m, ARRAY_SIZE(m));
+ memcpy(v, ctx->h, 64);
+ v[ 8] = BLAKE2B_IV0;
+ v[ 9] = BLAKE2B_IV1;
+ v[10] = BLAKE2B_IV2;
+ v[11] = BLAKE2B_IV3;
+ v[12] = BLAKE2B_IV4 ^ ctx->t[0];
+ v[13] = BLAKE2B_IV5 ^ ctx->t[1];
+ v[14] = BLAKE2B_IV6 ^ ctx->f[0];
+ v[15] = BLAKE2B_IV7 ^ ctx->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+ a += b + m[blake2b_sigma[r][2 * i + 0]]; \
+ d = ror64(d ^ a, 32); \
+ c += d; \
+ b = ror64(b ^ c, 24); \
+ a += b + m[blake2b_sigma[r][2 * i + 1]]; \
+ d = ror64(d ^ a, 16); \
+ c += d; \
+ b = ror64(b ^ c, 63); \
+} while (0)
+
+#define ROUND(r) do { \
+ G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+ G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+ G(r, 2, v[2], v[ 6], v[10], v[14]); \
+ G(r, 3, v[3], v[ 7], v[11], v[15]); \
+ G(r, 4, v[0], v[ 5], v[10], v[15]); \
+ G(r, 5, v[1], v[ 6], v[11], v[12]); \
+ G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+ G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+ ROUND(0);
+ ROUND(1);
+ ROUND(2);
+ ROUND(3);
+ ROUND(4);
+ ROUND(5);
+ ROUND(6);
+ ROUND(7);
+ ROUND(8);
+ ROUND(9);
+ ROUND(10);
+ ROUND(11);
+
+#undef G
+#undef ROUND
+
+ for (i = 0; i < 8; ++i)
+ ctx->h[i] ^= v[i] ^ v[i + 8];
+
+ data += BLAKE2B_BLOCK_SIZE;
+ --nblocks;
+ }
+}
+
+#ifdef CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+#include "blake2b.h" /* $(SRCARCH)/blake2b.h */
+#else
+#define blake2b_compress blake2b_compress_generic
+#endif
+
+static inline void blake2b_set_lastblock(struct blake2b_ctx *ctx)
+{
+ ctx->f[0] = -1;
+}
+
+void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen)
+{
+ const size_t fill = BLAKE2B_BLOCK_SIZE - ctx->buflen;
+
+ if (unlikely(!inlen))
+ return;
+ if (inlen > fill) {
+ memcpy(ctx->buf + ctx->buflen, in, fill);
+ blake2b_compress(ctx, ctx->buf, 1, BLAKE2B_BLOCK_SIZE);
+ ctx->buflen = 0;
+ in += fill;
+ inlen -= fill;
+ }
+ if (inlen > BLAKE2B_BLOCK_SIZE) {
+ const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2B_BLOCK_SIZE);
+
+ blake2b_compress(ctx, in, nblocks - 1, BLAKE2B_BLOCK_SIZE);
+ in += BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+ inlen -= BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+ }
+ memcpy(ctx->buf + ctx->buflen, in, inlen);
+ ctx->buflen += inlen;
+}
+EXPORT_SYMBOL(blake2b_update);
+
+void blake2b_final(struct blake2b_ctx *ctx, u8 *out)
+{
+ WARN_ON(IS_ENABLED(DEBUG) && !out);
+ blake2b_set_lastblock(ctx);
+ memset(ctx->buf + ctx->buflen, 0,
+ BLAKE2B_BLOCK_SIZE - ctx->buflen); /* Padding */
+ blake2b_compress(ctx, ctx->buf, 1, ctx->buflen);
+ cpu_to_le64_array(ctx->h, ARRAY_SIZE(ctx->h));
+ memcpy(out, ctx->h, ctx->outlen);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(blake2b_final);
+
+#ifdef blake2b_mod_init_arch
+static int __init blake2b_mod_init(void)
+{
+ blake2b_mod_init_arch();
+ return 0;
+}
+subsys_initcall(blake2b_mod_init);
+
+static void __exit blake2b_mod_exit(void)
+{
+}
+module_exit(blake2b_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("BLAKE2b hash function");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c
deleted file mode 100644
index 75ccb3e633e6..000000000000
--- a/lib/crypto/blake2s-generic.c
+++ /dev/null
@@ -1,115 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is an implementation of the BLAKE2s hash and PRF functions.
- *
- * Information: https://blake2.net/
- *
- */
-
-#include <crypto/internal/blake2s.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bug.h>
-#include <asm/unaligned.h>
-
-static const u8 blake2s_sigma[10][16] = {
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
- { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
- { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
- { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
- { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
- { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
- { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
- { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
- { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
- { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-};
-
-static inline void blake2s_increment_counter(struct blake2s_state *state,
- const u32 inc)
-{
- state->t[0] += inc;
- state->t[1] += (state->t[0] < inc);
-}
-
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
- __weak __alias(blake2s_compress_generic);
-
-void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
-{
- u32 m[16];
- u32 v[16];
- int i;
-
- WARN_ON(IS_ENABLED(DEBUG) &&
- (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
-
- while (nblocks > 0) {
- blake2s_increment_counter(state, inc);
- memcpy(m, block, BLAKE2S_BLOCK_SIZE);
- le32_to_cpu_array(m, ARRAY_SIZE(m));
- memcpy(v, state->h, 32);
- v[ 8] = BLAKE2S_IV0;
- v[ 9] = BLAKE2S_IV1;
- v[10] = BLAKE2S_IV2;
- v[11] = BLAKE2S_IV3;
- v[12] = BLAKE2S_IV4 ^ state->t[0];
- v[13] = BLAKE2S_IV5 ^ state->t[1];
- v[14] = BLAKE2S_IV6 ^ state->f[0];
- v[15] = BLAKE2S_IV7 ^ state->f[1];
-
-#define G(r, i, a, b, c, d) do { \
- a += b + m[blake2s_sigma[r][2 * i + 0]]; \
- d = ror32(d ^ a, 16); \
- c += d; \
- b = ror32(b ^ c, 12); \
- a += b + m[blake2s_sigma[r][2 * i + 1]]; \
- d = ror32(d ^ a, 8); \
- c += d; \
- b = ror32(b ^ c, 7); \
-} while (0)
-
-#define ROUND(r) do { \
- G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
- G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
- G(r, 2, v[2], v[ 6], v[10], v[14]); \
- G(r, 3, v[3], v[ 7], v[11], v[15]); \
- G(r, 4, v[0], v[ 5], v[10], v[15]); \
- G(r, 5, v[1], v[ 6], v[11], v[12]); \
- G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
- G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
-} while (0)
- ROUND(0);
- ROUND(1);
- ROUND(2);
- ROUND(3);
- ROUND(4);
- ROUND(5);
- ROUND(6);
- ROUND(7);
- ROUND(8);
- ROUND(9);
-
-#undef G
-#undef ROUND
-
- for (i = 0; i < 8; ++i)
- state->h[i] ^= v[i] ^ v[i + 8];
-
- block += BLAKE2S_BLOCK_SIZE;
- --nblocks;
- }
-}
-
-EXPORT_SYMBOL(blake2s_compress_generic);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("BLAKE2s hash function");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c
deleted file mode 100644
index 7d77dea15587..000000000000
--- a/lib/crypto/blake2s-selftest.c
+++ /dev/null
@@ -1,632 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <crypto/internal/blake2s.h>
-#include <linux/kernel.h>
-#include <linux/random.h>
-#include <linux/string.h>
-
-/*
- * blake2s_testvecs[] generated with the program below (using libb2-dev and
- * libssl-dev [OpenSSL])
- *
- * #include <blake2.h>
- * #include <stdint.h>
- * #include <stdio.h>
- *
- * #include <openssl/evp.h>
- *
- * #define BLAKE2S_TESTVEC_COUNT 256
- *
- * static void print_vec(const uint8_t vec[], int len)
- * {
- * int i;
- *
- * printf(" { ");
- * for (i = 0; i < len; i++) {
- * if (i && (i % 12) == 0)
- * printf("\n ");
- * printf("0x%02x, ", vec[i]);
- * }
- * printf("},\n");
- * }
- *
- * int main(void)
- * {
- * uint8_t key[BLAKE2S_KEYBYTES];
- * uint8_t buf[BLAKE2S_TESTVEC_COUNT];
- * uint8_t hash[BLAKE2S_OUTBYTES];
- * int i, j;
- *
- * key[0] = key[1] = 1;
- * for (i = 2; i < BLAKE2S_KEYBYTES; ++i)
- * key[i] = key[i - 2] + key[i - 1];
- *
- * for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i)
- * buf[i] = (uint8_t)i;
- *
- * printf("static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n");
- *
- * for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i) {
- * int outlen = 1 + i % BLAKE2S_OUTBYTES;
- * int keylen = (13 * i) % (BLAKE2S_KEYBYTES + 1);
- *
- * blake2s(hash, buf, key + BLAKE2S_KEYBYTES - keylen, outlen, i,
- * keylen);
- * print_vec(hash, outlen);
- * }
- * printf("};\n\n");
- *
- * return 0;
- *}
- */
-static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
- { 0xa1, },
- { 0x7c, 0x89, },
- { 0x74, 0x0e, 0xd4, },
- { 0x47, 0x0c, 0x21, 0x15, },
- { 0x18, 0xd6, 0x9c, 0xa6, 0xc4, },
- { 0x13, 0x5d, 0x16, 0x63, 0x2e, 0xf9, },
- { 0x2c, 0xb5, 0x04, 0xb7, 0x99, 0xe2, 0x73, },
- { 0x9a, 0x0f, 0xd2, 0x39, 0xd6, 0x68, 0x1b, 0x92, },
- { 0xc8, 0xde, 0x7a, 0xea, 0x2f, 0xf4, 0xd2, 0xe3, 0x2b, },
- { 0x5b, 0xf9, 0x43, 0x52, 0x0c, 0x12, 0xba, 0xb5, 0x93, 0x9f, },
- { 0xc6, 0x2c, 0x4e, 0x80, 0xfc, 0x32, 0x5b, 0x33, 0xb8, 0xb8, 0x0a, },
- { 0xa7, 0x5c, 0xfd, 0x3a, 0xcc, 0xbf, 0x90, 0xca, 0xb7, 0x97, 0xde, 0xd8, },
- { 0x66, 0xca, 0x3c, 0xc4, 0x19, 0xef, 0x92, 0x66, 0x3f, 0x21, 0x8f, 0xda,
- 0xb7, },
- { 0xba, 0xe5, 0xbb, 0x30, 0x25, 0x94, 0x6d, 0xc3, 0x89, 0x09, 0xc4, 0x25,
- 0x52, 0x3e, },
- { 0xa2, 0xef, 0x0e, 0x52, 0x0b, 0x5f, 0xa2, 0x01, 0x6d, 0x0a, 0x25, 0xbc,
- 0x57, 0xe2, 0x27, },
- { 0x4f, 0xe0, 0xf9, 0x52, 0x12, 0xda, 0x84, 0xb7, 0xab, 0xae, 0xb0, 0xa6,
- 0x47, 0x2a, 0xc7, 0xf5, },
- { 0x56, 0xe7, 0xa8, 0x1c, 0x4c, 0xca, 0xed, 0x90, 0x31, 0xec, 0x87, 0x43,
- 0xe7, 0x72, 0x08, 0xec, 0xbe, },
- { 0x7e, 0xdf, 0x80, 0x1c, 0x93, 0x33, 0xfd, 0x53, 0x44, 0xba, 0xfd, 0x96,
- 0xe1, 0xbb, 0xb5, 0x65, 0xa5, 0x00, },
- { 0xec, 0x6b, 0xed, 0xf7, 0x7b, 0x62, 0x1d, 0x7d, 0xf4, 0x82, 0xf3, 0x1e,
- 0x18, 0xff, 0x2b, 0xc4, 0x06, 0x20, 0x2a, },
- { 0x74, 0x98, 0xd7, 0x68, 0x63, 0xed, 0x87, 0xe4, 0x5d, 0x8d, 0x9e, 0x1d,
- 0xfd, 0x2a, 0xbb, 0x86, 0xac, 0xe9, 0x2a, 0x89, },
- { 0x89, 0xc3, 0x88, 0xce, 0x2b, 0x33, 0x1e, 0x10, 0xd1, 0x37, 0x20, 0x86,
- 0x28, 0x43, 0x70, 0xd9, 0xfb, 0x96, 0xd9, 0xb5, 0xd3, },
- { 0xcb, 0x56, 0x74, 0x41, 0x8d, 0x80, 0x01, 0x9a, 0x6b, 0x38, 0xe1, 0x41,
- 0xad, 0x9c, 0x62, 0x74, 0xce, 0x35, 0xd5, 0x6c, 0x89, 0x6e, },
- { 0x79, 0xaf, 0x94, 0x59, 0x99, 0x26, 0xe1, 0xc9, 0x34, 0xfe, 0x7c, 0x22,
- 0xf7, 0x43, 0xd7, 0x65, 0xd4, 0x48, 0x18, 0xac, 0x3d, 0xfd, 0x93, },
- { 0x85, 0x0d, 0xff, 0xb8, 0x3e, 0x87, 0x41, 0xb0, 0x95, 0xd3, 0x3d, 0x00,
- 0x47, 0x55, 0x9e, 0xd2, 0x69, 0xea, 0xbf, 0xe9, 0x7a, 0x2d, 0x61, 0x45, },
- { 0x03, 0xe0, 0x85, 0xec, 0x54, 0xb5, 0x16, 0x53, 0xa8, 0xc4, 0x71, 0xe9,
- 0x6a, 0xe7, 0xcb, 0xc4, 0x15, 0x02, 0xfc, 0x34, 0xa4, 0xa4, 0x28, 0x13,
- 0xd1, },
- { 0xe3, 0x34, 0x4b, 0xe1, 0xd0, 0x4b, 0x55, 0x61, 0x8f, 0xc0, 0x24, 0x05,
- 0xe6, 0xe0, 0x3d, 0x70, 0x24, 0x4d, 0xda, 0xb8, 0x91, 0x05, 0x29, 0x07,
- 0x01, 0x3e, },
- { 0x61, 0xff, 0x01, 0x72, 0xb1, 0x4d, 0xf6, 0xfe, 0xd1, 0xd1, 0x08, 0x74,
- 0xe6, 0x91, 0x44, 0xeb, 0x61, 0xda, 0x40, 0xaf, 0xfc, 0x8c, 0x91, 0x6b,
- 0xec, 0x13, 0xed, },
- { 0xd4, 0x40, 0xd2, 0xa0, 0x7f, 0xc1, 0x58, 0x0c, 0x85, 0xa0, 0x86, 0xc7,
- 0x86, 0xb9, 0x61, 0xc9, 0xea, 0x19, 0x86, 0x1f, 0xab, 0x07, 0xce, 0x37,
- 0x72, 0x67, 0x09, 0xfc, },
- { 0x9e, 0xf8, 0x18, 0x67, 0x93, 0x10, 0x9b, 0x39, 0x75, 0xe8, 0x8b, 0x38,
- 0x82, 0x7d, 0xb8, 0xb7, 0xa5, 0xaf, 0xe6, 0x6a, 0x22, 0x5e, 0x1f, 0x9c,
- 0x95, 0x29, 0x19, 0xf2, 0x4b, },
- { 0xc8, 0x62, 0x25, 0xf5, 0x98, 0xc9, 0xea, 0xe5, 0x29, 0x3a, 0xd3, 0x22,
- 0xeb, 0xeb, 0x07, 0x7c, 0x15, 0x07, 0xee, 0x15, 0x61, 0xbb, 0x05, 0x30,
- 0x99, 0x7f, 0x11, 0xf6, 0x0a, 0x1d, },
- { 0x68, 0x70, 0xf7, 0x90, 0xa1, 0x8b, 0x1f, 0x0f, 0xbb, 0xce, 0xd2, 0x0e,
- 0x33, 0x1f, 0x7f, 0xa9, 0x78, 0xa8, 0xa6, 0x81, 0x66, 0xab, 0x8d, 0xcd,
- 0x58, 0x55, 0x3a, 0x0b, 0x7a, 0xdb, 0xb5, },
- { 0xdd, 0x35, 0xd2, 0xb4, 0xf6, 0xc7, 0xea, 0xab, 0x64, 0x24, 0x4e, 0xfe,
- 0xe5, 0x3d, 0x4e, 0x95, 0x8b, 0x6d, 0x6c, 0xbc, 0xb0, 0xf8, 0x88, 0x61,
- 0x09, 0xb7, 0x78, 0xa3, 0x31, 0xfe, 0xd9, 0x2f, },
- { 0x0a, },
- { 0x6e, 0xd4, },
- { 0x64, 0xe9, 0xd1, },
- { 0x30, 0xdd, 0x71, 0xef, },
- { 0x11, 0xb5, 0x0c, 0x87, 0xc9, },
- { 0x06, 0x1c, 0x6d, 0x04, 0x82, 0xd0, },
- { 0x5c, 0x42, 0x0b, 0xee, 0xc5, 0x9c, 0xb2, },
- { 0xe8, 0x29, 0xd6, 0xb4, 0x5d, 0xf7, 0x2b, 0x93, },
- { 0x18, 0xca, 0x27, 0x72, 0x43, 0x39, 0x16, 0xbc, 0x6a, },
- { 0x39, 0x8f, 0xfd, 0x64, 0xf5, 0x57, 0x23, 0xb0, 0x45, 0xf8, },
- { 0xbb, 0x3a, 0x78, 0x6b, 0x02, 0x1d, 0x0b, 0x16, 0xe3, 0xb2, 0x9a, },
- { 0xb8, 0xb4, 0x0b, 0xe5, 0xd4, 0x1d, 0x0d, 0x85, 0x49, 0x91, 0x35, 0xfa, },
- { 0x6d, 0x48, 0x2a, 0x0c, 0x42, 0x08, 0xbd, 0xa9, 0x78, 0x6f, 0x18, 0xaf,
- 0xe2, },
- { 0x10, 0x45, 0xd4, 0x58, 0x88, 0xec, 0x4e, 0x1e, 0xf6, 0x14, 0x92, 0x64,
- 0x7e, 0xb0, },
- { 0x8b, 0x0b, 0x95, 0xee, 0x92, 0xc6, 0x3b, 0x91, 0xf1, 0x1e, 0xeb, 0x51,
- 0x98, 0x0a, 0x8d, },
- { 0xa3, 0x50, 0x4d, 0xa5, 0x1d, 0x03, 0x68, 0xe9, 0x57, 0x78, 0xd6, 0x04,
- 0xf1, 0xc3, 0x94, 0xd8, },
- { 0xb8, 0x66, 0x6e, 0xdd, 0x46, 0x15, 0xae, 0x3d, 0x83, 0x7e, 0xcf, 0xe7,
- 0x2c, 0xe8, 0x8f, 0xc7, 0x34, },
- { 0x2e, 0xc0, 0x1f, 0x29, 0xea, 0xf6, 0xb9, 0xe2, 0xc2, 0x93, 0xeb, 0x41,
- 0x0d, 0xf0, 0x0a, 0x13, 0x0e, 0xa2, },
- { 0x71, 0xb8, 0x33, 0xa9, 0x1b, 0xac, 0xf1, 0xb5, 0x42, 0x8f, 0x5e, 0x81,
- 0x34, 0x43, 0xb7, 0xa4, 0x18, 0x5c, 0x47, },
- { 0xda, 0x45, 0xb8, 0x2e, 0x82, 0x1e, 0xc0, 0x59, 0x77, 0x9d, 0xfa, 0xb4,
- 0x1c, 0x5e, 0xa0, 0x2b, 0x33, 0x96, 0x5a, 0x58, },
- { 0xe3, 0x09, 0x05, 0xa9, 0xeb, 0x48, 0x13, 0xad, 0x71, 0x88, 0x81, 0x9a,
- 0x3e, 0x2c, 0xe1, 0x23, 0x99, 0x13, 0x35, 0x9f, 0xb5, },
- { 0xb7, 0x86, 0x2d, 0x16, 0xe1, 0x04, 0x00, 0x47, 0x47, 0x61, 0x31, 0xfb,
- 0x14, 0xac, 0xd8, 0xe9, 0xe3, 0x49, 0xbd, 0xf7, 0x9c, 0x3f, },
- { 0x7f, 0xd9, 0x95, 0xa8, 0xa7, 0xa0, 0xcc, 0xba, 0xef, 0xb1, 0x0a, 0xa9,
- 0x21, 0x62, 0x08, 0x0f, 0x1b, 0xff, 0x7b, 0x9d, 0xae, 0xb2, 0x95, },
- { 0x85, 0x99, 0xea, 0x33, 0xe0, 0x56, 0xff, 0x13, 0xc6, 0x61, 0x8c, 0xf9,
- 0x57, 0x05, 0x03, 0x11, 0xf9, 0xfb, 0x3a, 0xf7, 0xce, 0xbb, 0x52, 0x30, },
- { 0xb2, 0x72, 0x9c, 0xf8, 0x77, 0x4e, 0x8f, 0x6b, 0x01, 0x6c, 0xff, 0x4e,
- 0x4f, 0x02, 0xd2, 0xbc, 0xeb, 0x51, 0x28, 0x99, 0x50, 0xab, 0xc4, 0x42,
- 0xe3, },
- { 0x8b, 0x0a, 0xb5, 0x90, 0x8f, 0xf5, 0x7b, 0xdd, 0xba, 0x47, 0x37, 0xc9,
- 0x2a, 0xd5, 0x4b, 0x25, 0x08, 0x8b, 0x02, 0x17, 0xa7, 0x9e, 0x6b, 0x6e,
- 0xe3, 0x90, },
- { 0x90, 0xdd, 0xf7, 0x75, 0xa7, 0xa3, 0x99, 0x5e, 0x5b, 0x7d, 0x75, 0xc3,
- 0x39, 0x6b, 0xa0, 0xe2, 0x44, 0x53, 0xb1, 0x9e, 0xc8, 0xf1, 0x77, 0x10,
- 0x58, 0x06, 0x9a, },
- { 0x99, 0x52, 0xf0, 0x49, 0xa8, 0x8c, 0xec, 0xa6, 0x97, 0x32, 0x13, 0xb5,
- 0xf7, 0xa3, 0x8e, 0xfb, 0x4b, 0x59, 0x31, 0x3d, 0x01, 0x59, 0x98, 0x5d,
- 0x53, 0x03, 0x1a, 0x39, },
- { 0x9f, 0xe0, 0xc2, 0xe5, 0x5d, 0x93, 0xd6, 0x9b, 0x47, 0x8f, 0x9b, 0xe0,
- 0x26, 0x35, 0x84, 0x20, 0x1d, 0xc5, 0x53, 0x10, 0x0f, 0x22, 0xb9, 0xb5,
- 0xd4, 0x36, 0xb1, 0xac, 0x73, },
- { 0x30, 0x32, 0x20, 0x3b, 0x10, 0x28, 0xec, 0x1f, 0x4f, 0x9b, 0x47, 0x59,
- 0xeb, 0x7b, 0xee, 0x45, 0xfb, 0x0c, 0x49, 0xd8, 0x3d, 0x69, 0xbd, 0x90,
- 0x2c, 0xf0, 0x9e, 0x8d, 0xbf, 0xd5, },
- { 0x2a, 0x37, 0x73, 0x7f, 0xf9, 0x96, 0x19, 0xaa, 0x25, 0xd8, 0x13, 0x28,
- 0x01, 0x29, 0x89, 0xdf, 0x6e, 0x0c, 0x9b, 0x43, 0x44, 0x51, 0xe9, 0x75,
- 0x26, 0x0c, 0xb7, 0x87, 0x66, 0x0b, 0x5f, },
- { 0x23, 0xdf, 0x96, 0x68, 0x91, 0x86, 0xd0, 0x93, 0x55, 0x33, 0x24, 0xf6,
- 0xba, 0x08, 0x75, 0x5b, 0x59, 0x11, 0x69, 0xb8, 0xb9, 0xe5, 0x2c, 0x77,
- 0x02, 0xf6, 0x47, 0xee, 0x81, 0xdd, 0xb9, 0x06, },
- { 0x9d, },
- { 0x9d, 0x7d, },
- { 0xfd, 0xc3, 0xda, },
- { 0xe8, 0x82, 0xcd, 0x21, },
- { 0xc3, 0x1d, 0x42, 0x4c, 0x74, },
- { 0xe9, 0xda, 0xf1, 0xa2, 0xe5, 0x7c, },
- { 0x52, 0xb8, 0x6f, 0x81, 0x5c, 0x3a, 0x4c, },
- { 0x5b, 0x39, 0x26, 0xfc, 0x92, 0x5e, 0xe0, 0x49, },
- { 0x59, 0xe4, 0x7c, 0x93, 0x1c, 0xf9, 0x28, 0x93, 0xde, },
- { 0xde, 0xdf, 0xb2, 0x43, 0x61, 0x0b, 0x86, 0x16, 0x4c, 0x2e, },
- { 0x14, 0x8f, 0x75, 0x51, 0xaf, 0xb9, 0xee, 0x51, 0x5a, 0xae, 0x23, },
- { 0x43, 0x5f, 0x50, 0xd5, 0x70, 0xb0, 0x5b, 0x87, 0xf5, 0xd9, 0xb3, 0x6d, },
- { 0x66, 0x0a, 0x64, 0x93, 0x79, 0x71, 0x94, 0x40, 0xb7, 0x68, 0x2d, 0xd3,
- 0x63, },
- { 0x15, 0x00, 0xc4, 0x0c, 0x7d, 0x1b, 0x10, 0xa9, 0x73, 0x1b, 0x90, 0x6f,
- 0xe6, 0xa9, },
- { 0x34, 0x75, 0xf3, 0x86, 0x8f, 0x56, 0xcf, 0x2a, 0x0a, 0xf2, 0x62, 0x0a,
- 0xf6, 0x0e, 0x20, },
- { 0xb1, 0xde, 0xc9, 0xf5, 0xdb, 0xf3, 0x2f, 0x4c, 0xd6, 0x41, 0x7d, 0x39,
- 0x18, 0x3e, 0xc7, 0xc3, },
- { 0xc5, 0x89, 0xb2, 0xf8, 0xb8, 0xc0, 0xa3, 0xb9, 0x3b, 0x10, 0x6d, 0x7c,
- 0x92, 0xfc, 0x7f, 0x34, 0x41, },
- { 0xc4, 0xd8, 0xef, 0xba, 0xef, 0xd2, 0xaa, 0xc5, 0x6c, 0x8e, 0x3e, 0xbb,
- 0x12, 0xfc, 0x0f, 0x72, 0xbf, 0x0f, },
- { 0xdd, 0x91, 0xd1, 0x15, 0x9e, 0x7d, 0xf8, 0xc1, 0xb9, 0x14, 0x63, 0x96,
- 0xb5, 0xcb, 0x83, 0x1d, 0x35, 0x1c, 0xec, },
- { 0xa9, 0xf8, 0x52, 0xc9, 0x67, 0x76, 0x2b, 0xad, 0xfb, 0xd8, 0x3a, 0xa6,
- 0x74, 0x02, 0xae, 0xb8, 0x25, 0x2c, 0x63, 0x49, },
- { 0x77, 0x1f, 0x66, 0x70, 0xfd, 0x50, 0x29, 0xaa, 0xeb, 0xdc, 0xee, 0xba,
- 0x75, 0x98, 0xdc, 0x93, 0x12, 0x3f, 0xdc, 0x7c, 0x38, },
- { 0xe2, 0xe1, 0x89, 0x5c, 0x37, 0x38, 0x6a, 0xa3, 0x40, 0xac, 0x3f, 0xb0,
- 0xca, 0xfc, 0xa7, 0xf3, 0xea, 0xf9, 0x0f, 0x5d, 0x8e, 0x39, },
- { 0x0f, 0x67, 0xc8, 0x38, 0x01, 0xb1, 0xb7, 0xb8, 0xa2, 0xe7, 0x0a, 0x6d,
- 0xd2, 0x63, 0x69, 0x9e, 0xcc, 0xf0, 0xf2, 0xbe, 0x9b, 0x98, 0xdd, },
- { 0x13, 0xe1, 0x36, 0x30, 0xfe, 0xc6, 0x01, 0x8a, 0xa1, 0x63, 0x96, 0x59,
- 0xc2, 0xa9, 0x68, 0x3f, 0x58, 0xd4, 0x19, 0x0c, 0x40, 0xf3, 0xde, 0x02, },
- { 0xa3, 0x9e, 0xce, 0xda, 0x42, 0xee, 0x8c, 0x6c, 0x5a, 0x7d, 0xdc, 0x89,
- 0x02, 0x77, 0xdd, 0xe7, 0x95, 0xbb, 0xff, 0x0d, 0xa4, 0xb5, 0x38, 0x1e,
- 0xaf, },
- { 0x9a, 0xf6, 0xb5, 0x9a, 0x4f, 0xa9, 0x4f, 0x2c, 0x35, 0x3c, 0x24, 0xdc,
- 0x97, 0x6f, 0xd9, 0xa1, 0x7d, 0x1a, 0x85, 0x0b, 0xf5, 0xda, 0x2e, 0xe7,
- 0xb1, 0x1d, },
- { 0x84, 0x1e, 0x8e, 0x3d, 0x45, 0xa5, 0xf2, 0x27, 0xf3, 0x31, 0xfe, 0xb9,
- 0xfb, 0xc5, 0x45, 0x99, 0x99, 0xdd, 0x93, 0x43, 0x02, 0xee, 0x58, 0xaf,
- 0xee, 0x6a, 0xbe, },
- { 0x07, 0x2f, 0xc0, 0xa2, 0x04, 0xc4, 0xab, 0x7c, 0x26, 0xbb, 0xa8, 0xd8,
- 0xe3, 0x1c, 0x75, 0x15, 0x64, 0x5d, 0x02, 0x6a, 0xf0, 0x86, 0xe9, 0xcd,
- 0x5c, 0xef, 0xa3, 0x25, },
- { 0x2f, 0x3b, 0x1f, 0xb5, 0x91, 0x8f, 0x86, 0xe0, 0xdc, 0x31, 0x48, 0xb6,
- 0xa1, 0x8c, 0xfd, 0x75, 0xbb, 0x7d, 0x3d, 0xc1, 0xf0, 0x10, 0x9a, 0xd8,
- 0x4b, 0x0e, 0xe3, 0x94, 0x9f, },
- { 0x29, 0xbb, 0x8f, 0x6c, 0xd1, 0xf2, 0xb6, 0xaf, 0xe5, 0xe3, 0x2d, 0xdc,
- 0x6f, 0xa4, 0x53, 0x88, 0xd8, 0xcf, 0x4d, 0x45, 0x42, 0x62, 0xdb, 0xdf,
- 0xf8, 0x45, 0xc2, 0x13, 0xec, 0x35, },
- { 0x06, 0x3c, 0xe3, 0x2c, 0x15, 0xc6, 0x43, 0x03, 0x81, 0xfb, 0x08, 0x76,
- 0x33, 0xcb, 0x02, 0xc1, 0xba, 0x33, 0xe5, 0xe0, 0xd1, 0x92, 0xa8, 0x46,
- 0x28, 0x3f, 0x3e, 0x9d, 0x2c, 0x44, 0x54, },
- { 0xea, 0xbb, 0x96, 0xf8, 0xd1, 0x8b, 0x04, 0x11, 0x40, 0x78, 0x42, 0x02,
- 0x19, 0xd1, 0xbc, 0x65, 0x92, 0xd3, 0xc3, 0xd6, 0xd9, 0x19, 0xe7, 0xc3,
- 0x40, 0x97, 0xbd, 0xd4, 0xed, 0xfa, 0x5e, 0x28, },
- { 0x02, },
- { 0x52, 0xa8, },
- { 0x38, 0x25, 0x0d, },
- { 0xe3, 0x04, 0xd4, 0x92, },
- { 0x97, 0xdb, 0xf7, 0x81, 0xca, },
- { 0x8a, 0x56, 0x9d, 0x62, 0x56, 0xcc, },
- { 0xa1, 0x8e, 0x3c, 0x72, 0x8f, 0x63, 0x03, },
- { 0xf7, 0xf3, 0x39, 0x09, 0x0a, 0xa1, 0xbb, 0x23, },
- { 0x6b, 0x03, 0xc0, 0xe9, 0xd9, 0x83, 0x05, 0x22, 0x01, },
- { 0x1b, 0x4b, 0xf5, 0xd6, 0x4f, 0x05, 0x75, 0x91, 0x4c, 0x7f, },
- { 0x4c, 0x8c, 0x25, 0x20, 0x21, 0xcb, 0xc2, 0x4b, 0x3a, 0x5b, 0x8d, },
- { 0x56, 0xe2, 0x77, 0xa0, 0xb6, 0x9f, 0x81, 0xec, 0x83, 0x75, 0xc4, 0xf9, },
- { 0x71, 0x70, 0x0f, 0xad, 0x4d, 0x35, 0x81, 0x9d, 0x88, 0x69, 0xf9, 0xaa,
- 0xd3, },
- { 0x50, 0x6e, 0x86, 0x6e, 0x43, 0xc0, 0xc2, 0x44, 0xc2, 0xe2, 0xa0, 0x1c,
- 0xb7, 0x9a, },
- { 0xe4, 0x7e, 0x72, 0xc6, 0x12, 0x8e, 0x7c, 0xfc, 0xbd, 0xe2, 0x08, 0x31,
- 0x3d, 0x47, 0x3d, },
- { 0x08, 0x97, 0x5b, 0x80, 0xae, 0xc4, 0x1d, 0x50, 0x77, 0xdf, 0x1f, 0xd0,
- 0x24, 0xf0, 0x17, 0xc0, },
- { 0x01, 0xb6, 0x29, 0xf4, 0xaf, 0x78, 0x5f, 0xb6, 0x91, 0xdd, 0x76, 0x76,
- 0xd2, 0xfd, 0x0c, 0x47, 0x40, },
- { 0xa1, 0xd8, 0x09, 0x97, 0x7a, 0xa6, 0xc8, 0x94, 0xf6, 0x91, 0x7b, 0xae,
- 0x2b, 0x9f, 0x0d, 0x83, 0x48, 0xf7, },
- { 0x12, 0xd5, 0x53, 0x7d, 0x9a, 0xb0, 0xbe, 0xd9, 0xed, 0xe9, 0x9e, 0xee,
- 0x61, 0x5b, 0x42, 0xf2, 0xc0, 0x73, 0xc0, },
- { 0xd5, 0x77, 0xd6, 0x5c, 0x6e, 0xa5, 0x69, 0x2b, 0x3b, 0x8c, 0xd6, 0x7d,
- 0x1d, 0xbe, 0x2c, 0xa1, 0x02, 0x21, 0xcd, 0x29, },
- { 0xa4, 0x98, 0x80, 0xca, 0x22, 0xcf, 0x6a, 0xab, 0x5e, 0x40, 0x0d, 0x61,
- 0x08, 0x21, 0xef, 0xc0, 0x6c, 0x52, 0xb4, 0xb0, 0x53, },
- { 0xbf, 0xaf, 0x8f, 0x3b, 0x7a, 0x97, 0x33, 0xe5, 0xca, 0x07, 0x37, 0xfd,
- 0x15, 0xdf, 0xce, 0x26, 0x2a, 0xb1, 0xa7, 0x0b, 0xb3, 0xac, },
- { 0x16, 0x22, 0xe1, 0xbc, 0x99, 0x4e, 0x01, 0xf0, 0xfa, 0xff, 0x8f, 0xa5,
- 0x0c, 0x61, 0xb0, 0xad, 0xcc, 0xb1, 0xe1, 0x21, 0x46, 0xfa, 0x2e, },
- { 0x11, 0x5b, 0x0b, 0x2b, 0xe6, 0x14, 0xc1, 0xd5, 0x4d, 0x71, 0x5e, 0x17,
- 0xea, 0x23, 0xdd, 0x6c, 0xbd, 0x1d, 0xbe, 0x12, 0x1b, 0xee, 0x4c, 0x1a, },
- { 0x40, 0x88, 0x22, 0xf3, 0x20, 0x6c, 0xed, 0xe1, 0x36, 0x34, 0x62, 0x2c,
- 0x98, 0x83, 0x52, 0xe2, 0x25, 0xee, 0xe9, 0xf5, 0xe1, 0x17, 0xf0, 0x5c,
- 0xae, },
- { 0xc3, 0x76, 0x37, 0xde, 0x95, 0x8c, 0xca, 0x2b, 0x0c, 0x23, 0xe7, 0xb5,
- 0x38, 0x70, 0x61, 0xcc, 0xff, 0xd3, 0x95, 0x7b, 0xf3, 0xff, 0x1f, 0x9d,
- 0x59, 0x00, },
- { 0x0c, 0x19, 0x52, 0x05, 0x22, 0x53, 0xcb, 0x48, 0xd7, 0x10, 0x0e, 0x7e,
- 0x14, 0x69, 0xb5, 0xa2, 0x92, 0x43, 0xa3, 0x9e, 0x4b, 0x8f, 0x51, 0x2c,
- 0x5a, 0x2c, 0x3b, },
- { 0xe1, 0x9d, 0x70, 0x70, 0x28, 0xec, 0x86, 0x40, 0x55, 0x33, 0x56, 0xda,
- 0x88, 0xca, 0xee, 0xc8, 0x6a, 0x20, 0xb1, 0xe5, 0x3d, 0x57, 0xf8, 0x3c,
- 0x10, 0x07, 0x2a, 0xc4, },
- { 0x0b, 0xae, 0xf1, 0xc4, 0x79, 0xee, 0x1b, 0x3d, 0x27, 0x35, 0x8d, 0x14,
- 0xd6, 0xae, 0x4e, 0x3c, 0xe9, 0x53, 0x50, 0xb5, 0xcc, 0x0c, 0xf7, 0xdf,
- 0xee, 0xa1, 0x74, 0xd6, 0x71, },
- { 0xe6, 0xa4, 0xf4, 0x99, 0x98, 0xb9, 0x80, 0xea, 0x96, 0x7f, 0x4f, 0x33,
- 0xcf, 0x74, 0x25, 0x6f, 0x17, 0x6c, 0xbf, 0xf5, 0x5c, 0x38, 0xd0, 0xff,
- 0x96, 0xcb, 0x13, 0xf9, 0xdf, 0xfd, },
- { 0xbe, 0x92, 0xeb, 0xba, 0x44, 0x2c, 0x24, 0x74, 0xd4, 0x03, 0x27, 0x3c,
- 0x5d, 0x5b, 0x03, 0x30, 0x87, 0x63, 0x69, 0xe0, 0xb8, 0x94, 0xf4, 0x44,
- 0x7e, 0xad, 0xcd, 0x20, 0x12, 0x16, 0x79, },
- { 0x30, 0xf1, 0xc4, 0x8e, 0x05, 0x90, 0x2a, 0x97, 0x63, 0x94, 0x46, 0xff,
- 0xce, 0xd8, 0x67, 0xa7, 0xac, 0x33, 0x8c, 0x95, 0xb7, 0xcd, 0xa3, 0x23,
- 0x98, 0x9d, 0x76, 0x6c, 0x9d, 0xa8, 0xd6, 0x8a, },
- { 0xbe, },
- { 0x17, 0x6c, },
- { 0x1a, 0x42, 0x4f, },
- { 0xba, 0xaf, 0xb7, 0x65, },
- { 0xc2, 0x63, 0x43, 0x6a, 0xea, },
- { 0xe4, 0x4d, 0xad, 0xf2, 0x0b, 0x02, },
- { 0x04, 0xc7, 0xc4, 0x7f, 0xa9, 0x2b, 0xce, },
- { 0x66, 0xf6, 0x67, 0xcb, 0x03, 0x53, 0xc8, 0xf1, },
- { 0x56, 0xa3, 0x60, 0x78, 0xc9, 0x5f, 0x70, 0x1b, 0x5e, },
- { 0x99, 0xff, 0x81, 0x7c, 0x13, 0x3c, 0x29, 0x79, 0x4b, 0x65, },
- { 0x51, 0x10, 0x50, 0x93, 0x01, 0x93, 0xb7, 0x01, 0xc9, 0x18, 0xb7, },
- { 0x8e, 0x3c, 0x42, 0x1e, 0x5e, 0x7d, 0xc1, 0x50, 0x70, 0x1f, 0x00, 0x98, },
- { 0x5f, 0xd9, 0x9b, 0xc8, 0xd7, 0xb2, 0x72, 0x62, 0x1a, 0x1e, 0xba, 0x92,
- 0xe9, },
- { 0x70, 0x2b, 0xba, 0xfe, 0xad, 0x5d, 0x96, 0x3f, 0x27, 0xc2, 0x41, 0x6d,
- 0xc4, 0xb3, },
- { 0xae, 0xe0, 0xd5, 0xd4, 0xc7, 0xae, 0x15, 0x5e, 0xdc, 0xdd, 0x33, 0x60,
- 0xd7, 0xd3, 0x5e, },
- { 0x79, 0x8e, 0xbc, 0x9e, 0x20, 0xb9, 0x19, 0x4b, 0x63, 0x80, 0xf3, 0x16,
- 0xaf, 0x39, 0xbd, 0x92, },
- { 0xc2, 0x0e, 0x85, 0xa0, 0x0b, 0x9a, 0xb0, 0xec, 0xde, 0x38, 0xd3, 0x10,
- 0xd9, 0xa7, 0x66, 0x27, 0xcf, },
- { 0x0e, 0x3b, 0x75, 0x80, 0x67, 0x14, 0x0c, 0x02, 0x90, 0xd6, 0xb3, 0x02,
- 0x81, 0xf6, 0xa6, 0x87, 0xce, 0x58, },
- { 0x79, 0xb5, 0xe9, 0x5d, 0x52, 0x4d, 0xf7, 0x59, 0xf4, 0x2e, 0x27, 0xdd,
- 0xb3, 0xed, 0x57, 0x5b, 0x82, 0xea, 0x6f, },
- { 0xa2, 0x97, 0xf5, 0x80, 0x02, 0x3d, 0xde, 0xa3, 0xf9, 0xf6, 0xab, 0xe3,
- 0x57, 0x63, 0x7b, 0x9b, 0x10, 0x42, 0x6f, 0xf2, },
- { 0x12, 0x7a, 0xfc, 0xb7, 0x67, 0x06, 0x0c, 0x78, 0x1a, 0xfe, 0x88, 0x4f,
- 0xc6, 0xac, 0x52, 0x96, 0x64, 0x28, 0x97, 0x84, 0x06, },
- { 0xc5, 0x04, 0x44, 0x6b, 0xb2, 0xa5, 0xa4, 0x66, 0xe1, 0x76, 0xa2, 0x51,
- 0xf9, 0x59, 0x69, 0x97, 0x56, 0x0b, 0xbf, 0x50, 0xb3, 0x34, },
- { 0x21, 0x32, 0x6b, 0x42, 0xb5, 0xed, 0x71, 0x8d, 0xf7, 0x5a, 0x35, 0xe3,
- 0x90, 0xe2, 0xee, 0xaa, 0x89, 0xf6, 0xc9, 0x9c, 0x4d, 0x73, 0xf4, },
- { 0x4c, 0xa6, 0x09, 0xf4, 0x48, 0xe7, 0x46, 0xbc, 0x49, 0xfc, 0xe5, 0xda,
- 0xd1, 0x87, 0x13, 0x17, 0x4c, 0x59, 0x71, 0x26, 0x5b, 0x2c, 0x42, 0xb7, },
- { 0x13, 0x63, 0xf3, 0x40, 0x02, 0xe5, 0xa3, 0x3a, 0x5e, 0x8e, 0xf8, 0xb6,
- 0x8a, 0x49, 0x60, 0x76, 0x34, 0x72, 0x94, 0x73, 0xf6, 0xd9, 0x21, 0x6a,
- 0x26, },
- { 0xdf, 0x75, 0x16, 0x10, 0x1b, 0x5e, 0x81, 0xc3, 0xc8, 0xde, 0x34, 0x24,
- 0xb0, 0x98, 0xeb, 0x1b, 0x8f, 0xa1, 0x9b, 0x05, 0xee, 0xa5, 0xe9, 0x35,
- 0xf4, 0x1d, },
- { 0xcd, 0x21, 0x93, 0x6e, 0x5b, 0xa0, 0x26, 0x2b, 0x21, 0x0e, 0xa0, 0xb9,
- 0x1c, 0xb5, 0xbb, 0xb8, 0xf8, 0x1e, 0xff, 0x5c, 0xa8, 0xf9, 0x39, 0x46,
- 0x4e, 0x29, 0x26, },
- { 0x73, 0x7f, 0x0e, 0x3b, 0x0b, 0x5c, 0xf9, 0x60, 0xaa, 0x88, 0xa1, 0x09,
- 0xb1, 0x5d, 0x38, 0x7b, 0x86, 0x8f, 0x13, 0x7a, 0x8d, 0x72, 0x7a, 0x98,
- 0x1a, 0x5b, 0xff, 0xc9, },
- { 0xd3, 0x3c, 0x61, 0x71, 0x44, 0x7e, 0x31, 0x74, 0x98, 0x9d, 0x9a, 0xd2,
- 0x27, 0xf3, 0x46, 0x43, 0x42, 0x51, 0xd0, 0x5f, 0xe9, 0x1c, 0x5c, 0x69,
- 0xbf, 0xf6, 0xbe, 0x3c, 0x40, },
- { 0x31, 0x99, 0x31, 0x9f, 0xaa, 0x43, 0x2e, 0x77, 0x3e, 0x74, 0x26, 0x31,
- 0x5e, 0x61, 0xf1, 0x87, 0xe2, 0xeb, 0x9b, 0xcd, 0xd0, 0x3a, 0xee, 0x20,
- 0x7e, 0x10, 0x0a, 0x0b, 0x7e, 0xfa, },
- { 0xa4, 0x27, 0x80, 0x67, 0x81, 0x2a, 0xa7, 0x62, 0xf7, 0x6e, 0xda, 0xd4,
- 0x5c, 0x39, 0x74, 0xad, 0x7e, 0xbe, 0xad, 0xa5, 0x84, 0x7f, 0xa9, 0x30,
- 0x5d, 0xdb, 0xe2, 0x05, 0x43, 0xf7, 0x1b, },
- { 0x0b, 0x37, 0xd8, 0x02, 0xe1, 0x83, 0xd6, 0x80, 0xf2, 0x35, 0xc2, 0xb0,
- 0x37, 0xef, 0xef, 0x5e, 0x43, 0x93, 0xf0, 0x49, 0x45, 0x0a, 0xef, 0xb5,
- 0x76, 0x70, 0x12, 0x44, 0xc4, 0xdb, 0xf5, 0x7a, },
- { 0x1f, },
- { 0x82, 0x60, },
- { 0xcc, 0xe3, 0x08, },
- { 0x56, 0x17, 0xe4, 0x59, },
- { 0xe2, 0xd7, 0x9e, 0xc4, 0x4c, },
- { 0xb2, 0xad, 0xd3, 0x78, 0x58, 0x5a, },
- { 0xce, 0x43, 0xb4, 0x02, 0x96, 0xab, 0x3c, },
- { 0xe6, 0x05, 0x1a, 0x73, 0x22, 0x32, 0xbb, 0x77, },
- { 0x23, 0xe7, 0xda, 0xfe, 0x2c, 0xef, 0x8c, 0x22, 0xec, },
- { 0xe9, 0x8e, 0x55, 0x38, 0xd1, 0xd7, 0x35, 0x23, 0x98, 0xc7, },
- { 0xb5, 0x81, 0x1a, 0xe5, 0xb5, 0xa5, 0xd9, 0x4d, 0xca, 0x41, 0xe7, },
- { 0x41, 0x16, 0x16, 0x95, 0x8d, 0x9e, 0x0c, 0xea, 0x8c, 0x71, 0x9a, 0xc1, },
- { 0x7c, 0x33, 0xc0, 0xa4, 0x00, 0x62, 0xea, 0x60, 0x67, 0xe4, 0x20, 0xbc,
- 0x5b, },
- { 0xdb, 0xb1, 0xdc, 0xfd, 0x08, 0xc0, 0xde, 0x82, 0xd1, 0xde, 0x38, 0xc0,
- 0x90, 0x48, },
- { 0x37, 0x18, 0x2e, 0x0d, 0x61, 0xaa, 0x61, 0xd7, 0x86, 0x20, 0x16, 0x60,
- 0x04, 0xd9, 0xd5, },
- { 0xb0, 0xcf, 0x2c, 0x4c, 0x5e, 0x5b, 0x4f, 0x2a, 0x23, 0x25, 0x58, 0x47,
- 0xe5, 0x31, 0x06, 0x70, },
- { 0x91, 0xa0, 0xa3, 0x86, 0x4e, 0xe0, 0x72, 0x38, 0x06, 0x67, 0x59, 0x5c,
- 0x70, 0x25, 0xdb, 0x33, 0x27, },
- { 0x44, 0x58, 0x66, 0xb8, 0x58, 0xc7, 0x13, 0xed, 0x4c, 0xc0, 0xf4, 0x9a,
- 0x1e, 0x67, 0x75, 0x33, 0xb6, 0xb8, },
- { 0x7f, 0x98, 0x4a, 0x8e, 0x50, 0xa2, 0x5c, 0xcd, 0x59, 0xde, 0x72, 0xb3,
- 0x9d, 0xc3, 0x09, 0x8a, 0xab, 0x56, 0xf1, },
- { 0x80, 0x96, 0x49, 0x1a, 0x59, 0xa2, 0xc5, 0xd5, 0xa7, 0x20, 0x8a, 0xb7,
- 0x27, 0x62, 0x84, 0x43, 0xc6, 0xe1, 0x1b, 0x5d, },
- { 0x6b, 0xb7, 0x2b, 0x26, 0x62, 0x14, 0x70, 0x19, 0x3d, 0x4d, 0xac, 0xac,
- 0x63, 0x58, 0x5e, 0x94, 0xb5, 0xb7, 0xe8, 0xe8, 0xa2, },
- { 0x20, 0xa8, 0xc0, 0xfd, 0x63, 0x3d, 0x6e, 0x98, 0xcf, 0x0c, 0x49, 0x98,
- 0xe4, 0x5a, 0xfe, 0x8c, 0xaa, 0x70, 0x82, 0x1c, 0x7b, 0x74, },
- { 0xc8, 0xe8, 0xdd, 0xdf, 0x69, 0x30, 0x01, 0xc2, 0x0f, 0x7e, 0x2f, 0x11,
- 0xcc, 0x3e, 0x17, 0xa5, 0x69, 0x40, 0x3f, 0x0e, 0x79, 0x7f, 0xcf, },
- { 0xdb, 0x61, 0xc0, 0xe2, 0x2e, 0x49, 0x07, 0x31, 0x1d, 0x91, 0x42, 0x8a,
- 0xfc, 0x5e, 0xd3, 0xf8, 0x56, 0x1f, 0x2b, 0x73, 0xfd, 0x9f, 0xb2, 0x8e, },
- { 0x0c, 0x89, 0x55, 0x0c, 0x1f, 0x59, 0x2c, 0x9d, 0x1b, 0x29, 0x1d, 0x41,
- 0x1d, 0xe6, 0x47, 0x8f, 0x8c, 0x2b, 0xea, 0x8f, 0xf0, 0xff, 0x21, 0x70,
- 0x88, },
- { 0x12, 0x18, 0x95, 0xa6, 0x59, 0xb1, 0x31, 0x24, 0x45, 0x67, 0x55, 0xa4,
- 0x1a, 0x2d, 0x48, 0x67, 0x1b, 0x43, 0x88, 0x2d, 0x8e, 0xa0, 0x70, 0xb3,
- 0xc6, 0xbb, },
- { 0xe7, 0xb1, 0x1d, 0xb2, 0x76, 0x4d, 0x68, 0x68, 0x68, 0x23, 0x02, 0x55,
- 0x3a, 0xe2, 0xe5, 0xd5, 0x4b, 0x43, 0xf9, 0x34, 0x77, 0x5c, 0xa1, 0xf5,
- 0x55, 0xfd, 0x4f, },
- { 0x8c, 0x87, 0x5a, 0x08, 0x3a, 0x73, 0xad, 0x61, 0xe1, 0xe7, 0x99, 0x7e,
- 0xf0, 0x5d, 0xe9, 0x5d, 0x16, 0x43, 0x80, 0x2f, 0xd0, 0x66, 0x34, 0xe2,
- 0x42, 0x64, 0x3b, 0x1a, },
- { 0x39, 0xc1, 0x99, 0xcf, 0x22, 0xbf, 0x16, 0x8f, 0x9f, 0x80, 0x7f, 0x95,
- 0x0a, 0x05, 0x67, 0x27, 0xe7, 0x15, 0xdf, 0x9d, 0xb2, 0xfe, 0x1c, 0xb5,
- 0x1d, 0x60, 0x8f, 0x8a, 0x1d, },
- { 0x9b, 0x6e, 0x08, 0x09, 0x06, 0x73, 0xab, 0x68, 0x02, 0x62, 0x1a, 0xe4,
- 0xd4, 0xdf, 0xc7, 0x02, 0x4c, 0x6a, 0x5f, 0xfd, 0x23, 0xac, 0xae, 0x6d,
- 0x43, 0xa4, 0x7a, 0x50, 0x60, 0x3c, },
- { 0x1d, 0xb4, 0xc6, 0xe1, 0xb1, 0x4b, 0xe3, 0xf2, 0xe2, 0x1a, 0x73, 0x1b,
- 0xa0, 0x92, 0xa7, 0xf5, 0xff, 0x8f, 0x8b, 0x5d, 0xdf, 0xa8, 0x04, 0xb3,
- 0xb0, 0xf7, 0xcc, 0x12, 0xfa, 0x35, 0x46, },
- { 0x49, 0x45, 0x97, 0x11, 0x0f, 0x1c, 0x60, 0x8e, 0xe8, 0x47, 0x30, 0xcf,
- 0x60, 0xa8, 0x71, 0xc5, 0x1b, 0xe9, 0x39, 0x4d, 0x49, 0xb6, 0x12, 0x1f,
- 0x24, 0xab, 0x37, 0xff, 0x83, 0xc2, 0xe1, 0x3a, },
- { 0x60, },
- { 0x24, 0x26, },
- { 0x47, 0xeb, 0xc9, },
- { 0x4a, 0xd0, 0xbc, 0xf0, },
- { 0x8e, 0x2b, 0xc9, 0x85, 0x3c, },
- { 0xa2, 0x07, 0x15, 0xb8, 0x12, 0x74, },
- { 0x0f, 0xdb, 0x5b, 0x33, 0x69, 0xfe, 0x4b, },
- { 0xa2, 0x86, 0x54, 0xf4, 0xfd, 0xb2, 0xd4, 0xe6, },
- { 0xbb, 0x84, 0x78, 0x49, 0x27, 0x8e, 0x61, 0xda, 0x60, },
- { 0x04, 0xc3, 0xcd, 0xaa, 0x8f, 0xa7, 0x03, 0xc9, 0xf9, 0xb6, },
- { 0xf8, 0x27, 0x1d, 0x61, 0xdc, 0x21, 0x42, 0xdd, 0xad, 0x92, 0x40, },
- { 0x12, 0x87, 0xdf, 0xc2, 0x41, 0x45, 0x5a, 0x36, 0x48, 0x5b, 0x51, 0x2b, },
- { 0xbb, 0x37, 0x5d, 0x1f, 0xf1, 0x68, 0x7a, 0xc4, 0xa5, 0xd2, 0xa4, 0x91,
- 0x8d, },
- { 0x5b, 0x27, 0xd1, 0x04, 0x54, 0x52, 0x9f, 0xa3, 0x47, 0x86, 0x33, 0x33,
- 0xbf, 0xa0, },
- { 0xcf, 0x04, 0xea, 0xf8, 0x03, 0x2a, 0x43, 0xff, 0xa6, 0x68, 0x21, 0x4c,
- 0xd5, 0x4b, 0xed, },
- { 0xaf, 0xb8, 0xbc, 0x63, 0x0f, 0x18, 0x4d, 0xe2, 0x7a, 0xdd, 0x46, 0x44,
- 0xc8, 0x24, 0x0a, 0xb7, },
- { 0x3e, 0xdc, 0x36, 0xe4, 0x89, 0xb1, 0xfa, 0xc6, 0x40, 0x93, 0x2e, 0x75,
- 0xb2, 0x15, 0xd1, 0xb1, 0x10, },
- { 0x6c, 0xd8, 0x20, 0x3b, 0x82, 0x79, 0xf9, 0xc8, 0xbc, 0x9d, 0xe0, 0x35,
- 0xbe, 0x1b, 0x49, 0x1a, 0xbc, 0x3a, },
- { 0x78, 0x65, 0x2c, 0xbe, 0x35, 0x67, 0xdc, 0x78, 0xd4, 0x41, 0xf6, 0xc9,
- 0xde, 0xde, 0x1f, 0x18, 0x13, 0x31, 0x11, },
- { 0x8a, 0x7f, 0xb1, 0x33, 0x8f, 0x0c, 0x3c, 0x0a, 0x06, 0x61, 0xf0, 0x47,
- 0x29, 0x1b, 0x29, 0xbc, 0x1c, 0x47, 0xef, 0x7a, },
- { 0x65, 0x91, 0xf1, 0xe6, 0xb3, 0x96, 0xd3, 0x8c, 0xc2, 0x4a, 0x59, 0x35,
- 0x72, 0x8e, 0x0b, 0x9a, 0x87, 0xca, 0x34, 0x7b, 0x63, },
- { 0x5f, 0x08, 0x87, 0x80, 0x56, 0x25, 0x89, 0x77, 0x61, 0x8c, 0x64, 0xa1,
- 0x59, 0x6d, 0x59, 0x62, 0xe8, 0x4a, 0xc8, 0x58, 0x99, 0xd1, },
- { 0x23, 0x87, 0x1d, 0xed, 0x6f, 0xf2, 0x91, 0x90, 0xe2, 0xfe, 0x43, 0x21,
- 0xaf, 0x97, 0xc6, 0xbc, 0xd7, 0x15, 0xc7, 0x2d, 0x08, 0x77, 0x91, },
- { 0x90, 0x47, 0x9a, 0x9e, 0x3a, 0xdf, 0xf3, 0xc9, 0x4c, 0x1e, 0xa7, 0xd4,
- 0x6a, 0x32, 0x90, 0xfe, 0xb7, 0xb6, 0x7b, 0xfa, 0x96, 0x61, 0xfb, 0xa4, },
- { 0xb1, 0x67, 0x60, 0x45, 0xb0, 0x96, 0xc5, 0x15, 0x9f, 0x4d, 0x26, 0xd7,
- 0x9d, 0xf1, 0xf5, 0x6d, 0x21, 0x00, 0x94, 0x31, 0x64, 0x94, 0xd3, 0xa7,
- 0xd3, },
- { 0x02, 0x3e, 0xaf, 0xf3, 0x79, 0x73, 0xa5, 0xf5, 0xcc, 0x7a, 0x7f, 0xfb,
- 0x79, 0x2b, 0x85, 0x8c, 0x88, 0x72, 0x06, 0xbe, 0xfe, 0xaf, 0xc1, 0x16,
- 0xa6, 0xd6, },
- { 0x2a, 0xb0, 0x1a, 0xe5, 0xaa, 0x6e, 0xb3, 0xae, 0x53, 0x85, 0x33, 0x80,
- 0x75, 0xae, 0x30, 0xe6, 0xb8, 0x72, 0x42, 0xf6, 0x25, 0x4f, 0x38, 0x88,
- 0x55, 0xd1, 0xa9, },
- { 0x90, 0xd8, 0x0c, 0xc0, 0x93, 0x4b, 0x4f, 0x9e, 0x65, 0x6c, 0xa1, 0x54,
- 0xa6, 0xf6, 0x6e, 0xca, 0xd2, 0xbb, 0x7e, 0x6a, 0x1c, 0xd3, 0xce, 0x46,
- 0xef, 0xb0, 0x00, 0x8d, },
- { 0xed, 0x9c, 0x49, 0xcd, 0xc2, 0xde, 0x38, 0x0e, 0xe9, 0x98, 0x6c, 0xc8,
- 0x90, 0x9e, 0x3c, 0xd4, 0xd3, 0xeb, 0x88, 0x32, 0xc7, 0x28, 0xe3, 0x94,
- 0x1c, 0x9f, 0x8b, 0xf3, 0xcb, },
- { 0xac, 0xe7, 0x92, 0x16, 0xb4, 0x14, 0xa0, 0xe4, 0x04, 0x79, 0xa2, 0xf4,
- 0x31, 0xe6, 0x0c, 0x26, 0xdc, 0xbf, 0x2f, 0x69, 0x1b, 0x55, 0x94, 0x67,
- 0xda, 0x0c, 0xd7, 0x32, 0x1f, 0xef, },
- { 0x68, 0x63, 0x85, 0x57, 0x95, 0x9e, 0x42, 0x27, 0x41, 0x43, 0x42, 0x02,
- 0xa5, 0x78, 0xa7, 0xc6, 0x43, 0xc1, 0x6a, 0xba, 0x70, 0x80, 0xcd, 0x04,
- 0xb6, 0x78, 0x76, 0x29, 0xf3, 0xe8, 0xa0, },
- { 0xe6, 0xac, 0x8d, 0x9d, 0xf0, 0xc0, 0xf7, 0xf7, 0xe3, 0x3e, 0x4e, 0x28,
- 0x0f, 0x59, 0xb2, 0x67, 0x9e, 0x84, 0x34, 0x42, 0x96, 0x30, 0x2b, 0xca,
- 0x49, 0xb6, 0xc5, 0x9a, 0x84, 0x59, 0xa7, 0x81, },
- { 0x7e, },
- { 0x1e, 0x21, },
- { 0x26, 0xd3, 0xdd, },
- { 0x2c, 0xd4, 0xb3, 0x3d, },
- { 0x86, 0x7b, 0x76, 0x3c, 0xf0, },
- { 0x12, 0xc3, 0x70, 0x1d, 0x55, 0x18, },
- { 0x96, 0xc2, 0xbd, 0x61, 0x55, 0xf4, 0x24, },
- { 0x20, 0x51, 0xf7, 0x86, 0x58, 0x8f, 0x07, 0x2a, },
- { 0x93, 0x15, 0xa8, 0x1d, 0xda, 0x97, 0xee, 0x0e, 0x6c, },
- { 0x39, 0x93, 0xdf, 0xd5, 0x0e, 0xca, 0xdc, 0x7a, 0x92, 0xce, },
- { 0x60, 0xd5, 0xfd, 0xf5, 0x1b, 0x26, 0x82, 0x26, 0x73, 0x02, 0xbc, },
- { 0x98, 0xf2, 0x34, 0xe1, 0xf5, 0xfb, 0x00, 0xac, 0x10, 0x4a, 0x38, 0x9f, },
- { 0xda, 0x3a, 0x92, 0x8a, 0xd0, 0xcd, 0x12, 0xcd, 0x15, 0xbb, 0xab, 0x77,
- 0x66, },
- { 0xa2, 0x92, 0x1a, 0xe5, 0xca, 0x0c, 0x30, 0x75, 0xeb, 0xaf, 0x00, 0x31,
- 0x55, 0x66, },
- { 0x06, 0xea, 0xfd, 0x3e, 0x86, 0x38, 0x62, 0x4e, 0xa9, 0x12, 0xa4, 0x12,
- 0x43, 0xbf, 0xa1, },
- { 0xe4, 0x71, 0x7b, 0x94, 0xdb, 0xa0, 0xd2, 0xff, 0x9b, 0xeb, 0xad, 0x8e,
- 0x95, 0x8a, 0xc5, 0xed, },
- { 0x25, 0x5a, 0x77, 0x71, 0x41, 0x0e, 0x7a, 0xe9, 0xed, 0x0c, 0x10, 0xef,
- 0xf6, 0x2b, 0x3a, 0xba, 0x60, },
- { 0xee, 0xe2, 0xa3, 0x67, 0x64, 0x1d, 0xc6, 0x04, 0xc4, 0xe1, 0x68, 0xd2,
- 0x6e, 0xd2, 0x91, 0x75, 0x53, 0x07, },
- { 0xe0, 0xf6, 0x4d, 0x8f, 0x68, 0xfc, 0x06, 0x7e, 0x18, 0x79, 0x7f, 0x2b,
- 0x6d, 0xef, 0x46, 0x7f, 0xab, 0xb2, 0xad, },
- { 0x3d, 0x35, 0x88, 0x9f, 0x2e, 0xcf, 0x96, 0x45, 0x07, 0x60, 0x71, 0x94,
- 0x00, 0x8d, 0xbf, 0xf4, 0xef, 0x46, 0x2e, 0x3c, },
- { 0x43, 0xcf, 0x98, 0xf7, 0x2d, 0xf4, 0x17, 0xe7, 0x8c, 0x05, 0x2d, 0x9b,
- 0x24, 0xfb, 0x4d, 0xea, 0x4a, 0xec, 0x01, 0x25, 0x29, },
- { 0x8e, 0x73, 0x9a, 0x78, 0x11, 0xfe, 0x48, 0xa0, 0x3b, 0x1a, 0x26, 0xdf,
- 0x25, 0xe9, 0x59, 0x1c, 0x70, 0x07, 0x9f, 0xdc, 0xa0, 0xa6, },
- { 0xe8, 0x47, 0x71, 0xc7, 0x3e, 0xdf, 0xb5, 0x13, 0xb9, 0x85, 0x13, 0xa8,
- 0x54, 0x47, 0x6e, 0x59, 0x96, 0x09, 0x13, 0x5f, 0x82, 0x16, 0x0b, },
- { 0xfb, 0xc0, 0x8c, 0x03, 0x21, 0xb3, 0xc4, 0xb5, 0x43, 0x32, 0x6c, 0xea,
- 0x7f, 0xa8, 0x43, 0x91, 0xe8, 0x4e, 0x3f, 0xbf, 0x45, 0x58, 0x6a, 0xa3, },
- { 0x55, 0xf8, 0xf3, 0x00, 0x76, 0x09, 0xef, 0x69, 0x5d, 0xd2, 0x8a, 0xf2,
- 0x65, 0xc3, 0xcb, 0x9b, 0x43, 0xfd, 0xb1, 0x7e, 0x7f, 0xa1, 0x94, 0xb0,
- 0xd7, },
- { 0xaa, 0x13, 0xc1, 0x51, 0x40, 0x6d, 0x8d, 0x4c, 0x0a, 0x95, 0x64, 0x7b,
- 0xd1, 0x96, 0xb6, 0x56, 0xb4, 0x5b, 0xcf, 0xd6, 0xd9, 0x15, 0x97, 0xdd,
- 0xb6, 0xef, },
- { 0xaf, 0xb7, 0x36, 0xb0, 0x04, 0xdb, 0xd7, 0x9c, 0x9a, 0x44, 0xc4, 0xf6,
- 0x1f, 0x12, 0x21, 0x2d, 0x59, 0x30, 0x54, 0xab, 0x27, 0x61, 0xa3, 0x57,
- 0xef, 0xf8, 0x53, },
- { 0x97, 0x34, 0x45, 0x3e, 0xce, 0x7c, 0x35, 0xa2, 0xda, 0x9f, 0x4b, 0x46,
- 0x6c, 0x11, 0x67, 0xff, 0x2f, 0x76, 0x58, 0x15, 0x71, 0xfa, 0x44, 0x89,
- 0x89, 0xfd, 0xf7, 0x99, },
- { 0x1f, 0xb1, 0x62, 0xeb, 0x83, 0xc5, 0x9c, 0x89, 0xf9, 0x2c, 0xd2, 0x03,
- 0x61, 0xbc, 0xbb, 0xa5, 0x74, 0x0e, 0x9b, 0x7e, 0x82, 0x3e, 0x70, 0x0a,
- 0xa9, 0x8f, 0x2b, 0x59, 0xfb, },
- { 0xf8, 0xca, 0x5e, 0x3a, 0x4f, 0x9e, 0x10, 0x69, 0x10, 0xd5, 0x4c, 0xeb,
- 0x1a, 0x0f, 0x3c, 0x6a, 0x98, 0xf5, 0xb0, 0x97, 0x5b, 0x37, 0x2f, 0x0d,
- 0xbd, 0x42, 0x4b, 0x69, 0xa1, 0x82, },
- { 0x12, 0x8c, 0x6d, 0x52, 0x08, 0xef, 0x74, 0xb2, 0xe6, 0xaa, 0xd3, 0xb0,
- 0x26, 0xb0, 0xd9, 0x94, 0xb6, 0x11, 0x45, 0x0e, 0x36, 0x71, 0x14, 0x2d,
- 0x41, 0x8c, 0x21, 0x53, 0x31, 0xe9, 0x68, },
- { 0xee, 0xea, 0x0d, 0x89, 0x47, 0x7e, 0x72, 0xd1, 0xd8, 0xce, 0x58, 0x4c,
- 0x94, 0x1f, 0x0d, 0x51, 0x08, 0xa3, 0xb6, 0x3d, 0xe7, 0x82, 0x46, 0x92,
- 0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, },
-};
-
-bool __init blake2s_selftest(void)
-{
- u8 key[BLAKE2S_KEY_SIZE];
- u8 buf[ARRAY_SIZE(blake2s_testvecs)];
- u8 hash[BLAKE2S_HASH_SIZE];
- struct blake2s_state state;
- bool success = true;
- int i, l;
-
- key[0] = key[1] = 1;
- for (i = 2; i < sizeof(key); ++i)
- key[i] = key[i - 2] + key[i - 1];
-
- for (i = 0; i < sizeof(buf); ++i)
- buf[i] = (u8)i;
-
- for (i = l = 0; i < ARRAY_SIZE(blake2s_testvecs); l = (l + 37) % ++i) {
- int outlen = 1 + i % BLAKE2S_HASH_SIZE;
- int keylen = (13 * i) % (BLAKE2S_KEY_SIZE + 1);
-
- blake2s(hash, buf, key + BLAKE2S_KEY_SIZE - keylen, outlen, i,
- keylen);
- if (memcmp(hash, blake2s_testvecs[i], outlen)) {
- pr_err("blake2s self-test %d: FAIL\n", i + 1);
- success = false;
- }
-
- if (!keylen)
- blake2s_init(&state, outlen);
- else
- blake2s_init_key(&state, outlen,
- key + BLAKE2S_KEY_SIZE - keylen,
- keylen);
-
- blake2s_update(&state, buf, l);
- blake2s_update(&state, buf + l, i - l);
- blake2s_final(&state, hash);
- if (memcmp(hash, blake2s_testvecs[i], outlen)) {
- pr_err("blake2s init/update/final self-test %d: FAIL\n",
- i + 1);
- success = false;
- }
- }
-
- for (i = 0; i < 32; ++i) {
- enum { TEST_ALIGNMENT = 16 };
- u8 unaligned_block[BLAKE2S_BLOCK_SIZE + TEST_ALIGNMENT - 1]
- __aligned(TEST_ALIGNMENT);
- u8 blocks[BLAKE2S_BLOCK_SIZE * 2];
- struct blake2s_state state1, state2;
-
- get_random_bytes(blocks, sizeof(blocks));
- get_random_bytes(&state, sizeof(state));
-
-#if defined(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) && \
- defined(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
- memcpy(&state1, &state, sizeof(state1));
- memcpy(&state2, &state, sizeof(state2));
- blake2s_compress(&state1, blocks, 2, BLAKE2S_BLOCK_SIZE);
- blake2s_compress_generic(&state2, blocks, 2, BLAKE2S_BLOCK_SIZE);
- if (memcmp(&state1, &state2, sizeof(state1))) {
- pr_err("blake2s random compress self-test %d: FAIL\n",
- i + 1);
- success = false;
- }
-#endif
-
- memcpy(&state1, &state, sizeof(state1));
- blake2s_compress(&state1, blocks, 1, BLAKE2S_BLOCK_SIZE);
- for (l = 1; l < TEST_ALIGNMENT; ++l) {
- memcpy(unaligned_block + l, blocks,
- BLAKE2S_BLOCK_SIZE);
- memcpy(&state2, &state, sizeof(state2));
- blake2s_compress(&state2, unaligned_block + l, 1,
- BLAKE2S_BLOCK_SIZE);
- if (memcmp(&state1, &state2, sizeof(state1))) {
- pr_err("blake2s random compress align %d self-test %d: FAIL\n",
- l, i + 1);
- success = false;
- }
- }
- }
-
- return success;
-}
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 98e688c6d891..6182c21ed943 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -8,65 +8,158 @@
*
*/
-#include <crypto/internal/blake2s.h>
-#include <linux/types.h>
-#include <linux/string.h>
+#include <crypto/blake2s.h>
+#include <linux/bug.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bug.h>
+#include <linux/string.h>
+#include <linux/types.h>
-static inline void blake2s_set_lastblock(struct blake2s_state *state)
+static const u8 blake2s_sigma[10][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+
+static inline void blake2s_increment_counter(struct blake2s_ctx *ctx, u32 inc)
{
- state->f[0] = -1;
+ ctx->t[0] += inc;
+ ctx->t[1] += (ctx->t[0] < inc);
}
-void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
+static void __maybe_unused
+blake2s_compress_generic(struct blake2s_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc)
{
- const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+ u32 m[16];
+ u32 v[16];
+ int i;
+
+ WARN_ON(IS_ENABLED(DEBUG) &&
+ (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
+
+ while (nblocks > 0) {
+ blake2s_increment_counter(ctx, inc);
+ memcpy(m, data, BLAKE2S_BLOCK_SIZE);
+ le32_to_cpu_array(m, ARRAY_SIZE(m));
+ memcpy(v, ctx->h, 32);
+ v[ 8] = BLAKE2S_IV0;
+ v[ 9] = BLAKE2S_IV1;
+ v[10] = BLAKE2S_IV2;
+ v[11] = BLAKE2S_IV3;
+ v[12] = BLAKE2S_IV4 ^ ctx->t[0];
+ v[13] = BLAKE2S_IV5 ^ ctx->t[1];
+ v[14] = BLAKE2S_IV6 ^ ctx->f[0];
+ v[15] = BLAKE2S_IV7 ^ ctx->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+ a += b + m[blake2s_sigma[r][2 * i + 0]]; \
+ d = ror32(d ^ a, 16); \
+ c += d; \
+ b = ror32(b ^ c, 12); \
+ a += b + m[blake2s_sigma[r][2 * i + 1]]; \
+ d = ror32(d ^ a, 8); \
+ c += d; \
+ b = ror32(b ^ c, 7); \
+} while (0)
+
+#define ROUND(r) do { \
+ G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+ G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+ G(r, 2, v[2], v[ 6], v[10], v[14]); \
+ G(r, 3, v[3], v[ 7], v[11], v[15]); \
+ G(r, 4, v[0], v[ 5], v[10], v[15]); \
+ G(r, 5, v[1], v[ 6], v[11], v[12]); \
+ G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+ G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+ ROUND(0);
+ ROUND(1);
+ ROUND(2);
+ ROUND(3);
+ ROUND(4);
+ ROUND(5);
+ ROUND(6);
+ ROUND(7);
+ ROUND(8);
+ ROUND(9);
+
+#undef G
+#undef ROUND
+
+ for (i = 0; i < 8; ++i)
+ ctx->h[i] ^= v[i] ^ v[i + 8];
+
+ data += BLAKE2S_BLOCK_SIZE;
+ --nblocks;
+ }
+}
+
+#ifdef CONFIG_CRYPTO_LIB_BLAKE2S_ARCH
+#include "blake2s.h" /* $(SRCARCH)/blake2s.h */
+#else
+#define blake2s_compress blake2s_compress_generic
+#endif
+
+static inline void blake2s_set_lastblock(struct blake2s_ctx *ctx)
+{
+ ctx->f[0] = -1;
+}
+
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
+{
+ const size_t fill = BLAKE2S_BLOCK_SIZE - ctx->buflen;
if (unlikely(!inlen))
return;
if (inlen > fill) {
- memcpy(state->buf + state->buflen, in, fill);
- blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
- state->buflen = 0;
+ memcpy(ctx->buf + ctx->buflen, in, fill);
+ blake2s_compress(ctx, ctx->buf, 1, BLAKE2S_BLOCK_SIZE);
+ ctx->buflen = 0;
in += fill;
inlen -= fill;
}
if (inlen > BLAKE2S_BLOCK_SIZE) {
const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
- blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+
+ blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
}
- memcpy(state->buf + state->buflen, in, inlen);
- state->buflen += inlen;
+ memcpy(ctx->buf + ctx->buflen, in, inlen);
+ ctx->buflen += inlen;
}
EXPORT_SYMBOL(blake2s_update);
-void blake2s_final(struct blake2s_state *state, u8 *out)
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out)
{
WARN_ON(IS_ENABLED(DEBUG) && !out);
- blake2s_set_lastblock(state);
- memset(state->buf + state->buflen, 0,
- BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
- blake2s_compress(state, state->buf, 1, state->buflen);
- cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
- memcpy(out, state->h, state->outlen);
- memzero_explicit(state, sizeof(*state));
+ blake2s_set_lastblock(ctx);
+ memset(ctx->buf + ctx->buflen, 0,
+ BLAKE2S_BLOCK_SIZE - ctx->buflen); /* Padding */
+ blake2s_compress(ctx, ctx->buf, 1, ctx->buflen);
+ cpu_to_le32_array(ctx->h, ARRAY_SIZE(ctx->h));
+ memcpy(out, ctx->h, ctx->outlen);
+ memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL(blake2s_final);
+#ifdef blake2s_mod_init_arch
static int __init blake2s_mod_init(void)
{
- if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
- WARN_ON(!blake2s_selftest()))
- return -ENODEV;
+ blake2s_mod_init_arch();
return 0;
}
+subsys_initcall(blake2s_mod_init);
+#endif
-module_init(blake2s_mod_init);
-MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("BLAKE2s hash function");
MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/chacha-block-generic.c b/lib/crypto/chacha-block-generic.c
new file mode 100644
index 000000000000..77f68de71066
--- /dev/null
+++ b/lib/crypto/chacha-block-generic.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/chacha.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+static void chacha_permute(struct chacha_state *state, int nrounds)
+{
+ u32 *x = state->x;
+ int i;
+
+ /* whitelist the allowed round counts */
+ WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
+
+ for (i = 0; i < nrounds; i += 2) {
+ x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16);
+ x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16);
+ x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16);
+ x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16);
+
+ x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12);
+ x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12);
+ x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12);
+ x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12);
+
+ x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8);
+ x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8);
+ x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8);
+ x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8);
+
+ x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7);
+ x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7);
+ x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7);
+ x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7);
+
+ x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16);
+ x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16);
+ x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16);
+ x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16);
+
+ x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12);
+ x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12);
+ x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12);
+ x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12);
+
+ x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8);
+ x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8);
+ x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8);
+ x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8);
+
+ x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7);
+ x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7);
+ x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7);
+ x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7);
+ }
+}
+
+/**
+ * chacha_block_generic - generate one keystream block and increment block counter
+ * @state: input state matrix
+ * @out: output keystream block
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
+ * The caller has already converted the endianness of the input. This function
+ * also handles incrementing the block counter in the input matrix.
+ */
+void chacha_block_generic(struct chacha_state *state,
+ u8 out[CHACHA_BLOCK_SIZE], int nrounds)
+{
+ struct chacha_state permuted_state = *state;
+ int i;
+
+ chacha_permute(&permuted_state, nrounds);
+
+ for (i = 0; i < ARRAY_SIZE(state->x); i++)
+ put_unaligned_le32(permuted_state.x[i] + state->x[i],
+ &out[i * sizeof(u32)]);
+
+ state->x[12]++;
+}
+EXPORT_SYMBOL(chacha_block_generic);
+
+/**
+ * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
+ * @state: input state matrix
+ * @out: the output words
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
+ * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha
+ * skips the final addition of the initial state, and outputs only certain words
+ * of the state. It should not be used for streaming directly.
+ */
+void hchacha_block_generic(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+ struct chacha_state permuted_state = *state;
+
+ chacha_permute(&permuted_state, nrounds);
+
+ memcpy(&out[0], &permuted_state.x[0], 16);
+ memcpy(&out[4], &permuted_state.x[12], 16);
+}
+EXPORT_SYMBOL(hchacha_block_generic);
diff --git a/lib/crypto/chacha.c b/lib/crypto/chacha.c
index b748fd3d256e..e0c7cb4af318 100644
--- a/lib/crypto/chacha.c
+++ b/lib/crypto/chacha.c
@@ -1,114 +1,70 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+ * The ChaCha stream cipher (RFC7539)
*
* Copyright (C) 2015 Martin Willi
*/
-#include <linux/bug.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/bitops.h>
-#include <linux/string.h>
-#include <asm/unaligned.h>
+#include <crypto/algapi.h> // for crypto_xor_cpy
#include <crypto/chacha.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
-static void chacha_permute(u32 *x, int nrounds)
+static void __maybe_unused
+chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
{
- int i;
-
- /* whitelist the allowed round counts */
- WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
-
- for (i = 0; i < nrounds; i += 2) {
- x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16);
- x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16);
- x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16);
- x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16);
-
- x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12);
- x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12);
- x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12);
- x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12);
-
- x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8);
- x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8);
- x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8);
- x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8);
-
- x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7);
- x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7);
- x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7);
- x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7);
-
- x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16);
- x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16);
- x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16);
- x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16);
-
- x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12);
- x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12);
- x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12);
- x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12);
-
- x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8);
- x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8);
- x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8);
- x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8);
-
- x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7);
- x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7);
- x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7);
- x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7);
+ /* aligned to potentially speed up crypto_xor() */
+ u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
+
+ while (bytes >= CHACHA_BLOCK_SIZE) {
+ chacha_block_generic(state, stream, nrounds);
+ crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
+ bytes -= CHACHA_BLOCK_SIZE;
+ dst += CHACHA_BLOCK_SIZE;
+ src += CHACHA_BLOCK_SIZE;
+ }
+ if (bytes) {
+ chacha_block_generic(state, stream, nrounds);
+ crypto_xor_cpy(dst, src, stream, bytes);
}
}
-/**
- * chacha_block_generic - generate one keystream block and increment block counter
- * @state: input state matrix (16 32-bit words)
- * @stream: output keystream block (64 bytes)
- * @nrounds: number of rounds (20 or 12; 20 is recommended)
- *
- * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
- * The caller has already converted the endianness of the input. This function
- * also handles incrementing the block counter in the input matrix.
- */
-void chacha_block_generic(u32 *state, u8 *stream, int nrounds)
-{
- u32 x[16];
- int i;
-
- memcpy(x, state, 64);
-
- chacha_permute(x, nrounds);
+#ifdef CONFIG_CRYPTO_LIB_CHACHA_ARCH
+#include "chacha.h" /* $(SRCARCH)/chacha.h */
+#else
+#define chacha_crypt_arch chacha_crypt_generic
+#define hchacha_block_arch hchacha_block_generic
+#endif
- for (i = 0; i < ARRAY_SIZE(x); i++)
- put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
-
- state[12]++;
+void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ chacha_crypt_arch(state, dst, src, bytes, nrounds);
}
-EXPORT_SYMBOL(chacha_block_generic);
+EXPORT_SYMBOL_GPL(chacha_crypt);
-/**
- * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
- * @state: input state matrix (16 32-bit words)
- * @stream: output (8 32-bit words)
- * @nrounds: number of rounds (20 or 12; 20 is recommended)
- *
- * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
- * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha
- * skips the final addition of the initial state, and outputs only certain words
- * of the state. It should not be used for streaming directly.
- */
-void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds)
+void hchacha_block(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds)
{
- u32 x[16];
-
- memcpy(x, state, 64);
+ hchacha_block_arch(state, out, nrounds);
+}
+EXPORT_SYMBOL_GPL(hchacha_block);
- chacha_permute(x, nrounds);
+#ifdef chacha_mod_init_arch
+static int __init chacha_mod_init(void)
+{
+ chacha_mod_init_arch();
+ return 0;
+}
+subsys_initcall(chacha_mod_init);
- memcpy(&stream[0], &x[0], 16);
- memcpy(&stream[4], &x[12], 16);
+static void __exit chacha_mod_exit(void)
+{
}
-EXPORT_SYMBOL(hchacha_block_generic);
+module_exit(chacha_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/chacha20poly1305-selftest.c b/lib/crypto/chacha20poly1305-selftest.c
index fa43deda2660..e4c85bc5a6d7 100644
--- a/lib/crypto/chacha20poly1305-selftest.c
+++ b/lib/crypto/chacha20poly1305-selftest.c
@@ -7,7 +7,7 @@
#include <crypto/chacha.h>
#include <crypto/poly1305.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/bug.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -8832,7 +8832,7 @@ chacha20poly1305_encrypt_bignonce(u8 *dst, const u8 *src, const size_t src_len,
{
const u8 *pad0 = page_address(ZERO_PAGE(0));
struct poly1305_desc_ctx poly1305_state;
- u32 chacha20_state[CHACHA_STATE_WORDS];
+ struct chacha_state chacha20_state;
union {
u8 block0[POLY1305_KEY_SIZE];
__le64 lens[2];
@@ -8844,12 +8844,12 @@ chacha20poly1305_encrypt_bignonce(u8 *dst, const u8 *src, const size_t src_len,
memcpy(&bottom_row[4], nonce, 12);
for (i = 0; i < 8; ++i)
le_key[i] = get_unaligned_le32(key + sizeof(le_key[i]) * i);
- chacha_init(chacha20_state, le_key, bottom_row);
- chacha20_crypt(chacha20_state, b.block0, b.block0, sizeof(b.block0));
+ chacha_init(&chacha20_state, le_key, bottom_row);
+ chacha20_crypt(&chacha20_state, b.block0, b.block0, sizeof(b.block0));
poly1305_init(&poly1305_state, b.block0);
poly1305_update(&poly1305_state, ad, ad_len);
poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf);
- chacha20_crypt(chacha20_state, dst, src, src_len);
+ chacha20_crypt(&chacha20_state, dst, src, src_len);
poly1305_update(&poly1305_state, dst, src_len);
poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf);
b.lens[0] = cpu_to_le64(ad_len);
diff --git a/lib/crypto/chacha20poly1305.c b/lib/crypto/chacha20poly1305.c
index fa6a9440fc95..212ce33562af 100644
--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
@@ -7,19 +7,16 @@
* Information: https://tools.ietf.org/html/rfc8439
*/
-#include <crypto/algapi.h>
-#include <crypto/chacha20poly1305.h>
#include <crypto/chacha.h>
+#include <crypto/chacha20poly1305.h>
#include <crypto/poly1305.h>
-#include <crypto/scatterwalk.h>
-
-#include <asm/unaligned.h>
-#include <linux/kernel.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
#include <linux/init.h>
+#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
-
-#define CHACHA_KEY_WORDS (CHACHA_KEY_SIZE / sizeof(u32))
+#include <linux/unaligned.h>
static void chacha_load_key(u32 *k, const u8 *in)
{
@@ -33,7 +30,8 @@ static void chacha_load_key(u32 *k, const u8 *in)
k[7] = get_unaligned_le32(in + 28);
}
-static void xchacha_init(u32 *chacha_state, const u8 *key, const u8 *nonce)
+static void xchacha_init(struct chacha_state *chacha_state,
+ const u8 *key, const u8 *nonce)
{
u32 k[CHACHA_KEY_WORDS];
u8 iv[CHACHA_IV_SIZE];
@@ -55,7 +53,8 @@ static void xchacha_init(u32 *chacha_state, const u8 *key, const u8 *nonce)
static void
__chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len, u32 *chacha_state)
+ const u8 *ad, const size_t ad_len,
+ struct chacha_state *chacha_state)
{
const u8 *pad0 = page_address(ZERO_PAGE(0));
struct poly1305_desc_ctx poly1305_state;
@@ -83,16 +82,16 @@ __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
poly1305_final(&poly1305_state, dst + src_len);
- memzero_explicit(chacha_state, CHACHA_STATE_WORDS * sizeof(u32));
+ chacha_zeroize_state(chacha_state);
memzero_explicit(&b, sizeof(b));
}
void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
{
- u32 chacha_state[CHACHA_STATE_WORDS];
+ struct chacha_state chacha_state;
u32 k[CHACHA_KEY_WORDS];
__le64 iv[2];
@@ -101,8 +100,9 @@ void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
iv[0] = 0;
iv[1] = cpu_to_le64(nonce);
- chacha_init(chacha_state, k, (u8 *)iv);
- __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, chacha_state);
+ chacha_init(&chacha_state, k, (u8 *)iv);
+ __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len,
+ &chacha_state);
memzero_explicit(iv, sizeof(iv));
memzero_explicit(k, sizeof(k));
@@ -111,19 +111,21 @@ EXPORT_SYMBOL(chacha20poly1305_encrypt);
void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 *ad, const size_t ad_len,
- const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
+ const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
{
- u32 chacha_state[CHACHA_STATE_WORDS];
+ struct chacha_state chacha_state;
- xchacha_init(chacha_state, key, nonce);
- __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, chacha_state);
+ xchacha_init(&chacha_state, key, nonce);
+ __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len,
+ &chacha_state);
}
EXPORT_SYMBOL(xchacha20poly1305_encrypt);
static bool
__chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len, u32 *chacha_state)
+ const u8 *ad, const size_t ad_len,
+ struct chacha_state *chacha_state)
{
const u8 *pad0 = page_address(ZERO_PAGE(0));
struct poly1305_desc_ctx poly1305_state;
@@ -168,9 +170,9 @@ __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
{
- u32 chacha_state[CHACHA_STATE_WORDS];
+ struct chacha_state chacha_state;
u32 k[CHACHA_KEY_WORDS];
__le64 iv[2];
bool ret;
@@ -180,11 +182,11 @@ bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
iv[0] = 0;
iv[1] = cpu_to_le64(nonce);
- chacha_init(chacha_state, k, (u8 *)iv);
+ chacha_init(&chacha_state, k, (u8 *)iv);
ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len,
- chacha_state);
+ &chacha_state);
- memzero_explicit(chacha_state, sizeof(chacha_state));
+ chacha_zeroize_state(&chacha_state);
memzero_explicit(iv, sizeof(iv));
memzero_explicit(k, sizeof(k));
return ret;
@@ -193,14 +195,14 @@ EXPORT_SYMBOL(chacha20poly1305_decrypt);
bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
const u8 *ad, const size_t ad_len,
- const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
+ const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
{
- u32 chacha_state[CHACHA_STATE_WORDS];
+ struct chacha_state chacha_state;
- xchacha_init(chacha_state, key, nonce);
+ xchacha_init(&chacha_state, key, nonce);
return __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len,
- chacha_state);
+ &chacha_state);
}
EXPORT_SYMBOL(xchacha20poly1305_decrypt);
@@ -209,12 +211,12 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
const size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE],
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE],
int encrypt)
{
const u8 *pad0 = page_address(ZERO_PAGE(0));
struct poly1305_desc_ctx poly1305_state;
- u32 chacha_state[CHACHA_STATE_WORDS];
+ struct chacha_state chacha_state;
struct sg_mapping_iter miter;
size_t partial = 0;
unsigned int flags;
@@ -241,8 +243,8 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
b.iv[0] = 0;
b.iv[1] = cpu_to_le64(nonce);
- chacha_init(chacha_state, b.k, (u8 *)b.iv);
- chacha20_crypt(chacha_state, b.block0, pad0, sizeof(b.block0));
+ chacha_init(&chacha_state, b.k, (u8 *)b.iv);
+ chacha20_crypt(&chacha_state, b.block0, pad0, sizeof(b.block0));
poly1305_init(&poly1305_state, b.block0);
if (unlikely(ad_len)) {
@@ -277,13 +279,13 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
if (unlikely(length < sl))
l &= ~(CHACHA_BLOCK_SIZE - 1);
- chacha20_crypt(chacha_state, addr, addr, l);
+ chacha20_crypt(&chacha_state, addr, addr, l);
addr += l;
length -= l;
}
if (unlikely(length > 0)) {
- chacha20_crypt(chacha_state, b.chacha_stream, pad0,
+ chacha20_crypt(&chacha_state, b.chacha_stream, pad0,
CHACHA_BLOCK_SIZE);
crypto_xor(addr, b.chacha_stream, length);
partial = length;
@@ -318,13 +320,13 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
if (unlikely(sl > -POLY1305_DIGEST_SIZE)) {
poly1305_final(&poly1305_state, b.mac[1]);
- scatterwalk_map_and_copy(b.mac[encrypt], src, src_len,
- sizeof(b.mac[1]), encrypt);
+ sg_copy_buffer(src, sg_nents(src), b.mac[encrypt],
+ sizeof(b.mac[1]), src_len, !encrypt);
ret = encrypt ||
!crypto_memneq(b.mac[0], b.mac[1], POLY1305_DIGEST_SIZE);
}
- memzero_explicit(chacha_state, sizeof(chacha_state));
+ chacha_zeroize_state(&chacha_state);
memzero_explicit(&b, sizeof(b));
return ret;
@@ -333,7 +335,7 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
{
return chacha20poly1305_crypt_sg_inplace(src, src_len, ad, ad_len,
nonce, key, 1);
@@ -343,7 +345,7 @@ EXPORT_SYMBOL(chacha20poly1305_encrypt_sg_inplace);
bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
const u8 *ad, const size_t ad_len,
const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
+ const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
{
if (unlikely(src_len < POLY1305_DIGEST_SIZE))
return false;
@@ -356,7 +358,7 @@ EXPORT_SYMBOL(chacha20poly1305_decrypt_sg_inplace);
static int __init chacha20poly1305_init(void)
{
- if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
+ if (IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) &&
WARN_ON(!chacha20poly1305_selftest()))
return -ENODEV;
return 0;
diff --git a/lib/crypto/curve25519-fiat32.c b/lib/crypto/curve25519-fiat32.c
index 2fde0ec33dbd..2e0ba634e299 100644
--- a/lib/crypto/curve25519-fiat32.c
+++ b/lib/crypto/curve25519-fiat32.c
@@ -10,7 +10,7 @@
* with 128-bit integer types.
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <crypto/curve25519.h>
#include <linux/string.h>
diff --git a/lib/crypto/curve25519-generic.c b/lib/crypto/curve25519-generic.c
deleted file mode 100644
index de7c99172fa2..000000000000
--- a/lib/crypto/curve25519-generic.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is an implementation of the Curve25519 ECDH algorithm, using either
- * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
- * depending on what is supported by the target compiler.
- *
- * Information: https://cr.yp.to/ecdh.html
- */
-
-#include <crypto/curve25519.h>
-#include <linux/module.h>
-
-const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
-const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
-
-EXPORT_SYMBOL(curve25519_null_point);
-EXPORT_SYMBOL(curve25519_base_point);
-EXPORT_SYMBOL(curve25519_generic);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Curve25519 scalar multiplication");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/curve25519-hacl64.c b/lib/crypto/curve25519-hacl64.c
index 771d82dc5f14..c4204133afb7 100644
--- a/lib/crypto/curve25519-hacl64.c
+++ b/lib/crypto/curve25519-hacl64.c
@@ -10,12 +10,10 @@
* integer types.
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <crypto/curve25519.h>
#include <linux/string.h>
-typedef __uint128_t u128;
-
static __always_inline u64 u64_eq_mask(u64 a, u64 b)
{
u64 x = a ^ b;
diff --git a/lib/crypto/curve25519.c b/lib/crypto/curve25519.c
index 064b352c6907..01e265dfbcd9 100644
--- a/lib/crypto/curve25519.c
+++ b/lib/crypto/curve25519.c
@@ -2,32 +2,77 @@
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*
- * This is an implementation of the Curve25519 ECDH algorithm, using either
- * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
+ * This is an implementation of the Curve25519 ECDH algorithm, using either an
+ * architecture-optimized implementation or a generic implementation. The
+ * generic implementation is either 32-bit, or 64-bit with 128-bit integers,
* depending on what is supported by the target compiler.
*
* Information: https://cr.yp.to/ecdh.html
*/
#include <crypto/curve25519.h>
-#include <linux/module.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
#include <linux/init.h>
+#include <linux/module.h>
-static int __init curve25519_init(void)
+static const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
+static const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
+
+#ifdef CONFIG_CRYPTO_LIB_CURVE25519_ARCH
+#include "curve25519.h" /* $(SRCARCH)/curve25519.h */
+#else
+static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
{
- if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
- WARN_ON(!curve25519_selftest()))
- return -ENODEV;
- return 0;
+ curve25519_generic(mypublic, secret, basepoint);
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ curve25519_generic(pub, secret, curve25519_base_point);
+}
+#endif
+
+bool __must_check
+curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+ curve25519_arch(mypublic, secret, basepoint);
+ return crypto_memneq(mypublic, curve25519_null_point,
+ CURVE25519_KEY_SIZE);
+}
+EXPORT_SYMBOL(curve25519);
+
+bool __must_check
+curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ if (unlikely(!crypto_memneq(secret, curve25519_null_point,
+ CURVE25519_KEY_SIZE)))
+ return false;
+ curve25519_base_arch(pub, secret);
+ return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE);
}
+EXPORT_SYMBOL(curve25519_generate_public);
-static void __exit curve25519_exit(void)
+#ifdef curve25519_mod_init_arch
+static int __init curve25519_mod_init(void)
{
+ curve25519_mod_init_arch();
+ return 0;
}
+subsys_initcall(curve25519_mod_init);
-module_init(curve25519_init);
-module_exit(curve25519_exit);
+static void __exit curve25519_mod_exit(void)
+{
+}
+module_exit(curve25519_mod_exit);
+#endif
MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Curve25519 scalar multiplication");
+MODULE_DESCRIPTION("Curve25519 algorithm");
MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/lib/crypto/des.c b/lib/crypto/des.c
index ef5bb8822aba..a906070136dc 100644
--- a/lib/crypto/des.c
+++ b/lib/crypto/des.c
@@ -7,20 +7,19 @@
* Copyright (c) 2005 Dag Arne Osvik <da@osvik.no>
*/
+#include <crypto/des.h>
+#include <crypto/internal/des.h>
#include <linux/bitops.h>
#include <linux/compiler.h>
#include <linux/crypto.h>
#include <linux/errno.h>
+#include <linux/export.h>
#include <linux/fips.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/types.h>
-
-#include <asm/unaligned.h>
-
-#include <crypto/des.h>
-#include <crypto/internal/des.h>
+#include <linux/unaligned.h>
#define ROL(x, r) ((x) = rol32((x), (r)))
#define ROR(x, r) ((x) = ror32((x), (r)))
@@ -899,4 +898,5 @@ void des3_ede_decrypt(const struct des3_ede_ctx *dctx, u8 *dst, const u8 *src)
}
EXPORT_SYMBOL_GPL(des3_ede_decrypt);
+MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms");
MODULE_LICENSE("GPL");
diff --git a/lib/crypto/fips.h b/lib/crypto/fips.h
new file mode 100644
index 000000000000..023410c2e0db
--- /dev/null
+++ b/lib/crypto/fips.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: gen-fips-testvecs.py */
+
+#include <linux/fips.h>
+
+static const u8 fips_test_data[] __initconst __maybe_unused = {
+ 0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
+ 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00,
+};
+
+static const u8 fips_test_key[] __initconst __maybe_unused = {
+ 0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
+ 0x74, 0x20, 0x6b, 0x65, 0x79, 0x00, 0x00, 0x00,
+};
+
+static const u8 fips_test_hmac_sha1_value[] __initconst __maybe_unused = {
+ 0x29, 0xa9, 0x88, 0xb8, 0x5c, 0xb4, 0xaf, 0x4b,
+ 0x97, 0x2a, 0xee, 0x87, 0x5b, 0x0a, 0x02, 0x55,
+ 0x99, 0xbf, 0x86, 0x78,
+};
+
+static const u8 fips_test_hmac_sha256_value[] __initconst __maybe_unused = {
+ 0x59, 0x25, 0x85, 0xcc, 0x40, 0xe9, 0x64, 0x2f,
+ 0xe9, 0xbf, 0x82, 0xb7, 0xd3, 0x15, 0x3d, 0x43,
+ 0x22, 0x0b, 0x4c, 0x00, 0x90, 0x14, 0x25, 0xcf,
+ 0x9e, 0x13, 0x2b, 0xc2, 0x30, 0xe6, 0xe8, 0x93,
+};
+
+static const u8 fips_test_hmac_sha512_value[] __initconst __maybe_unused = {
+ 0x6b, 0xea, 0x5d, 0x27, 0x49, 0x5b, 0x3f, 0xea,
+ 0xde, 0x2d, 0xfa, 0x32, 0x75, 0xdb, 0x77, 0xc8,
+ 0x26, 0xe9, 0x4e, 0x95, 0x4d, 0xad, 0x88, 0x02,
+ 0x87, 0xf9, 0x52, 0x0a, 0xd1, 0x92, 0x80, 0x1d,
+ 0x92, 0x7e, 0x3c, 0xbd, 0xb1, 0x3c, 0x49, 0x98,
+ 0x44, 0x9c, 0x8f, 0xee, 0x3f, 0x02, 0x71, 0x51,
+ 0x57, 0x0b, 0x15, 0x38, 0x95, 0xd8, 0xa3, 0x81,
+ 0xba, 0xb3, 0x15, 0x37, 0x5c, 0x6d, 0x57, 0x2b,
+};
+
+static const u8 fips_test_sha3_256_value[] __initconst __maybe_unused = {
+ 0x77, 0xc4, 0x8b, 0x69, 0x70, 0x5f, 0x0a, 0xb1,
+ 0xb1, 0xa5, 0x82, 0x0a, 0x22, 0x2b, 0x49, 0x31,
+ 0xba, 0x9b, 0xb6, 0xaa, 0x32, 0xa7, 0x97, 0x00,
+ 0x98, 0xdb, 0xff, 0xe7, 0xc6, 0xde, 0xb5, 0x82,
+};
diff --git a/lib/crypto/gf128mul.c b/lib/crypto/gf128mul.c
index 8f8c45e0cdcf..2a34590fe3f1 100644
--- a/lib/crypto/gf128mul.c
+++ b/lib/crypto/gf128mul.c
@@ -49,6 +49,7 @@
*/
#include <crypto/gf128mul.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -225,44 +226,6 @@ void gf128mul_lle(be128 *r, const be128 *b)
}
EXPORT_SYMBOL(gf128mul_lle);
-void gf128mul_bbe(be128 *r, const be128 *b)
-{
- be128 p[8];
- int i;
-
- p[0] = *r;
- for (i = 0; i < 7; ++i)
- gf128mul_x_bbe(&p[i + 1], &p[i]);
-
- memset(r, 0, sizeof(*r));
- for (i = 0;;) {
- u8 ch = ((u8 *)b)[i];
-
- if (ch & 0x80)
- be128_xor(r, r, &p[7]);
- if (ch & 0x40)
- be128_xor(r, r, &p[6]);
- if (ch & 0x20)
- be128_xor(r, r, &p[5]);
- if (ch & 0x10)
- be128_xor(r, r, &p[4]);
- if (ch & 0x08)
- be128_xor(r, r, &p[3]);
- if (ch & 0x04)
- be128_xor(r, r, &p[2]);
- if (ch & 0x02)
- be128_xor(r, r, &p[1]);
- if (ch & 0x01)
- be128_xor(r, r, &p[0]);
-
- if (++i >= 16)
- break;
-
- gf128mul_x8_bbe(r);
- }
-}
-EXPORT_SYMBOL(gf128mul_bbe);
-
/* This version uses 64k bytes of table space.
A 16 byte buffer has to be multiplied by a 16 byte key
value in GF(2^128). If we consider a GF(2^128) value in
@@ -380,28 +343,6 @@ out:
}
EXPORT_SYMBOL(gf128mul_init_4k_lle);
-struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g)
-{
- struct gf128mul_4k *t;
- int j, k;
-
- t = kzalloc(sizeof(*t), GFP_KERNEL);
- if (!t)
- goto out;
-
- t->t[1] = *g;
- for (j = 1; j <= 64; j <<= 1)
- gf128mul_x_bbe(&t->t[j + j], &t->t[j]);
-
- for (j = 2; j < 256; j += j)
- for (k = 1; k < j; ++k)
- be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
-
-out:
- return t;
-}
-EXPORT_SYMBOL(gf128mul_init_4k_bbe);
-
void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t)
{
u8 *ap = (u8 *)a;
@@ -417,20 +358,5 @@ void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t)
}
EXPORT_SYMBOL(gf128mul_4k_lle);
-void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t)
-{
- u8 *ap = (u8 *)a;
- be128 r[1];
- int i = 0;
-
- *r = t->t[ap[0]];
- while (++i < 16) {
- gf128mul_x8_bbe(r);
- be128_xor(r, r, &t->t[ap[i]]);
- }
- *a = *r;
-}
-EXPORT_SYMBOL(gf128mul_4k_bbe);
-
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
diff --git a/lib/crypto/hash_info.c b/lib/crypto/hash_info.c
new file mode 100644
index 000000000000..9a467638c971
--- /dev/null
+++ b/lib/crypto/hash_info.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Hash Info: Hash algorithms information
+ *
+ * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com>
+ */
+
+#include <linux/export.h>
+#include <crypto/hash_info.h>
+
+const char *const hash_algo_name[HASH_ALGO__LAST] = {
+ [HASH_ALGO_MD4] = "md4",
+ [HASH_ALGO_MD5] = "md5",
+ [HASH_ALGO_SHA1] = "sha1",
+ [HASH_ALGO_RIPE_MD_160] = "rmd160",
+ [HASH_ALGO_SHA256] = "sha256",
+ [HASH_ALGO_SHA384] = "sha384",
+ [HASH_ALGO_SHA512] = "sha512",
+ [HASH_ALGO_SHA224] = "sha224",
+ [HASH_ALGO_RIPE_MD_128] = "rmd128",
+ [HASH_ALGO_RIPE_MD_256] = "rmd256",
+ [HASH_ALGO_RIPE_MD_320] = "rmd320",
+ [HASH_ALGO_WP_256] = "wp256",
+ [HASH_ALGO_WP_384] = "wp384",
+ [HASH_ALGO_WP_512] = "wp512",
+ [HASH_ALGO_TGR_128] = "tgr128",
+ [HASH_ALGO_TGR_160] = "tgr160",
+ [HASH_ALGO_TGR_192] = "tgr192",
+ [HASH_ALGO_SM3_256] = "sm3",
+ [HASH_ALGO_STREEBOG_256] = "streebog256",
+ [HASH_ALGO_STREEBOG_512] = "streebog512",
+ [HASH_ALGO_SHA3_256] = "sha3-256",
+ [HASH_ALGO_SHA3_384] = "sha3-384",
+ [HASH_ALGO_SHA3_512] = "sha3-512",
+};
+EXPORT_SYMBOL_GPL(hash_algo_name);
+
+const int hash_digest_size[HASH_ALGO__LAST] = {
+ [HASH_ALGO_MD4] = MD5_DIGEST_SIZE,
+ [HASH_ALGO_MD5] = MD5_DIGEST_SIZE,
+ [HASH_ALGO_SHA1] = SHA1_DIGEST_SIZE,
+ [HASH_ALGO_RIPE_MD_160] = RMD160_DIGEST_SIZE,
+ [HASH_ALGO_SHA256] = SHA256_DIGEST_SIZE,
+ [HASH_ALGO_SHA384] = SHA384_DIGEST_SIZE,
+ [HASH_ALGO_SHA512] = SHA512_DIGEST_SIZE,
+ [HASH_ALGO_SHA224] = SHA224_DIGEST_SIZE,
+ [HASH_ALGO_RIPE_MD_128] = RMD128_DIGEST_SIZE,
+ [HASH_ALGO_RIPE_MD_256] = RMD256_DIGEST_SIZE,
+ [HASH_ALGO_RIPE_MD_320] = RMD320_DIGEST_SIZE,
+ [HASH_ALGO_WP_256] = WP256_DIGEST_SIZE,
+ [HASH_ALGO_WP_384] = WP384_DIGEST_SIZE,
+ [HASH_ALGO_WP_512] = WP512_DIGEST_SIZE,
+ [HASH_ALGO_TGR_128] = TGR128_DIGEST_SIZE,
+ [HASH_ALGO_TGR_160] = TGR160_DIGEST_SIZE,
+ [HASH_ALGO_TGR_192] = TGR192_DIGEST_SIZE,
+ [HASH_ALGO_SM3_256] = SM3256_DIGEST_SIZE,
+ [HASH_ALGO_STREEBOG_256] = STREEBOG256_DIGEST_SIZE,
+ [HASH_ALGO_STREEBOG_512] = STREEBOG512_DIGEST_SIZE,
+ [HASH_ALGO_SHA3_256] = SHA3_256_DIGEST_SIZE,
+ [HASH_ALGO_SHA3_384] = SHA3_384_DIGEST_SIZE,
+ [HASH_ALGO_SHA3_512] = SHA3_512_DIGEST_SIZE,
+};
+EXPORT_SYMBOL_GPL(hash_digest_size);
diff --git a/lib/crypto/libchacha.c b/lib/crypto/libchacha.c
deleted file mode 100644
index dabc3accae05..000000000000
--- a/lib/crypto/libchacha.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * The ChaCha stream cipher (RFC7539)
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/module.h>
-
-#include <crypto/algapi.h> // for crypto_xor_cpy
-#include <crypto/chacha.h>
-
-void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- /* aligned to potentially speed up crypto_xor() */
- u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
-
- while (bytes >= CHACHA_BLOCK_SIZE) {
- chacha_block_generic(state, stream, nrounds);
- crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
- bytes -= CHACHA_BLOCK_SIZE;
- dst += CHACHA_BLOCK_SIZE;
- src += CHACHA_BLOCK_SIZE;
- }
- if (bytes) {
- chacha_block_generic(state, stream, nrounds);
- crypto_xor_cpy(dst, src, stream, bytes);
- }
-}
-EXPORT_SYMBOL(chacha_crypt_generic);
-
-MODULE_LICENSE("GPL");
diff --git a/lib/crypto/md5.c b/lib/crypto/md5.c
new file mode 100644
index 000000000000..c0610ea1370e
--- /dev/null
+++ b/lib/crypto/md5.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * MD5 and HMAC-MD5 library functions
+ *
+ * md5_block_generic() is derived from cryptoapi implementation, originally
+ * based on the public domain implementation written by Colin Plumb in 1993.
+ *
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/hmac.h>
+#include <crypto/md5.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+
+static const struct md5_block_state md5_iv = {
+ .h = { MD5_H0, MD5_H1, MD5_H2, MD5_H3 },
+};
+
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+#define MD5STEP(f, w, x, y, z, in, s) \
+ (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+
+static void md5_block_generic(struct md5_block_state *state,
+ const u8 data[MD5_BLOCK_SIZE])
+{
+ u32 in[MD5_BLOCK_WORDS];
+ u32 a, b, c, d;
+
+ memcpy(in, data, MD5_BLOCK_SIZE);
+ le32_to_cpu_array(in, ARRAY_SIZE(in));
+
+ a = state->h[0];
+ b = state->h[1];
+ c = state->h[2];
+ d = state->h[3];
+
+ MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+ MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+ MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+ MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+ MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+ MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+ MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+ MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+ MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+ MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+ MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+ MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+ MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+ MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+ MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+ MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+ MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+ MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+ MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+ MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+ MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+ MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+ MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+ MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+ MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+ MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+ MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+ MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+ MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+ MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+ MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+ MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+ MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+ MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+ MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+ MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+ MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+ MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+ MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+ MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+ MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+ MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+ MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+ MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+ MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+ MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+ MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+ MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+ MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+ MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+ MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+ MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+ MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+ MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+ MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+ MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+ MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+ MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+ MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+ MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+ MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+ MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+ MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+ MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+ state->h[0] += a;
+ state->h[1] += b;
+ state->h[2] += c;
+ state->h[3] += d;
+}
+
+static void __maybe_unused md5_blocks_generic(struct md5_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ do {
+ md5_block_generic(state, data);
+ data += MD5_BLOCK_SIZE;
+ } while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_MD5_ARCH
+#include "md5.h" /* $(SRCARCH)/md5.h */
+#else
+#define md5_blocks md5_blocks_generic
+#endif
+
+void md5_init(struct md5_ctx *ctx)
+{
+ ctx->state = md5_iv;
+ ctx->bytecount = 0;
+}
+EXPORT_SYMBOL_GPL(md5_init);
+
+void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len)
+{
+ size_t partial = ctx->bytecount % MD5_BLOCK_SIZE;
+
+ ctx->bytecount += len;
+
+ if (partial + len >= MD5_BLOCK_SIZE) {
+ size_t nblocks;
+
+ if (partial) {
+ size_t l = MD5_BLOCK_SIZE - partial;
+
+ memcpy(&ctx->buf[partial], data, l);
+ data += l;
+ len -= l;
+
+ md5_blocks(&ctx->state, ctx->buf, 1);
+ }
+
+ nblocks = len / MD5_BLOCK_SIZE;
+ len %= MD5_BLOCK_SIZE;
+
+ if (nblocks) {
+ md5_blocks(&ctx->state, data, nblocks);
+ data += nblocks * MD5_BLOCK_SIZE;
+ }
+ partial = 0;
+ }
+ if (len)
+ memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(md5_update);
+
+static void __md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE])
+{
+ u64 bitcount = ctx->bytecount << 3;
+ size_t partial = ctx->bytecount % MD5_BLOCK_SIZE;
+
+ ctx->buf[partial++] = 0x80;
+ if (partial > MD5_BLOCK_SIZE - 8) {
+ memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - partial);
+ md5_blocks(&ctx->state, ctx->buf, 1);
+ partial = 0;
+ }
+ memset(&ctx->buf[partial], 0, MD5_BLOCK_SIZE - 8 - partial);
+ *(__le64 *)&ctx->buf[MD5_BLOCK_SIZE - 8] = cpu_to_le64(bitcount);
+ md5_blocks(&ctx->state, ctx->buf, 1);
+
+ cpu_to_le32_array(ctx->state.h, ARRAY_SIZE(ctx->state.h));
+ memcpy(out, ctx->state.h, MD5_DIGEST_SIZE);
+}
+
+void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE])
+{
+ __md5_final(ctx, out);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(md5_final);
+
+void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE])
+{
+ struct md5_ctx ctx;
+
+ md5_init(&ctx);
+ md5_update(&ctx, data, len);
+ md5_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(md5);
+
+static void __hmac_md5_preparekey(struct md5_block_state *istate,
+ struct md5_block_state *ostate,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ union {
+ u8 b[MD5_BLOCK_SIZE];
+ unsigned long w[MD5_BLOCK_SIZE / sizeof(unsigned long)];
+ } derived_key = { 0 };
+
+ if (unlikely(raw_key_len > MD5_BLOCK_SIZE))
+ md5(raw_key, raw_key_len, derived_key.b);
+ else
+ memcpy(derived_key.b, raw_key, raw_key_len);
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+ *istate = md5_iv;
+ md5_blocks(istate, derived_key.b, 1);
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+ HMAC_IPAD_VALUE);
+ *ostate = md5_iv;
+ md5_blocks(ostate, derived_key.b, 1);
+
+ memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_md5_preparekey(struct hmac_md5_key *key,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_md5_preparekey(&key->istate, &key->ostate, raw_key, raw_key_len);
+}
+EXPORT_SYMBOL_GPL(hmac_md5_preparekey);
+
+void hmac_md5_init(struct hmac_md5_ctx *ctx, const struct hmac_md5_key *key)
+{
+ ctx->hash_ctx.state = key->istate;
+ ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE;
+ ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(hmac_md5_init);
+
+void hmac_md5_init_usingrawkey(struct hmac_md5_ctx *ctx,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_md5_preparekey(&ctx->hash_ctx.state, &ctx->ostate,
+ raw_key, raw_key_len);
+ ctx->hash_ctx.bytecount = MD5_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_md5_init_usingrawkey);
+
+void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE])
+{
+ /* Generate the padded input for the outer hash in ctx->hash_ctx.buf. */
+ __md5_final(&ctx->hash_ctx, ctx->hash_ctx.buf);
+ memset(&ctx->hash_ctx.buf[MD5_DIGEST_SIZE], 0,
+ MD5_BLOCK_SIZE - MD5_DIGEST_SIZE);
+ ctx->hash_ctx.buf[MD5_DIGEST_SIZE] = 0x80;
+ *(__le64 *)&ctx->hash_ctx.buf[MD5_BLOCK_SIZE - 8] =
+ cpu_to_le64(8 * (MD5_BLOCK_SIZE + MD5_DIGEST_SIZE));
+
+ /* Compute the outer hash, which gives the HMAC value. */
+ md5_blocks(&ctx->ostate, ctx->hash_ctx.buf, 1);
+ cpu_to_le32_array(ctx->ostate.h, ARRAY_SIZE(ctx->ostate.h));
+ memcpy(out, ctx->ostate.h, MD5_DIGEST_SIZE);
+
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(hmac_md5_final);
+
+void hmac_md5(const struct hmac_md5_key *key,
+ const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE])
+{
+ struct hmac_md5_ctx ctx;
+
+ hmac_md5_init(&ctx, key);
+ hmac_md5_update(&ctx, data, data_len);
+ hmac_md5_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_md5);
+
+void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+ const u8 *data, size_t data_len,
+ u8 out[MD5_DIGEST_SIZE])
+{
+ struct hmac_md5_ctx ctx;
+
+ hmac_md5_init_usingrawkey(&ctx, raw_key, raw_key_len);
+ hmac_md5_update(&ctx, data, data_len);
+ hmac_md5_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_md5_usingrawkey);
+
+#ifdef md5_mod_init_arch
+static int __init md5_mod_init(void)
+{
+ md5_mod_init_arch();
+ return 0;
+}
+subsys_initcall(md5_mod_init);
+
+static void __exit md5_mod_exit(void)
+{
+}
+module_exit(md5_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("MD5 and HMAC-MD5 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/memneq.c b/lib/crypto/memneq.c
index 243d8677cc51..44daacb8cb51 100644
--- a/lib/crypto/memneq.c
+++ b/lib/crypto/memneq.c
@@ -59,9 +59,10 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include <asm/unaligned.h>
#include <crypto/algapi.h>
+#include <linux/export.h>
#include <linux/module.h>
+#include <linux/unaligned.h>
/* Generic path for arbitrary size */
static inline unsigned long
diff --git a/lib/crypto/mips/.gitignore b/lib/crypto/mips/.gitignore
new file mode 100644
index 000000000000..0d47d4f21c6d
--- /dev/null
+++ b/lib/crypto/mips/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
diff --git a/lib/crypto/mips/chacha-core.S b/lib/crypto/mips/chacha-core.S
new file mode 100644
index 000000000000..706aeb850fb0
--- /dev/null
+++ b/lib/crypto/mips/chacha-core.S
@@ -0,0 +1,491 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define MASK_U32 0x3c
+#define CHACHA20_BLOCK_SIZE 64
+#define STACK_SIZE 32
+
+#define X0 $t0
+#define X1 $t1
+#define X2 $t2
+#define X3 $t3
+#define X4 $t4
+#define X5 $t5
+#define X6 $t6
+#define X7 $t7
+#define X8 $t8
+#define X9 $t9
+#define X10 $v1
+#define X11 $s6
+#define X12 $s5
+#define X13 $s4
+#define X14 $s3
+#define X15 $s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0 $s1
+#define T1 $s0
+#define T(n) T ## n
+#define X(n) X ## n
+
+/* Input arguments */
+#define STATE $a0
+#define OUT $a1
+#define IN $a2
+#define BYTES $a3
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0 $v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X X15
+#define SAVED_CA $s7
+
+#define IS_UNALIGNED $s7
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define CPU_TO_LE32(n) \
+ wsbh n, n; \
+ rotr n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define CPU_TO_LE32(n)
+#endif
+
+#define FOR_EACH_WORD(x) \
+ x( 0); \
+ x( 1); \
+ x( 2); \
+ x( 3); \
+ x( 4); \
+ x( 5); \
+ x( 6); \
+ x( 7); \
+ x( 8); \
+ x( 9); \
+ x(10); \
+ x(11); \
+ x(12); \
+ x(13); \
+ x(14); \
+ x(15);
+
+#define FOR_EACH_WORD_REV(x) \
+ x(15); \
+ x(14); \
+ x(13); \
+ x(12); \
+ x(11); \
+ x(10); \
+ x( 9); \
+ x( 8); \
+ x( 7); \
+ x( 6); \
+ x( 5); \
+ x( 4); \
+ x( 3); \
+ x( 2); \
+ x( 1); \
+ x( 0);
+
+#define PLUS_ONE_0 1
+#define PLUS_ONE_1 2
+#define PLUS_ONE_2 3
+#define PLUS_ONE_3 4
+#define PLUS_ONE_4 5
+#define PLUS_ONE_5 6
+#define PLUS_ONE_6 7
+#define PLUS_ONE_7 8
+#define PLUS_ONE_8 9
+#define PLUS_ONE_9 10
+#define PLUS_ONE_10 11
+#define PLUS_ONE_11 12
+#define PLUS_ONE_12 13
+#define PLUS_ONE_13 14
+#define PLUS_ONE_14 15
+#define PLUS_ONE_15 16
+#define PLUS_ONE(x) PLUS_ONE_ ## x
+#define _CONCAT3(a,b,c) a ## b ## c
+#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
+
+#define STORE_UNALIGNED(x) \
+CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
+ .if (x != 12); \
+ lw T0, (x*4)(STATE); \
+ .endif; \
+ lwl T1, (x*4)+MSB ## (IN); \
+ lwr T1, (x*4)+LSB ## (IN); \
+ .if (x == 12); \
+ addu X ## x, NONCE_0; \
+ .else; \
+ addu X ## x, T0; \
+ .endif; \
+ CPU_TO_LE32(X ## x); \
+ xor X ## x, T1; \
+ swl X ## x, (x*4)+MSB ## (OUT); \
+ swr X ## x, (x*4)+LSB ## (OUT);
+
+#define STORE_ALIGNED(x) \
+CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
+ .if (x != 12); \
+ lw T0, (x*4)(STATE); \
+ .endif; \
+ lw T1, (x*4) ## (IN); \
+ .if (x == 12); \
+ addu X ## x, NONCE_0; \
+ .else; \
+ addu X ## x, T0; \
+ .endif; \
+ CPU_TO_LE32(X ## x); \
+ xor X ## x, T1; \
+ sw X ## x, (x*4) ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x) \
+.Lchacha_mips_jmptbl_aligned_ ## x: ; \
+ .set noreorder; \
+ b .Lchacha_mips_xor_aligned_ ## x ## _b; \
+ .if (x == 12); \
+ addu SAVED_X, X ## x, NONCE_0; \
+ .else; \
+ addu SAVED_X, X ## x, SAVED_CA; \
+ .endif; \
+ .set reorder
+
+#define JMPTBL_UNALIGNED(x) \
+.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
+ .set noreorder; \
+ b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
+ .if (x == 12); \
+ addu SAVED_X, X ## x, NONCE_0; \
+ .else; \
+ addu SAVED_X, X ## x, SAVED_CA; \
+ .endif; \
+ .set reorder
+
+#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
+ addu X(A), X(K); \
+ addu X(B), X(L); \
+ addu X(C), X(M); \
+ addu X(D), X(N); \
+ xor X(V), X(A); \
+ xor X(W), X(B); \
+ xor X(Y), X(C); \
+ xor X(Z), X(D); \
+ rotr X(V), 32 - S; \
+ rotr X(W), 32 - S; \
+ rotr X(Y), 32 - S; \
+ rotr X(Z), 32 - S;
+
+.text
+.set reorder
+.set noat
+.globl chacha_crypt_arch
+.ent chacha_crypt_arch
+chacha_crypt_arch:
+ .frame $sp, STACK_SIZE, $ra
+
+ /* Load number of rounds */
+ lw $at, 16($sp)
+
+ addiu $sp, -STACK_SIZE
+
+ /* Return bytes = 0. */
+ beqz BYTES, .Lchacha_mips_end
+
+ lw NONCE_0, 48(STATE)
+
+ /* Save s0-s7 */
+ sw $s0, 0($sp)
+ sw $s1, 4($sp)
+ sw $s2, 8($sp)
+ sw $s3, 12($sp)
+ sw $s4, 16($sp)
+ sw $s5, 20($sp)
+ sw $s6, 24($sp)
+ sw $s7, 28($sp)
+
+ /* Test IN or OUT is unaligned.
+ * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
+ */
+ or IS_UNALIGNED, IN, OUT
+ andi IS_UNALIGNED, 0x3
+
+ b .Lchacha_rounds_start
+
+.align 4
+.Loop_chacha_rounds:
+ addiu IN, CHACHA20_BLOCK_SIZE
+ addiu OUT, CHACHA20_BLOCK_SIZE
+ addiu NONCE_0, 1
+
+.Lchacha_rounds_start:
+ lw X0, 0(STATE)
+ lw X1, 4(STATE)
+ lw X2, 8(STATE)
+ lw X3, 12(STATE)
+
+ lw X4, 16(STATE)
+ lw X5, 20(STATE)
+ lw X6, 24(STATE)
+ lw X7, 28(STATE)
+ lw X8, 32(STATE)
+ lw X9, 36(STATE)
+ lw X10, 40(STATE)
+ lw X11, 44(STATE)
+
+ move X12, NONCE_0
+ lw X13, 52(STATE)
+ lw X14, 56(STATE)
+ lw X15, 60(STATE)
+
+.Loop_chacha_xor_rounds:
+ addiu $at, -2
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
+ bnez $at, .Loop_chacha_xor_rounds
+
+ addiu BYTES, -(CHACHA20_BLOCK_SIZE)
+
+ /* Is data src/dst unaligned? Jump */
+ bnez IS_UNALIGNED, .Loop_chacha_unaligned
+
+ /* Set number rounds here to fill delayslot. */
+ lw $at, (STACK_SIZE+16)($sp)
+
+ /* BYTES < 0, it has no full block. */
+ bltz BYTES, .Lchacha_mips_no_full_block_aligned
+
+ FOR_EACH_WORD_REV(STORE_ALIGNED)
+
+ /* BYTES > 0? Loop again. */
+ bgtz BYTES, .Loop_chacha_rounds
+
+ /* Place this here to fill delay slot */
+ addiu NONCE_0, 1
+
+ /* BYTES < 0? Handle last bytes */
+ bltz BYTES, .Lchacha_mips_xor_bytes
+
+.Lchacha_mips_xor_done:
+ /* Restore used registers */
+ lw $s0, 0($sp)
+ lw $s1, 4($sp)
+ lw $s2, 8($sp)
+ lw $s3, 12($sp)
+ lw $s4, 16($sp)
+ lw $s5, 20($sp)
+ lw $s6, 24($sp)
+ lw $s7, 28($sp)
+
+ /* Write NONCE_0 back to right location in state */
+ sw NONCE_0, 48(STATE)
+
+.Lchacha_mips_end:
+ addiu $sp, STACK_SIZE
+ jr $ra
+
+.Lchacha_mips_no_full_block_aligned:
+ /* Restore the offset on BYTES */
+ addiu BYTES, CHACHA20_BLOCK_SIZE
+
+ /* Get number of full WORDS */
+ andi $at, BYTES, MASK_U32
+
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
+
+ /* Calculate lower half jump table offset */
+ ins T0, $at, 1, 6
+
+ /* Add offset to STATE */
+ addu T1, STATE, $at
+
+ /* Add lower half jump table addr */
+ addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
+
+ /* Read value from STATE */
+ lw SAVED_CA, 0(T1)
+
+ /* Store remaining bytecounter as negative value */
+ subu BYTES, $at, BYTES
+
+ jr T0
+
+ /* Jump table */
+ FOR_EACH_WORD(JMPTBL_ALIGNED)
+
+
+.Loop_chacha_unaligned:
+ /* Set number rounds here to fill delayslot. */
+ lw $at, (STACK_SIZE+16)($sp)
+
+ /* BYTES > 0, it has no full block. */
+ bltz BYTES, .Lchacha_mips_no_full_block_unaligned
+
+ FOR_EACH_WORD_REV(STORE_UNALIGNED)
+
+ /* BYTES > 0? Loop again. */
+ bgtz BYTES, .Loop_chacha_rounds
+
+ /* Write NONCE_0 back to right location in state */
+ sw NONCE_0, 48(STATE)
+
+ .set noreorder
+ /* Fall through to byte handling */
+ bgez BYTES, .Lchacha_mips_xor_done
+.Lchacha_mips_xor_unaligned_0_b:
+.Lchacha_mips_xor_aligned_0_b:
+ /* Place this here to fill delay slot */
+ addiu NONCE_0, 1
+ .set reorder
+
+.Lchacha_mips_xor_bytes:
+ addu IN, $at
+ addu OUT, $at
+ /* First byte */
+ lbu T1, 0(IN)
+ addiu $at, BYTES, 1
+ xor T1, SAVED_X
+ sb T1, 0(OUT)
+ beqz $at, .Lchacha_mips_xor_done
+ /* Second byte */
+ lbu T1, 1(IN)
+ addiu $at, BYTES, 2
+ rotr SAVED_X, 8
+ xor T1, SAVED_X
+ sb T1, 1(OUT)
+ beqz $at, .Lchacha_mips_xor_done
+ /* Third byte */
+ lbu T1, 2(IN)
+ rotr SAVED_X, 8
+ xor T1, SAVED_X
+ sb T1, 2(OUT)
+ b .Lchacha_mips_xor_done
+
+.Lchacha_mips_no_full_block_unaligned:
+ /* Restore the offset on BYTES */
+ addiu BYTES, CHACHA20_BLOCK_SIZE
+
+ /* Get number of full WORDS */
+ andi $at, BYTES, MASK_U32
+
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
+
+ /* Calculate lower half jump table offset */
+ ins T0, $at, 1, 6
+
+ /* Add offset to STATE */
+ addu T1, STATE, $at
+
+ /* Add lower half jump table addr */
+ addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
+
+ /* Read value from STATE */
+ lw SAVED_CA, 0(T1)
+
+ /* Store remaining bytecounter as negative value */
+ subu BYTES, $at, BYTES
+
+ jr T0
+
+ /* Jump table */
+ FOR_EACH_WORD(JMPTBL_UNALIGNED)
+.end chacha_crypt_arch
+.set at
+
+/* Input arguments
+ * STATE $a0
+ * OUT $a1
+ * NROUND $a2
+ */
+
+#undef X12
+#undef X13
+#undef X14
+#undef X15
+
+#define X12 $a3
+#define X13 $at
+#define X14 $v0
+#define X15 STATE
+
+.set noat
+.globl hchacha_block_arch
+.ent hchacha_block_arch
+hchacha_block_arch:
+ .frame $sp, STACK_SIZE, $ra
+
+ addiu $sp, -STACK_SIZE
+
+ /* Save X11(s6) */
+ sw X11, 0($sp)
+
+ lw X0, 0(STATE)
+ lw X1, 4(STATE)
+ lw X2, 8(STATE)
+ lw X3, 12(STATE)
+ lw X4, 16(STATE)
+ lw X5, 20(STATE)
+ lw X6, 24(STATE)
+ lw X7, 28(STATE)
+ lw X8, 32(STATE)
+ lw X9, 36(STATE)
+ lw X10, 40(STATE)
+ lw X11, 44(STATE)
+ lw X12, 48(STATE)
+ lw X13, 52(STATE)
+ lw X14, 56(STATE)
+ lw X15, 60(STATE)
+
+.Loop_hchacha_xor_rounds:
+ addiu $a2, -2
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
+ bnez $a2, .Loop_hchacha_xor_rounds
+
+ /* Restore used register */
+ lw X11, 0($sp)
+
+ sw X0, 0(OUT)
+ sw X1, 4(OUT)
+ sw X2, 8(OUT)
+ sw X3, 12(OUT)
+ sw X12, 16(OUT)
+ sw X13, 20(OUT)
+ sw X14, 24(OUT)
+ sw X15, 28(OUT)
+
+ addiu $sp, STACK_SIZE
+ jr $ra
+.end hchacha_block_arch
+.set at
diff --git a/lib/crypto/mips/chacha.h b/lib/crypto/mips/chacha.h
new file mode 100644
index 000000000000..0c18c0dc2a40
--- /dev/null
+++ b/lib/crypto/mips/chacha.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ChaCha and HChaCha functions (MIPS optimized)
+ *
+ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/kernel.h>
+
+asmlinkage void chacha_crypt_arch(struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds);
+asmlinkage void hchacha_block_arch(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds);
diff --git a/lib/crypto/mips/md5.h b/lib/crypto/mips/md5.h
new file mode 100644
index 000000000000..e08e28aeffa4
--- /dev/null
+++ b/lib/crypto/mips/md5.h
@@ -0,0 +1,65 @@
+/*
+ * Cryptographic API.
+ *
+ * MD5 Message Digest Algorithm (RFC1321).
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/md5.c, which is:
+ *
+ * Derived from cryptoapi implementation, originally based on the
+ * public domain implementation written by Colin Plumb in 1993.
+ *
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void md5_blocks(struct md5_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ struct octeon_cop2_state cop2_state;
+ u64 *state64 = (u64 *)state;
+ unsigned long flags;
+
+ if (!octeon_has_crypto())
+ return md5_blocks_generic(state, data, nblocks);
+
+ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+
+ flags = octeon_crypto_enable(&cop2_state);
+ write_octeon_64bit_hash_dword(state64[0], 0);
+ write_octeon_64bit_hash_dword(state64[1], 1);
+
+ do {
+ const u64 *block = (const u64 *)data;
+
+ write_octeon_64bit_block_dword(block[0], 0);
+ write_octeon_64bit_block_dword(block[1], 1);
+ write_octeon_64bit_block_dword(block[2], 2);
+ write_octeon_64bit_block_dword(block[3], 3);
+ write_octeon_64bit_block_dword(block[4], 4);
+ write_octeon_64bit_block_dword(block[5], 5);
+ write_octeon_64bit_block_dword(block[6], 6);
+ octeon_md5_start(block[7]);
+
+ data += MD5_BLOCK_SIZE;
+ } while (--nblocks);
+
+ state64[0] = read_octeon_64bit_hash_dword(0);
+ state64[1] = read_octeon_64bit_hash_dword(1);
+ octeon_crypto_disable(&cop2_state, flags);
+
+ le32_to_cpu_array(state->h, ARRAY_SIZE(state->h));
+}
diff --git a/lib/crypto/mips/poly1305-mips.pl b/lib/crypto/mips/poly1305-mips.pl
new file mode 100644
index 000000000000..71347f34f4f9
--- /dev/null
+++ b/lib/crypto/mips/poly1305-mips.pl
@@ -0,0 +1,1269 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
+# project.
+# ====================================================================
+
+# Poly1305 hash for MIPS.
+#
+# May 2016
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc
+# R1x000 ~5.5/+130% (big-endian)
+# Octeon II 2.50/+70% (little-endian)
+#
+# March 2019
+#
+# Add 32-bit code path.
+#
+# October 2019
+#
+# Modulo-scheduling reduction allows to omit dependency chain at the
+# end of inner loop and improve performance. Also optimize MIPS32R2
+# code path for MIPS 1004K core. Per René von Dorst's suggestions.
+#
+# IALU/gcc
+# R1x000 ~9.8/? (big-endian)
+# Octeon II 3.65/+140% (little-endian)
+# MT7621/1004K 4.75/? (little-endian)
+#
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+# excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+# old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
+
+if ($flavour =~ /64|n32/i) {{{
+######################################################################
+# 64-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
+ defined(_MIPS_ARCH_MIPS64R6)) \\
+ && !defined(_MIPS_ARCH_MIPS64R2)
+# define _MIPS_ARCH_MIPS64R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+# define dmultu(rs,rt)
+# define mflo(rd,rs,rt) dmulu rd,rs,rt
+# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
+#else
+# define dmultu(rs,rt) dmultu rs,rt
+# define mflo(rd,rs,rt) mflo rd
+# define mfhi(rd,rs,rt) mfhi rd
+#endif
+
+#ifdef __KERNEL__
+# define poly1305_init poly1305_block_init
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 7
+#else
+# define MSB 7
+# define LSB 0
+#endif
+
+.text
+.set noat
+.set noreorder
+
+.align 5
+.globl poly1305_init
+.ent poly1305_init
+poly1305_init:
+ .frame $sp,0,$ra
+ .set reorder
+
+ sd $zero,0($ctx)
+ sd $zero,8($ctx)
+ sd $zero,16($ctx)
+
+ beqz $inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+ andi $tmp0,$inp,7 # $inp % 8
+ dsubu $inp,$inp,$tmp0 # align $inp
+ sll $tmp0,$tmp0,3 # byte to bit offset
+ ld $in0,0($inp)
+ ld $in1,8($inp)
+ beqz $tmp0,.Laligned_key
+ ld $tmp2,16($inp)
+
+ subu $tmp1,$zero,$tmp0
+# ifdef MIPSEB
+ dsllv $in0,$in0,$tmp0
+ dsrlv $tmp3,$in1,$tmp1
+ dsllv $in1,$in1,$tmp0
+ dsrlv $tmp2,$tmp2,$tmp1
+# else
+ dsrlv $in0,$in0,$tmp0
+ dsllv $tmp3,$in1,$tmp1
+ dsrlv $in1,$in1,$tmp0
+ dsllv $tmp2,$tmp2,$tmp1
+# endif
+ or $in0,$in0,$tmp3
+ or $in1,$in1,$tmp2
+.Laligned_key:
+#else
+ ldl $in0,0+MSB($inp)
+ ldl $in1,8+MSB($inp)
+ ldr $in0,0+LSB($inp)
+ ldr $in1,8+LSB($inp)
+#endif
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+ dsbh $in0,$in0 # byte swap
+ dsbh $in1,$in1
+ dshd $in0,$in0
+ dshd $in1,$in1
+# else
+ ori $tmp0,$zero,0xFF
+ dsll $tmp2,$tmp0,32
+ or $tmp0,$tmp2 # 0x000000FF000000FF
+
+ and $tmp1,$in0,$tmp0 # byte swap
+ and $tmp3,$in1,$tmp0
+ dsrl $tmp2,$in0,24
+ dsrl $tmp4,$in1,24
+ dsll $tmp1,24
+ dsll $tmp3,24
+ and $tmp2,$tmp0
+ and $tmp4,$tmp0
+ dsll $tmp0,8 # 0x0000FF000000FF00
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ and $tmp2,$in0,$tmp0
+ and $tmp4,$in1,$tmp0
+ dsrl $in0,8
+ dsrl $in1,8
+ dsll $tmp2,8
+ dsll $tmp4,8
+ and $in0,$tmp0
+ and $in1,$tmp0
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ or $in0,$tmp1
+ or $in1,$tmp3
+ dsrl $tmp1,$in0,32
+ dsrl $tmp3,$in1,32
+ dsll $in0,32
+ dsll $in1,32
+ or $in0,$tmp1
+ or $in1,$tmp3
+# endif
+#endif
+ li $tmp0,1
+ dsll $tmp0,32 # 0x0000000100000000
+ daddiu $tmp0,-63 # 0x00000000ffffffc1
+ dsll $tmp0,28 # 0x0ffffffc10000000
+ daddiu $tmp0,-1 # 0x0ffffffc0fffffff
+
+ and $in0,$tmp0
+ daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
+ and $in1,$tmp0
+
+ sd $in0,24($ctx)
+ dsrl $tmp0,$in1,2
+ sd $in1,32($ctx)
+ daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
+ sd $tmp0,40($ctx)
+
+.Lno_key:
+ li $v0,0 # return 0
+ jr $ra
+.end poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
+
+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
+ ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
+my ($shr,$shl) = ($s6,$s7); # used on R6
+
+$code.=<<___;
+.align 5
+.globl poly1305_blocks
+.ent poly1305_blocks
+poly1305_blocks:
+ .set noreorder
+ dsrl $len,4 # number of complete blocks
+ bnez $len,poly1305_blocks_internal
+ nop
+ jr $ra
+ nop
+.end poly1305_blocks
+
+.align 5
+.ent poly1305_blocks_internal
+poly1305_blocks_internal:
+ .set noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+ .frame $sp,8*8,$ra
+ .mask $SAVED_REGS_MASK|0x000c0000,-8
+ dsubu $sp,8*8
+ sd $s7,56($sp)
+ sd $s6,48($sp)
+#else
+ .frame $sp,6*8,$ra
+ .mask $SAVED_REGS_MASK,-8
+ dsubu $sp,6*8
+#endif
+ sd $s5,40($sp)
+ sd $s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ sd $s3,24($sp)
+ sd $s2,16($sp)
+ sd $s1,8($sp)
+ sd $s0,0($sp)
+___
+$code.=<<___;
+ .set reorder
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+ andi $shr,$inp,7
+ dsubu $inp,$inp,$shr # align $inp
+ sll $shr,$shr,3 # byte to bit offset
+ subu $shl,$zero,$shr
+#endif
+
+ ld $h0,0($ctx) # load hash value
+ ld $h1,8($ctx)
+ ld $h2,16($ctx)
+
+ ld $r0,24($ctx) # load key
+ ld $r1,32($ctx)
+ ld $rs1,40($ctx)
+
+ dsll $len,4
+ daddu $len,$inp # end of buffer
+ b .Loop
+
+.align 4
+.Loop:
+#if defined(_MIPS_ARCH_MIPS64R6)
+ ld $in0,0($inp) # load input
+ ld $in1,8($inp)
+ beqz $shr,.Laligned_inp
+
+ ld $tmp2,16($inp)
+# ifdef MIPSEB
+ dsllv $in0,$in0,$shr
+ dsrlv $tmp3,$in1,$shl
+ dsllv $in1,$in1,$shr
+ dsrlv $tmp2,$tmp2,$shl
+# else
+ dsrlv $in0,$in0,$shr
+ dsllv $tmp3,$in1,$shl
+ dsrlv $in1,$in1,$shr
+ dsllv $tmp2,$tmp2,$shl
+# endif
+ or $in0,$in0,$tmp3
+ or $in1,$in1,$tmp2
+.Laligned_inp:
+#else
+ ldl $in0,0+MSB($inp) # load input
+ ldl $in1,8+MSB($inp)
+ ldr $in0,0+LSB($inp)
+ ldr $in1,8+LSB($inp)
+#endif
+ daddiu $inp,16
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+ dsbh $in0,$in0 # byte swap
+ dsbh $in1,$in1
+ dshd $in0,$in0
+ dshd $in1,$in1
+# else
+ ori $tmp0,$zero,0xFF
+ dsll $tmp2,$tmp0,32
+ or $tmp0,$tmp2 # 0x000000FF000000FF
+
+ and $tmp1,$in0,$tmp0 # byte swap
+ and $tmp3,$in1,$tmp0
+ dsrl $tmp2,$in0,24
+ dsrl $tmp4,$in1,24
+ dsll $tmp1,24
+ dsll $tmp3,24
+ and $tmp2,$tmp0
+ and $tmp4,$tmp0
+ dsll $tmp0,8 # 0x0000FF000000FF00
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ and $tmp2,$in0,$tmp0
+ and $tmp4,$in1,$tmp0
+ dsrl $in0,8
+ dsrl $in1,8
+ dsll $tmp2,8
+ dsll $tmp4,8
+ and $in0,$tmp0
+ and $in1,$tmp0
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ or $in0,$tmp1
+ or $in1,$tmp3
+ dsrl $tmp1,$in0,32
+ dsrl $tmp3,$in1,32
+ dsll $in0,32
+ dsll $in1,32
+ or $in0,$tmp1
+ or $in1,$tmp3
+# endif
+#endif
+ dsrl $tmp1,$h2,2 # modulo-scheduled reduction
+ andi $h2,$h2,3
+ dsll $tmp0,$tmp1,2
+
+ daddu $d0,$h0,$in0 # accumulate input
+ daddu $tmp1,$tmp0
+ sltu $tmp0,$d0,$h0
+ daddu $d0,$d0,$tmp1 # ... and residue
+ sltu $tmp1,$d0,$tmp1
+ daddu $d1,$h1,$in1
+ daddu $tmp0,$tmp1
+ sltu $tmp1,$d1,$h1
+ daddu $d1,$tmp0
+
+ dmultu ($r0,$d0) # h0*r0
+ daddu $d2,$h2,$padbit
+ sltu $tmp0,$d1,$tmp0
+ mflo ($h0,$r0,$d0)
+ mfhi ($h1,$r0,$d0)
+
+ dmultu ($rs1,$d1) # h1*5*r1
+ daddu $d2,$tmp1
+ daddu $d2,$tmp0
+ mflo ($tmp0,$rs1,$d1)
+ mfhi ($tmp1,$rs1,$d1)
+
+ dmultu ($r1,$d0) # h0*r1
+ mflo ($tmp2,$r1,$d0)
+ mfhi ($h2,$r1,$d0)
+ daddu $h0,$tmp0
+ daddu $h1,$tmp1
+ sltu $tmp0,$h0,$tmp0
+
+ dmultu ($r0,$d1) # h1*r0
+ daddu $h1,$tmp0
+ daddu $h1,$tmp2
+ mflo ($tmp0,$r0,$d1)
+ mfhi ($tmp1,$r0,$d1)
+
+ dmultu ($rs1,$d2) # h2*5*r1
+ sltu $tmp2,$h1,$tmp2
+ daddu $h2,$tmp2
+ mflo ($tmp2,$rs1,$d2)
+
+ dmultu ($r0,$d2) # h2*r0
+ daddu $h1,$tmp0
+ daddu $h2,$tmp1
+ mflo ($tmp3,$r0,$d2)
+ sltu $tmp0,$h1,$tmp0
+ daddu $h2,$tmp0
+
+ daddu $h1,$tmp2
+ sltu $tmp2,$h1,$tmp2
+ daddu $h2,$tmp2
+ daddu $h2,$tmp3
+
+ bne $inp,$len,.Loop
+
+ sd $h0,0($ctx) # store hash value
+ sd $h1,8($ctx)
+ sd $h2,16($ctx)
+
+ .set noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+ ld $s7,56($sp)
+ ld $s6,48($sp)
+#endif
+ ld $s5,40($sp) # epilogue
+ ld $s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
+ ld $s3,24($sp)
+ ld $s2,16($sp)
+ ld $s1,8($sp)
+ ld $s0,0($sp)
+___
+$code.=<<___;
+ jr $ra
+#if defined(_MIPS_ARCH_MIPS64R6)
+ daddu $sp,8*8
+#else
+ daddu $sp,6*8
+#endif
+.end poly1305_blocks_internal
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.align 5
+.globl poly1305_emit
+.ent poly1305_emit
+poly1305_emit:
+ .frame $sp,0,$ra
+ .set reorder
+
+ ld $tmp2,16($ctx)
+ ld $tmp0,0($ctx)
+ ld $tmp1,8($ctx)
+
+ li $in0,-4 # final reduction
+ dsrl $in1,$tmp2,2
+ and $in0,$tmp2
+ andi $tmp2,$tmp2,3
+ daddu $in0,$in1
+
+ daddu $tmp0,$tmp0,$in0
+ sltu $in1,$tmp0,$in0
+ daddiu $in0,$tmp0,5 # compare to modulus
+ daddu $tmp1,$tmp1,$in1
+ sltiu $tmp3,$in0,5
+ sltu $tmp4,$tmp1,$in1
+ daddu $in1,$tmp1,$tmp3
+ daddu $tmp2,$tmp2,$tmp4
+ sltu $tmp3,$in1,$tmp3
+ daddu $tmp2,$tmp2,$tmp3
+
+ dsrl $tmp2,2 # see if it carried/borrowed
+ dsubu $tmp2,$zero,$tmp2
+
+ xor $in0,$tmp0
+ xor $in1,$tmp1
+ and $in0,$tmp2
+ and $in1,$tmp2
+ xor $in0,$tmp0
+ xor $in1,$tmp1
+
+ lwu $tmp0,0($nonce) # load nonce
+ lwu $tmp1,4($nonce)
+ lwu $tmp2,8($nonce)
+ lwu $tmp3,12($nonce)
+ dsll $tmp1,32
+ dsll $tmp3,32
+ or $tmp0,$tmp1
+ or $tmp2,$tmp3
+
+ daddu $in0,$tmp0 # accumulate nonce
+ daddu $in1,$tmp2
+ sltu $tmp0,$in0,$tmp0
+ daddu $in1,$tmp0
+
+ dsrl $tmp0,$in0,8 # write mac value
+ dsrl $tmp1,$in0,16
+ dsrl $tmp2,$in0,24
+ sb $in0,0($mac)
+ dsrl $tmp3,$in0,32
+ sb $tmp0,1($mac)
+ dsrl $tmp0,$in0,40
+ sb $tmp1,2($mac)
+ dsrl $tmp1,$in0,48
+ sb $tmp2,3($mac)
+ dsrl $tmp2,$in0,56
+ sb $tmp3,4($mac)
+ dsrl $tmp3,$in1,8
+ sb $tmp0,5($mac)
+ dsrl $tmp0,$in1,16
+ sb $tmp1,6($mac)
+ dsrl $tmp1,$in1,24
+ sb $tmp2,7($mac)
+
+ sb $in1,8($mac)
+ dsrl $tmp2,$in1,32
+ sb $tmp3,9($mac)
+ dsrl $tmp3,$in1,40
+ sb $tmp0,10($mac)
+ dsrl $tmp0,$in1,48
+ sb $tmp1,11($mac)
+ dsrl $tmp1,$in1,56
+ sb $tmp2,12($mac)
+ sb $tmp3,13($mac)
+ sb $tmp0,14($mac)
+ sb $tmp1,15($mac)
+
+ jr $ra
+.end poly1305_emit
+.rdata
+.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
+.align 2
+___
+}
+}}} else {{{
+######################################################################
+# 32-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
+ ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
+ defined(_MIPS_ARCH_MIPS32R6)) \\
+ && !defined(_MIPS_ARCH_MIPS32R2)
+# define _MIPS_ARCH_MIPS32R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+# define multu(rs,rt)
+# define mflo(rd,rs,rt) mulu rd,rs,rt
+# define mfhi(rd,rs,rt) muhu rd,rs,rt
+#else
+# define multu(rs,rt) multu rs,rt
+# define mflo(rd,rs,rt) mflo rd
+# define mfhi(rd,rs,rt) mfhi rd
+#endif
+
+#ifdef __KERNEL__
+# define poly1305_init poly1305_block_init
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 3
+#else
+# define MSB 3
+# define LSB 0
+#endif
+
+.text
+.set noat
+.set noreorder
+
+.align 5
+.globl poly1305_init
+.ent poly1305_init
+poly1305_init:
+ .frame $sp,0,$ra
+ .set reorder
+
+ sw $zero,0($ctx)
+ sw $zero,4($ctx)
+ sw $zero,8($ctx)
+ sw $zero,12($ctx)
+ sw $zero,16($ctx)
+
+ beqz $inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+ andi $tmp0,$inp,3 # $inp % 4
+ subu $inp,$inp,$tmp0 # align $inp
+ sll $tmp0,$tmp0,3 # byte to bit offset
+ lw $in0,0($inp)
+ lw $in1,4($inp)
+ lw $in2,8($inp)
+ lw $in3,12($inp)
+ beqz $tmp0,.Laligned_key
+
+ lw $tmp2,16($inp)
+ subu $tmp1,$zero,$tmp0
+# ifdef MIPSEB
+ sllv $in0,$in0,$tmp0
+ srlv $tmp3,$in1,$tmp1
+ sllv $in1,$in1,$tmp0
+ or $in0,$in0,$tmp3
+ srlv $tmp3,$in2,$tmp1
+ sllv $in2,$in2,$tmp0
+ or $in1,$in1,$tmp3
+ srlv $tmp3,$in3,$tmp1
+ sllv $in3,$in3,$tmp0
+ or $in2,$in2,$tmp3
+ srlv $tmp2,$tmp2,$tmp1
+ or $in3,$in3,$tmp2
+# else
+ srlv $in0,$in0,$tmp0
+ sllv $tmp3,$in1,$tmp1
+ srlv $in1,$in1,$tmp0
+ or $in0,$in0,$tmp3
+ sllv $tmp3,$in2,$tmp1
+ srlv $in2,$in2,$tmp0
+ or $in1,$in1,$tmp3
+ sllv $tmp3,$in3,$tmp1
+ srlv $in3,$in3,$tmp0
+ or $in2,$in2,$tmp3
+ sllv $tmp2,$tmp2,$tmp1
+ or $in3,$in3,$tmp2
+# endif
+.Laligned_key:
+#else
+ lwl $in0,0+MSB($inp)
+ lwl $in1,4+MSB($inp)
+ lwl $in2,8+MSB($inp)
+ lwl $in3,12+MSB($inp)
+ lwr $in0,0+LSB($inp)
+ lwr $in1,4+LSB($inp)
+ lwr $in2,8+LSB($inp)
+ lwr $in3,12+LSB($inp)
+#endif
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+ wsbh $in0,$in0 # byte swap
+ wsbh $in1,$in1
+ wsbh $in2,$in2
+ wsbh $in3,$in3
+ rotr $in0,$in0,16
+ rotr $in1,$in1,16
+ rotr $in2,$in2,16
+ rotr $in3,$in3,16
+# else
+ srl $tmp0,$in0,24 # byte swap
+ srl $tmp1,$in0,8
+ andi $tmp2,$in0,0xFF00
+ sll $in0,$in0,24
+ andi $tmp1,0xFF00
+ sll $tmp2,$tmp2,8
+ or $in0,$tmp0
+ srl $tmp0,$in1,24
+ or $tmp1,$tmp2
+ srl $tmp2,$in1,8
+ or $in0,$tmp1
+ andi $tmp1,$in1,0xFF00
+ sll $in1,$in1,24
+ andi $tmp2,0xFF00
+ sll $tmp1,$tmp1,8
+ or $in1,$tmp0
+ srl $tmp0,$in2,24
+ or $tmp2,$tmp1
+ srl $tmp1,$in2,8
+ or $in1,$tmp2
+ andi $tmp2,$in2,0xFF00
+ sll $in2,$in2,24
+ andi $tmp1,0xFF00
+ sll $tmp2,$tmp2,8
+ or $in2,$tmp0
+ srl $tmp0,$in3,24
+ or $tmp1,$tmp2
+ srl $tmp2,$in3,8
+ or $in2,$tmp1
+ andi $tmp1,$in3,0xFF00
+ sll $in3,$in3,24
+ andi $tmp2,0xFF00
+ sll $tmp1,$tmp1,8
+ or $in3,$tmp0
+ or $tmp2,$tmp1
+ or $in3,$tmp2
+# endif
+#endif
+ lui $tmp0,0x0fff
+ ori $tmp0,0xffff # 0x0fffffff
+ and $in0,$in0,$tmp0
+ subu $tmp0,3 # 0x0ffffffc
+ and $in1,$in1,$tmp0
+ and $in2,$in2,$tmp0
+ and $in3,$in3,$tmp0
+
+ sw $in0,20($ctx)
+ sw $in1,24($ctx)
+ sw $in2,28($ctx)
+ sw $in3,32($ctx)
+
+ srl $tmp1,$in1,2
+ srl $tmp2,$in2,2
+ srl $tmp3,$in3,2
+ addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
+ addu $in2,$in2,$tmp2
+ addu $in3,$in3,$tmp3
+ sw $in1,36($ctx)
+ sw $in2,40($ctx)
+ sw $in3,44($ctx)
+.Lno_key:
+ li $v0,0
+ jr $ra
+.end poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
+
+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
+ ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
+my ($d0,$d1,$d2,$d3) =
+ ($a4,$a5,$a6,$a7);
+my $shr = $t2; # used on R6
+my $one = $t2; # used on R2
+
+$code.=<<___;
+.globl poly1305_blocks
+.align 5
+.ent poly1305_blocks
+poly1305_blocks:
+ .frame $sp,16*4,$ra
+ .mask $SAVED_REGS_MASK,-4
+ .set noreorder
+ subu $sp, $sp,4*12
+ sw $s11,4*11($sp)
+ sw $s10,4*10($sp)
+ sw $s9, 4*9($sp)
+ sw $s8, 4*8($sp)
+ sw $s7, 4*7($sp)
+ sw $s6, 4*6($sp)
+ sw $s5, 4*5($sp)
+ sw $s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ sw $s3, 4*3($sp)
+ sw $s2, 4*2($sp)
+ sw $s1, 4*1($sp)
+ sw $s0, 4*0($sp)
+___
+$code.=<<___;
+ .set reorder
+
+ srl $len,4 # number of complete blocks
+ li $one,1
+ beqz $len,.Labort
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+ andi $shr,$inp,3
+ subu $inp,$inp,$shr # align $inp
+ sll $shr,$shr,3 # byte to bit offset
+#endif
+
+ lw $h0,0($ctx) # load hash value
+ lw $h1,4($ctx)
+ lw $h2,8($ctx)
+ lw $h3,12($ctx)
+ lw $h4,16($ctx)
+
+ lw $r0,20($ctx) # load key
+ lw $r1,24($ctx)
+ lw $r2,28($ctx)
+ lw $r3,32($ctx)
+ lw $rs1,36($ctx)
+ lw $rs2,40($ctx)
+ lw $rs3,44($ctx)
+
+ sll $len,4
+ addu $len,$len,$inp # end of buffer
+ b .Loop
+
+.align 4
+.Loop:
+#if defined(_MIPS_ARCH_MIPS32R6)
+ lw $d0,0($inp) # load input
+ lw $d1,4($inp)
+ lw $d2,8($inp)
+ lw $d3,12($inp)
+ beqz $shr,.Laligned_inp
+
+ lw $t0,16($inp)
+ subu $t1,$zero,$shr
+# ifdef MIPSEB
+ sllv $d0,$d0,$shr
+ srlv $at,$d1,$t1
+ sllv $d1,$d1,$shr
+ or $d0,$d0,$at
+ srlv $at,$d2,$t1
+ sllv $d2,$d2,$shr
+ or $d1,$d1,$at
+ srlv $at,$d3,$t1
+ sllv $d3,$d3,$shr
+ or $d2,$d2,$at
+ srlv $t0,$t0,$t1
+ or $d3,$d3,$t0
+# else
+ srlv $d0,$d0,$shr
+ sllv $at,$d1,$t1
+ srlv $d1,$d1,$shr
+ or $d0,$d0,$at
+ sllv $at,$d2,$t1
+ srlv $d2,$d2,$shr
+ or $d1,$d1,$at
+ sllv $at,$d3,$t1
+ srlv $d3,$d3,$shr
+ or $d2,$d2,$at
+ sllv $t0,$t0,$t1
+ or $d3,$d3,$t0
+# endif
+.Laligned_inp:
+#else
+ lwl $d0,0+MSB($inp) # load input
+ lwl $d1,4+MSB($inp)
+ lwl $d2,8+MSB($inp)
+ lwl $d3,12+MSB($inp)
+ lwr $d0,0+LSB($inp)
+ lwr $d1,4+LSB($inp)
+ lwr $d2,8+LSB($inp)
+ lwr $d3,12+LSB($inp)
+#endif
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+ wsbh $d0,$d0 # byte swap
+ wsbh $d1,$d1
+ wsbh $d2,$d2
+ wsbh $d3,$d3
+ rotr $d0,$d0,16
+ rotr $d1,$d1,16
+ rotr $d2,$d2,16
+ rotr $d3,$d3,16
+# else
+ srl $at,$d0,24 # byte swap
+ srl $t0,$d0,8
+ andi $t1,$d0,0xFF00
+ sll $d0,$d0,24
+ andi $t0,0xFF00
+ sll $t1,$t1,8
+ or $d0,$at
+ srl $at,$d1,24
+ or $t0,$t1
+ srl $t1,$d1,8
+ or $d0,$t0
+ andi $t0,$d1,0xFF00
+ sll $d1,$d1,24
+ andi $t1,0xFF00
+ sll $t0,$t0,8
+ or $d1,$at
+ srl $at,$d2,24
+ or $t1,$t0
+ srl $t0,$d2,8
+ or $d1,$t1
+ andi $t1,$d2,0xFF00
+ sll $d2,$d2,24
+ andi $t0,0xFF00
+ sll $t1,$t1,8
+ or $d2,$at
+ srl $at,$d3,24
+ or $t0,$t1
+ srl $t1,$d3,8
+ or $d2,$t0
+ andi $t0,$d3,0xFF00
+ sll $d3,$d3,24
+ andi $t1,0xFF00
+ sll $t0,$t0,8
+ or $d3,$at
+ or $t1,$t0
+ or $d3,$t1
+# endif
+#endif
+ srl $t0,$h4,2 # modulo-scheduled reduction
+ andi $h4,$h4,3
+ sll $at,$t0,2
+
+ addu $d0,$d0,$h0 # accumulate input
+ addu $t0,$t0,$at
+ sltu $h0,$d0,$h0
+ addu $d0,$d0,$t0 # ... and residue
+ sltu $at,$d0,$t0
+
+ addu $d1,$d1,$h1
+ addu $h0,$h0,$at # carry
+ sltu $h1,$d1,$h1
+ addu $d1,$d1,$h0
+ sltu $h0,$d1,$h0
+
+ addu $d2,$d2,$h2
+ addu $h1,$h1,$h0 # carry
+ sltu $h2,$d2,$h2
+ addu $d2,$d2,$h1
+ sltu $h1,$d2,$h1
+
+ addu $d3,$d3,$h3
+ addu $h2,$h2,$h1 # carry
+ sltu $h3,$d3,$h3
+ addu $d3,$d3,$h2
+
+#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
+ multu $r0,$d0 # d0*r0
+ sltu $h2,$d3,$h2
+ maddu $rs3,$d1 # d1*s3
+ addu $h3,$h3,$h2 # carry
+ maddu $rs2,$d2 # d2*s2
+ addu $h4,$h4,$padbit
+ maddu $rs1,$d3 # d3*s1
+ addu $h4,$h4,$h3
+ mfhi $at
+ mflo $h0
+
+ multu $r1,$d0 # d0*r1
+ maddu $r0,$d1 # d1*r0
+ maddu $rs3,$d2 # d2*s3
+ maddu $rs2,$d3 # d3*s2
+ maddu $rs1,$h4 # h4*s1
+ maddu $at,$one # hi*1
+ mfhi $at
+ mflo $h1
+
+ multu $r2,$d0 # d0*r2
+ maddu $r1,$d1 # d1*r1
+ maddu $r0,$d2 # d2*r0
+ maddu $rs3,$d3 # d3*s3
+ maddu $rs2,$h4 # h4*s2
+ maddu $at,$one # hi*1
+ mfhi $at
+ mflo $h2
+
+ mul $t0,$r0,$h4 # h4*r0
+
+ multu $r3,$d0 # d0*r3
+ maddu $r2,$d1 # d1*r2
+ maddu $r1,$d2 # d2*r1
+ maddu $r0,$d3 # d3*r0
+ maddu $rs3,$h4 # h4*s3
+ maddu $at,$one # hi*1
+ mfhi $at
+ mflo $h3
+
+ addiu $inp,$inp,16
+
+ addu $h4,$t0,$at
+#else
+ multu ($r0,$d0) # d0*r0
+ mflo ($h0,$r0,$d0)
+ mfhi ($h1,$r0,$d0)
+
+ sltu $h2,$d3,$h2
+ addu $h3,$h3,$h2 # carry
+
+ multu ($rs3,$d1) # d1*s3
+ mflo ($at,$rs3,$d1)
+ mfhi ($t0,$rs3,$d1)
+
+ addu $h4,$h4,$padbit
+ addiu $inp,$inp,16
+ addu $h4,$h4,$h3
+
+ multu ($rs2,$d2) # d2*s2
+ mflo ($a3,$rs2,$d2)
+ mfhi ($t1,$rs2,$d2)
+ addu $h0,$h0,$at
+ addu $h1,$h1,$t0
+ multu ($rs1,$d3) # d3*s1
+ sltu $at,$h0,$at
+ addu $h1,$h1,$at
+
+ mflo ($at,$rs1,$d3)
+ mfhi ($t0,$rs1,$d3)
+ addu $h0,$h0,$a3
+ addu $h1,$h1,$t1
+ multu ($r1,$d0) # d0*r1
+ sltu $a3,$h0,$a3
+ addu $h1,$h1,$a3
+
+
+ mflo ($a3,$r1,$d0)
+ mfhi ($h2,$r1,$d0)
+ addu $h0,$h0,$at
+ addu $h1,$h1,$t0
+ multu ($r0,$d1) # d1*r0
+ sltu $at,$h0,$at
+ addu $h1,$h1,$at
+
+ mflo ($at,$r0,$d1)
+ mfhi ($t0,$r0,$d1)
+ addu $h1,$h1,$a3
+ sltu $a3,$h1,$a3
+ multu ($rs3,$d2) # d2*s3
+ addu $h2,$h2,$a3
+
+ mflo ($a3,$rs3,$d2)
+ mfhi ($t1,$rs3,$d2)
+ addu $h1,$h1,$at
+ addu $h2,$h2,$t0
+ multu ($rs2,$d3) # d3*s2
+ sltu $at,$h1,$at
+ addu $h2,$h2,$at
+
+ mflo ($at,$rs2,$d3)
+ mfhi ($t0,$rs2,$d3)
+ addu $h1,$h1,$a3
+ addu $h2,$h2,$t1
+ multu ($rs1,$h4) # h4*s1
+ sltu $a3,$h1,$a3
+ addu $h2,$h2,$a3
+
+ mflo ($a3,$rs1,$h4)
+ addu $h1,$h1,$at
+ addu $h2,$h2,$t0
+ multu ($r2,$d0) # d0*r2
+ sltu $at,$h1,$at
+ addu $h2,$h2,$at
+
+
+ mflo ($at,$r2,$d0)
+ mfhi ($h3,$r2,$d0)
+ addu $h1,$h1,$a3
+ sltu $a3,$h1,$a3
+ multu ($r1,$d1) # d1*r1
+ addu $h2,$h2,$a3
+
+ mflo ($a3,$r1,$d1)
+ mfhi ($t1,$r1,$d1)
+ addu $h2,$h2,$at
+ sltu $at,$h2,$at
+ multu ($r0,$d2) # d2*r0
+ addu $h3,$h3,$at
+
+ mflo ($at,$r0,$d2)
+ mfhi ($t0,$r0,$d2)
+ addu $h2,$h2,$a3
+ addu $h3,$h3,$t1
+ multu ($rs3,$d3) # d3*s3
+ sltu $a3,$h2,$a3
+ addu $h3,$h3,$a3
+
+ mflo ($a3,$rs3,$d3)
+ mfhi ($t1,$rs3,$d3)
+ addu $h2,$h2,$at
+ addu $h3,$h3,$t0
+ multu ($rs2,$h4) # h4*s2
+ sltu $at,$h2,$at
+ addu $h3,$h3,$at
+
+ mflo ($at,$rs2,$h4)
+ addu $h2,$h2,$a3
+ addu $h3,$h3,$t1
+ multu ($r3,$d0) # d0*r3
+ sltu $a3,$h2,$a3
+ addu $h3,$h3,$a3
+
+
+ mflo ($a3,$r3,$d0)
+ mfhi ($t1,$r3,$d0)
+ addu $h2,$h2,$at
+ sltu $at,$h2,$at
+ multu ($r2,$d1) # d1*r2
+ addu $h3,$h3,$at
+
+ mflo ($at,$r2,$d1)
+ mfhi ($t0,$r2,$d1)
+ addu $h3,$h3,$a3
+ sltu $a3,$h3,$a3
+ multu ($r0,$d3) # d3*r0
+ addu $t1,$t1,$a3
+
+ mflo ($a3,$r0,$d3)
+ mfhi ($d3,$r0,$d3)
+ addu $h3,$h3,$at
+ addu $t1,$t1,$t0
+ multu ($r1,$d2) # d2*r1
+ sltu $at,$h3,$at
+ addu $t1,$t1,$at
+
+ mflo ($at,$r1,$d2)
+ mfhi ($t0,$r1,$d2)
+ addu $h3,$h3,$a3
+ addu $t1,$t1,$d3
+ multu ($rs3,$h4) # h4*s3
+ sltu $a3,$h3,$a3
+ addu $t1,$t1,$a3
+
+ mflo ($a3,$rs3,$h4)
+ addu $h3,$h3,$at
+ addu $t1,$t1,$t0
+ multu ($r0,$h4) # h4*r0
+ sltu $at,$h3,$at
+ addu $t1,$t1,$at
+
+
+ mflo ($h4,$r0,$h4)
+ addu $h3,$h3,$a3
+ sltu $a3,$h3,$a3
+ addu $t1,$t1,$a3
+ addu $h4,$h4,$t1
+
+ li $padbit,1 # if we loop, padbit is 1
+#endif
+ bne $inp,$len,.Loop
+
+ sw $h0,0($ctx) # store hash value
+ sw $h1,4($ctx)
+ sw $h2,8($ctx)
+ sw $h3,12($ctx)
+ sw $h4,16($ctx)
+
+ .set noreorder
+.Labort:
+ lw $s11,4*11($sp)
+ lw $s10,4*10($sp)
+ lw $s9, 4*9($sp)
+ lw $s8, 4*8($sp)
+ lw $s7, 4*7($sp)
+ lw $s6, 4*6($sp)
+ lw $s5, 4*5($sp)
+ lw $s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ lw $s3, 4*3($sp)
+ lw $s2, 4*2($sp)
+ lw $s1, 4*1($sp)
+ lw $s0, 4*0($sp)
+___
+$code.=<<___;
+ jr $ra
+ addu $sp,$sp,4*12
+.end poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
+
+$code.=<<___;
+.align 5
+.globl poly1305_emit
+.ent poly1305_emit
+poly1305_emit:
+ .frame $sp,0,$ra
+ .set reorder
+
+ lw $tmp4,16($ctx)
+ lw $tmp0,0($ctx)
+ lw $tmp1,4($ctx)
+ lw $tmp2,8($ctx)
+ lw $tmp3,12($ctx)
+
+ li $in0,-4 # final reduction
+ srl $ctx,$tmp4,2
+ and $in0,$in0,$tmp4
+ andi $tmp4,$tmp4,3
+ addu $ctx,$ctx,$in0
+
+ addu $tmp0,$tmp0,$ctx
+ sltu $ctx,$tmp0,$ctx
+ addiu $in0,$tmp0,5 # compare to modulus
+ addu $tmp1,$tmp1,$ctx
+ sltiu $in1,$in0,5
+ sltu $ctx,$tmp1,$ctx
+ addu $in1,$in1,$tmp1
+ addu $tmp2,$tmp2,$ctx
+ sltu $in2,$in1,$tmp1
+ sltu $ctx,$tmp2,$ctx
+ addu $in2,$in2,$tmp2
+ addu $tmp3,$tmp3,$ctx
+ sltu $in3,$in2,$tmp2
+ sltu $ctx,$tmp3,$ctx
+ addu $in3,$in3,$tmp3
+ addu $tmp4,$tmp4,$ctx
+ sltu $ctx,$in3,$tmp3
+ addu $ctx,$tmp4
+
+ srl $ctx,2 # see if it carried/borrowed
+ subu $ctx,$zero,$ctx
+
+ xor $in0,$tmp0
+ xor $in1,$tmp1
+ xor $in2,$tmp2
+ xor $in3,$tmp3
+ and $in0,$ctx
+ and $in1,$ctx
+ and $in2,$ctx
+ and $in3,$ctx
+ xor $in0,$tmp0
+ xor $in1,$tmp1
+ xor $in2,$tmp2
+ xor $in3,$tmp3
+
+ lw $tmp0,0($nonce) # load nonce
+ lw $tmp1,4($nonce)
+ lw $tmp2,8($nonce)
+ lw $tmp3,12($nonce)
+
+ addu $in0,$tmp0 # accumulate nonce
+ sltu $ctx,$in0,$tmp0
+
+ addu $in1,$tmp1
+ sltu $tmp1,$in1,$tmp1
+ addu $in1,$ctx
+ sltu $ctx,$in1,$ctx
+ addu $ctx,$tmp1
+
+ addu $in2,$tmp2
+ sltu $tmp2,$in2,$tmp2
+ addu $in2,$ctx
+ sltu $ctx,$in2,$ctx
+ addu $ctx,$tmp2
+
+ addu $in3,$tmp3
+ addu $in3,$ctx
+
+ srl $tmp0,$in0,8 # write mac value
+ srl $tmp1,$in0,16
+ srl $tmp2,$in0,24
+ sb $in0, 0($mac)
+ sb $tmp0,1($mac)
+ srl $tmp0,$in1,8
+ sb $tmp1,2($mac)
+ srl $tmp1,$in1,16
+ sb $tmp2,3($mac)
+ srl $tmp2,$in1,24
+ sb $in1, 4($mac)
+ sb $tmp0,5($mac)
+ srl $tmp0,$in2,8
+ sb $tmp1,6($mac)
+ srl $tmp1,$in2,16
+ sb $tmp2,7($mac)
+ srl $tmp2,$in2,24
+ sb $in2, 8($mac)
+ sb $tmp0,9($mac)
+ srl $tmp0,$in3,8
+ sb $tmp1,10($mac)
+ srl $tmp1,$in3,16
+ sb $tmp2,11($mac)
+ srl $tmp2,$in3,24
+ sb $in3, 12($mac)
+ sb $tmp0,13($mac)
+ sb $tmp1,14($mac)
+ sb $tmp2,15($mac)
+
+ jr $ra
+.end poly1305_emit
+.rdata
+.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
+.align 2
+___
+}
+}}}
+
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/lib/crypto/mips/poly1305.h b/lib/crypto/mips/poly1305.h
new file mode 100644
index 000000000000..85de450f1a93
--- /dev/null
+++ b/lib/crypto/mips/poly1305.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+ const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
+ const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+ u8 digest[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
diff --git a/lib/crypto/mips/sha1.h b/lib/crypto/mips/sha1.h
new file mode 100644
index 000000000000..ba1965002e4a
--- /dev/null
+++ b/lib/crypto/mips/sha1.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic API.
+ *
+ * SHA1 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha1_generic.c, which is:
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void octeon_sha1_store_hash(struct sha1_block_state *state)
+{
+ u64 *hash = (u64 *)&state->h[0];
+ union {
+ u32 word[2];
+ u64 dword;
+ } hash_tail = { { state->h[4], } };
+
+ write_octeon_64bit_hash_dword(hash[0], 0);
+ write_octeon_64bit_hash_dword(hash[1], 1);
+ write_octeon_64bit_hash_dword(hash_tail.dword, 2);
+ memzero_explicit(&hash_tail.word[0], sizeof(hash_tail.word[0]));
+}
+
+static void octeon_sha1_read_hash(struct sha1_block_state *state)
+{
+ u64 *hash = (u64 *)&state->h[0];
+ union {
+ u32 word[2];
+ u64 dword;
+ } hash_tail;
+
+ hash[0] = read_octeon_64bit_hash_dword(0);
+ hash[1] = read_octeon_64bit_hash_dword(1);
+ hash_tail.dword = read_octeon_64bit_hash_dword(2);
+ state->h[4] = hash_tail.word[0];
+ memzero_explicit(&hash_tail.dword, sizeof(hash_tail.dword));
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ struct octeon_cop2_state cop2_state;
+ unsigned long flags;
+
+ if (!octeon_has_crypto())
+ return sha1_blocks_generic(state, data, nblocks);
+
+ flags = octeon_crypto_enable(&cop2_state);
+ octeon_sha1_store_hash(state);
+
+ do {
+ const u64 *block = (const u64 *)data;
+
+ write_octeon_64bit_block_dword(block[0], 0);
+ write_octeon_64bit_block_dword(block[1], 1);
+ write_octeon_64bit_block_dword(block[2], 2);
+ write_octeon_64bit_block_dword(block[3], 3);
+ write_octeon_64bit_block_dword(block[4], 4);
+ write_octeon_64bit_block_dword(block[5], 5);
+ write_octeon_64bit_block_dword(block[6], 6);
+ octeon_sha1_start(block[7]);
+
+ data += SHA1_BLOCK_SIZE;
+ } while (--nblocks);
+
+ octeon_sha1_read_hash(state);
+ octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/mips/sha256.h b/lib/crypto/mips/sha256.h
new file mode 100644
index 000000000000..ccccfd131634
--- /dev/null
+++ b/lib/crypto/mips/sha256.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha256_generic.c, which is:
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ struct octeon_cop2_state cop2_state;
+ u64 *state64 = (u64 *)state;
+ unsigned long flags;
+
+ if (!octeon_has_crypto())
+ return sha256_blocks_generic(state, data, nblocks);
+
+ flags = octeon_crypto_enable(&cop2_state);
+ write_octeon_64bit_hash_dword(state64[0], 0);
+ write_octeon_64bit_hash_dword(state64[1], 1);
+ write_octeon_64bit_hash_dword(state64[2], 2);
+ write_octeon_64bit_hash_dword(state64[3], 3);
+
+ do {
+ const u64 *block = (const u64 *)data;
+
+ write_octeon_64bit_block_dword(block[0], 0);
+ write_octeon_64bit_block_dword(block[1], 1);
+ write_octeon_64bit_block_dword(block[2], 2);
+ write_octeon_64bit_block_dword(block[3], 3);
+ write_octeon_64bit_block_dword(block[4], 4);
+ write_octeon_64bit_block_dword(block[5], 5);
+ write_octeon_64bit_block_dword(block[6], 6);
+ octeon_sha256_start(block[7]);
+
+ data += SHA256_BLOCK_SIZE;
+ } while (--nblocks);
+
+ state64[0] = read_octeon_64bit_hash_dword(0);
+ state64[1] = read_octeon_64bit_hash_dword(1);
+ state64[2] = read_octeon_64bit_hash_dword(2);
+ state64[3] = read_octeon_64bit_hash_dword(3);
+ octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/mips/sha512.h b/lib/crypto/mips/sha512.h
new file mode 100644
index 000000000000..b3ffbc1e8ca8
--- /dev/null
+++ b/lib/crypto/mips/sha512.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Cryptographic API.
+ *
+ * SHA-512 and SHA-384 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha512_generic.c, which is:
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ struct octeon_cop2_state cop2_state;
+ unsigned long flags;
+
+ if (!octeon_has_crypto())
+ return sha512_blocks_generic(state, data, nblocks);
+
+ flags = octeon_crypto_enable(&cop2_state);
+ write_octeon_64bit_hash_sha512(state->h[0], 0);
+ write_octeon_64bit_hash_sha512(state->h[1], 1);
+ write_octeon_64bit_hash_sha512(state->h[2], 2);
+ write_octeon_64bit_hash_sha512(state->h[3], 3);
+ write_octeon_64bit_hash_sha512(state->h[4], 4);
+ write_octeon_64bit_hash_sha512(state->h[5], 5);
+ write_octeon_64bit_hash_sha512(state->h[6], 6);
+ write_octeon_64bit_hash_sha512(state->h[7], 7);
+
+ do {
+ const u64 *block = (const u64 *)data;
+
+ write_octeon_64bit_block_sha512(block[0], 0);
+ write_octeon_64bit_block_sha512(block[1], 1);
+ write_octeon_64bit_block_sha512(block[2], 2);
+ write_octeon_64bit_block_sha512(block[3], 3);
+ write_octeon_64bit_block_sha512(block[4], 4);
+ write_octeon_64bit_block_sha512(block[5], 5);
+ write_octeon_64bit_block_sha512(block[6], 6);
+ write_octeon_64bit_block_sha512(block[7], 7);
+ write_octeon_64bit_block_sha512(block[8], 8);
+ write_octeon_64bit_block_sha512(block[9], 9);
+ write_octeon_64bit_block_sha512(block[10], 10);
+ write_octeon_64bit_block_sha512(block[11], 11);
+ write_octeon_64bit_block_sha512(block[12], 12);
+ write_octeon_64bit_block_sha512(block[13], 13);
+ write_octeon_64bit_block_sha512(block[14], 14);
+ octeon_sha512_start(block[15]);
+
+ data += SHA512_BLOCK_SIZE;
+ } while (--nblocks);
+
+ state->h[0] = read_octeon_64bit_hash_sha512(0);
+ state->h[1] = read_octeon_64bit_hash_sha512(1);
+ state->h[2] = read_octeon_64bit_hash_sha512(2);
+ state->h[3] = read_octeon_64bit_hash_sha512(3);
+ state->h[4] = read_octeon_64bit_hash_sha512(4);
+ state->h[5] = read_octeon_64bit_hash_sha512(5);
+ state->h[6] = read_octeon_64bit_hash_sha512(6);
+ state->h[7] = read_octeon_64bit_hash_sha512(7);
+ octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/mpi/Makefile b/lib/crypto/mpi/Makefile
new file mode 100644
index 000000000000..9ad84079025a
--- /dev/null
+++ b/lib/crypto/mpi/Makefile
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# MPI multiprecision maths library (from gpg)
+#
+
+obj-$(CONFIG_MPILIB) = mpi.o
+
+mpi-y = \
+ generic_mpih-lshift.o \
+ generic_mpih-mul1.o \
+ generic_mpih-mul2.o \
+ generic_mpih-mul3.o \
+ generic_mpih-rshift.o \
+ generic_mpih-sub1.o \
+ generic_mpih-add1.o \
+ mpicoder.o \
+ mpi-add.o \
+ mpi-bit.o \
+ mpi-cmp.o \
+ mpi-sub-ui.o \
+ mpi-div.o \
+ mpi-mod.o \
+ mpi-mul.o \
+ mpih-cmp.o \
+ mpih-div.o \
+ mpih-mul.o \
+ mpi-pow.o \
+ mpiutil.o
diff --git a/lib/crypto/mpi/generic_mpih-add1.c b/lib/crypto/mpi/generic_mpih-add1.c
new file mode 100644
index 000000000000..299308b5461c
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-add1.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-add_1.c - MPI helper functions
+ * Copyright (C) 1994, 1996, 1997, 1998,
+ * 2000 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+mpi_limb_t
+mpihelp_add_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_ptr_t s2_ptr, mpi_size_t size)
+{
+ mpi_limb_t x, y, cy;
+ mpi_size_t j;
+
+ /* The loop counter and index J goes from -SIZE to -1. This way
+ the loop becomes faster. */
+ j = -size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ s1_ptr -= j;
+ s2_ptr -= j;
+ res_ptr -= j;
+
+ cy = 0;
+ do {
+ y = s2_ptr[j];
+ x = s1_ptr[j];
+ y += cy; /* add previous carry to one addend */
+ cy = y < cy; /* get out carry from that addition */
+ y += x; /* add other addend */
+ cy += y < x; /* get out carry from that add, combine */
+ res_ptr[j] = y;
+ } while (++j);
+
+ return cy;
+}
diff --git a/lib/crypto/mpi/generic_mpih-lshift.c b/lib/crypto/mpi/generic_mpih-lshift.c
new file mode 100644
index 000000000000..7b21f5938a50
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-lshift.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-lshift.c - MPI helper functions
+ * Copyright (C) 1994, 1996, 1998, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+
+/* Shift U (pointed to by UP and USIZE digits long) CNT bits to the left
+ * and store the USIZE least significant digits of the result at WP.
+ * Return the bits shifted out from the most significant digit.
+ *
+ * Argument constraints:
+ * 1. 0 < CNT < BITS_PER_MP_LIMB
+ * 2. If the result is to be written over the input, WP must be >= UP.
+ */
+
+mpi_limb_t
+mpihelp_lshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, unsigned int cnt)
+{
+ mpi_limb_t high_limb, low_limb;
+ unsigned sh_1, sh_2;
+ mpi_size_t i;
+ mpi_limb_t retval;
+
+ sh_1 = cnt;
+ wp += 1;
+ sh_2 = BITS_PER_MPI_LIMB - sh_1;
+ i = usize - 1;
+ low_limb = up[i];
+ retval = low_limb >> sh_2;
+ high_limb = low_limb;
+ while (--i >= 0) {
+ low_limb = up[i];
+ wp[i] = (high_limb << sh_1) | (low_limb >> sh_2);
+ high_limb = low_limb;
+ }
+ wp[i] = high_limb << sh_1;
+
+ return retval;
+}
diff --git a/lib/crypto/mpi/generic_mpih-mul1.c b/lib/crypto/mpi/generic_mpih-mul1.c
new file mode 100644
index 000000000000..e020e61d47b9
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-mul1.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-mul_1.c - MPI helper functions
+ * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+mpi_limb_t
+mpihelp_mul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
+ mpi_limb_t s2_limb)
+{
+ mpi_limb_t cy_limb;
+ mpi_size_t j;
+ mpi_limb_t prod_high, prod_low;
+
+ /* The loop counter and index J goes from -S1_SIZE to -1. This way
+ * the loop becomes faster. */
+ j = -s1_size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ s1_ptr -= j;
+ res_ptr -= j;
+
+ cy_limb = 0;
+ do {
+ umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb);
+ prod_low += cy_limb;
+ cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high;
+ res_ptr[j] = prod_low;
+ } while (++j);
+
+ return cy_limb;
+}
diff --git a/lib/crypto/mpi/generic_mpih-mul2.c b/lib/crypto/mpi/generic_mpih-mul2.c
new file mode 100644
index 000000000000..9484d8528243
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-mul2.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-mul_2.c - MPI helper functions
+ * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+mpi_limb_t
+mpihelp_addmul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_size_t s1_size, mpi_limb_t s2_limb)
+{
+ mpi_limb_t cy_limb;
+ mpi_size_t j;
+ mpi_limb_t prod_high, prod_low;
+ mpi_limb_t x;
+
+ /* The loop counter and index J goes from -SIZE to -1. This way
+ * the loop becomes faster. */
+ j = -s1_size;
+ res_ptr -= j;
+ s1_ptr -= j;
+
+ cy_limb = 0;
+ do {
+ umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb);
+
+ prod_low += cy_limb;
+ cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high;
+
+ x = res_ptr[j];
+ prod_low = x + prod_low;
+ cy_limb += prod_low < x ? 1 : 0;
+ res_ptr[j] = prod_low;
+ } while (++j);
+ return cy_limb;
+}
diff --git a/lib/crypto/mpi/generic_mpih-mul3.c b/lib/crypto/mpi/generic_mpih-mul3.c
new file mode 100644
index 000000000000..ccdbab4121e0
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-mul3.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-mul_3.c - MPI helper functions
+ * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+mpi_limb_t
+mpihelp_submul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_size_t s1_size, mpi_limb_t s2_limb)
+{
+ mpi_limb_t cy_limb;
+ mpi_size_t j;
+ mpi_limb_t prod_high, prod_low;
+ mpi_limb_t x;
+
+ /* The loop counter and index J goes from -SIZE to -1. This way
+ * the loop becomes faster. */
+ j = -s1_size;
+ res_ptr -= j;
+ s1_ptr -= j;
+
+ cy_limb = 0;
+ do {
+ umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb);
+
+ prod_low += cy_limb;
+ cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high;
+
+ x = res_ptr[j];
+ prod_low = x - prod_low;
+ cy_limb += prod_low > x ? 1 : 0;
+ res_ptr[j] = prod_low;
+ } while (++j);
+
+ return cy_limb;
+}
diff --git a/lib/crypto/mpi/generic_mpih-rshift.c b/lib/crypto/mpi/generic_mpih-rshift.c
new file mode 100644
index 000000000000..e07bc69aa898
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-rshift.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpih-rshift.c - MPI helper functions
+ * Copyright (C) 1994, 1996, 1998, 1999,
+ * 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GNUPG
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+
+/* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right
+ * and store the USIZE least significant limbs of the result at WP.
+ * The bits shifted out to the right are returned.
+ *
+ * Argument constraints:
+ * 1. 0 < CNT < BITS_PER_MP_LIMB
+ * 2. If the result is to be written over the input, WP must be <= UP.
+ */
+
+mpi_limb_t
+mpihelp_rshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, unsigned cnt)
+{
+ mpi_limb_t high_limb, low_limb;
+ unsigned sh_1, sh_2;
+ mpi_size_t i;
+ mpi_limb_t retval;
+
+ sh_1 = cnt;
+ wp -= 1;
+ sh_2 = BITS_PER_MPI_LIMB - sh_1;
+ high_limb = up[0];
+ retval = high_limb << sh_2;
+ low_limb = high_limb;
+ for (i = 1; i < usize; i++) {
+ high_limb = up[i];
+ wp[i] = (low_limb >> sh_1) | (high_limb << sh_2);
+ low_limb = high_limb;
+ }
+ wp[i] = low_limb >> sh_1;
+
+ return retval;
+}
diff --git a/lib/crypto/mpi/generic_mpih-sub1.c b/lib/crypto/mpi/generic_mpih-sub1.c
new file mode 100644
index 000000000000..eea4382aad5f
--- /dev/null
+++ b/lib/crypto/mpi/generic_mpih-sub1.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-add_2.c - MPI helper functions
+ * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+mpi_limb_t
+mpihelp_sub_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_ptr_t s2_ptr, mpi_size_t size)
+{
+ mpi_limb_t x, y, cy;
+ mpi_size_t j;
+
+ /* The loop counter and index J goes from -SIZE to -1. This way
+ the loop becomes faster. */
+ j = -size;
+
+ /* Offset the base pointers to compensate for the negative indices. */
+ s1_ptr -= j;
+ s2_ptr -= j;
+ res_ptr -= j;
+
+ cy = 0;
+ do {
+ y = s2_ptr[j];
+ x = s1_ptr[j];
+ y += cy; /* add previous carry to subtrahend */
+ cy = y < cy; /* get out carry from that addition */
+ y = x - y; /* main subtract */
+ cy += y > x; /* get out carry from the subtract, combine */
+ res_ptr[j] = y;
+ } while (++j);
+
+ return cy;
+}
diff --git a/lib/crypto/mpi/longlong.h b/lib/crypto/mpi/longlong.h
new file mode 100644
index 000000000000..b6fa1d08fb55
--- /dev/null
+++ b/lib/crypto/mpi/longlong.h
@@ -0,0 +1,1361 @@
+/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
+ * Note: I added some stuff for use with gnupg
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994, 1996, 1998,
+ * 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
+ * License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public License
+ * along with this file; see the file COPYING.LIB. If not, write to
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA. */
+
+#include <linux/count_zeros.h>
+
+/* You have to define the following before including this file:
+ *
+ * UWtype -- An unsigned type, default type for operations (typically a "word")
+ * UHWtype -- An unsigned type, at least half the size of UWtype.
+ * UDWtype -- An unsigned type, at least twice as large a UWtype
+ * W_TYPE_SIZE -- size in bits of UWtype
+ *
+ * SItype, USItype -- Signed and unsigned 32 bit types.
+ * DItype, UDItype -- Signed and unsigned 64 bit types.
+ *
+ * On a 32 bit machine UWtype should typically be USItype;
+ * on a 64 bit machine, UWtype should typically be UDItype.
+*/
+
+#define __BITS4 (W_TYPE_SIZE / 4)
+#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
+#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
+#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
+
+/* This is used to make sure no undesirable sharing between different libraries
+ that use this file takes place. */
+#ifndef __MPN
+#define __MPN(x) __##x
+#endif
+
+/* Define auxiliary asm macros.
+ *
+ * 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
+ * UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
+ * word product in HIGH_PROD and LOW_PROD.
+ *
+ * 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
+ * UDWtype product. This is just a variant of umul_ppmm.
+
+ * 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
+ * denominator) divides a UDWtype, composed by the UWtype integers
+ * HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
+ * in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less
+ * than DENOMINATOR for correct operation. If, in addition, the most
+ * significant bit of DENOMINATOR must be 1, then the pre-processor symbol
+ * UDIV_NEEDS_NORMALIZATION is defined to 1.
+ * 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
+ * denominator). Like udiv_qrnnd but the numbers are signed. The quotient
+ * is rounded towards 0.
+ *
+ * 5) count_leading_zeros(count, x) counts the number of zero-bits from the
+ * msb to the first non-zero bit in the UWtype X. This is the number of
+ * steps X needs to be shifted left to set the msb. Undefined for X == 0,
+ * unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
+ *
+ * 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
+ * from the least significant end.
+ *
+ * 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
+ * high_addend_2, low_addend_2) adds two UWtype integers, composed by
+ * HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
+ * respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow
+ * (i.e. carry out) is not stored anywhere, and is lost.
+ *
+ * 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
+ * high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
+ * composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
+ * LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE
+ * and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,
+ * and is lost.
+ *
+ * If any of these macros are left undefined for a particular CPU,
+ * C macros are used. */
+
+/* The CPUs come in alphabetical order below.
+ *
+ * Please add support for more CPUs here, or improve the current support
+ * for the CPUs below! */
+
+#if defined(__GNUC__) && !defined(NO_ASM)
+
+/* We sometimes need to clobber "cc" with gcc2, but that would not be
+ understood by gcc1. Use cpp to avoid major code duplication. */
+#if __GNUC__ < 2
+#define __CLOBBER_CC
+#define __AND_CLOBBER_CC
+#else /* __GNUC__ >= 2 */
+#define __CLOBBER_CC : "cc"
+#define __AND_CLOBBER_CC , "cc"
+#endif /* __GNUC__ < 2 */
+
+/***************************************
+ ************** A29K *****************
+ ***************************************/
+#if (defined(__a29k__) || defined(_AM29K)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("add %1,%4,%5\n" \
+ "addc %0,%2,%3" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "%r" ((USItype)(ah)), \
+ "rI" ((USItype)(bh)), \
+ "%r" ((USItype)(al)), \
+ "rI" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("sub %1,%4,%5\n" \
+ "subc %0,%2,%3" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "r" ((USItype)(ah)), \
+ "rI" ((USItype)(bh)), \
+ "r" ((USItype)(al)), \
+ "rI" ((USItype)(bl)))
+#define umul_ppmm(xh, xl, m0, m1) \
+do { \
+ USItype __m0 = (m0), __m1 = (m1); \
+ __asm__ ("multiplu %0,%1,%2" \
+ : "=r" ((USItype)(xl)) \
+ : "r" (__m0), \
+ "r" (__m1)); \
+ __asm__ ("multmu %0,%1,%2" \
+ : "=r" ((USItype)(xh)) \
+ : "r" (__m0), \
+ "r" (__m1)); \
+} while (0)
+#define udiv_qrnnd(q, r, n1, n0, d) \
+ __asm__ ("dividu %0,%3,%4" \
+ : "=r" ((USItype)(q)), \
+ "=q" ((USItype)(r)) \
+ : "1" ((USItype)(n1)), \
+ "r" ((USItype)(n0)), \
+ "r" ((USItype)(d)))
+#endif /* __a29k__ */
+
+#if defined(__alpha) && W_TYPE_SIZE == 64
+#define umul_ppmm(ph, pl, m0, m1) \
+do { \
+ UDItype __m0 = (m0), __m1 = (m1); \
+ (ph) = __builtin_alpha_umulh(__m0, __m1); \
+ (pl) = __m0 * __m1; \
+} while (0)
+#define UMUL_TIME 46
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+do { UDItype __r; \
+ (q) = __udiv_qrnnd(&__r, (n1), (n0), (d)); \
+ (r) = __r; \
+} while (0)
+extern UDItype __udiv_qrnnd(UDItype *, UDItype, UDItype, UDItype);
+#define UDIV_TIME 220
+#endif /* LONGLONG_STANDALONE */
+#endif /* __alpha */
+
+/***************************************
+ ************** ARM ******************
+ ***************************************/
+#if defined(__arm__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("adds %1, %4, %5\n" \
+ "adc %0, %2, %3" \
+ : "=r" (sh), \
+ "=&r" (sl) \
+ : "%r" ((USItype)(ah)), \
+ "rI" ((USItype)(bh)), \
+ "%r" ((USItype)(al)), \
+ "rI" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("subs %1, %4, %5\n" \
+ "sbc %0, %2, %3" \
+ : "=r" (sh), \
+ "=&r" (sl) \
+ : "r" ((USItype)(ah)), \
+ "rI" ((USItype)(bh)), \
+ "r" ((USItype)(al)), \
+ "rI" ((USItype)(bl)))
+#if defined __ARM_ARCH_2__ || defined __ARM_ARCH_3__
+#define umul_ppmm(xh, xl, a, b) \
+ __asm__ ("@ Inlined umul_ppmm\n" \
+ "mov %|r0, %2, lsr #16 @ AAAA\n" \
+ "mov %|r2, %3, lsr #16 @ BBBB\n" \
+ "bic %|r1, %2, %|r0, lsl #16 @ aaaa\n" \
+ "bic %0, %3, %|r2, lsl #16 @ bbbb\n" \
+ "mul %1, %|r1, %|r2 @ aaaa * BBBB\n" \
+ "mul %|r2, %|r0, %|r2 @ AAAA * BBBB\n" \
+ "mul %|r1, %0, %|r1 @ aaaa * bbbb\n" \
+ "mul %0, %|r0, %0 @ AAAA * bbbb\n" \
+ "adds %|r0, %1, %0 @ central sum\n" \
+ "addcs %|r2, %|r2, #65536\n" \
+ "adds %1, %|r1, %|r0, lsl #16\n" \
+ "adc %0, %|r2, %|r0, lsr #16" \
+ : "=&r" (xh), \
+ "=r" (xl) \
+ : "r" ((USItype)(a)), \
+ "r" ((USItype)(b)) \
+ : "r0", "r1", "r2")
+#else
+#define umul_ppmm(xh, xl, a, b) \
+ __asm__ ("@ Inlined umul_ppmm\n" \
+ "umull %1, %0, %2, %3" \
+ : "=&r" (xh), \
+ "=&r" (xl) \
+ : "r" ((USItype)(a)), \
+ "r" ((USItype)(b)) \
+ : "r0", "r1")
+#endif
+#define UMUL_TIME 20
+#define UDIV_TIME 100
+#endif /* __arm__ */
+
+/***************************************
+ ************** CLIPPER **************
+ ***************************************/
+#if defined(__clipper__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+ ({union {UDItype __ll; \
+ struct {USItype __l, __h; } __i; \
+ } __xx; \
+ __asm__ ("mulwux %2,%0" \
+ : "=r" (__xx.__ll) \
+ : "%0" ((USItype)(u)), \
+ "r" ((USItype)(v))); \
+ (w1) = __xx.__i.__h; (w0) = __xx.__i.__l; })
+#define smul_ppmm(w1, w0, u, v) \
+ ({union {DItype __ll; \
+ struct {SItype __l, __h; } __i; \
+ } __xx; \
+ __asm__ ("mulwx %2,%0" \
+ : "=r" (__xx.__ll) \
+ : "%0" ((SItype)(u)), \
+ "r" ((SItype)(v))); \
+ (w1) = __xx.__i.__h; (w0) = __xx.__i.__l; })
+#define __umulsidi3(u, v) \
+ ({UDItype __w; \
+ __asm__ ("mulwux %2,%0" \
+ : "=r" (__w) \
+ : "%0" ((USItype)(u)), \
+ "r" ((USItype)(v))); \
+ __w; })
+#endif /* __clipper__ */
+
+/***************************************
+ ************** GMICRO ***************
+ ***************************************/
+#if defined(__gmicro__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("add.w %5,%1\n" \
+ "addx %3,%0" \
+ : "=g" ((USItype)(sh)), \
+ "=&g" ((USItype)(sl)) \
+ : "%0" ((USItype)(ah)), \
+ "g" ((USItype)(bh)), \
+ "%1" ((USItype)(al)), \
+ "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("sub.w %5,%1\n" \
+ "subx %3,%0" \
+ : "=g" ((USItype)(sh)), \
+ "=&g" ((USItype)(sl)) \
+ : "0" ((USItype)(ah)), \
+ "g" ((USItype)(bh)), \
+ "1" ((USItype)(al)), \
+ "g" ((USItype)(bl)))
+#define umul_ppmm(ph, pl, m0, m1) \
+ __asm__ ("mulx %3,%0,%1" \
+ : "=g" ((USItype)(ph)), \
+ "=r" ((USItype)(pl)) \
+ : "%0" ((USItype)(m0)), \
+ "g" ((USItype)(m1)))
+#define udiv_qrnnd(q, r, nh, nl, d) \
+ __asm__ ("divx %4,%0,%1" \
+ : "=g" ((USItype)(q)), \
+ "=r" ((USItype)(r)) \
+ : "1" ((USItype)(nh)), \
+ "0" ((USItype)(nl)), \
+ "g" ((USItype)(d)))
+#endif
+
+/***************************************
+ ************** HPPA *****************
+ ***************************************/
+#if defined(__hppa) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("add %4,%5,%1\n" \
+ "addc %2,%3,%0" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "%rM" ((USItype)(ah)), \
+ "rM" ((USItype)(bh)), \
+ "%rM" ((USItype)(al)), \
+ "rM" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("sub %4,%5,%1\n" \
+ "subb %2,%3,%0" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "rM" ((USItype)(ah)), \
+ "rM" ((USItype)(bh)), \
+ "rM" ((USItype)(al)), \
+ "rM" ((USItype)(bl)))
+#if 0 && defined(_PA_RISC1_1)
+/* xmpyu uses floating point register which is not allowed in Linux kernel. */
+#define umul_ppmm(wh, wl, u, v) \
+do { \
+ union {UDItype __ll; \
+ struct {USItype __h, __l; } __i; \
+ } __xx; \
+ __asm__ ("xmpyu %1,%2,%0" \
+ : "=*f" (__xx.__ll) \
+ : "*f" ((USItype)(u)), \
+ "*f" ((USItype)(v))); \
+ (wh) = __xx.__i.__h; \
+ (wl) = __xx.__i.__l; \
+} while (0)
+#define UMUL_TIME 8
+#define UDIV_TIME 60
+#else
+#define UMUL_TIME 40
+#define UDIV_TIME 80
+#endif
+#if 0 /* #ifndef LONGLONG_STANDALONE */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+do { USItype __r; \
+ (q) = __udiv_qrnnd(&__r, (n1), (n0), (d)); \
+ (r) = __r; \
+} while (0)
+extern USItype __udiv_qrnnd();
+#endif /* LONGLONG_STANDALONE */
+#endif /* hppa */
+
+/***************************************
+ ************** I370 *****************
+ ***************************************/
+#if (defined(__i370__) || defined(__mvs__)) && W_TYPE_SIZE == 32
+#define umul_ppmm(xh, xl, m0, m1) \
+do { \
+ union {UDItype __ll; \
+ struct {USItype __h, __l; } __i; \
+ } __xx; \
+ USItype __m0 = (m0), __m1 = (m1); \
+ __asm__ ("mr %0,%3" \
+ : "=r" (__xx.__i.__h), \
+ "=r" (__xx.__i.__l) \
+ : "%1" (__m0), \
+ "r" (__m1)); \
+ (xh) = __xx.__i.__h; (xl) = __xx.__i.__l; \
+ (xh) += ((((SItype) __m0 >> 31) & __m1) \
+ + (((SItype) __m1 >> 31) & __m0)); \
+} while (0)
+#define smul_ppmm(xh, xl, m0, m1) \
+do { \
+ union {DItype __ll; \
+ struct {USItype __h, __l; } __i; \
+ } __xx; \
+ __asm__ ("mr %0,%3" \
+ : "=r" (__xx.__i.__h), \
+ "=r" (__xx.__i.__l) \
+ : "%1" (m0), \
+ "r" (m1)); \
+ (xh) = __xx.__i.__h; (xl) = __xx.__i.__l; \
+} while (0)
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+do { \
+ union {DItype __ll; \
+ struct {USItype __h, __l; } __i; \
+ } __xx; \
+ __xx.__i.__h = n1; __xx.__i.__l = n0; \
+ __asm__ ("dr %0,%2" \
+ : "=r" (__xx.__ll) \
+ : "0" (__xx.__ll), "r" (d)); \
+ (q) = __xx.__i.__l; (r) = __xx.__i.__h; \
+} while (0)
+#endif
+
+/***************************************
+ ************** I386 *****************
+ ***************************************/
+#undef __i386__
+#if (defined(__i386__) || defined(__i486__)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("addl %5,%1\n" \
+ "adcl %3,%0" \
+ : "=r" (sh), \
+ "=&r" (sl) \
+ : "%0" ((USItype)(ah)), \
+ "g" ((USItype)(bh)), \
+ "%1" ((USItype)(al)), \
+ "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("subl %5,%1\n" \
+ "sbbl %3,%0" \
+ : "=r" (sh), \
+ "=&r" (sl) \
+ : "0" ((USItype)(ah)), \
+ "g" ((USItype)(bh)), \
+ "1" ((USItype)(al)), \
+ "g" ((USItype)(bl)))
+#define umul_ppmm(w1, w0, u, v) \
+ __asm__ ("mull %3" \
+ : "=a" (w0), \
+ "=d" (w1) \
+ : "%0" ((USItype)(u)), \
+ "rm" ((USItype)(v)))
+#define udiv_qrnnd(q, r, n1, n0, d) \
+ __asm__ ("divl %4" \
+ : "=a" (q), \
+ "=d" (r) \
+ : "0" ((USItype)(n0)), \
+ "1" ((USItype)(n1)), \
+ "rm" ((USItype)(d)))
+#ifndef UMUL_TIME
+#define UMUL_TIME 40
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME 40
+#endif
+#endif /* 80x86 */
+
+/***************************************
+ ************** I860 *****************
+ ***************************************/
+#if defined(__i860__) && W_TYPE_SIZE == 32
+#define rshift_rhlc(r, h, l, c) \
+ __asm__ ("shr %3,r0,r0\n" \
+ "shrd %1,%2,%0" \
+ "=r" (r) : "r" (h), "r" (l), "rn" (c))
+#endif /* i860 */
+
+/***************************************
+ ************** I960 *****************
+ ***************************************/
+#if defined(__i960__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("cmpo 1,0\n" \
+ "addc %5,%4,%1\n" \
+ "addc %3,%2,%0" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "%dI" ((USItype)(ah)), \
+ "dI" ((USItype)(bh)), \
+ "%dI" ((USItype)(al)), \
+ "dI" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("cmpo 0,0\n" \
+ "subc %5,%4,%1\n" \
+ "subc %3,%2,%0" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "dI" ((USItype)(ah)), \
+ "dI" ((USItype)(bh)), \
+ "dI" ((USItype)(al)), \
+ "dI" ((USItype)(bl)))
+#define umul_ppmm(w1, w0, u, v) \
+ ({union {UDItype __ll; \
+ struct {USItype __l, __h; } __i; \
+ } __xx; \
+ __asm__ ("emul %2,%1,%0" \
+ : "=d" (__xx.__ll) \
+ : "%dI" ((USItype)(u)), \
+ "dI" ((USItype)(v))); \
+ (w1) = __xx.__i.__h; (w0) = __xx.__i.__l; })
+#define __umulsidi3(u, v) \
+ ({UDItype __w; \
+ __asm__ ("emul %2,%1,%0" \
+ : "=d" (__w) \
+ : "%dI" ((USItype)(u)), \
+ "dI" ((USItype)(v))); \
+ __w; })
+#define udiv_qrnnd(q, r, nh, nl, d) \
+do { \
+ union {UDItype __ll; \
+ struct {USItype __l, __h; } __i; \
+ } __nn; \
+ __nn.__i.__h = (nh); __nn.__i.__l = (nl); \
+ __asm__ ("ediv %d,%n,%0" \
+ : "=d" (__rq.__ll) \
+ : "dI" (__nn.__ll), \
+ "dI" ((USItype)(d))); \
+ (r) = __rq.__i.__l; (q) = __rq.__i.__h; \
+} while (0)
+#if defined(__i960mx) /* what is the proper symbol to test??? */
+#define rshift_rhlc(r, h, l, c) \
+do { \
+ union {UDItype __ll; \
+ struct {USItype __l, __h; } __i; \
+ } __nn; \
+ __nn.__i.__h = (h); __nn.__i.__l = (l); \
+ __asm__ ("shre %2,%1,%0" \
+ : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \
+}
+#endif /* i960mx */
+#endif /* i960 */
+
+/***************************************
+ ************** 68000 ****************
+ ***************************************/
+#if (defined(__mc68000__) || defined(__mc68020__) || defined(__NeXT__) || defined(mc68020)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("add%.l %5,%1\n" \
+ "addx%.l %3,%0" \
+ : "=d" ((USItype)(sh)), \
+ "=&d" ((USItype)(sl)) \
+ : "%0" ((USItype)(ah)), \
+ "d" ((USItype)(bh)), \
+ "%1" ((USItype)(al)), \
+ "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("sub%.l %5,%1\n" \
+ "subx%.l %3,%0" \
+ : "=d" ((USItype)(sh)), \
+ "=&d" ((USItype)(sl)) \
+ : "0" ((USItype)(ah)), \
+ "d" ((USItype)(bh)), \
+ "1" ((USItype)(al)), \
+ "g" ((USItype)(bl)))
+#if (defined(__mc68020__) || defined(__NeXT__) || defined(mc68020))
+#define umul_ppmm(w1, w0, u, v) \
+ __asm__ ("mulu%.l %3,%1:%0" \
+ : "=d" ((USItype)(w0)), \
+ "=d" ((USItype)(w1)) \
+ : "%0" ((USItype)(u)), \
+ "dmi" ((USItype)(v)))
+#define UMUL_TIME 45
+#define udiv_qrnnd(q, r, n1, n0, d) \
+ __asm__ ("divu%.l %4,%1:%0" \
+ : "=d" ((USItype)(q)), \
+ "=d" ((USItype)(r)) \
+ : "0" ((USItype)(n0)), \
+ "1" ((USItype)(n1)), \
+ "dmi" ((USItype)(d)))
+#define UDIV_TIME 90
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+ __asm__ ("divs%.l %4,%1:%0" \
+ : "=d" ((USItype)(q)), \
+ "=d" ((USItype)(r)) \
+ : "0" ((USItype)(n0)), \
+ "1" ((USItype)(n1)), \
+ "dmi" ((USItype)(d)))
+#else /* not mc68020 */
+#define umul_ppmm(xh, xl, a, b) \
+do { USItype __umul_tmp1, __umul_tmp2; \
+ __asm__ ("| Inlined umul_ppmm\n" \
+ "move%.l %5,%3\n" \
+ "move%.l %2,%0\n" \
+ "move%.w %3,%1\n" \
+ "swap %3\n" \
+ "swap %0\n" \
+ "mulu %2,%1\n" \
+ "mulu %3,%0\n" \
+ "mulu %2,%3\n" \
+ "swap %2\n" \
+ "mulu %5,%2\n" \
+ "add%.l %3,%2\n" \
+ "jcc 1f\n" \
+ "add%.l %#0x10000,%0\n" \
+ "1: move%.l %2,%3\n" \
+ "clr%.w %2\n" \
+ "swap %2\n" \
+ "swap %3\n" \
+ "clr%.w %3\n" \
+ "add%.l %3,%1\n" \
+ "addx%.l %2,%0\n" \
+ "| End inlined umul_ppmm" \
+ : "=&d" ((USItype)(xh)), "=&d" ((USItype)(xl)), \
+ "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \
+ : "%2" ((USItype)(a)), "d" ((USItype)(b))); \
+} while (0)
+#define UMUL_TIME 100
+#define UDIV_TIME 400
+#endif /* not mc68020 */
+#endif /* mc68000 */
+
+/***************************************
+ ************** 88000 ****************
+ ***************************************/
+#if defined(__m88000__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("addu.co %1,%r4,%r5\n" \
+ "addu.ci %0,%r2,%r3" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "%rJ" ((USItype)(ah)), \
+ "rJ" ((USItype)(bh)), \
+ "%rJ" ((USItype)(al)), \
+ "rJ" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("subu.co %1,%r4,%r5\n" \
+ "subu.ci %0,%r2,%r3" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "rJ" ((USItype)(ah)), \
+ "rJ" ((USItype)(bh)), \
+ "rJ" ((USItype)(al)), \
+ "rJ" ((USItype)(bl)))
+#if defined(__m88110__)
+#define umul_ppmm(wh, wl, u, v) \
+do { \
+ union {UDItype __ll; \
+ struct {USItype __h, __l; } __i; \
+ } __x; \
+ __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \
+ (wh) = __x.__i.__h; \
+ (wl) = __x.__i.__l; \
+} while (0)
+#define udiv_qrnnd(q, r, n1, n0, d) \
+ ({union {UDItype __ll; \
+ struct {USItype __h, __l; } __i; \
+ } __x, __q; \
+ __x.__i.__h = (n1); __x.__i.__l = (n0); \
+ __asm__ ("divu.d %0,%1,%2" \
+ : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \
+ (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
+#define UMUL_TIME 5
+#define UDIV_TIME 25
+#else
+#define UMUL_TIME 17
+#define UDIV_TIME 150
+#endif /* __m88110__ */
+#endif /* __m88000__ */
+
+/***************************************
+ ************** MIPS *****************
+ ***************************************/
+#if defined(__mips__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+do { \
+ UDItype __ll = (UDItype)(u) * (v); \
+ w1 = __ll >> 32; \
+ w0 = __ll; \
+} while (0)
+#define UMUL_TIME 10
+#define UDIV_TIME 100
+#endif /* __mips__ */
+
+/***************************************
+ ************** MIPS/64 **************
+ ***************************************/
+#if (defined(__mips) && __mips >= 3) && W_TYPE_SIZE == 64
+#if defined(__mips_isa_rev) && __mips_isa_rev >= 6 && defined(CONFIG_CC_IS_GCC)
+/*
+ * GCC ends up emitting a __multi3 intrinsic call for MIPS64r6 with the plain C
+ * code below, so we special case MIPS64r6 until the compiler can do better.
+ */
+#define umul_ppmm(w1, w0, u, v) \
+do { \
+ __asm__ ("dmulu %0,%1,%2" \
+ : "=d" ((UDItype)(w0)) \
+ : "d" ((UDItype)(u)), \
+ "d" ((UDItype)(v))); \
+ __asm__ ("dmuhu %0,%1,%2" \
+ : "=d" ((UDItype)(w1)) \
+ : "d" ((UDItype)(u)), \
+ "d" ((UDItype)(v))); \
+} while (0)
+#else
+#define umul_ppmm(w1, w0, u, v) \
+do { \
+ typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
+ __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
+ w1 = __ll >> 64; \
+ w0 = __ll; \
+} while (0)
+#endif
+#define UMUL_TIME 20
+#define UDIV_TIME 140
+#endif /* __mips__ */
+
+/***************************************
+ ************** 32000 ****************
+ ***************************************/
+#if defined(__ns32000__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+ ({union {UDItype __ll; \
+ struct {USItype __l, __h; } __i; \
+ } __xx; \
+ __asm__ ("meid %2,%0" \
+ : "=g" (__xx.__ll) \
+ : "%0" ((USItype)(u)), \
+ "g" ((USItype)(v))); \
+ (w1) = __xx.__i.__h; (w0) = __xx.__i.__l; })
+#define __umulsidi3(u, v) \
+ ({UDItype __w; \
+ __asm__ ("meid %2,%0" \
+ : "=g" (__w) \
+ : "%0" ((USItype)(u)), \
+ "g" ((USItype)(v))); \
+ __w; })
+#define udiv_qrnnd(q, r, n1, n0, d) \
+ ({union {UDItype __ll; \
+ struct {USItype __l, __h; } __i; \
+ } __xx; \
+ __xx.__i.__h = (n1); __xx.__i.__l = (n0); \
+ __asm__ ("deid %2,%0" \
+ : "=g" (__xx.__ll) \
+ : "0" (__xx.__ll), \
+ "g" ((USItype)(d))); \
+ (r) = __xx.__i.__l; (q) = __xx.__i.__h; })
+#endif /* __ns32000__ */
+
+/***************************************
+ ************** PPC ******************
+ ***************************************/
+#if (defined(_ARCH_PPC) || defined(_IBMR2)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+do { \
+ if (__builtin_constant_p(bh) && (bh) == 0) \
+ __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \
+ : "=r" (sh), \
+ "=&r" (sl) \
+ : "%r" ((USItype)(ah)), \
+ "%r" ((USItype)(al)), \
+ "rI" ((USItype)(bl))); \
+ else if (__builtin_constant_p(bh) && (bh) == ~(USItype) 0) \
+ __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \
+ : "=r" (sh), \
+ "=&r" (sl) \
+ : "%r" ((USItype)(ah)), \
+ "%r" ((USItype)(al)), \
+ "rI" ((USItype)(bl))); \
+ else \
+ __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \
+ : "=r" (sh), \
+ "=&r" (sl) \
+ : "%r" ((USItype)(ah)), \
+ "r" ((USItype)(bh)), \
+ "%r" ((USItype)(al)), \
+ "rI" ((USItype)(bl))); \
+} while (0)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+do { \
+ if (__builtin_constant_p(ah) && (ah) == 0) \
+ __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \
+ : "=r" (sh), \
+ "=&r" (sl) \
+ : "r" ((USItype)(bh)), \
+ "rI" ((USItype)(al)), \
+ "r" ((USItype)(bl))); \
+ else if (__builtin_constant_p(ah) && (ah) == ~(USItype) 0) \
+ __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \
+ : "=r" (sh), \
+ "=&r" (sl) \
+ : "r" ((USItype)(bh)), \
+ "rI" ((USItype)(al)), \
+ "r" ((USItype)(bl))); \
+ else if (__builtin_constant_p(bh) && (bh) == 0) \
+ __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \
+ : "=r" (sh), \
+ "=&r" (sl) \
+ : "r" ((USItype)(ah)), \
+ "rI" ((USItype)(al)), \
+ "r" ((USItype)(bl))); \
+ else if (__builtin_constant_p(bh) && (bh) == ~(USItype) 0) \
+ __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \
+ : "=r" (sh), \
+ "=&r" (sl) \
+ : "r" ((USItype)(ah)), \
+ "rI" ((USItype)(al)), \
+ "r" ((USItype)(bl))); \
+ else \
+ __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \
+ : "=r" (sh), \
+ "=&r" (sl) \
+ : "r" ((USItype)(ah)), \
+ "r" ((USItype)(bh)), \
+ "rI" ((USItype)(al)), \
+ "r" ((USItype)(bl))); \
+} while (0)
+#if defined(_ARCH_PPC)
+#define umul_ppmm(ph, pl, m0, m1) \
+do { \
+ USItype __m0 = (m0), __m1 = (m1); \
+ __asm__ ("mulhwu %0,%1,%2" \
+ : "=r" (ph) \
+ : "%r" (__m0), \
+ "r" (__m1)); \
+ (pl) = __m0 * __m1; \
+} while (0)
+#define UMUL_TIME 15
+#define smul_ppmm(ph, pl, m0, m1) \
+do { \
+ SItype __m0 = (m0), __m1 = (m1); \
+ __asm__ ("mulhw %0,%1,%2" \
+ : "=r" ((SItype) ph) \
+ : "%r" (__m0), \
+ "r" (__m1)); \
+ (pl) = __m0 * __m1; \
+} while (0)
+#define SMUL_TIME 14
+#define UDIV_TIME 120
+#else
+#define umul_ppmm(xh, xl, m0, m1) \
+do { \
+ USItype __m0 = (m0), __m1 = (m1); \
+ __asm__ ("mul %0,%2,%3" \
+ : "=r" ((USItype)(xh)), \
+ "=q" ((USItype)(xl)) \
+ : "r" (__m0), \
+ "r" (__m1)); \
+ (xh) += ((((SItype) __m0 >> 31) & __m1) \
+ + (((SItype) __m1 >> 31) & __m0)); \
+} while (0)
+#define UMUL_TIME 8
+#define smul_ppmm(xh, xl, m0, m1) \
+ __asm__ ("mul %0,%2,%3" \
+ : "=r" ((SItype)(xh)), \
+ "=q" ((SItype)(xl)) \
+ : "r" (m0), \
+ "r" (m1))
+#define SMUL_TIME 4
+#define sdiv_qrnnd(q, r, nh, nl, d) \
+ __asm__ ("div %0,%2,%4" \
+ : "=r" ((SItype)(q)), "=q" ((SItype)(r)) \
+ : "r" ((SItype)(nh)), "1" ((SItype)(nl)), "r" ((SItype)(d)))
+#define UDIV_TIME 100
+#endif
+#endif /* Power architecture variants. */
+
+/***************************************
+ ************** PYR ******************
+ ***************************************/
+#if defined(__pyr__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("addw %5,%1\n" \
+ "addwc %3,%0" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "%0" ((USItype)(ah)), \
+ "g" ((USItype)(bh)), \
+ "%1" ((USItype)(al)), \
+ "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("subw %5,%1\n" \
+ "subwb %3,%0" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "0" ((USItype)(ah)), \
+ "g" ((USItype)(bh)), \
+ "1" ((USItype)(al)), \
+ "g" ((USItype)(bl)))
+ /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */
+#define umul_ppmm(w1, w0, u, v) \
+ ({union {UDItype __ll; \
+ struct {USItype __h, __l; } __i; \
+ } __xx; \
+ __asm__ ("movw %1,%R0\n" \
+ "uemul %2,%0" \
+ : "=&r" (__xx.__ll) \
+ : "g" ((USItype) (u)), \
+ "g" ((USItype)(v))); \
+ (w1) = __xx.__i.__h; (w0) = __xx.__i.__l; })
+#endif /* __pyr__ */
+
+/***************************************
+ ************** RT/ROMP **************
+ ***************************************/
+#if defined(__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("a %1,%5\n" \
+ "ae %0,%3" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "%0" ((USItype)(ah)), \
+ "r" ((USItype)(bh)), \
+ "%1" ((USItype)(al)), \
+ "r" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("s %1,%5\n" \
+ "se %0,%3" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "0" ((USItype)(ah)), \
+ "r" ((USItype)(bh)), \
+ "1" ((USItype)(al)), \
+ "r" ((USItype)(bl)))
+#define umul_ppmm(ph, pl, m0, m1) \
+do { \
+ USItype __m0 = (m0), __m1 = (m1); \
+ __asm__ ( \
+ "s r2,r2\n" \
+ "mts r10,%2\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "m r2,%3\n" \
+ "cas %0,r2,r0\n" \
+ "mfs r10,%1" \
+ : "=r" ((USItype)(ph)), \
+ "=r" ((USItype)(pl)) \
+ : "%r" (__m0), \
+ "r" (__m1) \
+ : "r2"); \
+ (ph) += ((((SItype) __m0 >> 31) & __m1) \
+ + (((SItype) __m1 >> 31) & __m0)); \
+} while (0)
+#define UMUL_TIME 20
+#define UDIV_TIME 200
+#endif /* RT/ROMP */
+
+/***************************************
+ ************** SH2 ******************
+ ***************************************/
+#if (defined(__sh2__) || defined(__sh3__) || defined(__SH4__)) \
+ && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+ __asm__ ( \
+ "dmulu.l %2,%3\n" \
+ "sts macl,%1\n" \
+ "sts mach,%0" \
+ : "=r" ((USItype)(w1)), \
+ "=r" ((USItype)(w0)) \
+ : "r" ((USItype)(u)), \
+ "r" ((USItype)(v)) \
+ : "macl", "mach")
+#define UMUL_TIME 5
+#endif
+
+/***************************************
+ ************** SPARC ****************
+ ***************************************/
+#if defined(__sparc__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("addcc %r4,%5,%1\n" \
+ "addx %r2,%3,%0" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "%rJ" ((USItype)(ah)), \
+ "rI" ((USItype)(bh)), \
+ "%rJ" ((USItype)(al)), \
+ "rI" ((USItype)(bl)) \
+ __CLOBBER_CC)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("subcc %r4,%5,%1\n" \
+ "subx %r2,%3,%0" \
+ : "=r" ((USItype)(sh)), \
+ "=&r" ((USItype)(sl)) \
+ : "rJ" ((USItype)(ah)), \
+ "rI" ((USItype)(bh)), \
+ "rJ" ((USItype)(al)), \
+ "rI" ((USItype)(bl)) \
+ __CLOBBER_CC)
+#if defined(__sparc_v8__)
+/* Don't match immediate range because, 1) it is not often useful,
+ 2) the 'I' flag thinks of the range as a 13 bit signed interval,
+ while we want to match a 13 bit interval, sign extended to 32 bits,
+ but INTERPRETED AS UNSIGNED. */
+#define umul_ppmm(w1, w0, u, v) \
+ __asm__ ("umul %2,%3,%1;rd %%y,%0" \
+ : "=r" ((USItype)(w1)), \
+ "=r" ((USItype)(w0)) \
+ : "r" ((USItype)(u)), \
+ "r" ((USItype)(v)))
+#define UMUL_TIME 5
+#ifndef SUPERSPARC /* SuperSPARC's udiv only handles 53 bit dividends */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+do { \
+ USItype __q; \
+ __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
+ : "=r" ((USItype)(__q)) \
+ : "r" ((USItype)(n1)), \
+ "r" ((USItype)(n0)), \
+ "r" ((USItype)(d))); \
+ (r) = (n0) - __q * (d); \
+ (q) = __q; \
+} while (0)
+#define UDIV_TIME 25
+#endif /* SUPERSPARC */
+#else /* ! __sparc_v8__ */
+#if defined(__sparclite__)
+/* This has hardware multiply but not divide. It also has two additional
+ instructions scan (ffs from high bit) and divscc. */
+#define umul_ppmm(w1, w0, u, v) \
+ __asm__ ("umul %2,%3,%1;rd %%y,%0" \
+ : "=r" ((USItype)(w1)), \
+ "=r" ((USItype)(w0)) \
+ : "r" ((USItype)(u)), \
+ "r" ((USItype)(v)))
+#define UMUL_TIME 5
+#define udiv_qrnnd(q, r, n1, n0, d) \
+ __asm__ ("! Inlined udiv_qrnnd\n" \
+ "wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \
+ "tst %%g0\n" \
+ "divscc %3,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%%g1\n" \
+ "divscc %%g1,%4,%0\n" \
+ "rd %%y,%1\n" \
+ "bl,a 1f\n" \
+ "add %1,%4,%1\n" \
+ "1: ! End of inline udiv_qrnnd" \
+ : "=r" ((USItype)(q)), \
+ "=r" ((USItype)(r)) \
+ : "r" ((USItype)(n1)), \
+ "r" ((USItype)(n0)), \
+ "rI" ((USItype)(d)) \
+ : "%g1" __AND_CLOBBER_CC)
+#define UDIV_TIME 37
+#endif /* __sparclite__ */
+#endif /* __sparc_v8__ */
+ /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */
+#ifndef umul_ppmm
+#define umul_ppmm(w1, w0, u, v) \
+ __asm__ ("! Inlined umul_ppmm\n" \
+ "wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \
+ "sra %3,31,%%g2 ! Don't move this insn\n" \
+ "and %2,%%g2,%%g2 ! Don't move this insn\n" \
+ "andcc %%g0,0,%%g1 ! Don't move this insn\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,%3,%%g1\n" \
+ "mulscc %%g1,0,%%g1\n" \
+ "add %%g1,%%g2,%0\n" \
+ "rd %%y,%1" \
+ : "=r" ((USItype)(w1)), \
+ "=r" ((USItype)(w0)) \
+ : "%rI" ((USItype)(u)), \
+ "r" ((USItype)(v)) \
+ : "%g1", "%g2" __AND_CLOBBER_CC)
+#define UMUL_TIME 39 /* 39 instructions */
+/* It's quite necessary to add this much assembler for the sparc.
+ The default udiv_qrnnd (in C) is more than 10 times slower! */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+ __asm__ ("! Inlined udiv_qrnnd\n\t" \
+ "mov 32,%%g1\n\t" \
+ "subcc %1,%2,%%g0\n\t" \
+ "1: bcs 5f\n\t" \
+ "addxcc %0,%0,%0 ! shift n1n0 and a q-bit in lsb\n\t" \
+ "sub %1,%2,%1 ! this kills msb of n\n\t" \
+ "addx %1,%1,%1 ! so this can't give carry\n\t" \
+ "subcc %%g1,1,%%g1\n\t" \
+ "2: bne 1b\n\t" \
+ "subcc %1,%2,%%g0\n\t" \
+ "bcs 3f\n\t" \
+ "addxcc %0,%0,%0 ! shift n1n0 and a q-bit in lsb\n\t" \
+ "b 3f\n\t" \
+ "sub %1,%2,%1 ! this kills msb of n\n\t" \
+ "4: sub %1,%2,%1\n\t" \
+ "5: addxcc %1,%1,%1\n\t" \
+ "bcc 2b\n\t" \
+ "subcc %%g1,1,%%g1\n\t" \
+ "! Got carry from n. Subtract next step to cancel this carry.\n\t" \
+ "bne 4b\n\t" \
+ "addcc %0,%0,%0 ! shift n1n0 and a 0-bit in lsb\n\t" \
+ "sub %1,%2,%1\n\t" \
+ "3: xnor %0,0,%0\n\t" \
+ "! End of inline udiv_qrnnd\n" \
+ : "=&r" ((USItype)(q)), \
+ "=&r" ((USItype)(r)) \
+ : "r" ((USItype)(d)), \
+ "1" ((USItype)(n1)), \
+ "0" ((USItype)(n0)) : "%g1", "cc")
+#define UDIV_TIME (3+7*32) /* 7 instructions/iteration. 32 iterations. */
+#endif
+#endif /* __sparc__ */
+
+/***************************************
+ ************** VAX ******************
+ ***************************************/
+#if defined(__vax__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("addl2 %5,%1\n" \
+ "adwc %3,%0" \
+ : "=g" ((USItype)(sh)), \
+ "=&g" ((USItype)(sl)) \
+ : "%0" ((USItype)(ah)), \
+ "g" ((USItype)(bh)), \
+ "%1" ((USItype)(al)), \
+ "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("subl2 %5,%1\n" \
+ "sbwc %3,%0" \
+ : "=g" ((USItype)(sh)), \
+ "=&g" ((USItype)(sl)) \
+ : "0" ((USItype)(ah)), \
+ "g" ((USItype)(bh)), \
+ "1" ((USItype)(al)), \
+ "g" ((USItype)(bl)))
+#define umul_ppmm(xh, xl, m0, m1) \
+do { \
+ union {UDItype __ll; \
+ struct {USItype __l, __h; } __i; \
+ } __xx; \
+ USItype __m0 = (m0), __m1 = (m1); \
+ __asm__ ("emul %1,%2,$0,%0" \
+ : "=g" (__xx.__ll) \
+ : "g" (__m0), \
+ "g" (__m1)); \
+ (xh) = __xx.__i.__h; (xl) = __xx.__i.__l; \
+ (xh) += ((((SItype) __m0 >> 31) & __m1) \
+ + (((SItype) __m1 >> 31) & __m0)); \
+} while (0)
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+do { \
+ union {DItype __ll; \
+ struct {SItype __l, __h; } __i; \
+ } __xx; \
+ __xx.__i.__h = n1; __xx.__i.__l = n0; \
+ __asm__ ("ediv %3,%2,%0,%1" \
+ : "=g" (q), "=g" (r) \
+ : "g" (__xx.__ll), "g" (d)); \
+} while (0)
+#endif /* __vax__ */
+
+/***************************************
+ ************** Z8000 ****************
+ ***************************************/
+#if defined(__z8000__) && W_TYPE_SIZE == 16
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+ __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \
+ : "=r" ((unsigned int)(sh)), \
+ "=&r" ((unsigned int)(sl)) \
+ : "%0" ((unsigned int)(ah)), \
+ "r" ((unsigned int)(bh)), \
+ "%1" ((unsigned int)(al)), \
+ "rQR" ((unsigned int)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \
+ : "=r" ((unsigned int)(sh)), \
+ "=&r" ((unsigned int)(sl)) \
+ : "0" ((unsigned int)(ah)), \
+ "r" ((unsigned int)(bh)), \
+ "1" ((unsigned int)(al)), \
+ "rQR" ((unsigned int)(bl)))
+#define umul_ppmm(xh, xl, m0, m1) \
+do { \
+ union {long int __ll; \
+ struct {unsigned int __h, __l; } __i; \
+ } __xx; \
+ unsigned int __m0 = (m0), __m1 = (m1); \
+ __asm__ ("mult %S0,%H3" \
+ : "=r" (__xx.__i.__h), \
+ "=r" (__xx.__i.__l) \
+ : "%1" (__m0), \
+ "rQR" (__m1)); \
+ (xh) = __xx.__i.__h; (xl) = __xx.__i.__l; \
+ (xh) += ((((signed int) __m0 >> 15) & __m1) \
+ + (((signed int) __m1 >> 15) & __m0)); \
+} while (0)
+#endif /* __z8000__ */
+
+#endif /* __GNUC__ */
+
+/***************************************
+ *********** Generic Versions ********
+ ***************************************/
+#if !defined(umul_ppmm) && defined(__umulsidi3)
+#define umul_ppmm(ph, pl, m0, m1) \
+{ \
+ UDWtype __ll = __umulsidi3(m0, m1); \
+ ph = (UWtype) (__ll >> W_TYPE_SIZE); \
+ pl = (UWtype) __ll; \
+}
+#endif
+
+#if !defined(__umulsidi3)
+#define __umulsidi3(u, v) \
+ ({UWtype __hi, __lo; \
+ umul_ppmm(__hi, __lo, u, v); \
+ ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
+#endif
+
+ /* If this machine has no inline assembler, use C macros. */
+
+#if !defined(add_ssaaaa)
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+do { \
+ UWtype __x; \
+ __x = (al) + (bl); \
+ (sh) = (ah) + (bh) + (__x < (al)); \
+ (sl) = __x; \
+} while (0)
+#endif
+
+#if !defined(sub_ddmmss)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+do { \
+ UWtype __x; \
+ __x = (al) - (bl); \
+ (sh) = (ah) - (bh) - (__x > (al)); \
+ (sl) = __x; \
+} while (0)
+#endif
+
+#if !defined(umul_ppmm)
+#define umul_ppmm(w1, w0, u, v) \
+do { \
+ UWtype __x0, __x1, __x2, __x3; \
+ UHWtype __ul, __vl, __uh, __vh; \
+ UWtype __u = (u), __v = (v); \
+ \
+ __ul = __ll_lowpart(__u); \
+ __uh = __ll_highpart(__u); \
+ __vl = __ll_lowpart(__v); \
+ __vh = __ll_highpart(__v); \
+ \
+ __x0 = (UWtype) __ul * __vl; \
+ __x1 = (UWtype) __ul * __vh; \
+ __x2 = (UWtype) __uh * __vl; \
+ __x3 = (UWtype) __uh * __vh; \
+ \
+ __x1 += __ll_highpart(__x0);/* this can't give carry */ \
+ __x1 += __x2; /* but this indeed can */ \
+ if (__x1 < __x2) /* did we get it? */ \
+ __x3 += __ll_B; /* yes, add it in the proper pos. */ \
+ \
+ (w1) = __x3 + __ll_highpart(__x1); \
+ (w0) = (__ll_lowpart(__x1) << W_TYPE_SIZE/2) + __ll_lowpart(__x0); \
+} while (0)
+#endif
+
+#if !defined(umul_ppmm)
+#define smul_ppmm(w1, w0, u, v) \
+do { \
+ UWtype __w1; \
+ UWtype __m0 = (u), __m1 = (v); \
+ umul_ppmm(__w1, w0, __m0, __m1); \
+ (w1) = __w1 - (-(__m0 >> (W_TYPE_SIZE - 1)) & __m1) \
+ - (-(__m1 >> (W_TYPE_SIZE - 1)) & __m0); \
+} while (0)
+#endif
+
+ /* Define this unconditionally, so it can be used for debugging. */
+#define __udiv_qrnnd_c(q, r, n1, n0, d) \
+do { \
+ UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \
+ __d1 = __ll_highpart(d); \
+ __d0 = __ll_lowpart(d); \
+ \
+ __r1 = (n1) % __d1; \
+ __q1 = (n1) / __d1; \
+ __m = (UWtype) __q1 * __d0; \
+ __r1 = __r1 * __ll_B | __ll_highpart(n0); \
+ if (__r1 < __m) { \
+ __q1--, __r1 += (d); \
+ if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */ \
+ if (__r1 < __m) \
+ __q1--, __r1 += (d); \
+ } \
+ __r1 -= __m; \
+ \
+ __r0 = __r1 % __d1; \
+ __q0 = __r1 / __d1; \
+ __m = (UWtype) __q0 * __d0; \
+ __r0 = __r0 * __ll_B | __ll_lowpart(n0); \
+ if (__r0 < __m) { \
+ __q0--, __r0 += (d); \
+ if (__r0 >= (d)) \
+ if (__r0 < __m) \
+ __q0--, __r0 += (d); \
+ } \
+ __r0 -= __m; \
+ \
+ (q) = (UWtype) __q1 * __ll_B | __q0; \
+ (r) = __r0; \
+} while (0)
+
+/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
+ __udiv_w_sdiv (defined in libgcc or elsewhere). */
+#if !defined(udiv_qrnnd) && defined(sdiv_qrnnd)
+#define udiv_qrnnd(q, r, nh, nl, d) \
+do { \
+ UWtype __r; \
+ (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \
+ (r) = __r; \
+} while (0)
+#endif
+
+ /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */
+#if !defined(udiv_qrnnd)
+#define UDIV_NEEDS_NORMALIZATION 1
+#define udiv_qrnnd __udiv_qrnnd_c
+#endif
+
+#ifndef UDIV_NEEDS_NORMALIZATION
+#define UDIV_NEEDS_NORMALIZATION 0
+#endif
diff --git a/lib/crypto/mpi/mpi-add.c b/lib/crypto/mpi/mpi-add.c
new file mode 100644
index 000000000000..c0375c1672fa
--- /dev/null
+++ b/lib/crypto/mpi/mpi-add.c
@@ -0,0 +1,120 @@
+/* mpi-add.c - MPI functions
+ * Copyright (C) 1994, 1996, 1998, 2001, 2002,
+ * 2003 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ */
+
+#include <linux/export.h>
+
+#include "mpi-internal.h"
+
+int mpi_add(MPI w, MPI u, MPI v)
+{
+ mpi_ptr_t wp, up, vp;
+ mpi_size_t usize, vsize, wsize;
+ int usign, vsign, wsign;
+ int err;
+
+ if (u->nlimbs < v->nlimbs) { /* Swap U and V. */
+ usize = v->nlimbs;
+ usign = v->sign;
+ vsize = u->nlimbs;
+ vsign = u->sign;
+ wsize = usize + 1;
+ err = RESIZE_IF_NEEDED(w, wsize);
+ if (err)
+ return err;
+ /* These must be after realloc (u or v may be the same as w). */
+ up = v->d;
+ vp = u->d;
+ } else {
+ usize = u->nlimbs;
+ usign = u->sign;
+ vsize = v->nlimbs;
+ vsign = v->sign;
+ wsize = usize + 1;
+ err = RESIZE_IF_NEEDED(w, wsize);
+ if (err)
+ return err;
+ /* These must be after realloc (u or v may be the same as w). */
+ up = u->d;
+ vp = v->d;
+ }
+ wp = w->d;
+ wsign = 0;
+
+ if (!vsize) { /* simple */
+ MPN_COPY(wp, up, usize);
+ wsize = usize;
+ wsign = usign;
+ } else if (usign != vsign) { /* different sign */
+ /* This test is right since USIZE >= VSIZE */
+ if (usize != vsize) {
+ mpihelp_sub(wp, up, usize, vp, vsize);
+ wsize = usize;
+ MPN_NORMALIZE(wp, wsize);
+ wsign = usign;
+ } else if (mpihelp_cmp(up, vp, usize) < 0) {
+ mpihelp_sub_n(wp, vp, up, usize);
+ wsize = usize;
+ MPN_NORMALIZE(wp, wsize);
+ if (!usign)
+ wsign = 1;
+ } else {
+ mpihelp_sub_n(wp, up, vp, usize);
+ wsize = usize;
+ MPN_NORMALIZE(wp, wsize);
+ if (usign)
+ wsign = 1;
+ }
+ } else { /* U and V have same sign. Add them. */
+ mpi_limb_t cy = mpihelp_add(wp, up, usize, vp, vsize);
+ wp[usize] = cy;
+ wsize = usize + cy;
+ if (usign)
+ wsign = 1;
+ }
+
+ w->nlimbs = wsize;
+ w->sign = wsign;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_add);
+
+int mpi_sub(MPI w, MPI u, MPI v)
+{
+ int err;
+ MPI vv;
+
+ vv = mpi_copy(v);
+ if (!vv)
+ return -ENOMEM;
+
+ vv->sign = !vv->sign;
+ err = mpi_add(w, u, vv);
+ mpi_free(vv);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(mpi_sub);
+
+int mpi_addm(MPI w, MPI u, MPI v, MPI m)
+{
+ return mpi_add(w, u, v) ?:
+ mpi_mod(w, w, m);
+}
+EXPORT_SYMBOL_GPL(mpi_addm);
+
+int mpi_subm(MPI w, MPI u, MPI v, MPI m)
+{
+ return mpi_sub(w, u, v) ?:
+ mpi_mod(w, w, m);
+}
+EXPORT_SYMBOL_GPL(mpi_subm);
diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c
new file mode 100644
index 000000000000..b3d0e7ddbc03
--- /dev/null
+++ b/lib/crypto/mpi/mpi-bit.c
@@ -0,0 +1,177 @@
+/* mpi-bit.c - MPI bit level functions
+ * Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include <linux/export.h>
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+#define A_LIMB_1 ((mpi_limb_t) 1)
+
+/****************
+ * Sometimes we have MSL (most significant limbs) which are 0;
+ * this is for some reasons not good, so this function removes them.
+ */
+void mpi_normalize(MPI a)
+{
+ for (; a->nlimbs && !a->d[a->nlimbs - 1]; a->nlimbs--)
+ ;
+}
+
+/****************
+ * Return the number of bits in A.
+ */
+unsigned mpi_get_nbits(MPI a)
+{
+ unsigned n;
+
+ mpi_normalize(a);
+
+ if (a->nlimbs) {
+ mpi_limb_t alimb = a->d[a->nlimbs - 1];
+ if (alimb)
+ n = count_leading_zeros(alimb);
+ else
+ n = BITS_PER_MPI_LIMB;
+ n = BITS_PER_MPI_LIMB - n + (a->nlimbs - 1) * BITS_PER_MPI_LIMB;
+ } else
+ n = 0;
+ return n;
+}
+EXPORT_SYMBOL_GPL(mpi_get_nbits);
+
+/****************
+ * Test whether bit N is set.
+ */
+int mpi_test_bit(MPI a, unsigned int n)
+{
+ unsigned int limbno, bitno;
+ mpi_limb_t limb;
+
+ limbno = n / BITS_PER_MPI_LIMB;
+ bitno = n % BITS_PER_MPI_LIMB;
+
+ if (limbno >= a->nlimbs)
+ return 0; /* too far left: this is a 0 */
+ limb = a->d[limbno];
+ return (limb & (A_LIMB_1 << bitno)) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(mpi_test_bit);
+
+/****************
+ * Set bit N of A.
+ */
+int mpi_set_bit(MPI a, unsigned int n)
+{
+ unsigned int i, limbno, bitno;
+ int err;
+
+ limbno = n / BITS_PER_MPI_LIMB;
+ bitno = n % BITS_PER_MPI_LIMB;
+
+ if (limbno >= a->nlimbs) {
+ for (i = a->nlimbs; i < a->alloced; i++)
+ a->d[i] = 0;
+ err = mpi_resize(a, limbno+1);
+ if (err)
+ return err;
+ a->nlimbs = limbno+1;
+ }
+ a->d[limbno] |= (A_LIMB_1<<bitno);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_set_bit);
+
+/*
+ * Shift A by N bits to the right.
+ */
+int mpi_rshift(MPI x, MPI a, unsigned int n)
+{
+ mpi_size_t xsize;
+ unsigned int i;
+ unsigned int nlimbs = (n/BITS_PER_MPI_LIMB);
+ unsigned int nbits = (n%BITS_PER_MPI_LIMB);
+ int err;
+
+ if (x == a) {
+ /* In-place operation. */
+ if (nlimbs >= x->nlimbs) {
+ x->nlimbs = 0;
+ return 0;
+ }
+
+ if (nlimbs) {
+ for (i = 0; i < x->nlimbs - nlimbs; i++)
+ x->d[i] = x->d[i+nlimbs];
+ x->d[i] = 0;
+ x->nlimbs -= nlimbs;
+ }
+ if (x->nlimbs && nbits)
+ mpihelp_rshift(x->d, x->d, x->nlimbs, nbits);
+ } else if (nlimbs) {
+ /* Copy and shift by more or equal bits than in a limb. */
+ xsize = a->nlimbs;
+ x->sign = a->sign;
+ err = RESIZE_IF_NEEDED(x, xsize);
+ if (err)
+ return err;
+ x->nlimbs = xsize;
+ for (i = 0; i < a->nlimbs; i++)
+ x->d[i] = a->d[i];
+ x->nlimbs = i;
+
+ if (nlimbs >= x->nlimbs) {
+ x->nlimbs = 0;
+ return 0;
+ }
+
+ for (i = 0; i < x->nlimbs - nlimbs; i++)
+ x->d[i] = x->d[i+nlimbs];
+ x->d[i] = 0;
+ x->nlimbs -= nlimbs;
+
+ if (x->nlimbs && nbits)
+ mpihelp_rshift(x->d, x->d, x->nlimbs, nbits);
+ } else {
+ /* Copy and shift by less than bits in a limb. */
+ xsize = a->nlimbs;
+ x->sign = a->sign;
+ err = RESIZE_IF_NEEDED(x, xsize);
+ if (err)
+ return err;
+ x->nlimbs = xsize;
+
+ if (xsize) {
+ if (nbits)
+ mpihelp_rshift(x->d, a->d, x->nlimbs, nbits);
+ else {
+ /* The rshift helper function is not specified for
+ * NBITS==0, thus we do a plain copy here.
+ */
+ for (i = 0; i < x->nlimbs; i++)
+ x->d[i] = a->d[i];
+ }
+ }
+ }
+ MPN_NORMALIZE(x->d, x->nlimbs);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_rshift);
diff --git a/lib/crypto/mpi/mpi-cmp.c b/lib/crypto/mpi/mpi-cmp.c
new file mode 100644
index 000000000000..b42929296bce
--- /dev/null
+++ b/lib/crypto/mpi/mpi-cmp.c
@@ -0,0 +1,74 @@
+/* mpi-cmp.c - MPI functions
+ * Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include <linux/export.h>
+
+#include "mpi-internal.h"
+
+int mpi_cmp_ui(MPI u, unsigned long v)
+{
+ mpi_limb_t limb = v;
+
+ mpi_normalize(u);
+ if (u->nlimbs == 0) {
+ if (v == 0)
+ return 0;
+ else
+ return -1;
+ }
+ if (u->sign)
+ return -1;
+ if (u->nlimbs > 1)
+ return 1;
+
+ if (u->d[0] == limb)
+ return 0;
+ else if (u->d[0] > limb)
+ return 1;
+ else
+ return -1;
+}
+EXPORT_SYMBOL_GPL(mpi_cmp_ui);
+
+int mpi_cmp(MPI u, MPI v)
+{
+ mpi_size_t usize, vsize;
+ int cmp;
+
+ mpi_normalize(u);
+ mpi_normalize(v);
+ usize = u->nlimbs;
+ vsize = v->nlimbs;
+ if (!u->sign && v->sign)
+ return 1;
+ if (u->sign && !v->sign)
+ return -1;
+ if (usize != vsize && !u->sign && !v->sign)
+ return usize - vsize;
+ if (usize != vsize && u->sign && v->sign)
+ return vsize - usize;
+ if (!usize)
+ return 0;
+ cmp = mpihelp_cmp(u->d, v->d, usize);
+ if (u->sign)
+ return -cmp;
+ return cmp;
+}
+EXPORT_SYMBOL_GPL(mpi_cmp);
diff --git a/lib/crypto/mpi/mpi-div.c b/lib/crypto/mpi/mpi-div.c
new file mode 100644
index 000000000000..6e5044e72595
--- /dev/null
+++ b/lib/crypto/mpi/mpi-div.c
@@ -0,0 +1,230 @@
+/* mpi-div.c - MPI functions
+ * Copyright (C) 1994, 1996, 1998, 2001, 2002,
+ * 2003 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+int mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den);
+
+int mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor)
+{
+ int divisor_sign = divisor->sign;
+ MPI temp_divisor = NULL;
+ int err;
+
+ /* We need the original value of the divisor after the remainder has been
+ * preliminary calculated. We have to copy it to temporary space if it's
+ * the same variable as REM.
+ */
+ if (rem == divisor) {
+ temp_divisor = mpi_copy(divisor);
+ if (!temp_divisor)
+ return -ENOMEM;
+ divisor = temp_divisor;
+ }
+
+ err = mpi_tdiv_r(rem, dividend, divisor);
+ if (err)
+ goto free_temp_divisor;
+
+ if (((divisor_sign?1:0) ^ (dividend->sign?1:0)) && rem->nlimbs)
+ err = mpi_add(rem, rem, divisor);
+
+free_temp_divisor:
+ mpi_free(temp_divisor);
+
+ return err;
+}
+
+/* If den == quot, den needs temporary storage.
+ * If den == rem, den needs temporary storage.
+ * If num == quot, num needs temporary storage.
+ * If den has temporary storage, it can be normalized while being copied,
+ * i.e no extra storage should be allocated.
+ */
+
+int mpi_tdiv_r(MPI rem, MPI num, MPI den)
+{
+ return mpi_tdiv_qr(NULL, rem, num, den);
+}
+
+int mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den)
+{
+ mpi_ptr_t np, dp;
+ mpi_ptr_t qp, rp;
+ mpi_size_t nsize = num->nlimbs;
+ mpi_size_t dsize = den->nlimbs;
+ mpi_size_t qsize, rsize;
+ mpi_size_t sign_remainder = num->sign;
+ mpi_size_t sign_quotient = num->sign ^ den->sign;
+ unsigned int normalization_steps;
+ mpi_limb_t q_limb;
+ mpi_ptr_t marker[5];
+ int markidx = 0;
+ int err;
+
+ /* Ensure space is enough for quotient and remainder.
+ * We need space for an extra limb in the remainder, because it's
+ * up-shifted (normalized) below.
+ */
+ rsize = nsize + 1;
+ err = mpi_resize(rem, rsize);
+ if (err)
+ return err;
+
+ qsize = rsize - dsize; /* qsize cannot be bigger than this. */
+ if (qsize <= 0) {
+ if (num != rem) {
+ rem->nlimbs = num->nlimbs;
+ rem->sign = num->sign;
+ MPN_COPY(rem->d, num->d, nsize);
+ }
+ if (quot) {
+ /* This needs to follow the assignment to rem, in case the
+ * numerator and quotient are the same.
+ */
+ quot->nlimbs = 0;
+ quot->sign = 0;
+ }
+ return 0;
+ }
+
+ if (quot) {
+ err = mpi_resize(quot, qsize);
+ if (err)
+ return err;
+ }
+
+ /* Read pointers here, when reallocation is finished. */
+ np = num->d;
+ dp = den->d;
+ rp = rem->d;
+
+ /* Optimize division by a single-limb divisor. */
+ if (dsize == 1) {
+ mpi_limb_t rlimb;
+ if (quot) {
+ qp = quot->d;
+ rlimb = mpihelp_divmod_1(qp, np, nsize, dp[0]);
+ qsize -= qp[qsize - 1] == 0;
+ quot->nlimbs = qsize;
+ quot->sign = sign_quotient;
+ } else
+ rlimb = mpihelp_mod_1(np, nsize, dp[0]);
+ rp[0] = rlimb;
+ rsize = rlimb != 0?1:0;
+ rem->nlimbs = rsize;
+ rem->sign = sign_remainder;
+ return 0;
+ }
+
+ err = -ENOMEM;
+ if (quot) {
+ qp = quot->d;
+ /* Make sure QP and NP point to different objects. Otherwise the
+ * numerator would be gradually overwritten by the quotient limbs.
+ */
+ if (qp == np) { /* Copy NP object to temporary space. */
+ np = marker[markidx++] = mpi_alloc_limb_space(nsize);
+ if (!np)
+ goto out_free_marker;
+ MPN_COPY(np, qp, nsize);
+ }
+ } else /* Put quotient at top of remainder. */
+ qp = rp + dsize;
+
+ normalization_steps = count_leading_zeros(dp[dsize - 1]);
+
+ /* Normalize the denominator, i.e. make its most significant bit set by
+ * shifting it NORMALIZATION_STEPS bits to the left. Also shift the
+ * numerator the same number of steps (to keep the quotient the same!).
+ */
+ if (normalization_steps) {
+ mpi_ptr_t tp;
+ mpi_limb_t nlimb;
+
+ /* Shift up the denominator setting the most significant bit of
+ * the most significant word. Use temporary storage not to clobber
+ * the original contents of the denominator.
+ */
+ tp = marker[markidx++] = mpi_alloc_limb_space(dsize);
+ if (!tp)
+ goto out_free_marker;
+ mpihelp_lshift(tp, dp, dsize, normalization_steps);
+ dp = tp;
+
+ /* Shift up the numerator, possibly introducing a new most
+ * significant word. Move the shifted numerator in the remainder
+ * meanwhile.
+ */
+ nlimb = mpihelp_lshift(rp, np, nsize, normalization_steps);
+ if (nlimb) {
+ rp[nsize] = nlimb;
+ rsize = nsize + 1;
+ } else
+ rsize = nsize;
+ } else {
+ /* The denominator is already normalized, as required. Copy it to
+ * temporary space if it overlaps with the quotient or remainder.
+ */
+ if (dp == rp || (quot && (dp == qp))) {
+ mpi_ptr_t tp;
+
+ tp = marker[markidx++] = mpi_alloc_limb_space(dsize);
+ if (!tp)
+ goto out_free_marker;
+ MPN_COPY(tp, dp, dsize);
+ dp = tp;
+ }
+
+ /* Move the numerator to the remainder. */
+ if (rp != np)
+ MPN_COPY(rp, np, nsize);
+
+ rsize = nsize;
+ }
+
+ q_limb = mpihelp_divrem(qp, 0, rp, rsize, dp, dsize);
+
+ if (quot) {
+ qsize = rsize - dsize;
+ if (q_limb) {
+ qp[qsize] = q_limb;
+ qsize += 1;
+ }
+
+ quot->nlimbs = qsize;
+ quot->sign = sign_quotient;
+ }
+
+ rsize = dsize;
+ MPN_NORMALIZE(rp, rsize);
+
+ if (normalization_steps && rsize) {
+ mpihelp_rshift(rp, rp, rsize, normalization_steps);
+ rsize -= rp[rsize - 1] == 0?1:0;
+ }
+
+ rem->nlimbs = rsize;
+ rem->sign = sign_remainder;
+
+ err = 0;
+
+out_free_marker:
+ while (markidx) {
+ markidx--;
+ mpi_free_limb_space(marker[markidx]);
+ }
+
+ return err;
+}
diff --git a/lib/crypto/mpi/mpi-inline.h b/lib/crypto/mpi/mpi-inline.h
new file mode 100644
index 000000000000..980b6b940953
--- /dev/null
+++ b/lib/crypto/mpi/mpi-inline.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* mpi-inline.h - Internal to the Multi Precision Integers
+ * Copyright (C) 1994, 1996, 1998, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#ifndef G10_MPI_INLINE_H
+#define G10_MPI_INLINE_H
+
+#ifndef G10_MPI_INLINE_DECL
+#define G10_MPI_INLINE_DECL static inline
+#endif
+
+G10_MPI_INLINE_DECL mpi_limb_t
+mpihelp_add_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_size_t s1_size, mpi_limb_t s2_limb)
+{
+ mpi_limb_t x;
+
+ x = *s1_ptr++;
+ s2_limb += x;
+ *res_ptr++ = s2_limb;
+ if (s2_limb < x) { /* sum is less than the left operand: handle carry */
+ while (--s1_size) {
+ x = *s1_ptr++ + 1; /* add carry */
+ *res_ptr++ = x; /* and store */
+ if (x) /* not 0 (no overflow): we can stop */
+ goto leave;
+ }
+ return 1; /* return carry (size of s1 to small) */
+ }
+
+leave:
+ if (res_ptr != s1_ptr) { /* not the same variable */
+ mpi_size_t i; /* copy the rest */
+ for (i = 0; i < s1_size - 1; i++)
+ res_ptr[i] = s1_ptr[i];
+ }
+ return 0; /* no carry */
+}
+
+G10_MPI_INLINE_DECL mpi_limb_t
+mpihelp_add(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
+ mpi_ptr_t s2_ptr, mpi_size_t s2_size)
+{
+ mpi_limb_t cy = 0;
+
+ if (s2_size)
+ cy = mpihelp_add_n(res_ptr, s1_ptr, s2_ptr, s2_size);
+
+ if (s1_size - s2_size)
+ cy = mpihelp_add_1(res_ptr + s2_size, s1_ptr + s2_size,
+ s1_size - s2_size, cy);
+ return cy;
+}
+
+G10_MPI_INLINE_DECL mpi_limb_t
+mpihelp_sub_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_size_t s1_size, mpi_limb_t s2_limb)
+{
+ mpi_limb_t x;
+
+ x = *s1_ptr++;
+ s2_limb = x - s2_limb;
+ *res_ptr++ = s2_limb;
+ if (s2_limb > x) {
+ while (--s1_size) {
+ x = *s1_ptr++;
+ *res_ptr++ = x - 1;
+ if (x)
+ goto leave;
+ }
+ return 1;
+ }
+
+leave:
+ if (res_ptr != s1_ptr) {
+ mpi_size_t i;
+ for (i = 0; i < s1_size - 1; i++)
+ res_ptr[i] = s1_ptr[i];
+ }
+ return 0;
+}
+
+G10_MPI_INLINE_DECL mpi_limb_t
+mpihelp_sub(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
+ mpi_ptr_t s2_ptr, mpi_size_t s2_size)
+{
+ mpi_limb_t cy = 0;
+
+ if (s2_size)
+ cy = mpihelp_sub_n(res_ptr, s1_ptr, s2_ptr, s2_size);
+
+ if (s1_size - s2_size)
+ cy = mpihelp_sub_1(res_ptr + s2_size, s1_ptr + s2_size,
+ s1_size - s2_size, cy);
+ return cy;
+}
+
+#endif /*G10_MPI_INLINE_H */
diff --git a/lib/crypto/mpi/mpi-internal.h b/lib/crypto/mpi/mpi-internal.h
new file mode 100644
index 000000000000..8a4f49e3043c
--- /dev/null
+++ b/lib/crypto/mpi/mpi-internal.h
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* mpi-internal.h - Internal to the Multi Precision Integers
+ * Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ * Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#ifndef G10_MPI_INTERNAL_H
+#define G10_MPI_INTERNAL_H
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/mpi.h>
+#include <linux/errno.h>
+
+#define log_debug printk
+#define log_bug printk
+
+#define assert(x) \
+ do { \
+ if (!x) \
+ log_bug("failed assertion\n"); \
+ } while (0);
+
+/* If KARATSUBA_THRESHOLD is not already defined, define it to a
+ * value which is good on most machines. */
+
+/* tested 4, 16, 32 and 64, where 16 gave the best performance when
+ * checking a 768 and a 1024 bit ElGamal signature.
+ * (wk 22.12.97) */
+#ifndef KARATSUBA_THRESHOLD
+#define KARATSUBA_THRESHOLD 16
+#endif
+
+/* The code can't handle KARATSUBA_THRESHOLD smaller than 2. */
+#if KARATSUBA_THRESHOLD < 2
+#undef KARATSUBA_THRESHOLD
+#define KARATSUBA_THRESHOLD 2
+#endif
+
+typedef mpi_limb_t *mpi_ptr_t; /* pointer to a limb */
+typedef int mpi_size_t; /* (must be a signed type) */
+
+static inline int RESIZE_IF_NEEDED(MPI a, unsigned b)
+{
+ if (a->alloced < b)
+ return mpi_resize(a, b);
+ return 0;
+}
+
+/* Copy N limbs from S to D. */
+#define MPN_COPY(d, s, n) \
+ do { \
+ mpi_size_t _i; \
+ for (_i = 0; _i < (n); _i++) \
+ (d)[_i] = (s)[_i]; \
+ } while (0)
+
+#define MPN_COPY_DECR(d, s, n) \
+ do { \
+ mpi_size_t _i; \
+ for (_i = (n)-1; _i >= 0; _i--) \
+ (d)[_i] = (s)[_i]; \
+ } while (0)
+
+/* Zero N limbs at D */
+#define MPN_ZERO(d, n) \
+ do { \
+ int _i; \
+ for (_i = 0; _i < (n); _i++) \
+ (d)[_i] = 0; \
+ } while (0)
+
+#define MPN_NORMALIZE(d, n) \
+ do { \
+ while ((n) > 0) { \
+ if ((d)[(n)-1]) \
+ break; \
+ (n)--; \
+ } \
+ } while (0)
+
+#define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace) \
+ do { \
+ if ((size) < KARATSUBA_THRESHOLD) \
+ mul_n_basecase(prodp, up, vp, size); \
+ else \
+ mul_n(prodp, up, vp, size, tspace); \
+ } while (0);
+
+/* Divide the two-limb number in (NH,,NL) by D, with DI being the largest
+ * limb not larger than (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB).
+ * If this would yield overflow, DI should be the largest possible number
+ * (i.e., only ones). For correct operation, the most significant bit of D
+ * has to be set. Put the quotient in Q and the remainder in R.
+ */
+#define UDIV_QRNND_PREINV(q, r, nh, nl, d, di) \
+ do { \
+ mpi_limb_t _ql __maybe_unused; \
+ mpi_limb_t _q, _r; \
+ mpi_limb_t _xh, _xl; \
+ umul_ppmm(_q, _ql, (nh), (di)); \
+ _q += (nh); /* DI is 2**BITS_PER_MPI_LIMB too small */ \
+ umul_ppmm(_xh, _xl, _q, (d)); \
+ sub_ddmmss(_xh, _r, (nh), (nl), _xh, _xl); \
+ if (_xh) { \
+ sub_ddmmss(_xh, _r, _xh, _r, 0, (d)); \
+ _q++; \
+ if (_xh) { \
+ sub_ddmmss(_xh, _r, _xh, _r, 0, (d)); \
+ _q++; \
+ } \
+ } \
+ if (_r >= (d)) { \
+ _r -= (d); \
+ _q++; \
+ } \
+ (r) = _r; \
+ (q) = _q; \
+ } while (0)
+
+
+/*-- mpiutil.c --*/
+mpi_ptr_t mpi_alloc_limb_space(unsigned nlimbs);
+void mpi_free_limb_space(mpi_ptr_t a);
+void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs);
+
+static inline mpi_limb_t mpihelp_add_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_size_t s1_size, mpi_limb_t s2_limb);
+mpi_limb_t mpihelp_add_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_ptr_t s2_ptr, mpi_size_t size);
+static inline mpi_limb_t mpihelp_add(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
+ mpi_ptr_t s2_ptr, mpi_size_t s2_size);
+
+static inline mpi_limb_t mpihelp_sub_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_size_t s1_size, mpi_limb_t s2_limb);
+mpi_limb_t mpihelp_sub_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_ptr_t s2_ptr, mpi_size_t size);
+static inline mpi_limb_t mpihelp_sub(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
+ mpi_ptr_t s2_ptr, mpi_size_t s2_size);
+
+/*-- mpih-cmp.c --*/
+int mpihelp_cmp(mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size);
+
+/*-- mpih-mul.c --*/
+
+struct karatsuba_ctx {
+ struct karatsuba_ctx *next;
+ mpi_ptr_t tspace;
+ mpi_size_t tspace_size;
+ mpi_ptr_t tp;
+ mpi_size_t tp_size;
+};
+
+void mpihelp_release_karatsuba_ctx(struct karatsuba_ctx *ctx);
+
+mpi_limb_t mpihelp_addmul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_size_t s1_size, mpi_limb_t s2_limb);
+mpi_limb_t mpihelp_submul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_size_t s1_size, mpi_limb_t s2_limb);
+int mpihelp_mul(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
+ mpi_ptr_t vp, mpi_size_t vsize, mpi_limb_t *_result);
+void mpih_sqr_n_basecase(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size);
+void mpih_sqr_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size,
+ mpi_ptr_t tspace);
+
+int mpihelp_mul_karatsuba_case(mpi_ptr_t prodp,
+ mpi_ptr_t up, mpi_size_t usize,
+ mpi_ptr_t vp, mpi_size_t vsize,
+ struct karatsuba_ctx *ctx);
+
+/*-- generic_mpih-mul1.c --*/
+mpi_limb_t mpihelp_mul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
+ mpi_size_t s1_size, mpi_limb_t s2_limb);
+
+/*-- mpih-div.c --*/
+mpi_limb_t mpihelp_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
+ mpi_limb_t divisor_limb);
+mpi_limb_t mpihelp_divrem(mpi_ptr_t qp, mpi_size_t qextra_limbs,
+ mpi_ptr_t np, mpi_size_t nsize,
+ mpi_ptr_t dp, mpi_size_t dsize);
+mpi_limb_t mpihelp_divmod_1(mpi_ptr_t quot_ptr,
+ mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
+ mpi_limb_t divisor_limb);
+
+/*-- generic_mpih-[lr]shift.c --*/
+mpi_limb_t mpihelp_lshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize,
+ unsigned cnt);
+mpi_limb_t mpihelp_rshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize,
+ unsigned cnt);
+
+/* Define stuff for longlong.h. */
+#define W_TYPE_SIZE BITS_PER_MPI_LIMB
+typedef mpi_limb_t UWtype;
+typedef unsigned int UHWtype;
+#if defined(__GNUC__)
+typedef unsigned int UQItype __attribute__ ((mode(QI)));
+typedef int SItype __attribute__ ((mode(SI)));
+typedef unsigned int USItype __attribute__ ((mode(SI)));
+typedef int DItype __attribute__ ((mode(DI)));
+typedef unsigned int UDItype __attribute__ ((mode(DI)));
+#else
+typedef unsigned char UQItype;
+typedef long SItype;
+typedef unsigned long USItype;
+#endif
+
+#ifdef __GNUC__
+#include "mpi-inline.h"
+#endif
+
+#endif /*G10_MPI_INTERNAL_H */
diff --git a/lib/crypto/mpi/mpi-mod.c b/lib/crypto/mpi/mpi-mod.c
new file mode 100644
index 000000000000..d5fdaec3d0b6
--- /dev/null
+++ b/lib/crypto/mpi/mpi-mod.c
@@ -0,0 +1,13 @@
+/* mpi-mod.c - Modular reduction
+ * Copyright (C) 1998, 1999, 2001, 2002, 2003,
+ * 2007 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ */
+
+#include "mpi-internal.h"
+
+int mpi_mod(MPI rem, MPI dividend, MPI divisor)
+{
+ return mpi_fdiv_r(rem, dividend, divisor);
+}
diff --git a/lib/crypto/mpi/mpi-mul.c b/lib/crypto/mpi/mpi-mul.c
new file mode 100644
index 000000000000..d79f186ad90b
--- /dev/null
+++ b/lib/crypto/mpi/mpi-mul.c
@@ -0,0 +1,111 @@
+/* mpi-mul.c - MPI functions
+ * Copyright (C) 1994, 1996, 1998, 2001, 2002,
+ * 2003 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ */
+
+#include <linux/export.h>
+
+#include "mpi-internal.h"
+
+int mpi_mul(MPI w, MPI u, MPI v)
+{
+ mpi_size_t usize, vsize, wsize;
+ mpi_ptr_t up, vp, wp;
+ mpi_limb_t cy;
+ int usign, vsign, sign_product;
+ int assign_wp = 0;
+ mpi_ptr_t tmp_limb = NULL;
+ int err = 0;
+
+ if (u->nlimbs < v->nlimbs) {
+ /* Swap U and V. */
+ usize = v->nlimbs;
+ usign = v->sign;
+ up = v->d;
+ vsize = u->nlimbs;
+ vsign = u->sign;
+ vp = u->d;
+ } else {
+ usize = u->nlimbs;
+ usign = u->sign;
+ up = u->d;
+ vsize = v->nlimbs;
+ vsign = v->sign;
+ vp = v->d;
+ }
+ sign_product = usign ^ vsign;
+ wp = w->d;
+
+ /* Ensure W has space enough to store the result. */
+ wsize = usize + vsize;
+ if (w->alloced < wsize) {
+ if (wp == up || wp == vp) {
+ wp = mpi_alloc_limb_space(wsize);
+ if (!wp)
+ return -ENOMEM;
+ assign_wp = 1;
+ } else {
+ err = mpi_resize(w, wsize);
+ if (err)
+ return err;
+ wp = w->d;
+ }
+ } else { /* Make U and V not overlap with W. */
+ if (wp == up) {
+ /* W and U are identical. Allocate temporary space for U. */
+ up = tmp_limb = mpi_alloc_limb_space(usize);
+ if (!up)
+ return -ENOMEM;
+ /* Is V identical too? Keep it identical with U. */
+ if (wp == vp)
+ vp = up;
+ /* Copy to the temporary space. */
+ MPN_COPY(up, wp, usize);
+ } else if (wp == vp) {
+ /* W and V are identical. Allocate temporary space for V. */
+ vp = tmp_limb = mpi_alloc_limb_space(vsize);
+ if (!vp)
+ return -ENOMEM;
+ /* Copy to the temporary space. */
+ MPN_COPY(vp, wp, vsize);
+ }
+ }
+
+ if (!vsize)
+ wsize = 0;
+ else {
+ err = mpihelp_mul(wp, up, usize, vp, vsize, &cy);
+ if (err) {
+ if (assign_wp)
+ mpi_free_limb_space(wp);
+ goto free_tmp_limb;
+ }
+ wsize -= cy ? 0:1;
+ }
+
+ if (assign_wp)
+ mpi_assign_limb_space(w, wp, wsize);
+ w->nlimbs = wsize;
+ w->sign = sign_product;
+
+free_tmp_limb:
+ if (tmp_limb)
+ mpi_free_limb_space(tmp_limb);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mpi_mul);
+
+int mpi_mulm(MPI w, MPI u, MPI v, MPI m)
+{
+ return mpi_mul(w, u, v) ?:
+ mpi_tdiv_r(w, w, m);
+}
+EXPORT_SYMBOL_GPL(mpi_mulm);
diff --git a/lib/crypto/mpi/mpi-pow.c b/lib/crypto/mpi/mpi-pow.c
new file mode 100644
index 000000000000..9e695a3bda3a
--- /dev/null
+++ b/lib/crypto/mpi/mpi-pow.c
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpi-pow.c - MPI functions
+ * Copyright (C) 1994, 1996, 1998, 2000 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+/****************
+ * RES = BASE ^ EXP mod MOD
+ */
+int mpi_powm(MPI res, MPI base, MPI exp, MPI mod)
+{
+ mpi_ptr_t mp_marker = NULL, bp_marker = NULL, ep_marker = NULL;
+ struct karatsuba_ctx karactx = {};
+ mpi_ptr_t xp_marker = NULL;
+ mpi_ptr_t tspace = NULL;
+ mpi_ptr_t rp, ep, mp, bp;
+ mpi_size_t esize, msize, bsize, rsize;
+ int msign, bsign, rsign;
+ mpi_size_t size;
+ int mod_shift_cnt;
+ int negative_result;
+ int assign_rp = 0;
+ mpi_size_t tsize = 0; /* to avoid compiler warning */
+ /* fixme: we should check that the warning is void */
+ int rc = -ENOMEM;
+
+ esize = exp->nlimbs;
+ msize = mod->nlimbs;
+ size = 2 * msize;
+ msign = mod->sign;
+
+ rp = res->d;
+ ep = exp->d;
+
+ if (!msize)
+ return -EINVAL;
+
+ if (!esize) {
+ /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
+ * depending on if MOD equals 1. */
+ res->nlimbs = (msize == 1 && mod->d[0] == 1) ? 0 : 1;
+ if (res->nlimbs) {
+ if (mpi_resize(res, 1) < 0)
+ goto enomem;
+ rp = res->d;
+ rp[0] = 1;
+ }
+ res->sign = 0;
+ goto leave;
+ }
+
+ /* Normalize MOD (i.e. make its most significant bit set) as required by
+ * mpn_divrem. This will make the intermediate values in the calculation
+ * slightly larger, but the correct result is obtained after a final
+ * reduction using the original MOD value. */
+ mp = mp_marker = mpi_alloc_limb_space(msize);
+ if (!mp)
+ goto enomem;
+ mod_shift_cnt = count_leading_zeros(mod->d[msize - 1]);
+ if (mod_shift_cnt)
+ mpihelp_lshift(mp, mod->d, msize, mod_shift_cnt);
+ else
+ MPN_COPY(mp, mod->d, msize);
+
+ bsize = base->nlimbs;
+ bsign = base->sign;
+ if (bsize > msize) { /* The base is larger than the module. Reduce it. */
+ /* Allocate (BSIZE + 1) with space for remainder and quotient.
+ * (The quotient is (bsize - msize + 1) limbs.) */
+ bp = bp_marker = mpi_alloc_limb_space(bsize + 1);
+ if (!bp)
+ goto enomem;
+ MPN_COPY(bp, base->d, bsize);
+ /* We don't care about the quotient, store it above the remainder,
+ * at BP + MSIZE. */
+ mpihelp_divrem(bp + msize, 0, bp, bsize, mp, msize);
+ bsize = msize;
+ /* Canonicalize the base, since we are going to multiply with it
+ * quite a few times. */
+ MPN_NORMALIZE(bp, bsize);
+ } else
+ bp = base->d;
+
+ if (!bsize) {
+ res->nlimbs = 0;
+ res->sign = 0;
+ goto leave;
+ }
+
+ if (res->alloced < size) {
+ /* We have to allocate more space for RES. If any of the input
+ * parameters are identical to RES, defer deallocation of the old
+ * space. */
+ if (rp == ep || rp == mp || rp == bp) {
+ rp = mpi_alloc_limb_space(size);
+ if (!rp)
+ goto enomem;
+ assign_rp = 1;
+ } else {
+ if (mpi_resize(res, size) < 0)
+ goto enomem;
+ rp = res->d;
+ }
+ } else { /* Make BASE, EXP and MOD not overlap with RES. */
+ if (rp == bp) {
+ /* RES and BASE are identical. Allocate temp. space for BASE. */
+ BUG_ON(bp_marker);
+ bp = bp_marker = mpi_alloc_limb_space(bsize);
+ if (!bp)
+ goto enomem;
+ MPN_COPY(bp, rp, bsize);
+ }
+ if (rp == ep) {
+ /* RES and EXP are identical. Allocate temp. space for EXP. */
+ ep = ep_marker = mpi_alloc_limb_space(esize);
+ if (!ep)
+ goto enomem;
+ MPN_COPY(ep, rp, esize);
+ }
+ if (rp == mp) {
+ /* RES and MOD are identical. Allocate temporary space for MOD. */
+ BUG_ON(mp_marker);
+ mp = mp_marker = mpi_alloc_limb_space(msize);
+ if (!mp)
+ goto enomem;
+ MPN_COPY(mp, rp, msize);
+ }
+ }
+
+ MPN_COPY(rp, bp, bsize);
+ rsize = bsize;
+ rsign = bsign;
+
+ {
+ mpi_size_t i;
+ mpi_ptr_t xp;
+ int c;
+ mpi_limb_t e;
+ mpi_limb_t carry_limb;
+
+ xp = xp_marker = mpi_alloc_limb_space(2 * (msize + 1));
+ if (!xp)
+ goto enomem;
+
+ negative_result = (ep[0] & 1) && base->sign;
+
+ i = esize - 1;
+ e = ep[i];
+ c = count_leading_zeros(e);
+ e = (e << c) << 1; /* shift the exp bits to the left, lose msb */
+ c = BITS_PER_MPI_LIMB - 1 - c;
+
+ /* Main loop.
+ *
+ * Make the result be pointed to alternately by XP and RP. This
+ * helps us avoid block copying, which would otherwise be necessary
+ * with the overlap restrictions of mpihelp_divmod. With 50% probability
+ * the result after this loop will be in the area originally pointed
+ * by RP (==RES->d), and with 50% probability in the area originally
+ * pointed to by XP.
+ */
+
+ for (;;) {
+ while (c) {
+ mpi_size_t xsize;
+
+ /*if (mpihelp_mul_n(xp, rp, rp, rsize) < 0) goto enomem */
+ if (rsize < KARATSUBA_THRESHOLD)
+ mpih_sqr_n_basecase(xp, rp, rsize);
+ else {
+ if (!tspace) {
+ tsize = 2 * rsize;
+ tspace =
+ mpi_alloc_limb_space(tsize);
+ if (!tspace)
+ goto enomem;
+ } else if (tsize < (2 * rsize)) {
+ mpi_free_limb_space(tspace);
+ tsize = 2 * rsize;
+ tspace =
+ mpi_alloc_limb_space(tsize);
+ if (!tspace)
+ goto enomem;
+ }
+ mpih_sqr_n(xp, rp, rsize, tspace);
+ }
+
+ xsize = 2 * rsize;
+ if (xsize > msize) {
+ mpihelp_divrem(xp + msize, 0, xp, xsize,
+ mp, msize);
+ xsize = msize;
+ }
+
+ swap(rp, xp);
+ rsize = xsize;
+
+ if ((mpi_limb_signed_t) e < 0) {
+ /*mpihelp_mul( xp, rp, rsize, bp, bsize ); */
+ if (bsize < KARATSUBA_THRESHOLD) {
+ mpi_limb_t tmp;
+ if (mpihelp_mul
+ (xp, rp, rsize, bp, bsize,
+ &tmp) < 0)
+ goto enomem;
+ } else {
+ if (mpihelp_mul_karatsuba_case
+ (xp, rp, rsize, bp, bsize,
+ &karactx) < 0)
+ goto enomem;
+ }
+
+ xsize = rsize + bsize;
+ if (xsize > msize) {
+ mpihelp_divrem(xp + msize, 0,
+ xp, xsize, mp,
+ msize);
+ xsize = msize;
+ }
+
+ swap(rp, xp);
+ rsize = xsize;
+ }
+ e <<= 1;
+ c--;
+ cond_resched();
+ }
+
+ i--;
+ if (i < 0)
+ break;
+ e = ep[i];
+ c = BITS_PER_MPI_LIMB;
+ }
+
+ /* We shifted MOD, the modulo reduction argument, left MOD_SHIFT_CNT
+ * steps. Adjust the result by reducing it with the original MOD.
+ *
+ * Also make sure the result is put in RES->d (where it already
+ * might be, see above).
+ */
+ if (mod_shift_cnt) {
+ carry_limb =
+ mpihelp_lshift(res->d, rp, rsize, mod_shift_cnt);
+ rp = res->d;
+ if (carry_limb) {
+ rp[rsize] = carry_limb;
+ rsize++;
+ }
+ } else {
+ MPN_COPY(res->d, rp, rsize);
+ rp = res->d;
+ }
+
+ if (rsize >= msize) {
+ mpihelp_divrem(rp + msize, 0, rp, rsize, mp, msize);
+ rsize = msize;
+ }
+
+ /* Remove any leading zero words from the result. */
+ if (mod_shift_cnt)
+ mpihelp_rshift(rp, rp, rsize, mod_shift_cnt);
+ MPN_NORMALIZE(rp, rsize);
+ }
+
+ if (negative_result && rsize) {
+ if (mod_shift_cnt)
+ mpihelp_rshift(mp, mp, msize, mod_shift_cnt);
+ mpihelp_sub(rp, mp, msize, rp, rsize);
+ rsize = msize;
+ rsign = msign;
+ MPN_NORMALIZE(rp, rsize);
+ }
+ res->nlimbs = rsize;
+ res->sign = rsign;
+
+leave:
+ rc = 0;
+enomem:
+ mpihelp_release_karatsuba_ctx(&karactx);
+ if (assign_rp)
+ mpi_assign_limb_space(res, rp, size);
+ if (mp_marker)
+ mpi_free_limb_space(mp_marker);
+ if (bp_marker)
+ mpi_free_limb_space(bp_marker);
+ if (ep_marker)
+ mpi_free_limb_space(ep_marker);
+ if (xp_marker)
+ mpi_free_limb_space(xp_marker);
+ if (tspace)
+ mpi_free_limb_space(tspace);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(mpi_powm);
diff --git a/lib/crypto/mpi/mpi-sub-ui.c b/lib/crypto/mpi/mpi-sub-ui.c
new file mode 100644
index 000000000000..0edcdbd24833
--- /dev/null
+++ b/lib/crypto/mpi/mpi-sub-ui.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpi-sub-ui.c - Subtract an unsigned integer from an MPI.
+ *
+ * Copyright 1991, 1993, 1994, 1996, 1999-2002, 2004, 2012, 2013, 2015
+ * Free Software Foundation, Inc.
+ *
+ * This file was based on the GNU MP Library source file:
+ * https://gmplib.org/repo/gmp-6.2/file/510b83519d1c/mpz/aors_ui.h
+ *
+ * The GNU MP Library is free software; you can redistribute it and/or modify
+ * it under the terms of either:
+ *
+ * * the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your
+ * option) any later version.
+ *
+ * or
+ *
+ * * the GNU General Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your option) any
+ * later version.
+ *
+ * or both in parallel, as here.
+ *
+ * The GNU MP Library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received copies of the GNU General Public License and the
+ * GNU Lesser General Public License along with the GNU MP Library. If not,
+ * see https://www.gnu.org/licenses/.
+ */
+
+#include <linux/export.h>
+
+#include "mpi-internal.h"
+
+int mpi_sub_ui(MPI w, MPI u, unsigned long vval)
+{
+ if (u->nlimbs == 0) {
+ if (mpi_resize(w, 1) < 0)
+ return -ENOMEM;
+ w->d[0] = vval;
+ w->nlimbs = (vval != 0);
+ w->sign = (vval != 0);
+ return 0;
+ }
+
+ /* If not space for W (and possible carry), increase space. */
+ if (mpi_resize(w, u->nlimbs + 1))
+ return -ENOMEM;
+
+ if (u->sign) {
+ mpi_limb_t cy;
+
+ cy = mpihelp_add_1(w->d, u->d, u->nlimbs, (mpi_limb_t) vval);
+ w->d[u->nlimbs] = cy;
+ w->nlimbs = u->nlimbs + cy;
+ w->sign = 1;
+ } else {
+ /* The signs are different. Need exact comparison to determine
+ * which operand to subtract from which.
+ */
+ if (u->nlimbs == 1 && u->d[0] < vval) {
+ w->d[0] = vval - u->d[0];
+ w->nlimbs = 1;
+ w->sign = 1;
+ } else {
+ mpihelp_sub_1(w->d, u->d, u->nlimbs, (mpi_limb_t) vval);
+ /* Size can decrease with at most one limb. */
+ w->nlimbs = (u->nlimbs - (w->d[u->nlimbs - 1] == 0));
+ w->sign = 0;
+ }
+ }
+
+ mpi_normalize(w);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_sub_ui);
diff --git a/lib/crypto/mpi/mpicoder.c b/lib/crypto/mpi/mpicoder.c
new file mode 100644
index 000000000000..bf716a03c704
--- /dev/null
+++ b/lib/crypto/mpi/mpicoder.c
@@ -0,0 +1,417 @@
+/* mpicoder.c - Coder for the external representation of MPIs
+ * Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include <linux/bitops.h>
+#include <linux/byteorder/generic.h>
+#include <linux/count_zeros.h>
+#include <linux/export.h>
+#include <linux/scatterlist.h>
+#include <linux/string.h>
+#include "mpi-internal.h"
+
+#define MAX_EXTERN_MPI_BITS 16384
+
+/**
+ * mpi_read_raw_data - Read a raw byte stream as a positive integer
+ * @xbuffer: The data to read
+ * @nbytes: The amount of data to read
+ */
+MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes)
+{
+ const uint8_t *buffer = xbuffer;
+ int i, j;
+ unsigned nbits, nlimbs;
+ mpi_limb_t a;
+ MPI val = NULL;
+
+ while (nbytes > 0 && buffer[0] == 0) {
+ buffer++;
+ nbytes--;
+ }
+
+ nbits = nbytes * 8;
+ if (nbits > MAX_EXTERN_MPI_BITS) {
+ pr_info("MPI: mpi too large (%u bits)\n", nbits);
+ return NULL;
+ }
+ if (nbytes > 0)
+ nbits -= count_leading_zeros(buffer[0]) - (BITS_PER_LONG - 8);
+
+ nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB);
+ val = mpi_alloc(nlimbs);
+ if (!val)
+ return NULL;
+ val->nbits = nbits;
+ val->sign = 0;
+ val->nlimbs = nlimbs;
+
+ if (nbytes > 0) {
+ i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB;
+ i %= BYTES_PER_MPI_LIMB;
+ for (j = nlimbs; j > 0; j--) {
+ a = 0;
+ for (; i < BYTES_PER_MPI_LIMB; i++) {
+ a <<= 8;
+ a |= *buffer++;
+ }
+ i = 0;
+ val->d[j - 1] = a;
+ }
+ }
+ return val;
+}
+EXPORT_SYMBOL_GPL(mpi_read_raw_data);
+
+MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread)
+{
+ const uint8_t *buffer = xbuffer;
+ unsigned int nbits, nbytes;
+ MPI val;
+
+ if (*ret_nread < 2)
+ return ERR_PTR(-EINVAL);
+ nbits = buffer[0] << 8 | buffer[1];
+
+ if (nbits > MAX_EXTERN_MPI_BITS) {
+ pr_info("MPI: mpi too large (%u bits)\n", nbits);
+ return ERR_PTR(-EINVAL);
+ }
+
+ nbytes = DIV_ROUND_UP(nbits, 8);
+ if (nbytes + 2 > *ret_nread) {
+ pr_info("MPI: mpi larger than buffer nbytes=%u ret_nread=%u\n",
+ nbytes, *ret_nread);
+ return ERR_PTR(-EINVAL);
+ }
+
+ val = mpi_read_raw_data(buffer + 2, nbytes);
+ if (!val)
+ return ERR_PTR(-ENOMEM);
+
+ *ret_nread = nbytes + 2;
+ return val;
+}
+EXPORT_SYMBOL_GPL(mpi_read_from_buffer);
+
+static int count_lzeros(MPI a)
+{
+ mpi_limb_t alimb;
+ int i, lzeros = 0;
+
+ for (i = a->nlimbs - 1; i >= 0; i--) {
+ alimb = a->d[i];
+ if (alimb == 0) {
+ lzeros += sizeof(mpi_limb_t);
+ } else {
+ lzeros += count_leading_zeros(alimb) / 8;
+ break;
+ }
+ }
+ return lzeros;
+}
+
+/**
+ * mpi_read_buffer() - read MPI to a buffer provided by user (msb first)
+ *
+ * @a: a multi precision integer
+ * @buf: buffer to which the output will be written to. Needs to be at
+ * least mpi_get_size(a) long.
+ * @buf_len: size of the buf.
+ * @nbytes: receives the actual length of the data written on success and
+ * the data to-be-written on -EOVERFLOW in case buf_len was too
+ * small.
+ * @sign: if not NULL, it will be set to the sign of a.
+ *
+ * Return: 0 on success or error code in case of error
+ */
+int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes,
+ int *sign)
+{
+ uint8_t *p;
+#if BYTES_PER_MPI_LIMB == 4
+ __be32 alimb;
+#elif BYTES_PER_MPI_LIMB == 8
+ __be64 alimb;
+#else
+#error please implement for this limb size.
+#endif
+ unsigned int n = mpi_get_size(a);
+ int i, lzeros;
+
+ if (!buf || !nbytes)
+ return -EINVAL;
+
+ if (sign)
+ *sign = a->sign;
+
+ lzeros = count_lzeros(a);
+
+ if (buf_len < n - lzeros) {
+ *nbytes = n - lzeros;
+ return -EOVERFLOW;
+ }
+
+ p = buf;
+ *nbytes = n - lzeros;
+
+ for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB,
+ lzeros %= BYTES_PER_MPI_LIMB;
+ i >= 0; i--) {
+#if BYTES_PER_MPI_LIMB == 4
+ alimb = cpu_to_be32(a->d[i]);
+#elif BYTES_PER_MPI_LIMB == 8
+ alimb = cpu_to_be64(a->d[i]);
+#else
+#error please implement for this limb size.
+#endif
+ memcpy(p, (u8 *)&alimb + lzeros, BYTES_PER_MPI_LIMB - lzeros);
+ p += BYTES_PER_MPI_LIMB - lzeros;
+ lzeros = 0;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_read_buffer);
+
+/*
+ * mpi_get_buffer() - Returns an allocated buffer with the MPI (msb first).
+ * Caller must free the return string.
+ * This function does return a 0 byte buffer with nbytes set to zero if the
+ * value of A is zero.
+ *
+ * @a: a multi precision integer.
+ * @nbytes: receives the length of this buffer.
+ * @sign: if not NULL, it will be set to the sign of the a.
+ *
+ * Return: Pointer to MPI buffer or NULL on error
+ */
+void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign)
+{
+ uint8_t *buf;
+ unsigned int n;
+ int ret;
+
+ if (!nbytes)
+ return NULL;
+
+ n = mpi_get_size(a);
+
+ if (!n)
+ n++;
+
+ buf = kmalloc(n, GFP_KERNEL);
+
+ if (!buf)
+ return NULL;
+
+ ret = mpi_read_buffer(a, buf, n, nbytes, sign);
+
+ if (ret) {
+ kfree(buf);
+ return NULL;
+ }
+ return buf;
+}
+EXPORT_SYMBOL_GPL(mpi_get_buffer);
+
+/**
+ * mpi_write_to_sgl() - Funnction exports MPI to an sgl (msb first)
+ *
+ * This function works in the same way as the mpi_read_buffer, but it
+ * takes an sgl instead of u8 * buf.
+ *
+ * @a: a multi precision integer
+ * @sgl: scatterlist to write to. Needs to be at least
+ * mpi_get_size(a) long.
+ * @nbytes: the number of bytes to write. Leading bytes will be
+ * filled with zero.
+ * @sign: if not NULL, it will be set to the sign of a.
+ *
+ * Return: 0 on success or error code in case of error
+ */
+int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned nbytes,
+ int *sign)
+{
+ u8 *p, *p2;
+#if BYTES_PER_MPI_LIMB == 4
+ __be32 alimb;
+#elif BYTES_PER_MPI_LIMB == 8
+ __be64 alimb;
+#else
+#error please implement for this limb size.
+#endif
+ unsigned int n = mpi_get_size(a);
+ struct sg_mapping_iter miter;
+ int i, x, buf_len;
+ int nents;
+
+ if (sign)
+ *sign = a->sign;
+
+ if (nbytes < n)
+ return -EOVERFLOW;
+
+ nents = sg_nents_for_len(sgl, nbytes);
+ if (nents < 0)
+ return -EINVAL;
+
+ sg_miter_start(&miter, sgl, nents, SG_MITER_ATOMIC | SG_MITER_TO_SG);
+ sg_miter_next(&miter);
+ buf_len = miter.length;
+ p2 = miter.addr;
+
+ while (nbytes > n) {
+ i = min_t(unsigned, nbytes - n, buf_len);
+ memset(p2, 0, i);
+ p2 += i;
+ nbytes -= i;
+
+ buf_len -= i;
+ if (!buf_len) {
+ sg_miter_next(&miter);
+ buf_len = miter.length;
+ p2 = miter.addr;
+ }
+ }
+
+ for (i = a->nlimbs - 1; i >= 0; i--) {
+#if BYTES_PER_MPI_LIMB == 4
+ alimb = a->d[i] ? cpu_to_be32(a->d[i]) : 0;
+#elif BYTES_PER_MPI_LIMB == 8
+ alimb = a->d[i] ? cpu_to_be64(a->d[i]) : 0;
+#else
+#error please implement for this limb size.
+#endif
+ p = (u8 *)&alimb;
+
+ for (x = 0; x < sizeof(alimb); x++) {
+ *p2++ = *p++;
+ if (!--buf_len) {
+ sg_miter_next(&miter);
+ buf_len = miter.length;
+ p2 = miter.addr;
+ }
+ }
+ }
+
+ sg_miter_stop(&miter);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_write_to_sgl);
+
+/*
+ * mpi_read_raw_from_sgl() - Function allocates an MPI and populates it with
+ * data from the sgl
+ *
+ * This function works in the same way as the mpi_read_raw_data, but it
+ * takes an sgl instead of void * buffer. i.e. it allocates
+ * a new MPI and reads the content of the sgl to the MPI.
+ *
+ * @sgl: scatterlist to read from
+ * @nbytes: number of bytes to read
+ *
+ * Return: Pointer to a new MPI or NULL on error
+ */
+MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes)
+{
+ struct sg_mapping_iter miter;
+ unsigned int nbits, nlimbs;
+ int x, j, z, lzeros, ents;
+ unsigned int len;
+ const u8 *buff;
+ mpi_limb_t a;
+ MPI val = NULL;
+
+ ents = sg_nents_for_len(sgl, nbytes);
+ if (ents < 0)
+ return NULL;
+
+ sg_miter_start(&miter, sgl, ents, SG_MITER_ATOMIC | SG_MITER_FROM_SG);
+
+ lzeros = 0;
+ len = 0;
+ while (nbytes > 0) {
+ while (len && !*buff) {
+ lzeros++;
+ len--;
+ buff++;
+ }
+
+ if (len && *buff)
+ break;
+
+ sg_miter_next(&miter);
+ buff = miter.addr;
+ len = miter.length;
+
+ nbytes -= lzeros;
+ lzeros = 0;
+ }
+
+ miter.consumed = lzeros;
+
+ nbytes -= lzeros;
+ nbits = nbytes * 8;
+ if (nbits > MAX_EXTERN_MPI_BITS) {
+ sg_miter_stop(&miter);
+ pr_info("MPI: mpi too large (%u bits)\n", nbits);
+ return NULL;
+ }
+
+ if (nbytes > 0)
+ nbits -= count_leading_zeros(*buff) - (BITS_PER_LONG - 8);
+
+ sg_miter_stop(&miter);
+
+ nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB);
+ val = mpi_alloc(nlimbs);
+ if (!val)
+ return NULL;
+
+ val->nbits = nbits;
+ val->sign = 0;
+ val->nlimbs = nlimbs;
+
+ if (nbytes == 0)
+ return val;
+
+ j = nlimbs - 1;
+ a = 0;
+ z = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB;
+ z %= BYTES_PER_MPI_LIMB;
+
+ while (sg_miter_next(&miter)) {
+ buff = miter.addr;
+ len = min(miter.length, nbytes);
+ nbytes -= len;
+
+ for (x = 0; x < len; x++) {
+ a <<= 8;
+ a |= *buff++;
+ if (((z + x + 1) % BYTES_PER_MPI_LIMB) == 0) {
+ val->d[j--] = a;
+ a = 0;
+ }
+ }
+ z += x;
+ }
+
+ return val;
+}
+EXPORT_SYMBOL_GPL(mpi_read_raw_from_sgl);
diff --git a/lib/crypto/mpi/mpih-cmp.c b/lib/crypto/mpi/mpih-cmp.c
new file mode 100644
index 000000000000..f23709114a65
--- /dev/null
+++ b/lib/crypto/mpi/mpih-cmp.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-sub.c - MPI helper functions
+ * Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+
+/****************
+ * Compare OP1_PTR/OP1_SIZE with OP2_PTR/OP2_SIZE.
+ * There are no restrictions on the relative sizes of
+ * the two arguments.
+ * Return 1 if OP1 > OP2, 0 if they are equal, and -1 if OP1 < OP2.
+ */
+int mpihelp_cmp(mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size)
+{
+ mpi_size_t i;
+ mpi_limb_t op1_word, op2_word;
+
+ for (i = size - 1; i >= 0; i--) {
+ op1_word = op1_ptr[i];
+ op2_word = op2_ptr[i];
+ if (op1_word != op2_word)
+ goto diff;
+ }
+ return 0;
+
+diff:
+ /* This can *not* be simplified to
+ * op2_word - op2_word
+ * since that expression might give signed overflow. */
+ return (op1_word > op2_word) ? 1 : -1;
+}
diff --git a/lib/crypto/mpi/mpih-div.c b/lib/crypto/mpi/mpih-div.c
new file mode 100644
index 000000000000..be70ee2e42d3
--- /dev/null
+++ b/lib/crypto/mpi/mpih-div.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-div.c - MPI helper functions
+ * Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ * Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+
+mpi_limb_t
+mpihelp_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
+ mpi_limb_t divisor_limb)
+{
+ mpi_size_t i;
+ mpi_limb_t n1, n0, r;
+ mpi_limb_t dummy __maybe_unused;
+
+ /* Botch: Should this be handled at all? Rely on callers? */
+ if (!dividend_size)
+ return 0;
+
+ /* If multiplication is much faster than division, and the
+ * dividend is large, pre-invert the divisor, and use
+ * only multiplications in the inner loop.
+ *
+ * This test should be read:
+ * Does it ever help to use udiv_qrnnd_preinv?
+ * && Does what we save compensate for the inversion overhead?
+ */
+ if (UDIV_TIME > (2 * UMUL_TIME + 6)
+ && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) {
+ int normalization_steps;
+
+ normalization_steps = count_leading_zeros(divisor_limb);
+ if (normalization_steps) {
+ mpi_limb_t divisor_limb_inverted;
+
+ divisor_limb <<= normalization_steps;
+
+ /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The
+ * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+ * most significant bit (with weight 2**N) implicit.
+ *
+ * Special case for DIVISOR_LIMB == 100...000.
+ */
+ if (!(divisor_limb << 1))
+ divisor_limb_inverted = ~(mpi_limb_t)0;
+ else
+ udiv_qrnnd(divisor_limb_inverted, dummy,
+ -divisor_limb, 0, divisor_limb);
+
+ n1 = dividend_ptr[dividend_size - 1];
+ r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps);
+
+ /* Possible optimization:
+ * if (r == 0
+ * && divisor_limb > ((n1 << normalization_steps)
+ * | (dividend_ptr[dividend_size - 2] >> ...)))
+ * ...one division less...
+ */
+ for (i = dividend_size - 2; i >= 0; i--) {
+ n0 = dividend_ptr[i];
+ UDIV_QRNND_PREINV(dummy, r, r,
+ ((n1 << normalization_steps)
+ | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))),
+ divisor_limb, divisor_limb_inverted);
+ n1 = n0;
+ }
+ UDIV_QRNND_PREINV(dummy, r, r,
+ n1 << normalization_steps,
+ divisor_limb, divisor_limb_inverted);
+ return r >> normalization_steps;
+ } else {
+ mpi_limb_t divisor_limb_inverted;
+
+ /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The
+ * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+ * most significant bit (with weight 2**N) implicit.
+ *
+ * Special case for DIVISOR_LIMB == 100...000.
+ */
+ if (!(divisor_limb << 1))
+ divisor_limb_inverted = ~(mpi_limb_t)0;
+ else
+ udiv_qrnnd(divisor_limb_inverted, dummy,
+ -divisor_limb, 0, divisor_limb);
+
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ i--;
+
+ for ( ; i >= 0; i--) {
+ n0 = dividend_ptr[i];
+ UDIV_QRNND_PREINV(dummy, r, r,
+ n0, divisor_limb, divisor_limb_inverted);
+ }
+ return r;
+ }
+ } else {
+ if (UDIV_NEEDS_NORMALIZATION) {
+ int normalization_steps;
+
+ normalization_steps = count_leading_zeros(divisor_limb);
+ if (normalization_steps) {
+ divisor_limb <<= normalization_steps;
+
+ n1 = dividend_ptr[dividend_size - 1];
+ r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps);
+
+ /* Possible optimization:
+ * if (r == 0
+ * && divisor_limb > ((n1 << normalization_steps)
+ * | (dividend_ptr[dividend_size - 2] >> ...)))
+ * ...one division less...
+ */
+ for (i = dividend_size - 2; i >= 0; i--) {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd(dummy, r, r,
+ ((n1 << normalization_steps)
+ | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))),
+ divisor_limb);
+ n1 = n0;
+ }
+ udiv_qrnnd(dummy, r, r,
+ n1 << normalization_steps,
+ divisor_limb);
+ return r >> normalization_steps;
+ }
+ }
+ /* No normalization needed, either because udiv_qrnnd doesn't require
+ * it, or because DIVISOR_LIMB is already normalized.
+ */
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ i--;
+
+ for (; i >= 0; i--) {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd(dummy, r, r, n0, divisor_limb);
+ }
+ return r;
+ }
+}
+
+/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write
+ * the NSIZE-DSIZE least significant quotient limbs at QP
+ * and the DSIZE long remainder at NP. If QEXTRA_LIMBS is
+ * non-zero, generate that many fraction bits and append them after the
+ * other quotient limbs.
+ * Return the most significant limb of the quotient, this is always 0 or 1.
+ *
+ * Preconditions:
+ * 0. NSIZE >= DSIZE.
+ * 1. The most significant bit of the divisor must be set.
+ * 2. QP must either not overlap with the input operands at all, or
+ * QP + DSIZE >= NP must hold true. (This means that it's
+ * possible to put the quotient in the high part of NUM, right after the
+ * remainder in NUM.
+ * 3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero.
+ */
+
+mpi_limb_t
+mpihelp_divrem(mpi_ptr_t qp, mpi_size_t qextra_limbs,
+ mpi_ptr_t np, mpi_size_t nsize, mpi_ptr_t dp, mpi_size_t dsize)
+{
+ mpi_limb_t most_significant_q_limb = 0;
+
+ switch (dsize) {
+ case 0:
+ /* We are asked to divide by zero, so go ahead and do it! (To make
+ the compiler not remove this statement, return the value.) */
+ /*
+ * existing clients of this function have been modified
+ * not to call it with dsize == 0, so this should not happen
+ */
+ return 1 / dsize;
+
+ case 1:
+ {
+ mpi_size_t i;
+ mpi_limb_t n1;
+ mpi_limb_t d;
+
+ d = dp[0];
+ n1 = np[nsize - 1];
+
+ if (n1 >= d) {
+ n1 -= d;
+ most_significant_q_limb = 1;
+ }
+
+ qp += qextra_limbs;
+ for (i = nsize - 2; i >= 0; i--)
+ udiv_qrnnd(qp[i], n1, n1, np[i], d);
+ qp -= qextra_limbs;
+
+ for (i = qextra_limbs - 1; i >= 0; i--)
+ udiv_qrnnd(qp[i], n1, n1, 0, d);
+
+ np[0] = n1;
+ }
+ break;
+
+ case 2:
+ {
+ mpi_size_t i;
+ mpi_limb_t n1, n0, n2;
+ mpi_limb_t d1, d0;
+
+ np += nsize - 2;
+ d1 = dp[1];
+ d0 = dp[0];
+ n1 = np[1];
+ n0 = np[0];
+
+ if (n1 >= d1 && (n1 > d1 || n0 >= d0)) {
+ sub_ddmmss(n1, n0, n1, n0, d1, d0);
+ most_significant_q_limb = 1;
+ }
+
+ for (i = qextra_limbs + nsize - 2 - 1; i >= 0; i--) {
+ mpi_limb_t q;
+ mpi_limb_t r;
+
+ if (i >= qextra_limbs)
+ np--;
+ else
+ np[0] = 0;
+
+ if (n1 == d1) {
+ /* Q should be either 111..111 or 111..110. Need special
+ * treatment of this rare case as normal division would
+ * give overflow. */
+ q = ~(mpi_limb_t) 0;
+
+ r = n0 + d1;
+ if (r < d1) { /* Carry in the addition? */
+ add_ssaaaa(n1, n0, r - d0,
+ np[0], 0, d0);
+ qp[i] = q;
+ continue;
+ }
+ n1 = d0 - (d0 != 0 ? 1 : 0);
+ n0 = -d0;
+ } else {
+ udiv_qrnnd(q, r, n1, n0, d1);
+ umul_ppmm(n1, n0, d0, q);
+ }
+
+ n2 = np[0];
+q_test:
+ if (n1 > r || (n1 == r && n0 > n2)) {
+ /* The estimated Q was too large. */
+ q--;
+ sub_ddmmss(n1, n0, n1, n0, 0, d0);
+ r += d1;
+ if (r >= d1) /* If not carry, test Q again. */
+ goto q_test;
+ }
+
+ qp[i] = q;
+ sub_ddmmss(n1, n0, r, n2, n1, n0);
+ }
+ np[1] = n1;
+ np[0] = n0;
+ }
+ break;
+
+ default:
+ {
+ mpi_size_t i;
+ mpi_limb_t dX, d1, n0;
+
+ np += nsize - dsize;
+ dX = dp[dsize - 1];
+ d1 = dp[dsize - 2];
+ n0 = np[dsize - 1];
+
+ if (n0 >= dX) {
+ if (n0 > dX
+ || mpihelp_cmp(np, dp, dsize - 1) >= 0) {
+ mpihelp_sub_n(np, np, dp, dsize);
+ n0 = np[dsize - 1];
+ most_significant_q_limb = 1;
+ }
+ }
+
+ for (i = qextra_limbs + nsize - dsize - 1; i >= 0; i--) {
+ mpi_limb_t q;
+ mpi_limb_t n1, n2;
+ mpi_limb_t cy_limb;
+
+ if (i >= qextra_limbs) {
+ np--;
+ n2 = np[dsize];
+ } else {
+ n2 = np[dsize - 1];
+ MPN_COPY_DECR(np + 1, np, dsize - 1);
+ np[0] = 0;
+ }
+
+ if (n0 == dX) {
+ /* This might over-estimate q, but it's probably not worth
+ * the extra code here to find out. */
+ q = ~(mpi_limb_t) 0;
+ } else {
+ mpi_limb_t r;
+
+ udiv_qrnnd(q, r, n0, np[dsize - 1], dX);
+ umul_ppmm(n1, n0, d1, q);
+
+ while (n1 > r
+ || (n1 == r
+ && n0 > np[dsize - 2])) {
+ q--;
+ r += dX;
+ if (r < dX) /* I.e. "carry in previous addition?" */
+ break;
+ n1 -= n0 < d1;
+ n0 -= d1;
+ }
+ }
+
+ /* Possible optimization: We already have (q * n0) and (1 * n1)
+ * after the calculation of q. Taking advantage of that, we
+ * could make this loop make two iterations less. */
+ cy_limb = mpihelp_submul_1(np, dp, dsize, q);
+
+ if (n2 != cy_limb) {
+ mpihelp_add_n(np, np, dp, dsize);
+ q--;
+ }
+
+ qp[i] = q;
+ n0 = np[dsize - 1];
+ }
+ }
+ }
+
+ return most_significant_q_limb;
+}
+
+/****************
+ * Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+ * Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+ * Return the single-limb remainder.
+ * There are no constraints on the value of the divisor.
+ *
+ * QUOT_PTR and DIVIDEND_PTR might point to the same limb.
+ */
+
+mpi_limb_t
+mpihelp_divmod_1(mpi_ptr_t quot_ptr,
+ mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
+ mpi_limb_t divisor_limb)
+{
+ mpi_size_t i;
+ mpi_limb_t n1, n0, r;
+ mpi_limb_t dummy __maybe_unused;
+
+ if (!dividend_size)
+ return 0;
+
+ /* If multiplication is much faster than division, and the
+ * dividend is large, pre-invert the divisor, and use
+ * only multiplications in the inner loop.
+ *
+ * This test should be read:
+ * Does it ever help to use udiv_qrnnd_preinv?
+ * && Does what we save compensate for the inversion overhead?
+ */
+ if (UDIV_TIME > (2 * UMUL_TIME + 6)
+ && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) {
+ int normalization_steps;
+
+ normalization_steps = count_leading_zeros(divisor_limb);
+ if (normalization_steps) {
+ mpi_limb_t divisor_limb_inverted;
+
+ divisor_limb <<= normalization_steps;
+
+ /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The
+ * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+ * most significant bit (with weight 2**N) implicit.
+ */
+ /* Special case for DIVISOR_LIMB == 100...000. */
+ if (!(divisor_limb << 1))
+ divisor_limb_inverted = ~(mpi_limb_t)0;
+ else
+ udiv_qrnnd(divisor_limb_inverted, dummy,
+ -divisor_limb, 0, divisor_limb);
+
+ n1 = dividend_ptr[dividend_size - 1];
+ r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps);
+
+ /* Possible optimization:
+ * if (r == 0
+ * && divisor_limb > ((n1 << normalization_steps)
+ * | (dividend_ptr[dividend_size - 2] >> ...)))
+ * ...one division less...
+ */
+ for (i = dividend_size - 2; i >= 0; i--) {
+ n0 = dividend_ptr[i];
+ UDIV_QRNND_PREINV(quot_ptr[i + 1], r, r,
+ ((n1 << normalization_steps)
+ | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))),
+ divisor_limb, divisor_limb_inverted);
+ n1 = n0;
+ }
+ UDIV_QRNND_PREINV(quot_ptr[0], r, r,
+ n1 << normalization_steps,
+ divisor_limb, divisor_limb_inverted);
+ return r >> normalization_steps;
+ } else {
+ mpi_limb_t divisor_limb_inverted;
+
+ /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The
+ * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+ * most significant bit (with weight 2**N) implicit.
+ */
+ /* Special case for DIVISOR_LIMB == 100...000. */
+ if (!(divisor_limb << 1))
+ divisor_limb_inverted = ~(mpi_limb_t) 0;
+ else
+ udiv_qrnnd(divisor_limb_inverted, dummy,
+ -divisor_limb, 0, divisor_limb);
+
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ quot_ptr[i--] = 0;
+
+ for ( ; i >= 0; i--) {
+ n0 = dividend_ptr[i];
+ UDIV_QRNND_PREINV(quot_ptr[i], r, r,
+ n0, divisor_limb, divisor_limb_inverted);
+ }
+ return r;
+ }
+ } else {
+ if (UDIV_NEEDS_NORMALIZATION) {
+ int normalization_steps;
+
+ normalization_steps = count_leading_zeros(divisor_limb);
+ if (normalization_steps) {
+ divisor_limb <<= normalization_steps;
+
+ n1 = dividend_ptr[dividend_size - 1];
+ r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps);
+
+ /* Possible optimization:
+ * if (r == 0
+ * && divisor_limb > ((n1 << normalization_steps)
+ * | (dividend_ptr[dividend_size - 2] >> ...)))
+ * ...one division less...
+ */
+ for (i = dividend_size - 2; i >= 0; i--) {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd(quot_ptr[i + 1], r, r,
+ ((n1 << normalization_steps)
+ | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))),
+ divisor_limb);
+ n1 = n0;
+ }
+ udiv_qrnnd(quot_ptr[0], r, r,
+ n1 << normalization_steps,
+ divisor_limb);
+ return r >> normalization_steps;
+ }
+ }
+ /* No normalization needed, either because udiv_qrnnd doesn't require
+ * it, or because DIVISOR_LIMB is already normalized.
+ */
+ i = dividend_size - 1;
+ r = dividend_ptr[i];
+
+ if (r >= divisor_limb)
+ r = 0;
+ else
+ quot_ptr[i--] = 0;
+
+ for (; i >= 0; i--) {
+ n0 = dividend_ptr[i];
+ udiv_qrnnd(quot_ptr[i], r, r, n0, divisor_limb);
+ }
+ return r;
+ }
+}
diff --git a/lib/crypto/mpi/mpih-mul.c b/lib/crypto/mpi/mpih-mul.c
new file mode 100644
index 000000000000..a93647564054
--- /dev/null
+++ b/lib/crypto/mpi/mpih-mul.c
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* mpihelp-mul.c - MPI helper functions
+ * Copyright (C) 1994, 1996, 1998, 1999,
+ * 2000 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ * Actually it's the same code with only minor changes in the
+ * way the data is stored; this is to support the abstraction
+ * of an optional secure memory allocation which may be used
+ * to avoid revealing of sensitive data due to paging etc.
+ * The GNU MP Library itself is published under the LGPL;
+ * however I decided to publish this code under the plain GPL.
+ */
+
+#include <linux/string.h>
+#include "mpi-internal.h"
+#include "longlong.h"
+
+#define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace) \
+ do { \
+ if ((size) < KARATSUBA_THRESHOLD) \
+ mul_n_basecase(prodp, up, vp, size); \
+ else \
+ mul_n(prodp, up, vp, size, tspace); \
+ } while (0);
+
+#define MPN_SQR_N_RECURSE(prodp, up, size, tspace) \
+ do { \
+ if ((size) < KARATSUBA_THRESHOLD) \
+ mpih_sqr_n_basecase(prodp, up, size); \
+ else \
+ mpih_sqr_n(prodp, up, size, tspace); \
+ } while (0);
+
+/* Multiply the natural numbers u (pointed to by UP) and v (pointed to by VP),
+ * both with SIZE limbs, and store the result at PRODP. 2 * SIZE limbs are
+ * always stored. Return the most significant limb.
+ *
+ * Argument constraints:
+ * 1. PRODP != UP and PRODP != VP, i.e. the destination
+ * must be distinct from the multiplier and the multiplicand.
+ *
+ *
+ * Handle simple cases with traditional multiplication.
+ *
+ * This is the most critical code of multiplication. All multiplies rely
+ * on this, both small and huge. Small ones arrive here immediately. Huge
+ * ones arrive here as this is the base case for Karatsuba's recursive
+ * algorithm below.
+ */
+
+static mpi_limb_t
+mul_n_basecase(mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size)
+{
+ mpi_size_t i;
+ mpi_limb_t cy;
+ mpi_limb_t v_limb;
+
+ /* Multiply by the first limb in V separately, as the result can be
+ * stored (not added) to PROD. We also avoid a loop for zeroing. */
+ v_limb = vp[0];
+ if (v_limb <= 1) {
+ if (v_limb == 1)
+ MPN_COPY(prodp, up, size);
+ else
+ MPN_ZERO(prodp, size);
+ cy = 0;
+ } else
+ cy = mpihelp_mul_1(prodp, up, size, v_limb);
+
+ prodp[size] = cy;
+ prodp++;
+
+ /* For each iteration in the outer loop, multiply one limb from
+ * U with one limb from V, and add it to PROD. */
+ for (i = 1; i < size; i++) {
+ v_limb = vp[i];
+ if (v_limb <= 1) {
+ cy = 0;
+ if (v_limb == 1)
+ cy = mpihelp_add_n(prodp, prodp, up, size);
+ } else
+ cy = mpihelp_addmul_1(prodp, up, size, v_limb);
+
+ prodp[size] = cy;
+ prodp++;
+ }
+
+ return cy;
+}
+
+static void
+mul_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp,
+ mpi_size_t size, mpi_ptr_t tspace)
+{
+ if (size & 1) {
+ /* The size is odd, and the code below doesn't handle that.
+ * Multiply the least significant (size - 1) limbs with a recursive
+ * call, and handle the most significant limb of S1 and S2
+ * separately.
+ * A slightly faster way to do this would be to make the Karatsuba
+ * code below behave as if the size were even, and let it check for
+ * odd size in the end. I.e., in essence move this code to the end.
+ * Doing so would save us a recursive call, and potentially make the
+ * stack grow a lot less.
+ */
+ mpi_size_t esize = size - 1; /* even size */
+ mpi_limb_t cy_limb;
+
+ MPN_MUL_N_RECURSE(prodp, up, vp, esize, tspace);
+ cy_limb = mpihelp_addmul_1(prodp + esize, up, esize, vp[esize]);
+ prodp[esize + esize] = cy_limb;
+ cy_limb = mpihelp_addmul_1(prodp + esize, vp, size, up[esize]);
+ prodp[esize + size] = cy_limb;
+ } else {
+ /* Anatolij Alekseevich Karatsuba's divide-and-conquer algorithm.
+ *
+ * Split U in two pieces, U1 and U0, such that
+ * U = U0 + U1*(B**n),
+ * and V in V1 and V0, such that
+ * V = V0 + V1*(B**n).
+ *
+ * UV is then computed recursively using the identity
+ *
+ * 2n n n n
+ * UV = (B + B )U V + B (U -U )(V -V ) + (B + 1)U V
+ * 1 1 1 0 0 1 0 0
+ *
+ * Where B = 2**BITS_PER_MP_LIMB.
+ */
+ mpi_size_t hsize = size >> 1;
+ mpi_limb_t cy;
+ int negflg;
+
+ /* Product H. ________________ ________________
+ * |_____U1 x V1____||____U0 x V0_____|
+ * Put result in upper part of PROD and pass low part of TSPACE
+ * as new TSPACE.
+ */
+ MPN_MUL_N_RECURSE(prodp + size, up + hsize, vp + hsize, hsize,
+ tspace);
+
+ /* Product M. ________________
+ * |_(U1-U0)(V0-V1)_|
+ */
+ if (mpihelp_cmp(up + hsize, up, hsize) >= 0) {
+ mpihelp_sub_n(prodp, up + hsize, up, hsize);
+ negflg = 0;
+ } else {
+ mpihelp_sub_n(prodp, up, up + hsize, hsize);
+ negflg = 1;
+ }
+ if (mpihelp_cmp(vp + hsize, vp, hsize) >= 0) {
+ mpihelp_sub_n(prodp + hsize, vp + hsize, vp, hsize);
+ negflg ^= 1;
+ } else {
+ mpihelp_sub_n(prodp + hsize, vp, vp + hsize, hsize);
+ /* No change of NEGFLG. */
+ }
+ /* Read temporary operands from low part of PROD.
+ * Put result in low part of TSPACE using upper part of TSPACE
+ * as new TSPACE.
+ */
+ MPN_MUL_N_RECURSE(tspace, prodp, prodp + hsize, hsize,
+ tspace + size);
+
+ /* Add/copy product H. */
+ MPN_COPY(prodp + hsize, prodp + size, hsize);
+ cy = mpihelp_add_n(prodp + size, prodp + size,
+ prodp + size + hsize, hsize);
+
+ /* Add product M (if NEGFLG M is a negative number) */
+ if (negflg)
+ cy -=
+ mpihelp_sub_n(prodp + hsize, prodp + hsize, tspace,
+ size);
+ else
+ cy +=
+ mpihelp_add_n(prodp + hsize, prodp + hsize, tspace,
+ size);
+
+ /* Product L. ________________ ________________
+ * |________________||____U0 x V0_____|
+ * Read temporary operands from low part of PROD.
+ * Put result in low part of TSPACE using upper part of TSPACE
+ * as new TSPACE.
+ */
+ MPN_MUL_N_RECURSE(tspace, up, vp, hsize, tspace + size);
+
+ /* Add/copy Product L (twice) */
+
+ cy += mpihelp_add_n(prodp + hsize, prodp + hsize, tspace, size);
+ if (cy)
+ mpihelp_add_1(prodp + hsize + size,
+ prodp + hsize + size, hsize, cy);
+
+ MPN_COPY(prodp, tspace, hsize);
+ cy = mpihelp_add_n(prodp + hsize, prodp + hsize, tspace + hsize,
+ hsize);
+ if (cy)
+ mpihelp_add_1(prodp + size, prodp + size, size, 1);
+ }
+}
+
+void mpih_sqr_n_basecase(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size)
+{
+ mpi_size_t i;
+ mpi_limb_t cy_limb;
+ mpi_limb_t v_limb;
+
+ /* Multiply by the first limb in V separately, as the result can be
+ * stored (not added) to PROD. We also avoid a loop for zeroing. */
+ v_limb = up[0];
+ if (v_limb <= 1) {
+ if (v_limb == 1)
+ MPN_COPY(prodp, up, size);
+ else
+ MPN_ZERO(prodp, size);
+ cy_limb = 0;
+ } else
+ cy_limb = mpihelp_mul_1(prodp, up, size, v_limb);
+
+ prodp[size] = cy_limb;
+ prodp++;
+
+ /* For each iteration in the outer loop, multiply one limb from
+ * U with one limb from V, and add it to PROD. */
+ for (i = 1; i < size; i++) {
+ v_limb = up[i];
+ if (v_limb <= 1) {
+ cy_limb = 0;
+ if (v_limb == 1)
+ cy_limb = mpihelp_add_n(prodp, prodp, up, size);
+ } else
+ cy_limb = mpihelp_addmul_1(prodp, up, size, v_limb);
+
+ prodp[size] = cy_limb;
+ prodp++;
+ }
+}
+
+void
+mpih_sqr_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size, mpi_ptr_t tspace)
+{
+ if (size & 1) {
+ /* The size is odd, and the code below doesn't handle that.
+ * Multiply the least significant (size - 1) limbs with a recursive
+ * call, and handle the most significant limb of S1 and S2
+ * separately.
+ * A slightly faster way to do this would be to make the Karatsuba
+ * code below behave as if the size were even, and let it check for
+ * odd size in the end. I.e., in essence move this code to the end.
+ * Doing so would save us a recursive call, and potentially make the
+ * stack grow a lot less.
+ */
+ mpi_size_t esize = size - 1; /* even size */
+ mpi_limb_t cy_limb;
+
+ MPN_SQR_N_RECURSE(prodp, up, esize, tspace);
+ cy_limb = mpihelp_addmul_1(prodp + esize, up, esize, up[esize]);
+ prodp[esize + esize] = cy_limb;
+ cy_limb = mpihelp_addmul_1(prodp + esize, up, size, up[esize]);
+
+ prodp[esize + size] = cy_limb;
+ } else {
+ mpi_size_t hsize = size >> 1;
+ mpi_limb_t cy;
+
+ /* Product H. ________________ ________________
+ * |_____U1 x U1____||____U0 x U0_____|
+ * Put result in upper part of PROD and pass low part of TSPACE
+ * as new TSPACE.
+ */
+ MPN_SQR_N_RECURSE(prodp + size, up + hsize, hsize, tspace);
+
+ /* Product M. ________________
+ * |_(U1-U0)(U0-U1)_|
+ */
+ if (mpihelp_cmp(up + hsize, up, hsize) >= 0)
+ mpihelp_sub_n(prodp, up + hsize, up, hsize);
+ else
+ mpihelp_sub_n(prodp, up, up + hsize, hsize);
+
+ /* Read temporary operands from low part of PROD.
+ * Put result in low part of TSPACE using upper part of TSPACE
+ * as new TSPACE. */
+ MPN_SQR_N_RECURSE(tspace, prodp, hsize, tspace + size);
+
+ /* Add/copy product H */
+ MPN_COPY(prodp + hsize, prodp + size, hsize);
+ cy = mpihelp_add_n(prodp + size, prodp + size,
+ prodp + size + hsize, hsize);
+
+ /* Add product M (if NEGFLG M is a negative number). */
+ cy -= mpihelp_sub_n(prodp + hsize, prodp + hsize, tspace, size);
+
+ /* Product L. ________________ ________________
+ * |________________||____U0 x U0_____|
+ * Read temporary operands from low part of PROD.
+ * Put result in low part of TSPACE using upper part of TSPACE
+ * as new TSPACE. */
+ MPN_SQR_N_RECURSE(tspace, up, hsize, tspace + size);
+
+ /* Add/copy Product L (twice). */
+ cy += mpihelp_add_n(prodp + hsize, prodp + hsize, tspace, size);
+ if (cy)
+ mpihelp_add_1(prodp + hsize + size,
+ prodp + hsize + size, hsize, cy);
+
+ MPN_COPY(prodp, tspace, hsize);
+ cy = mpihelp_add_n(prodp + hsize, prodp + hsize, tspace + hsize,
+ hsize);
+ if (cy)
+ mpihelp_add_1(prodp + size, prodp + size, size, 1);
+ }
+}
+
+int
+mpihelp_mul_karatsuba_case(mpi_ptr_t prodp,
+ mpi_ptr_t up, mpi_size_t usize,
+ mpi_ptr_t vp, mpi_size_t vsize,
+ struct karatsuba_ctx *ctx)
+{
+ mpi_limb_t cy;
+
+ if (!ctx->tspace || ctx->tspace_size < vsize) {
+ if (ctx->tspace)
+ mpi_free_limb_space(ctx->tspace);
+ ctx->tspace = mpi_alloc_limb_space(2 * vsize);
+ if (!ctx->tspace)
+ return -ENOMEM;
+ ctx->tspace_size = vsize;
+ }
+
+ MPN_MUL_N_RECURSE(prodp, up, vp, vsize, ctx->tspace);
+
+ prodp += vsize;
+ up += vsize;
+ usize -= vsize;
+ if (usize >= vsize) {
+ if (!ctx->tp || ctx->tp_size < vsize) {
+ if (ctx->tp)
+ mpi_free_limb_space(ctx->tp);
+ ctx->tp = mpi_alloc_limb_space(2 * vsize);
+ if (!ctx->tp) {
+ if (ctx->tspace)
+ mpi_free_limb_space(ctx->tspace);
+ ctx->tspace = NULL;
+ return -ENOMEM;
+ }
+ ctx->tp_size = vsize;
+ }
+
+ do {
+ MPN_MUL_N_RECURSE(ctx->tp, up, vp, vsize, ctx->tspace);
+ cy = mpihelp_add_n(prodp, prodp, ctx->tp, vsize);
+ mpihelp_add_1(prodp + vsize, ctx->tp + vsize, vsize,
+ cy);
+ prodp += vsize;
+ up += vsize;
+ usize -= vsize;
+ } while (usize >= vsize);
+ }
+
+ if (usize) {
+ if (usize < KARATSUBA_THRESHOLD) {
+ mpi_limb_t tmp;
+ if (mpihelp_mul(ctx->tspace, vp, vsize, up, usize, &tmp)
+ < 0)
+ return -ENOMEM;
+ } else {
+ if (!ctx->next) {
+ ctx->next = kzalloc(sizeof *ctx, GFP_KERNEL);
+ if (!ctx->next)
+ return -ENOMEM;
+ }
+ if (mpihelp_mul_karatsuba_case(ctx->tspace,
+ vp, vsize,
+ up, usize,
+ ctx->next) < 0)
+ return -ENOMEM;
+ }
+
+ cy = mpihelp_add_n(prodp, prodp, ctx->tspace, vsize);
+ mpihelp_add_1(prodp + vsize, ctx->tspace + vsize, usize, cy);
+ }
+
+ return 0;
+}
+
+void mpihelp_release_karatsuba_ctx(struct karatsuba_ctx *ctx)
+{
+ struct karatsuba_ctx *ctx2;
+
+ if (ctx->tp)
+ mpi_free_limb_space(ctx->tp);
+ if (ctx->tspace)
+ mpi_free_limb_space(ctx->tspace);
+ for (ctx = ctx->next; ctx; ctx = ctx2) {
+ ctx2 = ctx->next;
+ if (ctx->tp)
+ mpi_free_limb_space(ctx->tp);
+ if (ctx->tspace)
+ mpi_free_limb_space(ctx->tspace);
+ kfree(ctx);
+ }
+}
+
+/* Multiply the natural numbers u (pointed to by UP, with USIZE limbs)
+ * and v (pointed to by VP, with VSIZE limbs), and store the result at
+ * PRODP. USIZE + VSIZE limbs are always stored, but if the input
+ * operands are normalized. Return the most significant limb of the
+ * result.
+ *
+ * NOTE: The space pointed to by PRODP is overwritten before finished
+ * with U and V, so overlap is an error.
+ *
+ * Argument constraints:
+ * 1. USIZE >= VSIZE.
+ * 2. PRODP != UP and PRODP != VP, i.e. the destination
+ * must be distinct from the multiplier and the multiplicand.
+ */
+
+int
+mpihelp_mul(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
+ mpi_ptr_t vp, mpi_size_t vsize, mpi_limb_t *_result)
+{
+ mpi_ptr_t prod_endp = prodp + usize + vsize - 1;
+ mpi_limb_t cy;
+ struct karatsuba_ctx ctx;
+
+ if (vsize < KARATSUBA_THRESHOLD) {
+ mpi_size_t i;
+ mpi_limb_t v_limb;
+
+ if (!vsize) {
+ *_result = 0;
+ return 0;
+ }
+
+ /* Multiply by the first limb in V separately, as the result can be
+ * stored (not added) to PROD. We also avoid a loop for zeroing. */
+ v_limb = vp[0];
+ if (v_limb <= 1) {
+ if (v_limb == 1)
+ MPN_COPY(prodp, up, usize);
+ else
+ MPN_ZERO(prodp, usize);
+ cy = 0;
+ } else
+ cy = mpihelp_mul_1(prodp, up, usize, v_limb);
+
+ prodp[usize] = cy;
+ prodp++;
+
+ /* For each iteration in the outer loop, multiply one limb from
+ * U with one limb from V, and add it to PROD. */
+ for (i = 1; i < vsize; i++) {
+ v_limb = vp[i];
+ if (v_limb <= 1) {
+ cy = 0;
+ if (v_limb == 1)
+ cy = mpihelp_add_n(prodp, prodp, up,
+ usize);
+ } else
+ cy = mpihelp_addmul_1(prodp, up, usize, v_limb);
+
+ prodp[usize] = cy;
+ prodp++;
+ }
+
+ *_result = cy;
+ return 0;
+ }
+
+ memset(&ctx, 0, sizeof ctx);
+ if (mpihelp_mul_karatsuba_case(prodp, up, usize, vp, vsize, &ctx) < 0)
+ return -ENOMEM;
+ mpihelp_release_karatsuba_ctx(&ctx);
+ *_result = *prod_endp;
+ return 0;
+}
diff --git a/lib/crypto/mpi/mpiutil.c b/lib/crypto/mpi/mpiutil.c
new file mode 100644
index 000000000000..7f2db830f404
--- /dev/null
+++ b/lib/crypto/mpi/mpiutil.c
@@ -0,0 +1,152 @@
+/* mpiutil.ac - Utility functions for MPI
+ * Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include <linux/export.h>
+
+#include "mpi-internal.h"
+
+/****************
+ * Note: It was a bad idea to use the number of limbs to allocate
+ * because on a alpha the limbs are large but we normally need
+ * integers of n bits - So we should change this to bits (or bytes).
+ *
+ * But mpi_alloc is used in a lot of places :-)
+ */
+MPI mpi_alloc(unsigned nlimbs)
+{
+ MPI a;
+
+ a = kmalloc(sizeof *a, GFP_KERNEL);
+ if (!a)
+ return a;
+
+ if (nlimbs) {
+ a->d = mpi_alloc_limb_space(nlimbs);
+ if (!a->d) {
+ kfree(a);
+ return NULL;
+ }
+ } else {
+ a->d = NULL;
+ }
+
+ a->alloced = nlimbs;
+ a->nlimbs = 0;
+ a->sign = 0;
+ a->flags = 0;
+ a->nbits = 0;
+ return a;
+}
+EXPORT_SYMBOL_GPL(mpi_alloc);
+
+mpi_ptr_t mpi_alloc_limb_space(unsigned nlimbs)
+{
+ size_t len = nlimbs * sizeof(mpi_limb_t);
+
+ if (!len)
+ return NULL;
+
+ return kmalloc(len, GFP_KERNEL);
+}
+
+void mpi_free_limb_space(mpi_ptr_t a)
+{
+ if (!a)
+ return;
+
+ kfree_sensitive(a);
+}
+
+void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs)
+{
+ mpi_free_limb_space(a->d);
+ a->d = ap;
+ a->alloced = nlimbs;
+}
+
+/****************
+ * Resize the array of A to NLIMBS. the additional space is cleared
+ * (set to 0) [done by m_realloc()]
+ */
+int mpi_resize(MPI a, unsigned nlimbs)
+{
+ void *p;
+
+ if (nlimbs <= a->alloced)
+ return 0; /* no need to do it */
+
+ if (a->d) {
+ p = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t));
+ kfree_sensitive(a->d);
+ a->d = p;
+ } else {
+ a->d = kcalloc(nlimbs, sizeof(mpi_limb_t), GFP_KERNEL);
+ if (!a->d)
+ return -ENOMEM;
+ }
+ a->alloced = nlimbs;
+ return 0;
+}
+
+void mpi_free(MPI a)
+{
+ if (!a)
+ return;
+
+ if (a->flags & 4)
+ kfree_sensitive(a->d);
+ else
+ mpi_free_limb_space(a->d);
+
+ if (a->flags & ~7)
+ pr_info("invalid flag value in mpi\n");
+ kfree(a);
+}
+EXPORT_SYMBOL_GPL(mpi_free);
+
+/****************
+ * Note: This copy function should not interpret the MPI
+ * but copy it transparently.
+ */
+MPI mpi_copy(MPI a)
+{
+ int i;
+ MPI b;
+
+ if (a) {
+ b = mpi_alloc(a->nlimbs);
+ if (!b)
+ return NULL;
+ b->nlimbs = a->nlimbs;
+ b->sign = a->sign;
+ b->flags = a->flags;
+ b->flags &= ~(16|32); /* Reset the immutable and constant flags. */
+ for (i = 0; i < b->nlimbs; i++)
+ b->d[i] = a->d[i];
+ } else
+ b = NULL;
+ return b;
+}
+
+MODULE_DESCRIPTION("Multiprecision maths library");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/poly1305-donna32.c b/lib/crypto/poly1305-donna32.c
index 7fb71845cc84..b66131b3f6d4 100644
--- a/lib/crypto/poly1305-donna32.c
+++ b/lib/crypto/poly1305-donna32.c
@@ -6,9 +6,10 @@
* public domain.
*/
-#include <linux/kernel.h>
-#include <asm/unaligned.h>
#include <crypto/internal/poly1305.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/unaligned.h>
void poly1305_core_setkey(struct poly1305_core_key *key,
const u8 raw_key[POLY1305_BLOCK_SIZE])
diff --git a/lib/crypto/poly1305-donna64.c b/lib/crypto/poly1305-donna64.c
index d34cf4053668..8a72a5a84944 100644
--- a/lib/crypto/poly1305-donna64.c
+++ b/lib/crypto/poly1305-donna64.c
@@ -6,11 +6,10 @@
* public domain.
*/
-#include <linux/kernel.h>
-#include <asm/unaligned.h>
#include <crypto/internal/poly1305.h>
-
-typedef __uint128_t u128;
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/unaligned.h>
void poly1305_core_setkey(struct poly1305_core_key *key,
const u8 raw_key[POLY1305_BLOCK_SIZE])
diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c
index 26d87fc3823e..f313ccc4b4dd 100644
--- a/lib/crypto/poly1305.c
+++ b/lib/crypto/poly1305.c
@@ -8,71 +8,93 @@
*/
#include <crypto/internal/poly1305.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <asm/unaligned.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
-void poly1305_init_generic(struct poly1305_desc_ctx *desc,
- const u8 key[POLY1305_KEY_SIZE])
+#ifdef CONFIG_CRYPTO_LIB_POLY1305_ARCH
+#include "poly1305.h" /* $(SRCARCH)/poly1305.h */
+#else
+#define poly1305_block_init poly1305_block_init_generic
+#define poly1305_blocks poly1305_blocks_generic
+#define poly1305_emit poly1305_emit_generic
+#endif
+
+void poly1305_init(struct poly1305_desc_ctx *desc,
+ const u8 key[POLY1305_KEY_SIZE])
{
- poly1305_core_setkey(&desc->core_r, key);
desc->s[0] = get_unaligned_le32(key + 16);
desc->s[1] = get_unaligned_le32(key + 20);
desc->s[2] = get_unaligned_le32(key + 24);
desc->s[3] = get_unaligned_le32(key + 28);
- poly1305_core_init(&desc->h);
desc->buflen = 0;
- desc->sset = true;
- desc->rset = 2;
+ poly1305_block_init(&desc->state, key);
}
-EXPORT_SYMBOL_GPL(poly1305_init_generic);
+EXPORT_SYMBOL(poly1305_init);
-void poly1305_update_generic(struct poly1305_desc_ctx *desc, const u8 *src,
- unsigned int nbytes)
+void poly1305_update(struct poly1305_desc_ctx *desc,
+ const u8 *src, unsigned int nbytes)
{
- unsigned int bytes;
+ if (desc->buflen + nbytes >= POLY1305_BLOCK_SIZE) {
+ unsigned int bulk_len;
- if (unlikely(desc->buflen)) {
- bytes = min(nbytes, POLY1305_BLOCK_SIZE - desc->buflen);
- memcpy(desc->buf + desc->buflen, src, bytes);
- src += bytes;
- nbytes -= bytes;
- desc->buflen += bytes;
+ if (desc->buflen) {
+ unsigned int l = POLY1305_BLOCK_SIZE - desc->buflen;
- if (desc->buflen == POLY1305_BLOCK_SIZE) {
- poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf,
- 1, 1);
+ memcpy(&desc->buf[desc->buflen], src, l);
+ src += l;
+ nbytes -= l;
+
+ poly1305_blocks(&desc->state, desc->buf,
+ POLY1305_BLOCK_SIZE, 1);
desc->buflen = 0;
}
- }
- if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
- poly1305_core_blocks(&desc->h, &desc->core_r, src,
- nbytes / POLY1305_BLOCK_SIZE, 1);
- src += nbytes - (nbytes % POLY1305_BLOCK_SIZE);
+ bulk_len = round_down(nbytes, POLY1305_BLOCK_SIZE);
nbytes %= POLY1305_BLOCK_SIZE;
- }
- if (unlikely(nbytes)) {
- desc->buflen = nbytes;
- memcpy(desc->buf, src, nbytes);
+ if (bulk_len) {
+ poly1305_blocks(&desc->state, src, bulk_len, 1);
+ src += bulk_len;
+ }
+ }
+ if (nbytes) {
+ memcpy(&desc->buf[desc->buflen], src, nbytes);
+ desc->buflen += nbytes;
}
}
-EXPORT_SYMBOL_GPL(poly1305_update_generic);
+EXPORT_SYMBOL(poly1305_update);
-void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *dst)
+void poly1305_final(struct poly1305_desc_ctx *desc, u8 *dst)
{
if (unlikely(desc->buflen)) {
desc->buf[desc->buflen++] = 1;
memset(desc->buf + desc->buflen, 0,
POLY1305_BLOCK_SIZE - desc->buflen);
- poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf, 1, 0);
+ poly1305_blocks(&desc->state, desc->buf, POLY1305_BLOCK_SIZE,
+ 0);
}
- poly1305_core_emit(&desc->h, desc->s, dst);
+ poly1305_emit(&desc->state.h, dst, desc->s);
*desc = (struct poly1305_desc_ctx){};
}
-EXPORT_SYMBOL_GPL(poly1305_final_generic);
+EXPORT_SYMBOL(poly1305_final);
+
+#ifdef poly1305_mod_init_arch
+static int __init poly1305_mod_init(void)
+{
+ poly1305_mod_init_arch();
+ return 0;
+}
+subsys_initcall(poly1305_mod_init);
+
+static void __exit poly1305_mod_exit(void)
+{
+}
+module_exit(poly1305_mod_exit);
+#endif
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("Poly1305 authenticator algorithm, RFC7539");
diff --git a/lib/crypto/polyval.c b/lib/crypto/polyval.c
new file mode 100644
index 000000000000..5796275f574a
--- /dev/null
+++ b/lib/crypto/polyval.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * POLYVAL library functions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/polyval.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+/*
+ * POLYVAL is an almost-XOR-universal hash function. Similar to GHASH, POLYVAL
+ * interprets the message as the coefficients of a polynomial in GF(2^128) and
+ * evaluates that polynomial at a secret point. POLYVAL has a simple
+ * mathematical relationship with GHASH, but it uses a better field convention
+ * which makes it easier and faster to implement.
+ *
+ * POLYVAL is not a cryptographic hash function, and it should be used only by
+ * algorithms that are specifically designed to use it.
+ *
+ * POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated
+ * Encryption" (https://datatracker.ietf.org/doc/html/rfc8452)
+ *
+ * POLYVAL is also used by HCTR2. See "Length-preserving encryption with HCTR2"
+ * (https://eprint.iacr.org/2021/1441.pdf).
+ *
+ * This file provides a library API for POLYVAL. This API can delegate to
+ * either a generic implementation or an architecture-optimized implementation.
+ *
+ * For the generic implementation, we don't use the traditional table approach
+ * to GF(2^128) multiplication. That approach is not constant-time and requires
+ * a lot of memory. Instead, we use a different approach which emulates
+ * carryless multiplication using standard multiplications by spreading the data
+ * bits apart using "holes". This allows the carries to spill harmlessly. This
+ * approach is borrowed from BoringSSL, which in turn credits BearSSL's
+ * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
+ * "holes" trick and a presentation by Shay Gueron
+ * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
+ * 256-bit => 128-bit reduction algorithm.
+ */
+
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+ /*
+ * With 64-bit multiplicands and one term every 4 bits, there would be
+ * up to 64 / 4 = 16 one bits per column when each multiplication is
+ * written out as a series of additions in the schoolbook manner.
+ * Unfortunately, that doesn't work since the value 16 is 1 too large to
+ * fit in 4 bits. Carries would sometimes overflow into the next term.
+ *
+ * Using one term every 5 bits would work. However, that would cost
+ * 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
+ *
+ * Instead, mask off 4 bits from one multiplicand, giving a max of 15
+ * one bits per column. Then handle those 4 bits separately.
+ */
+ u64 a0 = a & 0x1111111111111110;
+ u64 a1 = a & 0x2222222222222220;
+ u64 a2 = a & 0x4444444444444440;
+ u64 a3 = a & 0x8888888888888880;
+
+ u64 b0 = b & 0x1111111111111111;
+ u64 b1 = b & 0x2222222222222222;
+ u64 b2 = b & 0x4444444444444444;
+ u64 b3 = b & 0x8888888888888888;
+
+ /* Multiply the high 60 bits of @a by @b. */
+ u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
+ (a2 * (u128)b2) ^ (a3 * (u128)b1);
+ u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
+ (a2 * (u128)b3) ^ (a3 * (u128)b2);
+ u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
+ (a2 * (u128)b0) ^ (a3 * (u128)b3);
+ u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
+ (a2 * (u128)b1) ^ (a3 * (u128)b0);
+
+ /* Multiply the low 4 bits of @a by @b. */
+ u64 e0 = -(a & 1) & b;
+ u64 e1 = -((a >> 1) & 1) & b;
+ u64 e2 = -((a >> 2) & 1) & b;
+ u64 e3 = -((a >> 3) & 1) & b;
+ u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
+ u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
+
+ /* Add all the intermediate products together. */
+ *out_lo = (((u64)c0) & 0x1111111111111111) ^
+ (((u64)c1) & 0x2222222222222222) ^
+ (((u64)c2) & 0x4444444444444444) ^
+ (((u64)c3) & 0x8888888888888888) ^ extra_lo;
+ *out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
+ (((u64)(c1 >> 64)) & 0x2222222222222222) ^
+ (((u64)(c2 >> 64)) & 0x4444444444444444) ^
+ (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
+}
+
+#else /* CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Do a 32 x 32 => 64 bit carryless multiplication. */
+static u64 clmul32(u32 a, u32 b)
+{
+ /*
+ * With 32-bit multiplicands and one term every 4 bits, there are up to
+ * 32 / 4 = 8 one bits per column when each multiplication is written
+ * out as a series of additions in the schoolbook manner. The value 8
+ * fits in 4 bits, so the carries don't overflow into the next term.
+ */
+ u32 a0 = a & 0x11111111;
+ u32 a1 = a & 0x22222222;
+ u32 a2 = a & 0x44444444;
+ u32 a3 = a & 0x88888888;
+
+ u32 b0 = b & 0x11111111;
+ u32 b1 = b & 0x22222222;
+ u32 b2 = b & 0x44444444;
+ u32 b3 = b & 0x88888888;
+
+ u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
+ (a2 * (u64)b2) ^ (a3 * (u64)b1);
+ u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
+ (a2 * (u64)b3) ^ (a3 * (u64)b2);
+ u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
+ (a2 * (u64)b0) ^ (a3 * (u64)b3);
+ u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
+ (a2 * (u64)b1) ^ (a3 * (u64)b0);
+
+ /* Add all the intermediate products together. */
+ return (c0 & 0x1111111111111111) ^
+ (c1 & 0x2222222222222222) ^
+ (c2 & 0x4444444444444444) ^
+ (c3 & 0x8888888888888888);
+}
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+ u32 a_lo = (u32)a;
+ u32 a_hi = a >> 32;
+ u32 b_lo = (u32)b;
+ u32 b_hi = b >> 32;
+
+ /* Karatsuba multiplication */
+ u64 lo = clmul32(a_lo, b_lo);
+ u64 hi = clmul32(a_hi, b_hi);
+ u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
+
+ *out_lo = lo ^ (mi << 32);
+ *out_hi = hi ^ (mi >> 32);
+}
+#endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
+static void __maybe_unused
+polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
+{
+ u64 c0, c1, c2, c3, mi0, mi1;
+
+ /*
+ * Carryless-multiply @a by @b using Karatsuba multiplication. Store
+ * the 256-bit product in @c0 (low) through @c3 (high).
+ */
+ clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
+ clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
+ clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
+ &mi0, &mi1);
+ mi0 ^= c0 ^ c2;
+ mi1 ^= c1 ^ c3;
+ c1 ^= mi0;
+ c2 ^= mi1;
+
+ /*
+ * Cancel out the low 128 bits of the product by adding multiples of
+ * G(x) = x^128 + x^127 + x^126 + x^121 + 1. Do this in two steps, each
+ * of which cancels out 64 bits. Note that we break G(x) into three
+ * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
+ */
+
+ /*
+ * First, add G(x) times c0 as follows:
+ *
+ * (c0, c1, c2) = (0,
+ * c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
+ * c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
+ */
+ c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
+ c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
+
+ /*
+ * Second, add G(x) times the new c1:
+ *
+ * (c1, c2, c3) = (0,
+ * c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
+ * c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
+ */
+ c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
+ c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
+
+ /* Return (c2, c3). This implicitly multiplies by x^-128. */
+ a->lo = cpu_to_le64(c2);
+ a->hi = cpu_to_le64(c3);
+}
+
+static void __maybe_unused
+polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
+ const u8 *data, size_t nblocks)
+{
+ do {
+ acc->lo ^= get_unaligned((__le64 *)data);
+ acc->hi ^= get_unaligned((__le64 *)(data + 8));
+ polyval_mul_generic(acc, key);
+ data += POLYVAL_BLOCK_SIZE;
+ } while (--nblocks);
+}
+
+/* Include the arch-optimized implementation of POLYVAL, if one is available. */
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#include "polyval.h" /* $(SRCARCH)/polyval.h */
+void polyval_preparekey(struct polyval_key *key,
+ const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+ polyval_preparekey_arch(key, raw_key);
+}
+EXPORT_SYMBOL_GPL(polyval_preparekey);
+#endif /* Else, polyval_preparekey() is an inline function. */
+
+/*
+ * polyval_mul_generic() and polyval_blocks_generic() take the key as a
+ * polyval_elem rather than a polyval_key, so that arch-optimized
+ * implementations with a different key format can use it as a fallback (if they
+ * have H^1 stored somewhere in their struct). Thus, the following dispatch
+ * code is needed to pass the appropriate key argument.
+ */
+
+static void polyval_mul(struct polyval_ctx *ctx)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+ polyval_mul_arch(&ctx->acc, ctx->key);
+#else
+ polyval_mul_generic(&ctx->acc, &ctx->key->h);
+#endif
+}
+
+static void polyval_blocks(struct polyval_ctx *ctx,
+ const u8 *data, size_t nblocks)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+ polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
+#else
+ polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
+#endif
+}
+
+void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
+{
+ if (unlikely(ctx->partial)) {
+ size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
+
+ len -= n;
+ while (n--)
+ ctx->acc.bytes[ctx->partial++] ^= *data++;
+ if (ctx->partial < POLYVAL_BLOCK_SIZE)
+ return;
+ polyval_mul(ctx);
+ }
+ if (len >= POLYVAL_BLOCK_SIZE) {
+ size_t nblocks = len / POLYVAL_BLOCK_SIZE;
+
+ polyval_blocks(ctx, data, nblocks);
+ data += len & ~(POLYVAL_BLOCK_SIZE - 1);
+ len &= POLYVAL_BLOCK_SIZE - 1;
+ }
+ for (size_t i = 0; i < len; i++)
+ ctx->acc.bytes[i] ^= data[i];
+ ctx->partial = len;
+}
+EXPORT_SYMBOL_GPL(polyval_update);
+
+void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
+{
+ if (unlikely(ctx->partial))
+ polyval_mul(ctx);
+ memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(polyval_final);
+
+#ifdef polyval_mod_init_arch
+static int __init polyval_mod_init(void)
+{
+ polyval_mod_init_arch();
+ return 0;
+}
+subsys_initcall(polyval_mod_init);
+
+static void __exit polyval_mod_exit(void)
+{
+}
+module_exit(polyval_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/powerpc/chacha-p10le-8x.S b/lib/crypto/powerpc/chacha-p10le-8x.S
new file mode 100644
index 000000000000..b29562bd5d40
--- /dev/null
+++ b/lib/crypto/powerpc/chacha-p10le-8x.S
@@ -0,0 +1,840 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated chacha20 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# do rounds, 8 quarter rounds
+# 1. a += b; d ^= a; d <<<= 16;
+# 2. c += d; b ^= c; b <<<= 12;
+# 3. a += b; d ^= a; d <<<= 8;
+# 4. c += d; b ^= c; b <<<= 7
+#
+# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
+# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
+# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
+# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
+#
+# 4 blocks (a b c d)
+#
+# a0 b0 c0 d0
+# a1 b1 c1 d1
+# ...
+# a4 b4 c4 d4
+# ...
+# a8 b8 c8 d8
+# ...
+# a12 b12 c12 d12
+# a13 ...
+# a14 ...
+# a15 b15 c15 d15
+#
+# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+#
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine "any"
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+ std \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ stvx \VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ stxvx \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+ ld \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ lvx \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ lxvx \VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-752(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+ SAVE_GPR 25, 200, 1
+ SAVE_GPR 26, 208, 1
+ SAVE_GPR 27, 216, 1
+ SAVE_GPR 28, 224, 1
+ SAVE_GPR 29, 232, 1
+ SAVE_GPR 30, 240, 1
+ SAVE_GPR 31, 248, 1
+
+ addi 9, 1, 256
+ SAVE_VRS 20, 0, 9
+ SAVE_VRS 21, 16, 9
+ SAVE_VRS 22, 32, 9
+ SAVE_VRS 23, 48, 9
+ SAVE_VRS 24, 64, 9
+ SAVE_VRS 25, 80, 9
+ SAVE_VRS 26, 96, 9
+ SAVE_VRS 27, 112, 9
+ SAVE_VRS 28, 128, 9
+ SAVE_VRS 29, 144, 9
+ SAVE_VRS 30, 160, 9
+ SAVE_VRS 31, 176, 9
+
+ SAVE_VSX 14, 192, 9
+ SAVE_VSX 15, 208, 9
+ SAVE_VSX 16, 224, 9
+ SAVE_VSX 17, 240, 9
+ SAVE_VSX 18, 256, 9
+ SAVE_VSX 19, 272, 9
+ SAVE_VSX 20, 288, 9
+ SAVE_VSX 21, 304, 9
+ SAVE_VSX 22, 320, 9
+ SAVE_VSX 23, 336, 9
+ SAVE_VSX 24, 352, 9
+ SAVE_VSX 25, 368, 9
+ SAVE_VSX 26, 384, 9
+ SAVE_VSX 27, 400, 9
+ SAVE_VSX 28, 416, 9
+ SAVE_VSX 29, 432, 9
+ SAVE_VSX 30, 448, 9
+ SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+ addi 9, 1, 256
+ RESTORE_VRS 20, 0, 9
+ RESTORE_VRS 21, 16, 9
+ RESTORE_VRS 22, 32, 9
+ RESTORE_VRS 23, 48, 9
+ RESTORE_VRS 24, 64, 9
+ RESTORE_VRS 25, 80, 9
+ RESTORE_VRS 26, 96, 9
+ RESTORE_VRS 27, 112, 9
+ RESTORE_VRS 28, 128, 9
+ RESTORE_VRS 29, 144, 9
+ RESTORE_VRS 30, 160, 9
+ RESTORE_VRS 31, 176, 9
+
+ RESTORE_VSX 14, 192, 9
+ RESTORE_VSX 15, 208, 9
+ RESTORE_VSX 16, 224, 9
+ RESTORE_VSX 17, 240, 9
+ RESTORE_VSX 18, 256, 9
+ RESTORE_VSX 19, 272, 9
+ RESTORE_VSX 20, 288, 9
+ RESTORE_VSX 21, 304, 9
+ RESTORE_VSX 22, 320, 9
+ RESTORE_VSX 23, 336, 9
+ RESTORE_VSX 24, 352, 9
+ RESTORE_VSX 25, 368, 9
+ RESTORE_VSX 26, 384, 9
+ RESTORE_VSX 27, 400, 9
+ RESTORE_VSX 28, 416, 9
+ RESTORE_VSX 29, 432, 9
+ RESTORE_VSX 30, 448, 9
+ RESTORE_VSX 31, 464, 9
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+ RESTORE_GPR 25, 200, 1
+ RESTORE_GPR 26, 208, 1
+ RESTORE_GPR 27, 216, 1
+ RESTORE_GPR 28, 224, 1
+ RESTORE_GPR 29, 232, 1
+ RESTORE_GPR 30, 240, 1
+ RESTORE_GPR 31, 248, 1
+
+ addi 1, 1, 752
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+.macro QT_loop_8x
+ # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 20, 20
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vadduwm 16, 16, 20
+ vadduwm 17, 17, 21
+ vadduwm 18, 18, 22
+ vadduwm 19, 19, 23
+
+ vpermxor 12, 12, 0, 25
+ vpermxor 13, 13, 1, 25
+ vpermxor 14, 14, 2, 25
+ vpermxor 15, 15, 3, 25
+ vpermxor 28, 28, 16, 25
+ vpermxor 29, 29, 17, 25
+ vpermxor 30, 30, 18, 25
+ vpermxor 31, 31, 19, 25
+ xxlor 32+25, 0, 0
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vadduwm 24, 24, 28
+ vadduwm 25, 25, 29
+ vadduwm 26, 26, 30
+ vadduwm 27, 27, 31
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vxor 20, 20, 24
+ vxor 21, 21, 25
+ vxor 22, 22, 26
+ vxor 23, 23, 27
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 21, 21
+ vrlw 4, 4, 25 #
+ vrlw 5, 5, 25
+ vrlw 6, 6, 25
+ vrlw 7, 7, 25
+ vrlw 20, 20, 25 #
+ vrlw 21, 21, 25
+ vrlw 22, 22, 25
+ vrlw 23, 23, 25
+ xxlor 32+25, 0, 0
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vadduwm 16, 16, 20
+ vadduwm 17, 17, 21
+ vadduwm 18, 18, 22
+ vadduwm 19, 19, 23
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 22, 22
+ vpermxor 12, 12, 0, 25
+ vpermxor 13, 13, 1, 25
+ vpermxor 14, 14, 2, 25
+ vpermxor 15, 15, 3, 25
+ vpermxor 28, 28, 16, 25
+ vpermxor 29, 29, 17, 25
+ vpermxor 30, 30, 18, 25
+ vpermxor 31, 31, 19, 25
+ xxlor 32+25, 0, 0
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vadduwm 24, 24, 28
+ vadduwm 25, 25, 29
+ vadduwm 26, 26, 30
+ vadduwm 27, 27, 31
+ xxlor 0, 32+28, 32+28
+ xxlor 32+28, 23, 23
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vxor 20, 20, 24
+ vxor 21, 21, 25
+ vxor 22, 22, 26
+ vxor 23, 23, 27
+ vrlw 4, 4, 28 #
+ vrlw 5, 5, 28
+ vrlw 6, 6, 28
+ vrlw 7, 7, 28
+ vrlw 20, 20, 28 #
+ vrlw 21, 21, 28
+ vrlw 22, 22, 28
+ vrlw 23, 23, 28
+ xxlor 32+28, 0, 0
+
+ # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 20, 20
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vadduwm 16, 16, 21
+ vadduwm 17, 17, 22
+ vadduwm 18, 18, 23
+ vadduwm 19, 19, 20
+
+ vpermxor 15, 15, 0, 25
+ vpermxor 12, 12, 1, 25
+ vpermxor 13, 13, 2, 25
+ vpermxor 14, 14, 3, 25
+ vpermxor 31, 31, 16, 25
+ vpermxor 28, 28, 17, 25
+ vpermxor 29, 29, 18, 25
+ vpermxor 30, 30, 19, 25
+
+ xxlor 32+25, 0, 0
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vadduwm 26, 26, 31
+ vadduwm 27, 27, 28
+ vadduwm 24, 24, 29
+ vadduwm 25, 25, 30
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vxor 21, 21, 26
+ vxor 22, 22, 27
+ vxor 23, 23, 24
+ vxor 20, 20, 25
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 21, 21
+ vrlw 5, 5, 25
+ vrlw 6, 6, 25
+ vrlw 7, 7, 25
+ vrlw 4, 4, 25
+ vrlw 21, 21, 25
+ vrlw 22, 22, 25
+ vrlw 23, 23, 25
+ vrlw 20, 20, 25
+ xxlor 32+25, 0, 0
+
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vadduwm 16, 16, 21
+ vadduwm 17, 17, 22
+ vadduwm 18, 18, 23
+ vadduwm 19, 19, 20
+
+ xxlor 0, 32+25, 32+25
+ xxlor 32+25, 22, 22
+ vpermxor 15, 15, 0, 25
+ vpermxor 12, 12, 1, 25
+ vpermxor 13, 13, 2, 25
+ vpermxor 14, 14, 3, 25
+ vpermxor 31, 31, 16, 25
+ vpermxor 28, 28, 17, 25
+ vpermxor 29, 29, 18, 25
+ vpermxor 30, 30, 19, 25
+ xxlor 32+25, 0, 0
+
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vadduwm 26, 26, 31
+ vadduwm 27, 27, 28
+ vadduwm 24, 24, 29
+ vadduwm 25, 25, 30
+
+ xxlor 0, 32+28, 32+28
+ xxlor 32+28, 23, 23
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vxor 21, 21, 26
+ vxor 22, 22, 27
+ vxor 23, 23, 24
+ vxor 20, 20, 25
+ vrlw 5, 5, 28
+ vrlw 6, 6, 28
+ vrlw 7, 7, 28
+ vrlw 4, 4, 28
+ vrlw 21, 21, 28
+ vrlw 22, 22, 28
+ vrlw 23, 23, 28
+ vrlw 20, 20, 28
+ xxlor 32+28, 0, 0
+.endm
+
+.macro QT_loop_4x
+ # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vpermxor 12, 12, 0, 20
+ vpermxor 13, 13, 1, 20
+ vpermxor 14, 14, 2, 20
+ vpermxor 15, 15, 3, 20
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vrlw 4, 4, 21
+ vrlw 5, 5, 21
+ vrlw 6, 6, 21
+ vrlw 7, 7, 21
+ vadduwm 0, 0, 4
+ vadduwm 1, 1, 5
+ vadduwm 2, 2, 6
+ vadduwm 3, 3, 7
+ vpermxor 12, 12, 0, 22
+ vpermxor 13, 13, 1, 22
+ vpermxor 14, 14, 2, 22
+ vpermxor 15, 15, 3, 22
+ vadduwm 8, 8, 12
+ vadduwm 9, 9, 13
+ vadduwm 10, 10, 14
+ vadduwm 11, 11, 15
+ vxor 4, 4, 8
+ vxor 5, 5, 9
+ vxor 6, 6, 10
+ vxor 7, 7, 11
+ vrlw 4, 4, 23
+ vrlw 5, 5, 23
+ vrlw 6, 6, 23
+ vrlw 7, 7, 23
+
+ # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vpermxor 15, 15, 0, 20
+ vpermxor 12, 12, 1, 20
+ vpermxor 13, 13, 2, 20
+ vpermxor 14, 14, 3, 20
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vrlw 5, 5, 21
+ vrlw 6, 6, 21
+ vrlw 7, 7, 21
+ vrlw 4, 4, 21
+ vadduwm 0, 0, 5
+ vadduwm 1, 1, 6
+ vadduwm 2, 2, 7
+ vadduwm 3, 3, 4
+ vpermxor 15, 15, 0, 22
+ vpermxor 12, 12, 1, 22
+ vpermxor 13, 13, 2, 22
+ vpermxor 14, 14, 3, 22
+ vadduwm 10, 10, 15
+ vadduwm 11, 11, 12
+ vadduwm 8, 8, 13
+ vadduwm 9, 9, 14
+ vxor 5, 5, 10
+ vxor 6, 6, 11
+ vxor 7, 7, 8
+ vxor 4, 4, 9
+ vrlw 5, 5, 23
+ vrlw 6, 6, 23
+ vrlw 7, 7, 23
+ vrlw 4, 4, 23
+.endm
+
+# Transpose
+.macro TP_4x a0 a1 a2 a3
+ xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
+ xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
+ xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
+ xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
+ xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
+ xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
+ xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
+ xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
+.endm
+
+# key stream = working state + state
+.macro Add_state S
+ vadduwm \S+0, \S+0, 16-\S
+ vadduwm \S+4, \S+4, 17-\S
+ vadduwm \S+8, \S+8, 18-\S
+ vadduwm \S+12, \S+12, 19-\S
+
+ vadduwm \S+1, \S+1, 16-\S
+ vadduwm \S+5, \S+5, 17-\S
+ vadduwm \S+9, \S+9, 18-\S
+ vadduwm \S+13, \S+13, 19-\S
+
+ vadduwm \S+2, \S+2, 16-\S
+ vadduwm \S+6, \S+6, 17-\S
+ vadduwm \S+10, \S+10, 18-\S
+ vadduwm \S+14, \S+14, 19-\S
+
+ vadduwm \S+3, \S+3, 16-\S
+ vadduwm \S+7, \S+7, 17-\S
+ vadduwm \S+11, \S+11, 18-\S
+ vadduwm \S+15, \S+15, 19-\S
+.endm
+
+#
+# write 256 bytes
+#
+.macro Write_256 S
+ add 9, 14, 5
+ add 16, 14, 4
+ lxvw4x 0, 0, 9
+ lxvw4x 1, 17, 9
+ lxvw4x 2, 18, 9
+ lxvw4x 3, 19, 9
+ lxvw4x 4, 20, 9
+ lxvw4x 5, 21, 9
+ lxvw4x 6, 22, 9
+ lxvw4x 7, 23, 9
+ lxvw4x 8, 24, 9
+ lxvw4x 9, 25, 9
+ lxvw4x 10, 26, 9
+ lxvw4x 11, 27, 9
+ lxvw4x 12, 28, 9
+ lxvw4x 13, 29, 9
+ lxvw4x 14, 30, 9
+ lxvw4x 15, 31, 9
+
+ xxlxor \S+32, \S+32, 0
+ xxlxor \S+36, \S+36, 1
+ xxlxor \S+40, \S+40, 2
+ xxlxor \S+44, \S+44, 3
+ xxlxor \S+33, \S+33, 4
+ xxlxor \S+37, \S+37, 5
+ xxlxor \S+41, \S+41, 6
+ xxlxor \S+45, \S+45, 7
+ xxlxor \S+34, \S+34, 8
+ xxlxor \S+38, \S+38, 9
+ xxlxor \S+42, \S+42, 10
+ xxlxor \S+46, \S+46, 11
+ xxlxor \S+35, \S+35, 12
+ xxlxor \S+39, \S+39, 13
+ xxlxor \S+43, \S+43, 14
+ xxlxor \S+47, \S+47, 15
+
+ stxvw4x \S+32, 0, 16
+ stxvw4x \S+36, 17, 16
+ stxvw4x \S+40, 18, 16
+ stxvw4x \S+44, 19, 16
+
+ stxvw4x \S+33, 20, 16
+ stxvw4x \S+37, 21, 16
+ stxvw4x \S+41, 22, 16
+ stxvw4x \S+45, 23, 16
+
+ stxvw4x \S+34, 24, 16
+ stxvw4x \S+38, 25, 16
+ stxvw4x \S+42, 26, 16
+ stxvw4x \S+46, 27, 16
+
+ stxvw4x \S+35, 28, 16
+ stxvw4x \S+39, 29, 16
+ stxvw4x \S+43, 30, 16
+ stxvw4x \S+47, 31, 16
+
+.endm
+
+#
+# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src,
+# unsigned int len, int nrounds);
+#
+SYM_FUNC_START(chacha_p10le_8x)
+.align 5
+ cmpdi 6, 0
+ ble Out_no_chacha
+
+ SAVE_REGS
+
+ # r17 - r31 mainly for Write_256 macro.
+ li 17, 16
+ li 18, 32
+ li 19, 48
+ li 20, 64
+ li 21, 80
+ li 22, 96
+ li 23, 112
+ li 24, 128
+ li 25, 144
+ li 26, 160
+ li 27, 176
+ li 28, 192
+ li 29, 208
+ li 30, 224
+ li 31, 240
+
+ mr 15, 6 # len
+ li 14, 0 # offset to inp and outp
+
+ lxvw4x 48, 0, 3 # vr16, constants
+ lxvw4x 49, 17, 3 # vr17, key 1
+ lxvw4x 50, 18, 3 # vr18, key 2
+ lxvw4x 51, 19, 3 # vr19, counter, nonce
+
+ # create (0, 1, 2, 3) counters
+ vspltisw 0, 0
+ vspltisw 1, 1
+ vspltisw 2, 2
+ vspltisw 3, 3
+ vmrghw 4, 0, 1
+ vmrglw 5, 2, 3
+ vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3)
+
+ vspltisw 21, 12
+ vspltisw 23, 7
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxvw4x 32+20, 0, 11
+ lxvw4x 32+22, 17, 11
+
+ sradi 8, 7, 1
+
+ mtctr 8
+
+ # save constants to vsx
+ xxlor 16, 48, 48
+ xxlor 17, 49, 49
+ xxlor 18, 50, 50
+ xxlor 19, 51, 51
+
+ vspltisw 25, 4
+ vspltisw 26, 8
+
+ xxlor 25, 32+26, 32+26
+ xxlor 24, 32+25, 32+25
+
+ vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
+ xxlor 30, 32+30, 32+30
+ xxlor 31, 32+31, 32+31
+
+ xxlor 20, 32+20, 32+20
+ xxlor 21, 32+21, 32+21
+ xxlor 22, 32+22, 32+22
+ xxlor 23, 32+23, 32+23
+
+ cmpdi 6, 512
+ blt Loop_last
+
+Loop_8x:
+ xxspltw 32+0, 16, 0
+ xxspltw 32+1, 16, 1
+ xxspltw 32+2, 16, 2
+ xxspltw 32+3, 16, 3
+
+ xxspltw 32+4, 17, 0
+ xxspltw 32+5, 17, 1
+ xxspltw 32+6, 17, 2
+ xxspltw 32+7, 17, 3
+ xxspltw 32+8, 18, 0
+ xxspltw 32+9, 18, 1
+ xxspltw 32+10, 18, 2
+ xxspltw 32+11, 18, 3
+ xxspltw 32+12, 19, 0
+ xxspltw 32+13, 19, 1
+ xxspltw 32+14, 19, 2
+ xxspltw 32+15, 19, 3
+ vadduwm 12, 12, 30 # increase counter
+
+ xxspltw 32+16, 16, 0
+ xxspltw 32+17, 16, 1
+ xxspltw 32+18, 16, 2
+ xxspltw 32+19, 16, 3
+
+ xxspltw 32+20, 17, 0
+ xxspltw 32+21, 17, 1
+ xxspltw 32+22, 17, 2
+ xxspltw 32+23, 17, 3
+ xxspltw 32+24, 18, 0
+ xxspltw 32+25, 18, 1
+ xxspltw 32+26, 18, 2
+ xxspltw 32+27, 18, 3
+ xxspltw 32+28, 19, 0
+ xxspltw 32+29, 19, 1
+ vadduwm 28, 28, 31 # increase counter
+ xxspltw 32+30, 19, 2
+ xxspltw 32+31, 19, 3
+
+.align 5
+quarter_loop_8x:
+ QT_loop_8x
+
+ bdnz quarter_loop_8x
+
+ xxlor 0, 32+30, 32+30
+ xxlor 32+30, 30, 30
+ vadduwm 12, 12, 30
+ xxlor 32+30, 0, 0
+ TP_4x 0, 1, 2, 3
+ TP_4x 4, 5, 6, 7
+ TP_4x 8, 9, 10, 11
+ TP_4x 12, 13, 14, 15
+
+ xxlor 0, 48, 48
+ xxlor 1, 49, 49
+ xxlor 2, 50, 50
+ xxlor 3, 51, 51
+ xxlor 48, 16, 16
+ xxlor 49, 17, 17
+ xxlor 50, 18, 18
+ xxlor 51, 19, 19
+ Add_state 0
+ xxlor 48, 0, 0
+ xxlor 49, 1, 1
+ xxlor 50, 2, 2
+ xxlor 51, 3, 3
+ Write_256 0
+ addi 14, 14, 256 # offset +=256
+ addi 15, 15, -256 # len -=256
+
+ xxlor 5, 32+31, 32+31
+ xxlor 32+31, 31, 31
+ vadduwm 28, 28, 31
+ xxlor 32+31, 5, 5
+ TP_4x 16+0, 16+1, 16+2, 16+3
+ TP_4x 16+4, 16+5, 16+6, 16+7
+ TP_4x 16+8, 16+9, 16+10, 16+11
+ TP_4x 16+12, 16+13, 16+14, 16+15
+
+ xxlor 32, 16, 16
+ xxlor 33, 17, 17
+ xxlor 34, 18, 18
+ xxlor 35, 19, 19
+ Add_state 16
+ Write_256 16
+ addi 14, 14, 256 # offset +=256
+ addi 15, 15, -256 # len +=256
+
+ xxlor 32+24, 24, 24
+ xxlor 32+25, 25, 25
+ xxlor 32+30, 30, 30
+ vadduwm 30, 30, 25
+ vadduwm 31, 30, 24
+ xxlor 30, 32+30, 32+30
+ xxlor 31, 32+31, 32+31
+
+ cmpdi 15, 0
+ beq Out_loop
+
+ cmpdi 15, 512
+ blt Loop_last
+
+ mtctr 8
+ b Loop_8x
+
+Loop_last:
+ lxvw4x 48, 0, 3 # vr16, constants
+ lxvw4x 49, 17, 3 # vr17, key 1
+ lxvw4x 50, 18, 3 # vr18, key 2
+ lxvw4x 51, 19, 3 # vr19, counter, nonce
+
+ vspltisw 21, 12
+ vspltisw 23, 7
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxvw4x 32+20, 0, 11
+ lxvw4x 32+22, 17, 11
+
+ sradi 8, 7, 1
+ mtctr 8
+
+Loop_4x:
+ vspltw 0, 16, 0
+ vspltw 1, 16, 1
+ vspltw 2, 16, 2
+ vspltw 3, 16, 3
+
+ vspltw 4, 17, 0
+ vspltw 5, 17, 1
+ vspltw 6, 17, 2
+ vspltw 7, 17, 3
+ vspltw 8, 18, 0
+ vspltw 9, 18, 1
+ vspltw 10, 18, 2
+ vspltw 11, 18, 3
+ vspltw 12, 19, 0
+ vadduwm 12, 12, 30 # increase counter
+ vspltw 13, 19, 1
+ vspltw 14, 19, 2
+ vspltw 15, 19, 3
+
+.align 5
+quarter_loop:
+ QT_loop_4x
+
+ bdnz quarter_loop
+
+ vadduwm 12, 12, 30
+ TP_4x 0, 1, 2, 3
+ TP_4x 4, 5, 6, 7
+ TP_4x 8, 9, 10, 11
+ TP_4x 12, 13, 14, 15
+
+ Add_state 0
+ Write_256 0
+ addi 14, 14, 256 # offset += 256
+ addi 15, 15, -256 # len += 256
+
+ # Update state counter
+ vspltisw 25, 4
+ vadduwm 30, 30, 25
+
+ cmpdi 15, 0
+ beq Out_loop
+ cmpdi 15, 256
+ blt Out_loop
+
+ mtctr 8
+ b Loop_4x
+
+Out_loop:
+ RESTORE_REGS
+ blr
+
+Out_no_chacha:
+ li 3, 0
+ blr
+SYM_FUNC_END(chacha_p10le_8x)
+
+SYM_DATA_START_LOCAL(PERMX)
+.align 5
+permx:
+.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
+.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
+SYM_DATA_END(PERMX)
diff --git a/lib/crypto/powerpc/chacha.h b/lib/crypto/powerpc/chacha.h
new file mode 100644
index 000000000000..1df6e1ce31c4
--- /dev/null
+++ b/lib/crypto/powerpc/chacha.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha stream cipher (P10 accelerated)
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+
+#include <crypto/internal/simd.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <linux/sizes.h>
+#include <asm/simd.h>
+#include <asm/switch_to.h>
+
+asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+ preempt_disable();
+ enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+ disable_kernel_vsx();
+ preempt_enable();
+}
+
+static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ unsigned int l = bytes & ~0x0FF;
+
+ if (l > 0) {
+ chacha_p10le_8x(state, dst, src, l, nrounds);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state->x[12] += l / CHACHA_BLOCK_SIZE;
+ }
+
+ if (bytes > 0)
+ chacha_crypt_generic(state, dst, src, bytes, nrounds);
+}
+
+#define hchacha_block_arch hchacha_block_generic /* not implemented yet */
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int bytes, int nrounds)
+{
+ if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE ||
+ !crypto_simd_usable())
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ vsx_begin();
+ chacha_p10_do_8x(state, dst, src, todo, nrounds);
+ vsx_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ static_branch_enable(&have_p10);
+}
diff --git a/lib/crypto/powerpc/curve25519-ppc64le_asm.S b/lib/crypto/powerpc/curve25519-ppc64le_asm.S
new file mode 100644
index 000000000000..06c1febe24b9
--- /dev/null
+++ b/lib/crypto/powerpc/curve25519-ppc64le_asm.S
@@ -0,0 +1,671 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# This code is taken from CRYPTOGAMs[1] and is included here using the option
+# in the license to distribute the code under the GPL. Therefore this program
+# is free software; you can redistribute it and/or modify it under the terms of
+# the GNU General Public License version 2 as published by the Free Software
+# Foundation.
+#
+# [1] https://github.com/dot-asm/cryptogams/
+
+# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain copyright notices,
+# this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# * Neither the name of the CRYPTOGAMS nor the names of its
+# copyright holder and contributors may be used to endorse or
+# promote products derived from this software without specific
+# prior written permission.
+#
+# ALTERNATIVELY, provided that this notice is retained in full, this
+# product may be distributed under the terms of the GNU General Public
+# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+# those given above.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+#
+# ====================================================================
+# Written and Modified by Danny Tsen <dtsen@us.ibm.com>
+# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes
+# and x25519_cswap
+#
+# Copyright 2024- IBM Corp.
+#
+# X25519 lower-level primitives for PPC64.
+#
+
+#include <linux/linkage.h>
+
+.text
+
+.align 5
+SYM_FUNC_START(x25519_fe51_mul)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ ld 6,0(5)
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ mulld 22,7,6
+ mulhdu 23,7,6
+
+ mulld 24,8,6
+ mulhdu 25,8,6
+
+ mulld 30,11,6
+ mulhdu 31,11,6
+ ld 4,8(5)
+ mulli 11,11,19
+
+ mulld 26,9,6
+ mulhdu 27,9,6
+
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 12,11,4
+ mulhdu 21,11,4
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,7,4
+ mulhdu 21,7,4
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,10,4
+ mulhdu 21,10,4
+ ld 6,16(5)
+ mulli 10,10,19
+ addc 30,30,12
+ adde 31,31,21
+
+ mulld 12,8,4
+ mulhdu 21,8,4
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,9,4
+ mulhdu 21,9,4
+ addc 28,28,12
+ adde 29,29,21
+ mulld 12,10,6
+ mulhdu 21,10,6
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,11,6
+ mulhdu 21,11,6
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,9,6
+ mulhdu 21,9,6
+ ld 4,24(5)
+ mulli 9,9,19
+ addc 30,30,12
+ adde 31,31,21
+
+ mulld 12,7,6
+ mulhdu 21,7,6
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,8,6
+ mulhdu 21,8,6
+ addc 28,28,12
+ adde 29,29,21
+ mulld 12,9,4
+ mulhdu 21,9,4
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,10,4
+ mulhdu 21,10,4
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,8,4
+ mulhdu 21,8,4
+ ld 6,32(5)
+ mulli 8,8,19
+ addc 30,30,12
+ adde 31,31,21
+
+ mulld 12,11,4
+ mulhdu 21,11,4
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,7,4
+ mulhdu 21,7,4
+ addc 28,28,12
+ adde 29,29,21
+ mulld 12,8,6
+ mulhdu 21,8,6
+ addc 22,22,12
+ adde 23,23,21
+
+ mulld 12,9,6
+ mulhdu 21,9,6
+ addc 24,24,12
+ adde 25,25,21
+
+ mulld 12,10,6
+ mulhdu 21,10,6
+ addc 26,26,12
+ adde 27,27,21
+
+ mulld 12,11,6
+ mulhdu 21,11,6
+ addc 28,28,12
+ adde 29,29,21
+
+ mulld 12,7,6
+ mulhdu 21,7,6
+ addc 30,30,12
+ adde 31,31,21
+
+.Lfe51_reduce:
+ li 0,-1
+ srdi 0,0,13
+
+ srdi 12,26,51
+ and 9,26,0
+ insrdi 12,27,51,0
+ srdi 21,22,51
+ and 7,22,0
+ insrdi 21,23,51,0
+ addc 28,28,12
+ addze 29,29
+ addc 24,24,21
+ addze 25,25
+
+ srdi 12,28,51
+ and 10,28,0
+ insrdi 12,29,51,0
+ srdi 21,24,51
+ and 8,24,0
+ insrdi 21,25,51,0
+ addc 30,30,12
+ addze 31,31
+ add 9,9,21
+
+ srdi 12,30,51
+ and 11,30,0
+ insrdi 12,31,51,0
+ mulli 12,12,19
+
+ add 7,7,12
+
+ srdi 21,9,51
+ and 9,9,0
+ add 10,10,21
+
+ srdi 12,7,51
+ and 7,7,0
+ add 8,8,12
+
+ std 9,16(3)
+ std 10,24(3)
+ std 11,32(3)
+ std 7,0(3)
+ std 8,8(3)
+
+ ld 21,56(1)
+ ld 22,64(1)
+ ld 23,72(1)
+ ld 24,80(1)
+ ld 25,88(1)
+ ld 26,96(1)
+ ld 27,104(1)
+ ld 28,112(1)
+ ld 29,120(1)
+ ld 30,128(1)
+ ld 31,136(1)
+ addi 1,1,144
+ blr
+SYM_FUNC_END(x25519_fe51_mul)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_sqr)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ add 6,7,7
+ mulli 21,11,19
+
+ mulld 22,7,7
+ mulhdu 23,7,7
+ mulld 24,8,6
+ mulhdu 25,8,6
+ mulld 26,9,6
+ mulhdu 27,9,6
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 30,11,6
+ mulhdu 31,11,6
+ add 6,8,8
+ mulld 12,11,21
+ mulhdu 11,11,21
+ addc 28,28,12
+ adde 29,29,11
+
+ mulli 5,10,19
+
+ mulld 12,8,8
+ mulhdu 11,8,8
+ addc 26,26,12
+ adde 27,27,11
+ mulld 12,9,6
+ mulhdu 11,9,6
+ addc 28,28,12
+ adde 29,29,11
+ mulld 12,10,6
+ mulhdu 11,10,6
+ addc 30,30,12
+ adde 31,31,11
+ mulld 12,21,6
+ mulhdu 11,21,6
+ add 6,10,10
+ addc 22,22,12
+ adde 23,23,11
+ mulld 12,10,5
+ mulhdu 10,10,5
+ addc 24,24,12
+ adde 25,25,10
+ mulld 12,6,21
+ mulhdu 10,6,21
+ add 6,9,9
+ addc 26,26,12
+ adde 27,27,10
+
+ mulld 12,9,9
+ mulhdu 10,9,9
+ addc 30,30,12
+ adde 31,31,10
+ mulld 12,5,6
+ mulhdu 10,5,6
+ addc 22,22,12
+ adde 23,23,10
+ mulld 12,21,6
+ mulhdu 10,21,6
+ addc 24,24,12
+ adde 25,25,10
+
+ b .Lfe51_reduce
+SYM_FUNC_END(x25519_fe51_sqr)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_mul121666)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ lis 6,1
+ ori 6,6,56130
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ mulld 22,7,6
+ mulhdu 23,7,6
+ mulld 24,8,6
+ mulhdu 25,8,6
+ mulld 26,9,6
+ mulhdu 27,9,6
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 30,11,6
+ mulhdu 31,11,6
+
+ b .Lfe51_reduce
+SYM_FUNC_END(x25519_fe51_mul121666)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_sqr_times)
+
+ stdu 1,-144(1)
+ std 21,56(1)
+ std 22,64(1)
+ std 23,72(1)
+ std 24,80(1)
+ std 25,88(1)
+ std 26,96(1)
+ std 27,104(1)
+ std 28,112(1)
+ std 29,120(1)
+ std 30,128(1)
+ std 31,136(1)
+
+ ld 7,0(4)
+ ld 8,8(4)
+ ld 9,16(4)
+ ld 10,24(4)
+ ld 11,32(4)
+
+ mtctr 5
+
+.Lsqr_times_loop:
+ add 6,7,7
+ mulli 21,11,19
+
+ mulld 22,7,7
+ mulhdu 23,7,7
+ mulld 24,8,6
+ mulhdu 25,8,6
+ mulld 26,9,6
+ mulhdu 27,9,6
+ mulld 28,10,6
+ mulhdu 29,10,6
+ mulld 30,11,6
+ mulhdu 31,11,6
+ add 6,8,8
+ mulld 12,11,21
+ mulhdu 11,11,21
+ addc 28,28,12
+ adde 29,29,11
+
+ mulli 5,10,19
+
+ mulld 12,8,8
+ mulhdu 11,8,8
+ addc 26,26,12
+ adde 27,27,11
+ mulld 12,9,6
+ mulhdu 11,9,6
+ addc 28,28,12
+ adde 29,29,11
+ mulld 12,10,6
+ mulhdu 11,10,6
+ addc 30,30,12
+ adde 31,31,11
+ mulld 12,21,6
+ mulhdu 11,21,6
+ add 6,10,10
+ addc 22,22,12
+ adde 23,23,11
+ mulld 12,10,5
+ mulhdu 10,10,5
+ addc 24,24,12
+ adde 25,25,10
+ mulld 12,6,21
+ mulhdu 10,6,21
+ add 6,9,9
+ addc 26,26,12
+ adde 27,27,10
+
+ mulld 12,9,9
+ mulhdu 10,9,9
+ addc 30,30,12
+ adde 31,31,10
+ mulld 12,5,6
+ mulhdu 10,5,6
+ addc 22,22,12
+ adde 23,23,10
+ mulld 12,21,6
+ mulhdu 10,21,6
+ addc 24,24,12
+ adde 25,25,10
+
+ # fe51_reduce
+ li 0,-1
+ srdi 0,0,13
+
+ srdi 12,26,51
+ and 9,26,0
+ insrdi 12,27,51,0
+ srdi 21,22,51
+ and 7,22,0
+ insrdi 21,23,51,0
+ addc 28,28,12
+ addze 29,29
+ addc 24,24,21
+ addze 25,25
+
+ srdi 12,28,51
+ and 10,28,0
+ insrdi 12,29,51,0
+ srdi 21,24,51
+ and 8,24,0
+ insrdi 21,25,51,0
+ addc 30,30,12
+ addze 31,31
+ add 9,9,21
+
+ srdi 12,30,51
+ and 11,30,0
+ insrdi 12,31,51,0
+ mulli 12,12,19
+
+ add 7,7,12
+
+ srdi 21,9,51
+ and 9,9,0
+ add 10,10,21
+
+ srdi 12,7,51
+ and 7,7,0
+ add 8,8,12
+
+ bdnz .Lsqr_times_loop
+
+ std 9,16(3)
+ std 10,24(3)
+ std 11,32(3)
+ std 7,0(3)
+ std 8,8(3)
+
+ ld 21,56(1)
+ ld 22,64(1)
+ ld 23,72(1)
+ ld 24,80(1)
+ ld 25,88(1)
+ ld 26,96(1)
+ ld 27,104(1)
+ ld 28,112(1)
+ ld 29,120(1)
+ ld 30,128(1)
+ ld 31,136(1)
+ addi 1,1,144
+ blr
+SYM_FUNC_END(x25519_fe51_sqr_times)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_frombytes)
+
+ li 12, -1
+ srdi 12, 12, 13 # 0x7ffffffffffff
+
+ ld 5, 0(4)
+ ld 6, 8(4)
+ ld 7, 16(4)
+ ld 8, 24(4)
+
+ srdi 10, 5, 51
+ and 5, 5, 12 # h0
+
+ sldi 11, 6, 13
+ or 11, 10, 11 # h1t
+ srdi 10, 6, 38
+ and 6, 11, 12 # h1
+
+ sldi 11, 7, 26
+ or 10, 10, 11 # h2t
+
+ srdi 11, 7, 25
+ and 7, 10, 12 # h2
+ sldi 10, 8, 39
+ or 11, 11, 10 # h3t
+
+ srdi 9, 8, 12
+ and 8, 11, 12 # h3
+ and 9, 9, 12 # h4
+
+ std 5, 0(3)
+ std 6, 8(3)
+ std 7, 16(3)
+ std 8, 24(3)
+ std 9, 32(3)
+
+ blr
+SYM_FUNC_END(x25519_fe51_frombytes)
+
+.align 5
+SYM_FUNC_START(x25519_fe51_tobytes)
+
+ ld 5, 0(4)
+ ld 6, 8(4)
+ ld 7, 16(4)
+ ld 8, 24(4)
+ ld 9, 32(4)
+
+ li 12, -1
+ srdi 12, 12, 13 # 0x7ffffffffffff
+
+ # Full reducuction
+ addi 10, 5, 19
+ srdi 10, 10, 51
+ add 10, 10, 6
+ srdi 10, 10, 51
+ add 10, 10, 7
+ srdi 10, 10, 51
+ add 10, 10, 8
+ srdi 10, 10, 51
+ add 10, 10, 9
+ srdi 10, 10, 51
+
+ mulli 10, 10, 19
+ add 5, 5, 10
+ srdi 11, 5, 51
+ add 6, 6, 11
+ srdi 11, 6, 51
+ add 7, 7, 11
+ srdi 11, 7, 51
+ add 8, 8, 11
+ srdi 11, 8, 51
+ add 9, 9, 11
+
+ and 5, 5, 12
+ and 6, 6, 12
+ and 7, 7, 12
+ and 8, 8, 12
+ and 9, 9, 12
+
+ sldi 10, 6, 51
+ or 5, 5, 10 # s0
+
+ srdi 11, 6, 13
+ sldi 10, 7, 38
+ or 6, 11, 10 # s1
+
+ srdi 11, 7, 26
+ sldi 10, 8, 25
+ or 7, 11, 10 # s2
+
+ srdi 11, 8, 39
+ sldi 10, 9, 12
+ or 8, 11, 10 # s4
+
+ std 5, 0(3)
+ std 6, 8(3)
+ std 7, 16(3)
+ std 8, 24(3)
+
+ blr
+SYM_FUNC_END(x25519_fe51_tobytes)
+
+.align 5
+SYM_FUNC_START(x25519_cswap)
+
+ li 7, 5
+ neg 6, 5
+ mtctr 7
+
+.Lswap_loop:
+ ld 8, 0(3)
+ ld 9, 0(4)
+ xor 10, 8, 9
+ and 10, 10, 6
+ xor 11, 8, 10
+ xor 12, 9, 10
+ std 11, 0(3)
+ addi 3, 3, 8
+ std 12, 0(4)
+ addi 4, 4, 8
+ bdnz .Lswap_loop
+
+ blr
+SYM_FUNC_END(x25519_cswap)
diff --git a/lib/crypto/powerpc/curve25519.h b/lib/crypto/powerpc/curve25519.h
new file mode 100644
index 000000000000..dee6234c48e9
--- /dev/null
+++ b/lib/crypto/powerpc/curve25519.h
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2024- IBM Corp.
+ *
+ * X25519 scalar multiplication with 51 bits limbs for PPC64le.
+ * Based on RFC7748 and AArch64 optimized implementation for X25519
+ * - Algorithm 1 Scalar multiplication of a variable point
+ */
+
+#include <linux/types.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+#include <linux/cpufeature.h>
+#include <linux/processor.h>
+
+typedef uint64_t fe51[5];
+
+asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
+asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f);
+asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f);
+asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n);
+asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s);
+asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h);
+asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit);
+
+#define fmul x25519_fe51_mul
+#define fsqr x25519_fe51_sqr
+#define fmul121666 x25519_fe51_mul121666
+#define fe51_tobytes x25519_fe51_tobytes
+
+static void fadd(fe51 h, const fe51 f, const fe51 g)
+{
+ h[0] = f[0] + g[0];
+ h[1] = f[1] + g[1];
+ h[2] = f[2] + g[2];
+ h[3] = f[3] + g[3];
+ h[4] = f[4] + g[4];
+}
+
+/*
+ * Prime = 2 ** 255 - 19, 255 bits
+ * (0x7fffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffed)
+ *
+ * Prime in 5 51-bit limbs
+ */
+static fe51 prime51 = { 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff};
+
+static void fsub(fe51 h, const fe51 f, const fe51 g)
+{
+ h[0] = (f[0] + ((prime51[0] * 2))) - g[0];
+ h[1] = (f[1] + ((prime51[1] * 2))) - g[1];
+ h[2] = (f[2] + ((prime51[2] * 2))) - g[2];
+ h[3] = (f[3] + ((prime51[3] * 2))) - g[3];
+ h[4] = (f[4] + ((prime51[4] * 2))) - g[4];
+}
+
+static void fe51_frombytes(fe51 h, const uint8_t *s)
+{
+ /*
+ * Make sure 64-bit aligned.
+ */
+ unsigned char sbuf[32+8];
+ unsigned char *sb = PTR_ALIGN((void *)sbuf, 8);
+
+ memcpy(sb, s, 32);
+ x25519_fe51_frombytes(h, sb);
+}
+
+static void finv(fe51 o, const fe51 i)
+{
+ fe51 a0, b, c, t00;
+
+ fsqr(a0, i);
+ x25519_fe51_sqr_times(t00, a0, 2);
+
+ fmul(b, t00, i);
+ fmul(a0, b, a0);
+
+ fsqr(t00, a0);
+
+ fmul(b, t00, b);
+ x25519_fe51_sqr_times(t00, b, 5);
+
+ fmul(b, t00, b);
+ x25519_fe51_sqr_times(t00, b, 10);
+
+ fmul(c, t00, b);
+ x25519_fe51_sqr_times(t00, c, 20);
+
+ fmul(t00, t00, c);
+ x25519_fe51_sqr_times(t00, t00, 10);
+
+ fmul(b, t00, b);
+ x25519_fe51_sqr_times(t00, b, 50);
+
+ fmul(c, t00, b);
+ x25519_fe51_sqr_times(t00, c, 100);
+
+ fmul(t00, t00, c);
+ x25519_fe51_sqr_times(t00, t00, 50);
+
+ fmul(t00, t00, b);
+ x25519_fe51_sqr_times(t00, t00, 5);
+
+ fmul(o, t00, a0);
+}
+
+static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32],
+ const uint8_t point[32])
+{
+ fe51 x1, x2, z2, x3, z3;
+ uint8_t s[32];
+ unsigned int swap = 0;
+ int i;
+
+ memcpy(s, scalar, 32);
+ s[0] &= 0xf8;
+ s[31] &= 0x7f;
+ s[31] |= 0x40;
+ fe51_frombytes(x1, point);
+
+ z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0;
+ x3[0] = x1[0];
+ x3[1] = x1[1];
+ x3[2] = x1[2];
+ x3[3] = x1[3];
+ x3[4] = x1[4];
+
+ x2[0] = z3[0] = 1;
+ x2[1] = z3[1] = 0;
+ x2[2] = z3[2] = 0;
+ x2[3] = z3[3] = 0;
+ x2[4] = z3[4] = 0;
+
+ for (i = 254; i >= 0; --i) {
+ unsigned int k_t = 1 & (s[i / 8] >> (i & 7));
+ fe51 a, b, c, d, e;
+ fe51 da, cb, aa, bb;
+ fe51 dacb_p, dacb_m;
+
+ swap ^= k_t;
+ x25519_cswap(x2, x3, swap);
+ x25519_cswap(z2, z3, swap);
+ swap = k_t;
+
+ fsub(b, x2, z2); // B = x_2 - z_2
+ fadd(a, x2, z2); // A = x_2 + z_2
+ fsub(d, x3, z3); // D = x_3 - z_3
+ fadd(c, x3, z3); // C = x_3 + z_3
+
+ fsqr(bb, b); // BB = B^2
+ fsqr(aa, a); // AA = A^2
+ fmul(da, d, a); // DA = D * A
+ fmul(cb, c, b); // CB = C * B
+
+ fsub(e, aa, bb); // E = AA - BB
+ fmul(x2, aa, bb); // x2 = AA * BB
+ fadd(dacb_p, da, cb); // DA + CB
+ fsub(dacb_m, da, cb); // DA - CB
+
+ fmul121666(z3, e); // 121666 * E
+ fsqr(z2, dacb_m); // (DA - CB)^2
+ fsqr(x3, dacb_p); // x3 = (DA + CB)^2
+ fadd(b, bb, z3); // BB + 121666 * E
+ fmul(z3, x1, z2); // z3 = x1 * (DA - CB)^2
+ fmul(z2, e, b); // z2 = e * (BB + (DA + CB)^2)
+ }
+
+ finv(z2, z2);
+ fmul(x2, x2, z2);
+ fe51_tobytes(out, x2);
+}
+
+static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+ curve25519_fe51(mypublic, secret, basepoint);
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ curve25519_fe51(pub, secret, curve25519_base_point);
+}
diff --git a/lib/crypto/powerpc/md5-asm.S b/lib/crypto/powerpc/md5-asm.S
new file mode 100644
index 000000000000..fa6bc440cf4a
--- /dev/null
+++ b/lib/crypto/powerpc/md5-asm.S
@@ -0,0 +1,235 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast MD5 implementation for PPC
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+
+#define rHP r3
+#define rWP r4
+
+#define rH0 r0
+#define rH1 r6
+#define rH2 r7
+#define rH3 r5
+
+#define rW00 r8
+#define rW01 r9
+#define rW02 r10
+#define rW03 r11
+#define rW04 r12
+#define rW05 r14
+#define rW06 r15
+#define rW07 r16
+#define rW08 r17
+#define rW09 r18
+#define rW10 r19
+#define rW11 r20
+#define rW12 r21
+#define rW13 r22
+#define rW14 r23
+#define rW15 r24
+
+#define rT0 r25
+#define rT1 r26
+
+#define INITIALIZE \
+ PPC_STLU r1,-INT_FRAME_SIZE(r1); \
+ SAVE_GPRS(14, 26, r1) /* push registers onto stack */
+
+#define FINALIZE \
+ REST_GPRS(14, 26, r1); /* pop registers from stack */ \
+ addi r1,r1,INT_FRAME_SIZE
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+ lwbrx reg,0,rWP; /* load data */
+#define INC_PTR \
+ addi rWP,rWP,4; /* increment per word */
+#define NEXT_BLOCK /* nothing to do */
+#else
+#define LOAD_DATA(reg, off) \
+ lwz reg,off(rWP); /* load data */
+#define INC_PTR /* nothing to do */
+#define NEXT_BLOCK \
+ addi rWP,rWP,64; /* increment per block */
+#endif
+
+#define R_00_15(a, b, c, d, w0, w1, p, q, off, k0h, k0l, k1h, k1l) \
+ LOAD_DATA(w0, off) /* W */ \
+ and rT0,b,c; /* 1: f = b and c */ \
+ INC_PTR /* ptr++ */ \
+ andc rT1,d,b; /* 1: f' = ~b and d */ \
+ LOAD_DATA(w1, off+4) /* W */ \
+ or rT0,rT0,rT1; /* 1: f = f or f' */ \
+ addi w0,w0,k0l; /* 1: wk = w + k */ \
+ add a,a,rT0; /* 1: a = a + f */ \
+ addis w0,w0,k0h; /* 1: wk = w + k' */ \
+ addis w1,w1,k1h; /* 2: wk = w + k */ \
+ add a,a,w0; /* 1: a = a + wk */ \
+ addi w1,w1,k1l; /* 2: wk = w + k' */ \
+ rotrwi a,a,p; /* 1: a = a rotl x */ \
+ add d,d,w1; /* 2: a = a + wk */ \
+ add a,a,b; /* 1: a = a + b */ \
+ and rT0,a,b; /* 2: f = b and c */ \
+ andc rT1,c,a; /* 2: f' = ~b and d */ \
+ or rT0,rT0,rT1; /* 2: f = f or f' */ \
+ add d,d,rT0; /* 2: a = a + f */ \
+ INC_PTR /* ptr++ */ \
+ rotrwi d,d,q; /* 2: a = a rotl x */ \
+ add d,d,a; /* 2: a = a + b */
+
+#define R_16_31(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
+ andc rT0,c,d; /* 1: f = c and ~d */ \
+ and rT1,b,d; /* 1: f' = b and d */ \
+ addi w0,w0,k0l; /* 1: wk = w + k */ \
+ or rT0,rT0,rT1; /* 1: f = f or f' */ \
+ addis w0,w0,k0h; /* 1: wk = w + k' */ \
+ add a,a,rT0; /* 1: a = a + f */ \
+ addi w1,w1,k1l; /* 2: wk = w + k */ \
+ add a,a,w0; /* 1: a = a + wk */ \
+ addis w1,w1,k1h; /* 2: wk = w + k' */ \
+ andc rT0,b,c; /* 2: f = c and ~d */ \
+ rotrwi a,a,p; /* 1: a = a rotl x */ \
+ add a,a,b; /* 1: a = a + b */ \
+ add d,d,w1; /* 2: a = a + wk */ \
+ and rT1,a,c; /* 2: f' = b and d */ \
+ or rT0,rT0,rT1; /* 2: f = f or f' */ \
+ add d,d,rT0; /* 2: a = a + f */ \
+ rotrwi d,d,q; /* 2: a = a rotl x */ \
+ add d,d,a; /* 2: a = a +b */
+
+#define R_32_47(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
+ xor rT0,b,c; /* 1: f' = b xor c */ \
+ addi w0,w0,k0l; /* 1: wk = w + k */ \
+ xor rT1,rT0,d; /* 1: f = f xor f' */ \
+ addis w0,w0,k0h; /* 1: wk = w + k' */ \
+ add a,a,rT1; /* 1: a = a + f */ \
+ addi w1,w1,k1l; /* 2: wk = w + k */ \
+ add a,a,w0; /* 1: a = a + wk */ \
+ addis w1,w1,k1h; /* 2: wk = w + k' */ \
+ rotrwi a,a,p; /* 1: a = a rotl x */ \
+ add d,d,w1; /* 2: a = a + wk */ \
+ add a,a,b; /* 1: a = a + b */ \
+ xor rT1,rT0,a; /* 2: f = b xor f' */ \
+ add d,d,rT1; /* 2: a = a + f */ \
+ rotrwi d,d,q; /* 2: a = a rotl x */ \
+ add d,d,a; /* 2: a = a + b */
+
+#define R_48_63(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \
+ addi w0,w0,k0l; /* 1: w = w + k */ \
+ orc rT0,b,d; /* 1: f = b or ~d */ \
+ addis w0,w0,k0h; /* 1: w = w + k' */ \
+ xor rT0,rT0,c; /* 1: f = f xor c */ \
+ add a,a,w0; /* 1: a = a + wk */ \
+ addi w1,w1,k1l; /* 2: w = w + k */ \
+ add a,a,rT0; /* 1: a = a + f */ \
+ addis w1,w1,k1h; /* 2: w = w + k' */ \
+ rotrwi a,a,p; /* 1: a = a rotl x */ \
+ add a,a,b; /* 1: a = a + b */ \
+ orc rT0,a,c; /* 2: f = b or ~d */ \
+ add d,d,w1; /* 2: a = a + wk */ \
+ xor rT0,rT0,b; /* 2: f = f xor c */ \
+ add d,d,rT0; /* 2: a = a + f */ \
+ rotrwi d,d,q; /* 2: a = a rotl x */ \
+ add d,d,a; /* 2: a = a + b */
+
+_GLOBAL(ppc_md5_transform)
+ INITIALIZE
+
+ mtctr r5
+ lwz rH0,0(rHP)
+ lwz rH1,4(rHP)
+ lwz rH2,8(rHP)
+ lwz rH3,12(rHP)
+
+ppc_md5_main:
+ R_00_15(rH0, rH1, rH2, rH3, rW00, rW01, 25, 20, 0,
+ 0xd76b, -23432, 0xe8c8, -18602)
+ R_00_15(rH2, rH3, rH0, rH1, rW02, rW03, 15, 10, 8,
+ 0x2420, 0x70db, 0xc1be, -12562)
+ R_00_15(rH0, rH1, rH2, rH3, rW04, rW05, 25, 20, 16,
+ 0xf57c, 0x0faf, 0x4788, -14806)
+ R_00_15(rH2, rH3, rH0, rH1, rW06, rW07, 15, 10, 24,
+ 0xa830, 0x4613, 0xfd47, -27391)
+ R_00_15(rH0, rH1, rH2, rH3, rW08, rW09, 25, 20, 32,
+ 0x6981, -26408, 0x8b45, -2129)
+ R_00_15(rH2, rH3, rH0, rH1, rW10, rW11, 15, 10, 40,
+ 0xffff, 0x5bb1, 0x895d, -10306)
+ R_00_15(rH0, rH1, rH2, rH3, rW12, rW13, 25, 20, 48,
+ 0x6b90, 0x1122, 0xfd98, 0x7193)
+ R_00_15(rH2, rH3, rH0, rH1, rW14, rW15, 15, 10, 56,
+ 0xa679, 0x438e, 0x49b4, 0x0821)
+
+ R_16_31(rH0, rH1, rH2, rH3, rW01, rW06, 27, 23,
+ 0x0d56, 0x6e0c, 0x1810, 0x6d2d)
+ R_16_31(rH2, rH3, rH0, rH1, rW11, rW00, 18, 12,
+ 0x9d02, -32109, 0x124c, 0x2332)
+ R_16_31(rH0, rH1, rH2, rH3, rW05, rW10, 27, 23,
+ 0x8ea7, 0x4a33, 0x0245, -18270)
+ R_16_31(rH2, rH3, rH0, rH1, rW15, rW04, 18, 12,
+ 0x8eee, -8608, 0xf258, -5095)
+ R_16_31(rH0, rH1, rH2, rH3, rW09, rW14, 27, 23,
+ 0x969d, -10697, 0x1cbe, -15288)
+ R_16_31(rH2, rH3, rH0, rH1, rW03, rW08, 18, 12,
+ 0x3317, 0x3e99, 0xdbd9, 0x7c15)
+ R_16_31(rH0, rH1, rH2, rH3, rW13, rW02, 27, 23,
+ 0xac4b, 0x7772, 0xd8cf, 0x331d)
+ R_16_31(rH2, rH3, rH0, rH1, rW07, rW12, 18, 12,
+ 0x6a28, 0x6dd8, 0x219a, 0x3b68)
+
+ R_32_47(rH0, rH1, rH2, rH3, rW05, rW08, 28, 21,
+ 0x29cb, 0x28e5, 0x4218, -7788)
+ R_32_47(rH2, rH3, rH0, rH1, rW11, rW14, 16, 9,
+ 0x473f, 0x06d1, 0x3aae, 0x3036)
+ R_32_47(rH0, rH1, rH2, rH3, rW01, rW04, 28, 21,
+ 0xaea1, -15134, 0x640b, -11295)
+ R_32_47(rH2, rH3, rH0, rH1, rW07, rW10, 16, 9,
+ 0x8f4c, 0x4887, 0xbc7c, -22499)
+ R_32_47(rH0, rH1, rH2, rH3, rW13, rW00, 28, 21,
+ 0x7eb8, -27199, 0x00ea, 0x6050)
+ R_32_47(rH2, rH3, rH0, rH1, rW03, rW06, 16, 9,
+ 0xe01a, 0x22fe, 0x4447, 0x69c5)
+ R_32_47(rH0, rH1, rH2, rH3, rW09, rW12, 28, 21,
+ 0xb7f3, 0x0253, 0x59b1, 0x4d5b)
+ R_32_47(rH2, rH3, rH0, rH1, rW15, rW02, 16, 9,
+ 0x4701, -27017, 0xc7bd, -19859)
+
+ R_48_63(rH0, rH1, rH2, rH3, rW00, rW07, 26, 22,
+ 0x0988, -1462, 0x4c70, -19401)
+ R_48_63(rH2, rH3, rH0, rH1, rW14, rW05, 17, 11,
+ 0xadaf, -5221, 0xfc99, 0x66f7)
+ R_48_63(rH0, rH1, rH2, rH3, rW12, rW03, 26, 22,
+ 0x7e80, -16418, 0xba1e, -25587)
+ R_48_63(rH2, rH3, rH0, rH1, rW10, rW01, 17, 11,
+ 0x4130, 0x380d, 0xe0c5, 0x738d)
+ lwz rW00,0(rHP)
+ R_48_63(rH0, rH1, rH2, rH3, rW08, rW15, 26, 22,
+ 0xe837, -30770, 0xde8a, 0x69e8)
+ lwz rW14,4(rHP)
+ R_48_63(rH2, rH3, rH0, rH1, rW06, rW13, 17, 11,
+ 0x9e79, 0x260f, 0x256d, -27941)
+ lwz rW12,8(rHP)
+ R_48_63(rH0, rH1, rH2, rH3, rW04, rW11, 26, 22,
+ 0xab75, -20775, 0x4f9e, -28397)
+ lwz rW10,12(rHP)
+ R_48_63(rH2, rH3, rH0, rH1, rW02, rW09, 17, 11,
+ 0x662b, 0x7c56, 0x11b2, 0x0358)
+
+ add rH0,rH0,rW00
+ stw rH0,0(rHP)
+ add rH1,rH1,rW14
+ stw rH1,4(rHP)
+ add rH2,rH2,rW12
+ stw rH2,8(rHP)
+ add rH3,rH3,rW10
+ stw rH3,12(rHP)
+ NEXT_BLOCK
+
+ bdnz ppc_md5_main
+
+ FINALIZE
+ blr
diff --git a/lib/crypto/powerpc/md5.h b/lib/crypto/powerpc/md5.h
new file mode 100644
index 000000000000..540b08e34d1d
--- /dev/null
+++ b/lib/crypto/powerpc/md5.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * MD5 optimized for PowerPC
+ */
+
+void ppc_md5_transform(u32 *state, const u8 *data, size_t nblocks);
+
+static void md5_blocks(struct md5_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ ppc_md5_transform(state->h, data, nblocks);
+}
diff --git a/lib/crypto/powerpc/poly1305-p10le_64.S b/lib/crypto/powerpc/poly1305-p10le_64.S
new file mode 100644
index 000000000000..a3c1987f1ecd
--- /dev/null
+++ b/lib/crypto/powerpc/poly1305-p10le_64.S
@@ -0,0 +1,1075 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#
+# Accelerated poly1305 implementation for ppc64le.
+#
+# Copyright 2023- IBM Corp. All rights reserved
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# Poly1305 - this version mainly using vector/VSX/Scalar
+# - 26 bits limbs
+# - Handle multiple 64 byte blcok.
+#
+# Block size 16 bytes
+# key = (r, s)
+# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF
+# p = 2^130 - 5
+# a += m
+# a = (r + a) % p
+# a += s
+#
+# Improve performance by breaking down polynominal to the sum of products with
+# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+#
+# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0
+# to 9 vectors for multiplications.
+#
+# setup r^4, r^3, r^2, r vectors
+# vs [r^1, r^3, r^2, r^4]
+# vs0 = [r0,.....]
+# vs1 = [r1,.....]
+# vs2 = [r2,.....]
+# vs3 = [r3,.....]
+# vs4 = [r4,.....]
+# vs5 = [r1*5,...]
+# vs6 = [r2*5,...]
+# vs7 = [r2*5,...]
+# vs8 = [r4*5,...]
+#
+# Each word in a vector consists a member of a "r/s" in [a * r/s].
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0, r4*5, r3*5, r2*5;
+# r2, r1, r0, r4*5, r3*5;
+# r3, r2, r1, r0, r4*5;
+# r4, r3, r2, r1, r0 ;
+#
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+# k = 32 bytes key
+# r3 = k (r, s)
+# r4 = mlen
+# r5 = m
+#
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+#include <linux/linkage.h>
+
+.machine "any"
+
+.text
+
+.macro SAVE_GPR GPR OFFSET FRAME
+ std \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro SAVE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ stvx \VRS, 16, \FRAME
+.endm
+
+.macro SAVE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ stxvx \VSX, 16, \FRAME
+.endm
+
+.macro RESTORE_GPR GPR OFFSET FRAME
+ ld \GPR,\OFFSET(\FRAME)
+.endm
+
+.macro RESTORE_VRS VRS OFFSET FRAME
+ li 16, \OFFSET
+ lvx \VRS, 16, \FRAME
+.endm
+
+.macro RESTORE_VSX VSX OFFSET FRAME
+ li 16, \OFFSET
+ lxvx \VSX, 16, \FRAME
+.endm
+
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-752(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+ SAVE_GPR 25, 200, 1
+ SAVE_GPR 26, 208, 1
+ SAVE_GPR 27, 216, 1
+ SAVE_GPR 28, 224, 1
+ SAVE_GPR 29, 232, 1
+ SAVE_GPR 30, 240, 1
+ SAVE_GPR 31, 248, 1
+
+ addi 9, 1, 256
+ SAVE_VRS 20, 0, 9
+ SAVE_VRS 21, 16, 9
+ SAVE_VRS 22, 32, 9
+ SAVE_VRS 23, 48, 9
+ SAVE_VRS 24, 64, 9
+ SAVE_VRS 25, 80, 9
+ SAVE_VRS 26, 96, 9
+ SAVE_VRS 27, 112, 9
+ SAVE_VRS 28, 128, 9
+ SAVE_VRS 29, 144, 9
+ SAVE_VRS 30, 160, 9
+ SAVE_VRS 31, 176, 9
+
+ SAVE_VSX 14, 192, 9
+ SAVE_VSX 15, 208, 9
+ SAVE_VSX 16, 224, 9
+ SAVE_VSX 17, 240, 9
+ SAVE_VSX 18, 256, 9
+ SAVE_VSX 19, 272, 9
+ SAVE_VSX 20, 288, 9
+ SAVE_VSX 21, 304, 9
+ SAVE_VSX 22, 320, 9
+ SAVE_VSX 23, 336, 9
+ SAVE_VSX 24, 352, 9
+ SAVE_VSX 25, 368, 9
+ SAVE_VSX 26, 384, 9
+ SAVE_VSX 27, 400, 9
+ SAVE_VSX 28, 416, 9
+ SAVE_VSX 29, 432, 9
+ SAVE_VSX 30, 448, 9
+ SAVE_VSX 31, 464, 9
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+ addi 9, 1, 256
+ RESTORE_VRS 20, 0, 9
+ RESTORE_VRS 21, 16, 9
+ RESTORE_VRS 22, 32, 9
+ RESTORE_VRS 23, 48, 9
+ RESTORE_VRS 24, 64, 9
+ RESTORE_VRS 25, 80, 9
+ RESTORE_VRS 26, 96, 9
+ RESTORE_VRS 27, 112, 9
+ RESTORE_VRS 28, 128, 9
+ RESTORE_VRS 29, 144, 9
+ RESTORE_VRS 30, 160, 9
+ RESTORE_VRS 31, 176, 9
+
+ RESTORE_VSX 14, 192, 9
+ RESTORE_VSX 15, 208, 9
+ RESTORE_VSX 16, 224, 9
+ RESTORE_VSX 17, 240, 9
+ RESTORE_VSX 18, 256, 9
+ RESTORE_VSX 19, 272, 9
+ RESTORE_VSX 20, 288, 9
+ RESTORE_VSX 21, 304, 9
+ RESTORE_VSX 22, 320, 9
+ RESTORE_VSX 23, 336, 9
+ RESTORE_VSX 24, 352, 9
+ RESTORE_VSX 25, 368, 9
+ RESTORE_VSX 26, 384, 9
+ RESTORE_VSX 27, 400, 9
+ RESTORE_VSX 28, 416, 9
+ RESTORE_VSX 29, 432, 9
+ RESTORE_VSX 30, 448, 9
+ RESTORE_VSX 31, 464, 9
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+ RESTORE_GPR 25, 200, 1
+ RESTORE_GPR 26, 208, 1
+ RESTORE_GPR 27, 216, 1
+ RESTORE_GPR 28, 224, 1
+ RESTORE_GPR 29, 232, 1
+ RESTORE_GPR 30, 240, 1
+ RESTORE_GPR 31, 248, 1
+
+ addi 1, 1, 752
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+#
+# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5;
+# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5;
+# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5;
+# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5;
+# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ;
+#
+# [r^2, r^3, r^1, r^4]
+# [m3, m2, m4, m1]
+#
+# multiply odd and even words
+.macro mul_odd
+ vmulouw 14, 4, 26
+ vmulouw 10, 5, 3
+ vmulouw 11, 6, 2
+ vmulouw 12, 7, 1
+ vmulouw 13, 8, 0
+ vmulouw 15, 4, 27
+ vaddudm 14, 14, 10
+ vaddudm 14, 14, 11
+ vmulouw 10, 5, 26
+ vmulouw 11, 6, 3
+ vaddudm 14, 14, 12
+ vaddudm 14, 14, 13 # x0
+ vaddudm 15, 15, 10
+ vaddudm 15, 15, 11
+ vmulouw 12, 7, 2
+ vmulouw 13, 8, 1
+ vaddudm 15, 15, 12
+ vaddudm 15, 15, 13 # x1
+ vmulouw 16, 4, 28
+ vmulouw 10, 5, 27
+ vmulouw 11, 6, 26
+ vaddudm 16, 16, 10
+ vaddudm 16, 16, 11
+ vmulouw 12, 7, 3
+ vmulouw 13, 8, 2
+ vaddudm 16, 16, 12
+ vaddudm 16, 16, 13 # x2
+ vmulouw 17, 4, 29
+ vmulouw 10, 5, 28
+ vmulouw 11, 6, 27
+ vaddudm 17, 17, 10
+ vaddudm 17, 17, 11
+ vmulouw 12, 7, 26
+ vmulouw 13, 8, 3
+ vaddudm 17, 17, 12
+ vaddudm 17, 17, 13 # x3
+ vmulouw 18, 4, 30
+ vmulouw 10, 5, 29
+ vmulouw 11, 6, 28
+ vaddudm 18, 18, 10
+ vaddudm 18, 18, 11
+ vmulouw 12, 7, 27
+ vmulouw 13, 8, 26
+ vaddudm 18, 18, 12
+ vaddudm 18, 18, 13 # x4
+.endm
+
+.macro mul_even
+ vmuleuw 9, 4, 26
+ vmuleuw 10, 5, 3
+ vmuleuw 11, 6, 2
+ vmuleuw 12, 7, 1
+ vmuleuw 13, 8, 0
+ vaddudm 14, 14, 9
+ vaddudm 14, 14, 10
+ vaddudm 14, 14, 11
+ vaddudm 14, 14, 12
+ vaddudm 14, 14, 13 # x0
+
+ vmuleuw 9, 4, 27
+ vmuleuw 10, 5, 26
+ vmuleuw 11, 6, 3
+ vmuleuw 12, 7, 2
+ vmuleuw 13, 8, 1
+ vaddudm 15, 15, 9
+ vaddudm 15, 15, 10
+ vaddudm 15, 15, 11
+ vaddudm 15, 15, 12
+ vaddudm 15, 15, 13 # x1
+
+ vmuleuw 9, 4, 28
+ vmuleuw 10, 5, 27
+ vmuleuw 11, 6, 26
+ vmuleuw 12, 7, 3
+ vmuleuw 13, 8, 2
+ vaddudm 16, 16, 9
+ vaddudm 16, 16, 10
+ vaddudm 16, 16, 11
+ vaddudm 16, 16, 12
+ vaddudm 16, 16, 13 # x2
+
+ vmuleuw 9, 4, 29
+ vmuleuw 10, 5, 28
+ vmuleuw 11, 6, 27
+ vmuleuw 12, 7, 26
+ vmuleuw 13, 8, 3
+ vaddudm 17, 17, 9
+ vaddudm 17, 17, 10
+ vaddudm 17, 17, 11
+ vaddudm 17, 17, 12
+ vaddudm 17, 17, 13 # x3
+
+ vmuleuw 9, 4, 30
+ vmuleuw 10, 5, 29
+ vmuleuw 11, 6, 28
+ vmuleuw 12, 7, 27
+ vmuleuw 13, 8, 26
+ vaddudm 18, 18, 9
+ vaddudm 18, 18, 10
+ vaddudm 18, 18, 11
+ vaddudm 18, 18, 12
+ vaddudm 18, 18, 13 # x4
+.endm
+
+#
+# poly1305_setup_r
+#
+# setup r^4, r^3, r^2, r vectors
+# [r, r^3, r^2, r^4]
+# vs0 = [r0,...]
+# vs1 = [r1,...]
+# vs2 = [r2,...]
+# vs3 = [r3,...]
+# vs4 = [r4,...]
+# vs5 = [r4*5,...]
+# vs6 = [r3*5,...]
+# vs7 = [r2*5,...]
+# vs8 = [r1*5,...]
+#
+# r0, r4*5, r3*5, r2*5, r1*5;
+# r1, r0, r4*5, r3*5, r2*5;
+# r2, r1, r0, r4*5, r3*5;
+# r3, r2, r1, r0, r4*5;
+# r4, r3, r2, r1, r0 ;
+#
+.macro poly1305_setup_r
+
+ # save r
+ xxlor 26, 58, 58
+ xxlor 27, 59, 59
+ xxlor 28, 60, 60
+ xxlor 29, 61, 61
+ xxlor 30, 62, 62
+
+ xxlxor 31, 31, 31
+
+# [r, r^3, r^2, r^4]
+ # compute r^2
+ vmr 4, 26
+ vmr 5, 27
+ vmr 6, 28
+ vmr 7, 29
+ vmr 8, 30
+ bl do_mul # r^2 r^1
+ xxpermdi 58, 58, 36, 0x3 # r0
+ xxpermdi 59, 59, 37, 0x3 # r1
+ xxpermdi 60, 60, 38, 0x3 # r2
+ xxpermdi 61, 61, 39, 0x3 # r3
+ xxpermdi 62, 62, 40, 0x3 # r4
+ xxpermdi 36, 36, 36, 0x3
+ xxpermdi 37, 37, 37, 0x3
+ xxpermdi 38, 38, 38, 0x3
+ xxpermdi 39, 39, 39, 0x3
+ xxpermdi 40, 40, 40, 0x3
+ vspltisb 13, 2
+ vsld 9, 27, 13
+ vsld 10, 28, 13
+ vsld 11, 29, 13
+ vsld 12, 30, 13
+ vaddudm 0, 9, 27
+ vaddudm 1, 10, 28
+ vaddudm 2, 11, 29
+ vaddudm 3, 12, 30
+
+ bl do_mul # r^4 r^3
+ vmrgow 26, 26, 4
+ vmrgow 27, 27, 5
+ vmrgow 28, 28, 6
+ vmrgow 29, 29, 7
+ vmrgow 30, 30, 8
+ vspltisb 13, 2
+ vsld 9, 27, 13
+ vsld 10, 28, 13
+ vsld 11, 29, 13
+ vsld 12, 30, 13
+ vaddudm 0, 9, 27
+ vaddudm 1, 10, 28
+ vaddudm 2, 11, 29
+ vaddudm 3, 12, 30
+
+ # r^2 r^4
+ xxlor 0, 58, 58
+ xxlor 1, 59, 59
+ xxlor 2, 60, 60
+ xxlor 3, 61, 61
+ xxlor 4, 62, 62
+ xxlor 5, 32, 32
+ xxlor 6, 33, 33
+ xxlor 7, 34, 34
+ xxlor 8, 35, 35
+
+ vspltw 9, 26, 3
+ vspltw 10, 26, 2
+ vmrgow 26, 10, 9
+ vspltw 9, 27, 3
+ vspltw 10, 27, 2
+ vmrgow 27, 10, 9
+ vspltw 9, 28, 3
+ vspltw 10, 28, 2
+ vmrgow 28, 10, 9
+ vspltw 9, 29, 3
+ vspltw 10, 29, 2
+ vmrgow 29, 10, 9
+ vspltw 9, 30, 3
+ vspltw 10, 30, 2
+ vmrgow 30, 10, 9
+
+ vsld 9, 27, 13
+ vsld 10, 28, 13
+ vsld 11, 29, 13
+ vsld 12, 30, 13
+ vaddudm 0, 9, 27
+ vaddudm 1, 10, 28
+ vaddudm 2, 11, 29
+ vaddudm 3, 12, 30
+.endm
+
+SYM_FUNC_START_LOCAL(do_mul)
+ mul_odd
+
+ # do reduction ( h %= p )
+ # carry reduction
+ vspltisb 9, 2
+ vsrd 10, 14, 31
+ vsrd 11, 17, 31
+ vand 7, 17, 25
+ vand 4, 14, 25
+ vaddudm 18, 18, 11
+ vsrd 12, 18, 31
+ vaddudm 15, 15, 10
+
+ vsrd 11, 15, 31
+ vand 8, 18, 25
+ vand 5, 15, 25
+ vaddudm 4, 4, 12
+ vsld 10, 12, 9
+ vaddudm 6, 16, 11
+
+ vsrd 13, 6, 31
+ vand 6, 6, 25
+ vaddudm 4, 4, 10
+ vsrd 10, 4, 31
+ vaddudm 7, 7, 13
+
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 5, 5, 10
+ vaddudm 8, 8, 11
+ blr
+SYM_FUNC_END(do_mul)
+
+#
+# init key
+#
+.macro do_poly1305_init
+ addis 10, 2, rmask@toc@ha
+ addi 10, 10, rmask@toc@l
+
+ ld 11, 0(10)
+ ld 12, 8(10)
+
+ li 14, 16
+ li 15, 32
+ addis 10, 2, cnum@toc@ha
+ addi 10, 10, cnum@toc@l
+ lvx 25, 0, 10 # v25 - mask
+ lvx 31, 14, 10 # v31 = 1a
+ lvx 19, 15, 10 # v19 = 1 << 24
+ lxv 24, 48(10) # vs24
+ lxv 25, 64(10) # vs25
+
+ # initialize
+ # load key from r3 to vectors
+ ld 9, 24(3)
+ ld 10, 32(3)
+ and. 9, 9, 11
+ and. 10, 10, 12
+
+ # break 26 bits
+ extrdi 14, 9, 26, 38
+ extrdi 15, 9, 26, 12
+ extrdi 16, 9, 12, 0
+ mtvsrdd 58, 0, 14
+ insrdi 16, 10, 14, 38
+ mtvsrdd 59, 0, 15
+ extrdi 17, 10, 26, 24
+ mtvsrdd 60, 0, 16
+ extrdi 18, 10, 24, 0
+ mtvsrdd 61, 0, 17
+ mtvsrdd 62, 0, 18
+
+ # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5
+ li 9, 5
+ mtvsrdd 36, 0, 9
+ vmulouw 0, 27, 4 # v0 = rr0
+ vmulouw 1, 28, 4 # v1 = rr1
+ vmulouw 2, 29, 4 # v2 = rr2
+ vmulouw 3, 30, 4 # v3 = rr3
+.endm
+
+#
+# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m)
+# k = 32 bytes key
+# r3 = k (r, s)
+# r4 = mlen
+# r5 = m
+#
+SYM_FUNC_START(poly1305_p10le_4blocks)
+.align 5
+ cmpdi 5, 64
+ blt Out_no_poly1305
+
+ SAVE_REGS
+
+ do_poly1305_init
+
+ li 21, 0 # counter to message
+
+ poly1305_setup_r
+
+ # load previous H state
+ # break/convert r6 to 26 bits
+ ld 9, 0(3)
+ ld 10, 8(3)
+ ld 19, 16(3)
+ sldi 19, 19, 24
+ mtvsrdd 41, 0, 19
+ extrdi 14, 9, 26, 38
+ extrdi 15, 9, 26, 12
+ extrdi 16, 9, 12, 0
+ mtvsrdd 36, 0, 14
+ insrdi 16, 10, 14, 38
+ mtvsrdd 37, 0, 15
+ extrdi 17, 10, 26, 24
+ mtvsrdd 38, 0, 16
+ extrdi 18, 10, 24, 0
+ mtvsrdd 39, 0, 17
+ mtvsrdd 40, 0, 18
+ vor 8, 8, 9
+
+ # input m1 m2
+ add 20, 4, 21
+ xxlor 49, 24, 24
+ xxlor 50, 25, 25
+ lxvw4x 43, 0, 20
+ addi 17, 20, 16
+ lxvw4x 44, 0, 17
+ vperm 14, 11, 12, 17
+ vperm 15, 11, 12, 18
+ vand 9, 14, 25 # a0
+ vsrd 10, 14, 31 # >> 26
+ vsrd 11, 10, 31 # 12 bits left
+ vand 10, 10, 25 # a1
+ vspltisb 13, 12
+ vand 16, 15, 25
+ vsld 12, 16, 13
+ vor 11, 11, 12
+ vand 11, 11, 25 # a2
+ vspltisb 13, 14
+ vsrd 12, 15, 13 # >> 14
+ vsrd 13, 12, 31 # >> 26, a4
+ vand 12, 12, 25 # a3
+
+ vaddudm 20, 4, 9
+ vaddudm 21, 5, 10
+ vaddudm 22, 6, 11
+ vaddudm 23, 7, 12
+ vaddudm 24, 8, 13
+
+ # m3 m4
+ addi 17, 17, 16
+ lxvw4x 43, 0, 17
+ addi 17, 17, 16
+ lxvw4x 44, 0, 17
+ vperm 14, 11, 12, 17
+ vperm 15, 11, 12, 18
+ vand 9, 14, 25 # a0
+ vsrd 10, 14, 31 # >> 26
+ vsrd 11, 10, 31 # 12 bits left
+ vand 10, 10, 25 # a1
+ vspltisb 13, 12
+ vand 16, 15, 25
+ vsld 12, 16, 13
+ vspltisb 13, 14
+ vor 11, 11, 12
+ vand 11, 11, 25 # a2
+ vsrd 12, 15, 13 # >> 14
+ vsrd 13, 12, 31 # >> 26, a4
+ vand 12, 12, 25 # a3
+
+ # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
+ vmrgow 4, 9, 20
+ vmrgow 5, 10, 21
+ vmrgow 6, 11, 22
+ vmrgow 7, 12, 23
+ vmrgow 8, 13, 24
+ vaddudm 8, 8, 19
+
+ addi 5, 5, -64 # len -= 64
+ addi 21, 21, 64 # offset += 64
+
+ li 9, 64
+ divdu 31, 5, 9
+
+ cmpdi 31, 0
+ ble Skip_block_loop
+
+ mtctr 31
+
+# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r
+# Rewrite the polynominal sum of product as follows,
+# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2
+# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2
+# .... Repeat
+# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 -->
+# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r
+#
+loop_4blocks:
+
+ # Multiply odd words and even words
+ mul_odd
+ mul_even
+ # carry reduction
+ vspltisb 9, 2
+ vsrd 10, 14, 31
+ vsrd 11, 17, 31
+ vand 7, 17, 25
+ vand 4, 14, 25
+ vaddudm 18, 18, 11
+ vsrd 12, 18, 31
+ vaddudm 15, 15, 10
+
+ vsrd 11, 15, 31
+ vand 8, 18, 25
+ vand 5, 15, 25
+ vaddudm 4, 4, 12
+ vsld 10, 12, 9
+ vaddudm 6, 16, 11
+
+ vsrd 13, 6, 31
+ vand 6, 6, 25
+ vaddudm 4, 4, 10
+ vsrd 10, 4, 31
+ vaddudm 7, 7, 13
+
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 5, 5, 10
+ vaddudm 8, 8, 11
+
+ # input m1 m2 m3 m4
+ add 20, 4, 21
+ xxlor 49, 24, 24
+ xxlor 50, 25, 25
+ lxvw4x 43, 0, 20
+ addi 17, 20, 16
+ lxvw4x 44, 0, 17
+ vperm 14, 11, 12, 17
+ vperm 15, 11, 12, 18
+ addi 17, 17, 16
+ lxvw4x 43, 0, 17
+ addi 17, 17, 16
+ lxvw4x 44, 0, 17
+ vperm 17, 11, 12, 17
+ vperm 18, 11, 12, 18
+
+ vand 20, 14, 25 # a0
+ vand 9, 17, 25 # a0
+ vsrd 21, 14, 31 # >> 26
+ vsrd 22, 21, 31 # 12 bits left
+ vsrd 10, 17, 31 # >> 26
+ vsrd 11, 10, 31 # 12 bits left
+
+ vand 21, 21, 25 # a1
+ vand 10, 10, 25 # a1
+
+ vspltisb 13, 12
+ vand 16, 15, 25
+ vsld 23, 16, 13
+ vor 22, 22, 23
+ vand 22, 22, 25 # a2
+ vand 16, 18, 25
+ vsld 12, 16, 13
+ vor 11, 11, 12
+ vand 11, 11, 25 # a2
+ vspltisb 13, 14
+ vsrd 23, 15, 13 # >> 14
+ vsrd 24, 23, 31 # >> 26, a4
+ vand 23, 23, 25 # a3
+ vsrd 12, 18, 13 # >> 14
+ vsrd 13, 12, 31 # >> 26, a4
+ vand 12, 12, 25 # a3
+
+ vaddudm 4, 4, 20
+ vaddudm 5, 5, 21
+ vaddudm 6, 6, 22
+ vaddudm 7, 7, 23
+ vaddudm 8, 8, 24
+
+ # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1]
+ vmrgow 4, 9, 4
+ vmrgow 5, 10, 5
+ vmrgow 6, 11, 6
+ vmrgow 7, 12, 7
+ vmrgow 8, 13, 8
+ vaddudm 8, 8, 19
+
+ addi 5, 5, -64 # len -= 64
+ addi 21, 21, 64 # offset += 64
+
+ bdnz loop_4blocks
+
+Skip_block_loop:
+ xxlor 58, 0, 0
+ xxlor 59, 1, 1
+ xxlor 60, 2, 2
+ xxlor 61, 3, 3
+ xxlor 62, 4, 4
+ xxlor 32, 5, 5
+ xxlor 33, 6, 6
+ xxlor 34, 7, 7
+ xxlor 35, 8, 8
+
+ # Multiply odd words and even words
+ mul_odd
+ mul_even
+
+ # Sum the products.
+ xxpermdi 41, 31, 46, 0
+ xxpermdi 42, 31, 47, 0
+ vaddudm 4, 14, 9
+ xxpermdi 36, 31, 36, 3
+ vaddudm 5, 15, 10
+ xxpermdi 37, 31, 37, 3
+ xxpermdi 43, 31, 48, 0
+ vaddudm 6, 16, 11
+ xxpermdi 38, 31, 38, 3
+ xxpermdi 44, 31, 49, 0
+ vaddudm 7, 17, 12
+ xxpermdi 39, 31, 39, 3
+ xxpermdi 45, 31, 50, 0
+ vaddudm 8, 18, 13
+ xxpermdi 40, 31, 40, 3
+
+ # carry reduction
+ vspltisb 9, 2
+ vsrd 10, 4, 31
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 8, 8, 11
+ vsrd 12, 8, 31
+ vaddudm 5, 5, 10
+
+ vsrd 11, 5, 31
+ vand 8, 8, 25
+ vand 5, 5, 25
+ vaddudm 4, 4, 12
+ vsld 10, 12, 9
+ vaddudm 6, 6, 11
+
+ vsrd 13, 6, 31
+ vand 6, 6, 25
+ vaddudm 4, 4, 10
+ vsrd 10, 4, 31
+ vaddudm 7, 7, 13
+
+ vsrd 11, 7, 31
+ vand 7, 7, 25
+ vand 4, 4, 25
+ vaddudm 5, 5, 10
+ vsrd 10, 5, 31
+ vand 5, 5, 25
+ vaddudm 6, 6, 10
+ vaddudm 8, 8, 11
+
+ b do_final_update
+
+do_final_update:
+ # combine 26 bit limbs
+ # v4, v5, v6, v7 and v8 are 26 bit vectors
+ vsld 5, 5, 31
+ vor 20, 4, 5
+ vspltisb 11, 12
+ vsrd 12, 6, 11
+ vsld 6, 6, 31
+ vsld 6, 6, 31
+ vor 20, 20, 6
+ vspltisb 11, 14
+ vsld 7, 7, 11
+ vor 21, 7, 12
+ mfvsrld 16, 40 # save last 2 bytes
+ vsld 8, 8, 11
+ vsld 8, 8, 31
+ vor 21, 21, 8
+ mfvsrld 17, 52
+ mfvsrld 19, 53
+ srdi 16, 16, 24
+
+ std 17, 0(3)
+ std 19, 8(3)
+ stw 16, 16(3)
+
+Out_loop:
+ li 3, 0
+
+ RESTORE_REGS
+
+ blr
+
+Out_no_poly1305:
+ li 3, 0
+ blr
+SYM_FUNC_END(poly1305_p10le_4blocks)
+
+#
+# =======================================================================
+# The following functions implement 64 x 64 bits multiplication poly1305.
+#
+SYM_FUNC_START_LOCAL(Poly1305_init_64)
+ # mask 0x0FFFFFFC0FFFFFFC
+ # mask 0x0FFFFFFC0FFFFFFF
+ addis 10, 2, rmask@toc@ha
+ addi 10, 10, rmask@toc@l
+ ld 11, 0(10)
+ ld 12, 8(10)
+
+ # initialize
+ # load key from r3
+ ld 9, 24(3)
+ ld 10, 32(3)
+ and. 9, 9, 11 # cramp mask r0
+ and. 10, 10, 12 # cramp mask r1
+
+ srdi 21, 10, 2
+ add 19, 21, 10 # s1: r19 - (r1 >> 2) *5
+
+ # setup r and s
+ li 25, 0
+ mtvsrdd 32+0, 9, 19 # r0, s1
+ mtvsrdd 32+1, 10, 9 # r1, r0
+ mtvsrdd 32+2, 19, 25 # s1
+ mtvsrdd 32+3, 9, 25 # r0
+
+ blr
+SYM_FUNC_END(Poly1305_init_64)
+
+# Poly1305_mult
+# v6 = (h0, h1), v8 = h2
+# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0
+#
+# Output: v7, v10, v11
+#
+SYM_FUNC_START_LOCAL(Poly1305_mult)
+ #
+ # d0 = h0 * r0 + h1 * s1
+ vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1
+
+ # d1 = h0 * r1 + h1 * r0 + h2 * s1
+ vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0
+ vmsumudm 10, 8, 2, 11 # d1 += h2 * s1
+
+ # d2 = r0
+ vmsumudm 11, 8, 3, 9 # d2 = h2 * r0
+ blr
+SYM_FUNC_END(Poly1305_mult)
+
+#
+# carry reduction
+# h %=p
+#
+# Input: v7, v10, v11
+# Output: r27, r28, r29
+#
+SYM_FUNC_START_LOCAL(Carry_reduction)
+ mfvsrld 27, 32+7
+ mfvsrld 28, 32+10
+ mfvsrld 29, 32+11
+ mfvsrd 20, 32+7 # h0.h
+ mfvsrd 21, 32+10 # h1.h
+
+ addc 28, 28, 20
+ adde 29, 29, 21
+ srdi 22, 29, 0x2
+ sldi 23, 22, 0x2
+ add 23, 23, 22 # (h2 & 3) * 5
+ addc 27, 27, 23 # h0
+ addze 28, 28 # h1
+ andi. 29, 29, 0x3 # h2
+ blr
+SYM_FUNC_END(Carry_reduction)
+
+#
+# poly1305 multiplication
+# h *= r, h %= p
+# d0 = h0 * r0 + h1 * s1
+# d1 = h0 * r1 + h1 * r0 + h2 * s1
+# d2 = h0 * r0
+#
+#
+# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit)
+# - no highbit if final leftover block (highbit = 0)
+#
+SYM_FUNC_START(poly1305_64s)
+ cmpdi 5, 0
+ ble Out_no_poly1305_64
+
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-400(1)
+
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+ SAVE_GPR 25, 200, 1
+ SAVE_GPR 26, 208, 1
+ SAVE_GPR 27, 216, 1
+ SAVE_GPR 28, 224, 1
+ SAVE_GPR 29, 232, 1
+ SAVE_GPR 30, 240, 1
+ SAVE_GPR 31, 248, 1
+
+ # Init poly1305
+ bl Poly1305_init_64
+
+ li 25, 0 # offset to inp and outp
+
+ add 11, 25, 4
+
+ # load h
+ # h0, h1, h2?
+ ld 27, 0(3)
+ ld 28, 8(3)
+ lwz 29, 16(3)
+
+ li 30, 16
+ divdu 31, 5, 30
+
+ mtctr 31
+
+ mr 24, 6 # highbit
+
+Loop_block_64:
+ vxor 9, 9, 9
+
+ ld 20, 0(11)
+ ld 21, 8(11)
+ addi 11, 11, 16
+
+ addc 27, 27, 20
+ adde 28, 28, 21
+ adde 29, 29, 24
+
+ li 22, 0
+ mtvsrdd 32+6, 27, 28 # h0, h1
+ mtvsrdd 32+8, 29, 22 # h2
+
+ bl Poly1305_mult
+
+ bl Carry_reduction
+
+ bdnz Loop_block_64
+
+ std 27, 0(3)
+ std 28, 8(3)
+ stw 29, 16(3)
+
+ li 3, 0
+
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
+ RESTORE_GPR 25, 200, 1
+ RESTORE_GPR 26, 208, 1
+ RESTORE_GPR 27, 216, 1
+ RESTORE_GPR 28, 224, 1
+ RESTORE_GPR 29, 232, 1
+ RESTORE_GPR 30, 240, 1
+ RESTORE_GPR 31, 248, 1
+
+ addi 1, 1, 400
+ ld 0, 16(1)
+ mtlr 0
+
+ blr
+
+Out_no_poly1305_64:
+ li 3, 0
+ blr
+SYM_FUNC_END(poly1305_64s)
+
+#
+# Input: r3 = h, r4 = s, r5 = mac
+# mac = h + s
+#
+SYM_FUNC_START(poly1305_emit_64)
+ ld 10, 0(3)
+ ld 11, 8(3)
+ ld 12, 16(3)
+
+ # compare modulus
+ # h + 5 + (-p)
+ mr 6, 10
+ mr 7, 11
+ mr 8, 12
+ addic. 6, 6, 5
+ addze 7, 7
+ addze 8, 8
+ srdi 9, 8, 2 # overflow?
+ cmpdi 9, 0
+ beq Skip_h64
+ mr 10, 6
+ mr 11, 7
+ mr 12, 8
+
+Skip_h64:
+ ld 6, 0(4)
+ ld 7, 8(4)
+ addc 10, 10, 6
+ adde 11, 11, 7
+ addze 12, 12
+
+ std 10, 0(5)
+ std 11, 8(5)
+ blr
+SYM_FUNC_END(poly1305_emit_64)
+
+SYM_DATA_START_LOCAL(RMASK)
+.align 5
+rmask:
+.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f
+cnum:
+.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000
+.long 0x1a, 0x00, 0x1a, 0x00
+.long 0x01000000, 0x01000000, 0x01000000, 0x01000000
+.long 0x00010203, 0x04050607, 0x10111213, 0x14151617
+.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f
+SYM_DATA_END(RMASK)
diff --git a/lib/crypto/powerpc/poly1305.h b/lib/crypto/powerpc/poly1305.h
new file mode 100644
index 000000000000..b8ed098a0e95
--- /dev/null
+++ b/lib/crypto/powerpc/poly1305.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Poly1305 authenticator algorithm, RFC7539.
+ *
+ * Copyright 2023- IBM Corp. All rights reserved.
+ */
+#include <asm/switch_to.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen);
+asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit);
+asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10);
+
+static void vsx_begin(void)
+{
+ preempt_disable();
+ enable_kernel_vsx();
+}
+
+static void vsx_end(void)
+{
+ disable_kernel_vsx();
+ preempt_enable();
+}
+
+static void poly1305_block_init(struct poly1305_block_state *dctx,
+ const u8 raw_key[POLY1305_BLOCK_SIZE])
+{
+ if (!static_key_enabled(&have_p10))
+ return poly1305_block_init_generic(dctx, raw_key);
+
+ dctx->h = (struct poly1305_state){};
+ dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0);
+ dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8);
+}
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
+ unsigned int len, u32 padbit)
+{
+ if (!static_key_enabled(&have_p10))
+ return poly1305_blocks_generic(state, src, len, padbit);
+ vsx_begin();
+ if (len >= POLY1305_BLOCK_SIZE * 4) {
+ poly1305_p10le_4blocks(state, src, len);
+ src += len - (len % (POLY1305_BLOCK_SIZE * 4));
+ len %= POLY1305_BLOCK_SIZE * 4;
+ }
+ while (len >= POLY1305_BLOCK_SIZE) {
+ poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit);
+ len -= POLY1305_BLOCK_SIZE;
+ src += POLY1305_BLOCK_SIZE;
+ }
+ vsx_end();
+}
+
+static void poly1305_emit(const struct poly1305_state *state,
+ u8 digest[POLY1305_DIGEST_SIZE], const u32 nonce[4])
+{
+ if (!static_key_enabled(&have_p10))
+ return poly1305_emit_generic(state, digest, nonce);
+ poly1305_emit_64(state, nonce, digest);
+}
+
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ static_branch_enable(&have_p10);
+}
diff --git a/lib/crypto/powerpc/sha1-powerpc-asm.S b/lib/crypto/powerpc/sha1-powerpc-asm.S
new file mode 100644
index 000000000000..f0d5ed557ab1
--- /dev/null
+++ b/lib/crypto/powerpc/sha1-powerpc-asm.S
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/asm-compat.h>
+
+#ifdef __BIG_ENDIAN__
+#define LWZ(rt, d, ra) \
+ lwz rt,d(ra)
+#else
+#define LWZ(rt, d, ra) \
+ li rt,d; \
+ lwbrx rt,rt,ra
+#endif
+
+/*
+ * We roll the registers for T, A, B, C, D, E around on each
+ * iteration; T on iteration t is A on iteration t+1, and so on.
+ * We use registers 7 - 12 for this.
+ */
+#define RT(t) ((((t)+5)%6)+7)
+#define RA(t) ((((t)+4)%6)+7)
+#define RB(t) ((((t)+3)%6)+7)
+#define RC(t) ((((t)+2)%6)+7)
+#define RD(t) ((((t)+1)%6)+7)
+#define RE(t) ((((t)+0)%6)+7)
+
+/* We use registers 16 - 31 for the W values */
+#define W(t) (((t)%16)+16)
+
+#define LOADW(t) \
+ LWZ(W(t),(t)*4,r4)
+
+#define STEPD0_LOAD(t) \
+ andc r0,RD(t),RB(t); \
+ and r6,RB(t),RC(t); \
+ rotlwi RT(t),RA(t),5; \
+ or r6,r6,r0; \
+ add r0,RE(t),r15; \
+ add RT(t),RT(t),r6; \
+ add r14,r0,W(t); \
+ LWZ(W((t)+4),((t)+4)*4,r4); \
+ rotlwi RB(t),RB(t),30; \
+ add RT(t),RT(t),r14
+
+#define STEPD0_UPDATE(t) \
+ and r6,RB(t),RC(t); \
+ andc r0,RD(t),RB(t); \
+ rotlwi RT(t),RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ or r6,r6,r0; \
+ add r0,RE(t),r15; \
+ xor r5,W((t)+4-3),W((t)+4-8); \
+ add RT(t),RT(t),r6; \
+ xor W((t)+4),W((t)+4-16),W((t)+4-14); \
+ add r0,r0,W(t); \
+ xor W((t)+4),W((t)+4),r5; \
+ add RT(t),RT(t),r0; \
+ rotlwi W((t)+4),W((t)+4),1
+
+#define STEPD1(t) \
+ xor r6,RB(t),RC(t); \
+ rotlwi RT(t),RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ xor r6,r6,RD(t); \
+ add r0,RE(t),r15; \
+ add RT(t),RT(t),r6; \
+ add r0,r0,W(t); \
+ add RT(t),RT(t),r0
+
+#define STEPD1_UPDATE(t) \
+ xor r6,RB(t),RC(t); \
+ rotlwi RT(t),RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ xor r6,r6,RD(t); \
+ add r0,RE(t),r15; \
+ xor r5,W((t)+4-3),W((t)+4-8); \
+ add RT(t),RT(t),r6; \
+ xor W((t)+4),W((t)+4-16),W((t)+4-14); \
+ add r0,r0,W(t); \
+ xor W((t)+4),W((t)+4),r5; \
+ add RT(t),RT(t),r0; \
+ rotlwi W((t)+4),W((t)+4),1
+
+#define STEPD2_UPDATE(t) \
+ and r6,RB(t),RC(t); \
+ and r0,RB(t),RD(t); \
+ rotlwi RT(t),RA(t),5; \
+ or r6,r6,r0; \
+ rotlwi RB(t),RB(t),30; \
+ and r0,RC(t),RD(t); \
+ xor r5,W((t)+4-3),W((t)+4-8); \
+ or r6,r6,r0; \
+ xor W((t)+4),W((t)+4-16),W((t)+4-14); \
+ add r0,RE(t),r15; \
+ add RT(t),RT(t),r6; \
+ add r0,r0,W(t); \
+ xor W((t)+4),W((t)+4),r5; \
+ add RT(t),RT(t),r0; \
+ rotlwi W((t)+4),W((t)+4),1
+
+#define STEP0LD4(t) \
+ STEPD0_LOAD(t); \
+ STEPD0_LOAD((t)+1); \
+ STEPD0_LOAD((t)+2); \
+ STEPD0_LOAD((t)+3)
+
+#define STEPUP4(t, fn) \
+ STEP##fn##_UPDATE(t); \
+ STEP##fn##_UPDATE((t)+1); \
+ STEP##fn##_UPDATE((t)+2); \
+ STEP##fn##_UPDATE((t)+3)
+
+#define STEPUP20(t, fn) \
+ STEPUP4(t, fn); \
+ STEPUP4((t)+4, fn); \
+ STEPUP4((t)+8, fn); \
+ STEPUP4((t)+12, fn); \
+ STEPUP4((t)+16, fn)
+
+_GLOBAL(powerpc_sha_transform)
+ PPC_STLU r1,-INT_FRAME_SIZE(r1)
+ SAVE_GPRS(14, 31, r1)
+
+ /* Load up A - E */
+ lwz RA(0),0(r3) /* A */
+ lwz RB(0),4(r3) /* B */
+ lwz RC(0),8(r3) /* C */
+ lwz RD(0),12(r3) /* D */
+ lwz RE(0),16(r3) /* E */
+
+ LOADW(0)
+ LOADW(1)
+ LOADW(2)
+ LOADW(3)
+
+ lis r15,0x5a82 /* K0-19 */
+ ori r15,r15,0x7999
+ STEP0LD4(0)
+ STEP0LD4(4)
+ STEP0LD4(8)
+ STEPUP4(12, D0)
+ STEPUP4(16, D0)
+
+ lis r15,0x6ed9 /* K20-39 */
+ ori r15,r15,0xeba1
+ STEPUP20(20, D1)
+
+ lis r15,0x8f1b /* K40-59 */
+ ori r15,r15,0xbcdc
+ STEPUP20(40, D2)
+
+ lis r15,0xca62 /* K60-79 */
+ ori r15,r15,0xc1d6
+ STEPUP4(60, D1)
+ STEPUP4(64, D1)
+ STEPUP4(68, D1)
+ STEPUP4(72, D1)
+ lwz r20,16(r3)
+ STEPD1(76)
+ lwz r19,12(r3)
+ STEPD1(77)
+ lwz r18,8(r3)
+ STEPD1(78)
+ lwz r17,4(r3)
+ STEPD1(79)
+
+ lwz r16,0(r3)
+ add r20,RE(80),r20
+ add RD(0),RD(80),r19
+ add RC(0),RC(80),r18
+ add RB(0),RB(80),r17
+ add RA(0),RA(80),r16
+ mr RE(0),r20
+ stw RA(0),0(r3)
+ stw RB(0),4(r3)
+ stw RC(0),8(r3)
+ stw RD(0),12(r3)
+ stw RE(0),16(r3)
+
+ REST_GPRS(14, 31, r1)
+ addi r1,r1,INT_FRAME_SIZE
+ blr
diff --git a/lib/crypto/powerpc/sha1-spe-asm.S b/lib/crypto/powerpc/sha1-spe-asm.S
new file mode 100644
index 000000000000..0f447523be5e
--- /dev/null
+++ b/lib/crypto/powerpc/sha1-spe-asm.S
@@ -0,0 +1,294 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast SHA-1 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP r3 /* pointer to hash value */
+#define rWP r4 /* pointer to input */
+#define rKP r5 /* pointer to constants */
+
+#define rW0 r14 /* 64 bit round words */
+#define rW1 r15
+#define rW2 r16
+#define rW3 r17
+#define rW4 r18
+#define rW5 r19
+#define rW6 r20
+#define rW7 r21
+
+#define rH0 r6 /* 32 bit hash values */
+#define rH1 r7
+#define rH2 r8
+#define rH3 r9
+#define rH4 r10
+
+#define rT0 r22 /* 64 bit temporary */
+#define rT1 r0 /* 32 bit temporaries */
+#define rT2 r11
+#define rT3 r12
+
+#define rK r23 /* 64 bit constant in volatile register */
+
+#define LOAD_K01
+
+#define LOAD_K11 \
+ evlwwsplat rK,0(rKP);
+
+#define LOAD_K21 \
+ evlwwsplat rK,4(rKP);
+
+#define LOAD_K31 \
+ evlwwsplat rK,8(rKP);
+
+#define LOAD_K41 \
+ evlwwsplat rK,12(rKP);
+
+#define INITIALIZE \
+ stwu r1,-128(r1); /* create stack frame */ \
+ evstdw r14,8(r1); /* We must save non volatile */ \
+ evstdw r15,16(r1); /* registers. Take the chance */ \
+ evstdw r16,24(r1); /* and save the SPE part too */ \
+ evstdw r17,32(r1); \
+ evstdw r18,40(r1); \
+ evstdw r19,48(r1); \
+ evstdw r20,56(r1); \
+ evstdw r21,64(r1); \
+ evstdw r22,72(r1); \
+ evstdw r23,80(r1);
+
+
+#define FINALIZE \
+ evldw r14,8(r1); /* restore SPE registers */ \
+ evldw r15,16(r1); \
+ evldw r16,24(r1); \
+ evldw r17,32(r1); \
+ evldw r18,40(r1); \
+ evldw r19,48(r1); \
+ evldw r20,56(r1); \
+ evldw r21,64(r1); \
+ evldw r22,72(r1); \
+ evldw r23,80(r1); \
+ xor r0,r0,r0; \
+ stw r0,8(r1); /* Delete sensitive data */ \
+ stw r0,16(r1); /* that we might have pushed */ \
+ stw r0,24(r1); /* from other context that runs */ \
+ stw r0,32(r1); /* the same code. Assume that */ \
+ stw r0,40(r1); /* the lower part of the GPRs */ \
+ stw r0,48(r1); /* were already overwritten on */ \
+ stw r0,56(r1); /* the way down to here */ \
+ stw r0,64(r1); \
+ stw r0,72(r1); \
+ stw r0,80(r1); \
+ addi r1,r1,128; /* cleanup stack frame */
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+ lwz reg,off(rWP); /* load data */
+#define NEXT_BLOCK \
+ addi rWP,rWP,64; /* increment per block */
+#else
+#define LOAD_DATA(reg, off) \
+ lwbrx reg,0,rWP; /* load data */ \
+ addi rWP,rWP,4; /* increment per word */
+#define NEXT_BLOCK /* nothing to do */
+#endif
+
+#define R_00_15(a, b, c, d, e, w0, w1, k, off) \
+ LOAD_DATA(w0, off) /* 1: W */ \
+ and rT2,b,c; /* 1: F' = B and C */ \
+ LOAD_K##k##1 \
+ andc rT1,d,b; /* 1: F" = ~B and D */ \
+ rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \
+ or rT2,rT2,rT1; /* 1: F = F' or F" */ \
+ add e,e,rT0; /* 1: E = E + A' */ \
+ rotrwi b,b,2; /* 1: B = B rotl 30 */ \
+ add e,e,w0; /* 1: E = E + W */ \
+ LOAD_DATA(w1, off+4) /* 2: W */ \
+ add e,e,rT2; /* 1: E = E + F */ \
+ and rT1,a,b; /* 2: F' = B and C */ \
+ add e,e,rK; /* 1: E = E + K */ \
+ andc rT2,c,a; /* 2: F" = ~B and D */ \
+ add d,d,rK; /* 2: E = E + K */ \
+ or rT2,rT2,rT1; /* 2: F = F' or F" */ \
+ rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
+ add d,d,w1; /* 2: E = E + W */ \
+ rotrwi a,a,2; /* 2: B = B rotl 30 */ \
+ add d,d,rT0; /* 2: E = E + A' */ \
+ evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \
+ add d,d,rT2 /* 2: E = E + F */
+
+#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+ and rT2,b,c; /* 1: F' = B and C */ \
+ evmergelohi rT0,w7,w6; /* W[-3] */ \
+ andc rT1,d,b; /* 1: F" = ~B and D */ \
+ evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
+ or rT1,rT1,rT2; /* 1: F = F' or F" */ \
+ evxor w0,w0,w4; /* W = W xor W[-8] */ \
+ add e,e,rT1; /* 1: E = E + F */ \
+ evxor w0,w0,w1; /* W = W xor W[-14] */ \
+ rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
+ evrlwi w0,w0,1; /* W = W rotl 1 */ \
+ add e,e,rT2; /* 1: E = E + A' */ \
+ evaddw rT0,w0,rK; /* WK = W + K */ \
+ rotrwi b,b,2; /* 1: B = B rotl 30 */ \
+ LOAD_K##k##1 \
+ evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
+ add e,e,rT0; /* 1: E = E + WK */ \
+ add d,d,rT1; /* 2: E = E + WK */ \
+ and rT2,a,b; /* 2: F' = B and C */ \
+ andc rT1,c,a; /* 2: F" = ~B and D */ \
+ rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
+ or rT1,rT1,rT2; /* 2: F = F' or F" */ \
+ add d,d,rT0; /* 2: E = E + A' */ \
+ rotrwi a,a,2; /* 2: B = B rotl 30 */ \
+ add d,d,rT1 /* 2: E = E + F */
+
+#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+ evmergelohi rT0,w7,w6; /* W[-3] */ \
+ xor rT2,b,c; /* 1: F' = B xor C */ \
+ evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
+ xor rT2,rT2,d; /* 1: F = F' xor D */ \
+ evxor w0,w0,w4; /* W = W xor W[-8] */ \
+ add e,e,rT2; /* 1: E = E + F */ \
+ evxor w0,w0,w1; /* W = W xor W[-14] */ \
+ rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
+ evrlwi w0,w0,1; /* W = W rotl 1 */ \
+ add e,e,rT2; /* 1: E = E + A' */ \
+ evaddw rT0,w0,rK; /* WK = W + K */ \
+ rotrwi b,b,2; /* 1: B = B rotl 30 */ \
+ LOAD_K##k##1 \
+ evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
+ add e,e,rT0; /* 1: E = E + WK */ \
+ xor rT2,a,b; /* 2: F' = B xor C */ \
+ add d,d,rT1; /* 2: E = E + WK */ \
+ xor rT2,rT2,c; /* 2: F = F' xor D */ \
+ rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
+ add d,d,rT2; /* 2: E = E + F */ \
+ rotrwi a,a,2; /* 2: B = B rotl 30 */ \
+ add d,d,rT0 /* 2: E = E + A' */
+
+#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+ and rT2,b,c; /* 1: F' = B and C */ \
+ evmergelohi rT0,w7,w6; /* W[-3] */ \
+ or rT1,b,c; /* 1: F" = B or C */ \
+ evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \
+ and rT1,d,rT1; /* 1: F" = F" and D */ \
+ evxor w0,w0,w4; /* W = W xor W[-8] */ \
+ or rT2,rT2,rT1; /* 1: F = F' or F" */ \
+ evxor w0,w0,w1; /* W = W xor W[-14] */ \
+ add e,e,rT2; /* 1: E = E + F */ \
+ evrlwi w0,w0,1; /* W = W rotl 1 */ \
+ rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \
+ evaddw rT0,w0,rK; /* WK = W + K */ \
+ add e,e,rT2; /* 1: E = E + A' */ \
+ LOAD_K##k##1 \
+ evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \
+ rotrwi b,b,2; /* 1: B = B rotl 30 */ \
+ add e,e,rT0; /* 1: E = E + WK */ \
+ and rT2,a,b; /* 2: F' = B and C */ \
+ or rT0,a,b; /* 2: F" = B or C */ \
+ add d,d,rT1; /* 2: E = E + WK */ \
+ and rT0,c,rT0; /* 2: F" = F" and D */ \
+ rotrwi a,a,2; /* 2: B = B rotl 30 */ \
+ or rT2,rT2,rT0; /* 2: F = F' or F" */ \
+ rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \
+ add d,d,rT2; /* 2: E = E + F */ \
+ add d,d,rT0 /* 2: E = E + A' */
+
+#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \
+ R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k)
+
+_GLOBAL(ppc_spe_sha1_transform)
+ INITIALIZE
+
+ lwz rH0,0(rHP)
+ lwz rH1,4(rHP)
+ mtctr r5
+ lwz rH2,8(rHP)
+ lis rKP,PPC_SPE_SHA1_K@h
+ lwz rH3,12(rHP)
+ ori rKP,rKP,PPC_SPE_SHA1_K@l
+ lwz rH4,16(rHP)
+
+ppc_spe_sha1_main:
+ R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0)
+ R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8)
+ R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16)
+ R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24)
+ R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32)
+ R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40)
+ R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48)
+ R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56)
+
+ R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0)
+ R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2)
+
+ R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0)
+ R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0)
+ R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0)
+ R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0)
+ R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0)
+ R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0)
+ R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0)
+ R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0)
+ R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0)
+ R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3)
+
+ R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0)
+ R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0)
+ R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0)
+ R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0)
+ R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0)
+ R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0)
+ R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0)
+ R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0)
+ R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0)
+ R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4)
+
+ R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0)
+ R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0)
+ R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0)
+ R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0)
+ R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0)
+ R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0)
+ R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0)
+ lwz rT3,0(rHP)
+ R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0)
+ lwz rW1,4(rHP)
+ R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0)
+ lwz rW2,8(rHP)
+ R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0)
+ lwz rW3,12(rHP)
+ NEXT_BLOCK
+ lwz rW4,16(rHP)
+
+ add rH0,rH0,rT3
+ stw rH0,0(rHP)
+ add rH1,rH1,rW1
+ stw rH1,4(rHP)
+ add rH2,rH2,rW2
+ stw rH2,8(rHP)
+ add rH3,rH3,rW3
+ stw rH3,12(rHP)
+ add rH4,rH4,rW4
+ stw rH4,16(rHP)
+
+ bdnz ppc_spe_sha1_main
+
+ FINALIZE
+ blr
+
+.data
+.align 4
+PPC_SPE_SHA1_K:
+ .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
diff --git a/lib/crypto/powerpc/sha1.h b/lib/crypto/powerpc/sha1.h
new file mode 100644
index 000000000000..e2c010f0370b
--- /dev/null
+++ b/lib/crypto/powerpc/sha1.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for PowerPC
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <linux/preempt.h>
+
+#ifdef CONFIG_SPE
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA1 takes ~1000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 2KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 2048
+
+asmlinkage void ppc_spe_sha1_transform(struct sha1_block_state *state,
+ const u8 *data, u32 nblocks);
+
+static void spe_begin(void)
+{
+ /* We just start SPE operations and will save SPE registers later. */
+ preempt_disable();
+ enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+ disable_kernel_spe();
+ /* reenable preemption */
+ preempt_enable();
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ do {
+ u32 unit = min_t(size_t, nblocks, MAX_BYTES / SHA1_BLOCK_SIZE);
+
+ spe_begin();
+ ppc_spe_sha1_transform(state, data, unit);
+ spe_end();
+
+ data += unit * SHA1_BLOCK_SIZE;
+ nblocks -= unit;
+ } while (nblocks);
+}
+#else /* CONFIG_SPE */
+asmlinkage void powerpc_sha_transform(struct sha1_block_state *state,
+ const u8 data[SHA1_BLOCK_SIZE]);
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ do {
+ powerpc_sha_transform(state, data);
+ data += SHA1_BLOCK_SIZE;
+ } while (--nblocks);
+}
+#endif /* !CONFIG_SPE */
diff --git a/lib/crypto/powerpc/sha256-spe-asm.S b/lib/crypto/powerpc/sha256-spe-asm.S
new file mode 100644
index 000000000000..cd99d71dae34
--- /dev/null
+++ b/lib/crypto/powerpc/sha256-spe-asm.S
@@ -0,0 +1,318 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Fast SHA-256 implementation for SPE instruction set (PPC)
+ *
+ * This code makes use of the SPE SIMD instruction set as defined in
+ * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
+ * Implementation is based on optimization guide notes from
+ * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+#define rHP r3 /* pointer to hash values in memory */
+#define rKP r24 /* pointer to round constants */
+#define rWP r4 /* pointer to input data */
+
+#define rH0 r5 /* 8 32 bit hash values in 8 registers */
+#define rH1 r6
+#define rH2 r7
+#define rH3 r8
+#define rH4 r9
+#define rH5 r10
+#define rH6 r11
+#define rH7 r12
+
+#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */
+#define rW1 r15
+#define rW2 r16
+#define rW3 r17
+#define rW4 r18
+#define rW5 r19
+#define rW6 r20
+#define rW7 r21
+
+#define rT0 r22 /* 64 bit temporaries */
+#define rT1 r23
+#define rT2 r0 /* 32 bit temporaries */
+#define rT3 r25
+
+#define CMP_KN_LOOP
+#define CMP_KC_LOOP \
+ cmpwi rT1,0;
+
+#define INITIALIZE \
+ stwu r1,-128(r1); /* create stack frame */ \
+ evstdw r14,8(r1); /* We must save non volatile */ \
+ evstdw r15,16(r1); /* registers. Take the chance */ \
+ evstdw r16,24(r1); /* and save the SPE part too */ \
+ evstdw r17,32(r1); \
+ evstdw r18,40(r1); \
+ evstdw r19,48(r1); \
+ evstdw r20,56(r1); \
+ evstdw r21,64(r1); \
+ evstdw r22,72(r1); \
+ evstdw r23,80(r1); \
+ stw r24,88(r1); /* save normal registers */ \
+ stw r25,92(r1);
+
+
+#define FINALIZE \
+ evldw r14,8(r1); /* restore SPE registers */ \
+ evldw r15,16(r1); \
+ evldw r16,24(r1); \
+ evldw r17,32(r1); \
+ evldw r18,40(r1); \
+ evldw r19,48(r1); \
+ evldw r20,56(r1); \
+ evldw r21,64(r1); \
+ evldw r22,72(r1); \
+ evldw r23,80(r1); \
+ lwz r24,88(r1); /* restore normal registers */ \
+ lwz r25,92(r1); \
+ xor r0,r0,r0; \
+ stw r0,8(r1); /* Delete sensitive data */ \
+ stw r0,16(r1); /* that we might have pushed */ \
+ stw r0,24(r1); /* from other context that runs */ \
+ stw r0,32(r1); /* the same code. Assume that */ \
+ stw r0,40(r1); /* the lower part of the GPRs */ \
+ stw r0,48(r1); /* was already overwritten on */ \
+ stw r0,56(r1); /* the way down to here */ \
+ stw r0,64(r1); \
+ stw r0,72(r1); \
+ stw r0,80(r1); \
+ addi r1,r1,128; /* cleanup stack frame */
+
+#ifdef __BIG_ENDIAN__
+#define LOAD_DATA(reg, off) \
+ lwz reg,off(rWP); /* load data */
+#define NEXT_BLOCK \
+ addi rWP,rWP,64; /* increment per block */
+#else
+#define LOAD_DATA(reg, off) \
+ lwbrx reg,0,rWP; /* load data */ \
+ addi rWP,rWP,4; /* increment per word */
+#define NEXT_BLOCK /* nothing to do */
+#endif
+
+#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \
+ LOAD_DATA(w, off) /* 1: W */ \
+ rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \
+ rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \
+ rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \
+ xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \
+ and rT3,e,f; /* 1: ch = e and f */ \
+ xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \
+ andc rT1,g,e; /* 1: ch' = ~e and g */ \
+ lwz rT2,off(rKP); /* 1: K */ \
+ xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \
+ add h,h,rT0; /* 1: temp1 = h + S1 */ \
+ add rT3,rT3,w; /* 1: temp1' = ch + w */ \
+ rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \
+ add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \
+ rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \
+ add h,h,rT2; /* 1: temp1 = temp1 + K */ \
+ rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \
+ xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \
+ add d,d,h; /* 1: d = d + temp1 */ \
+ xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \
+ evmergelo w,w,w; /* shift W */ \
+ or rT2,a,b; /* 1: maj = a or b */ \
+ and rT1,a,b; /* 1: maj' = a and b */ \
+ and rT2,rT2,c; /* 1: maj = maj and c */ \
+ LOAD_DATA(w, off+4) /* 2: W */ \
+ or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \
+ rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
+ add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \
+ rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
+ add h,h,rT3; /* 1: h = temp1 + temp2 */ \
+ rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \
+ xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
+ and rT3,d,e; /* 2: ch = e and f */ \
+ xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
+ andc rT1,f,d; /* 2: ch' = ~e and g */ \
+ lwz rT2,off+4(rKP); /* 2: K */ \
+ xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
+ add g,g,rT0; /* 2: temp1 = h + S1 */ \
+ add rT3,rT3,w; /* 2: temp1' = ch + w */ \
+ rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
+ add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \
+ rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
+ add g,g,rT2; /* 2: temp1 = temp1 + K */ \
+ rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
+ xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
+ or rT2,h,a; /* 2: maj = a or b */ \
+ xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
+ and rT1,h,a; /* 2: maj' = a and b */ \
+ and rT2,rT2,b; /* 2: maj = maj and c */ \
+ add c,c,g; /* 2: d = d + temp1 */ \
+ or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
+ add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
+ add g,g,rT3 /* 2: h = temp1 + temp2 */
+
+#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \
+ rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \
+ evmergelohi rT0,w0,w1; /* w[-15] */ \
+ rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \
+ evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \
+ xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
+ evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \
+ rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \
+ evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \
+ xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \
+ evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \
+ add h,h,rT2; /* 1: temp1 = h + S1 */ \
+ evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \
+ and rT2,e,f; /* 1: ch = e and f */ \
+ evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \
+ andc rT3,g,e; /* 1: ch' = ~e and g */ \
+ evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \
+ xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \
+ evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \
+ add h,h,rT2; /* 1: temp1 = temp1 + ch */ \
+ evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
+ rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \
+ evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \
+ rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \
+ evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \
+ xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
+ evldw rT1,off(rKP); /* k */ \
+ rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \
+ evaddw w0,w0,rT0; /* w = w + s1 */ \
+ xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \
+ evmergelohi rT0,w4,w5; /* w[-7] */ \
+ and rT3,a,b; /* 1: maj = a and b */ \
+ evaddw w0,w0,rT0; /* w = w + w[-7] */ \
+ CMP_K##k##_LOOP \
+ add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \
+ evaddw rT1,rT1,w0; /* wk = w + k */ \
+ xor rT3,a,b; /* 1: maj = a xor b */ \
+ evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \
+ and rT3,rT3,c; /* 1: maj = maj and c */ \
+ add h,h,rT0; /* 1: temp1 = temp1 + wk */ \
+ add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \
+ add g,g,rT1; /* 2: temp1 = temp1 + wk */ \
+ add d,d,h; /* 1: d = d + temp1 */ \
+ rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \
+ add h,h,rT2; /* 1: h = temp1 + temp2 */ \
+ rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \
+ rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \
+ xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \
+ and rT3,d,e; /* 2: ch = e and f */ \
+ xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \
+ andc rT1,f,d; /* 2: ch' = ~e and g */ \
+ add g,g,rT0; /* 2: temp1 = h + S1 */ \
+ xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \
+ rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \
+ add g,g,rT3; /* 2: temp1 = temp1 + ch */ \
+ rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \
+ rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \
+ xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \
+ or rT2,h,a; /* 2: maj = a or b */ \
+ and rT1,h,a; /* 2: maj' = a and b */ \
+ and rT2,rT2,b; /* 2: maj = maj and c */ \
+ xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \
+ or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \
+ add c,c,g; /* 2: d = d + temp1 */ \
+ add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \
+ add g,g,rT3 /* 2: h = temp1 + temp2 */
+
+_GLOBAL(ppc_spe_sha256_transform)
+ INITIALIZE
+
+ mtctr r5
+ lwz rH0,0(rHP)
+ lwz rH1,4(rHP)
+ lwz rH2,8(rHP)
+ lwz rH3,12(rHP)
+ lwz rH4,16(rHP)
+ lwz rH5,20(rHP)
+ lwz rH6,24(rHP)
+ lwz rH7,28(rHP)
+
+ppc_spe_sha256_main:
+ lis rKP,PPC_SPE_SHA256_K@ha
+ addi rKP,rKP,PPC_SPE_SHA256_K@l
+
+ R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0)
+ R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8)
+ R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16)
+ R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24)
+ R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32)
+ R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40)
+ R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48)
+ R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56)
+ppc_spe_sha256_16_rounds:
+ addi rKP,rKP,64
+ R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+ rW0, rW1, rW4, rW5, rW7, N, 0)
+ R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+ rW1, rW2, rW5, rW6, rW0, N, 8)
+ R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+ rW2, rW3, rW6, rW7, rW1, N, 16)
+ R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+ rW3, rW4, rW7, rW0, rW2, N, 24)
+ R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7,
+ rW4, rW5, rW0, rW1, rW3, N, 32)
+ R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5,
+ rW5, rW6, rW1, rW2, rW4, N, 40)
+ R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3,
+ rW6, rW7, rW2, rW3, rW5, N, 48)
+ R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1,
+ rW7, rW0, rW3, rW4, rW6, C, 56)
+ bt gt,ppc_spe_sha256_16_rounds
+
+ lwz rW0,0(rHP)
+ NEXT_BLOCK
+ lwz rW1,4(rHP)
+ lwz rW2,8(rHP)
+ lwz rW3,12(rHP)
+ lwz rW4,16(rHP)
+ lwz rW5,20(rHP)
+ lwz rW6,24(rHP)
+ lwz rW7,28(rHP)
+
+ add rH0,rH0,rW0
+ stw rH0,0(rHP)
+ add rH1,rH1,rW1
+ stw rH1,4(rHP)
+ add rH2,rH2,rW2
+ stw rH2,8(rHP)
+ add rH3,rH3,rW3
+ stw rH3,12(rHP)
+ add rH4,rH4,rW4
+ stw rH4,16(rHP)
+ add rH5,rH5,rW5
+ stw rH5,20(rHP)
+ add rH6,rH6,rW6
+ stw rH6,24(rHP)
+ add rH7,rH7,rW7
+ stw rH7,28(rHP)
+
+ bdnz ppc_spe_sha256_main
+
+ FINALIZE
+ blr
+
+.data
+.align 5
+PPC_SPE_SHA256_K:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/lib/crypto/powerpc/sha256.h b/lib/crypto/powerpc/sha256.h
new file mode 100644
index 000000000000..50d355441c7e
--- /dev/null
+++ b/lib/crypto/powerpc/sha256.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 Secure Hash Algorithm, SPE optimized
+ *
+ * Based on generic implementation. The assembler module takes care
+ * about the SPE registers so it can run from interrupt context.
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <linux/preempt.h>
+
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 1024
+
+extern void ppc_spe_sha256_transform(struct sha256_block_state *state,
+ const u8 *src, u32 blocks);
+
+static void spe_begin(void)
+{
+ /* We just start SPE operations and will save SPE registers later. */
+ preempt_disable();
+ enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+ disable_kernel_spe();
+ /* reenable preemption */
+ preempt_enable();
+}
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ do {
+ /* cut input data into smaller blocks */
+ u32 unit = min_t(size_t, nblocks,
+ MAX_BYTES / SHA256_BLOCK_SIZE);
+
+ spe_begin();
+ ppc_spe_sha256_transform(state, data, unit);
+ spe_end();
+
+ data += unit * SHA256_BLOCK_SIZE;
+ nblocks -= unit;
+ } while (nblocks);
+}
diff --git a/lib/crypto/riscv/chacha-riscv64-zvkb.S b/lib/crypto/riscv/chacha-riscv64-zvkb.S
new file mode 100644
index 000000000000..b777d0b4e379
--- /dev/null
+++ b/lib/crypto/riscv/chacha-riscv64-zvkb.S
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvkb
+
+#define STATEP a0
+#define INP a1
+#define OUTP a2
+#define NBLOCKS a3
+#define NROUNDS a4
+
+#define CONSTS0 a5
+#define CONSTS1 a6
+#define CONSTS2 a7
+#define CONSTS3 t0
+#define TMP t1
+#define VL t2
+#define STRIDE t3
+#define ROUND_CTR t4
+#define KEY0 s0
+#define KEY1 s1
+#define KEY2 s2
+#define KEY3 s3
+#define KEY4 s4
+#define KEY5 s5
+#define KEY6 s6
+#define KEY7 s7
+#define COUNTER s8
+#define NONCE0 s9
+#define NONCE1 s10
+#define NONCE2 s11
+
+.macro chacha_round a0, b0, c0, d0, a1, b1, c1, d1, \
+ a2, b2, c2, d2, a3, b3, c3, d3
+ // a += b; d ^= a; d = rol(d, 16);
+ vadd.vv \a0, \a0, \b0
+ vadd.vv \a1, \a1, \b1
+ vadd.vv \a2, \a2, \b2
+ vadd.vv \a3, \a3, \b3
+ vxor.vv \d0, \d0, \a0
+ vxor.vv \d1, \d1, \a1
+ vxor.vv \d2, \d2, \a2
+ vxor.vv \d3, \d3, \a3
+ vror.vi \d0, \d0, 32 - 16
+ vror.vi \d1, \d1, 32 - 16
+ vror.vi \d2, \d2, 32 - 16
+ vror.vi \d3, \d3, 32 - 16
+
+ // c += d; b ^= c; b = rol(b, 12);
+ vadd.vv \c0, \c0, \d0
+ vadd.vv \c1, \c1, \d1
+ vadd.vv \c2, \c2, \d2
+ vadd.vv \c3, \c3, \d3
+ vxor.vv \b0, \b0, \c0
+ vxor.vv \b1, \b1, \c1
+ vxor.vv \b2, \b2, \c2
+ vxor.vv \b3, \b3, \c3
+ vror.vi \b0, \b0, 32 - 12
+ vror.vi \b1, \b1, 32 - 12
+ vror.vi \b2, \b2, 32 - 12
+ vror.vi \b3, \b3, 32 - 12
+
+ // a += b; d ^= a; d = rol(d, 8);
+ vadd.vv \a0, \a0, \b0
+ vadd.vv \a1, \a1, \b1
+ vadd.vv \a2, \a2, \b2
+ vadd.vv \a3, \a3, \b3
+ vxor.vv \d0, \d0, \a0
+ vxor.vv \d1, \d1, \a1
+ vxor.vv \d2, \d2, \a2
+ vxor.vv \d3, \d3, \a3
+ vror.vi \d0, \d0, 32 - 8
+ vror.vi \d1, \d1, 32 - 8
+ vror.vi \d2, \d2, 32 - 8
+ vror.vi \d3, \d3, 32 - 8
+
+ // c += d; b ^= c; b = rol(b, 7);
+ vadd.vv \c0, \c0, \d0
+ vadd.vv \c1, \c1, \d1
+ vadd.vv \c2, \c2, \d2
+ vadd.vv \c3, \c3, \d3
+ vxor.vv \b0, \b0, \c0
+ vxor.vv \b1, \b1, \c1
+ vxor.vv \b2, \b2, \c2
+ vxor.vv \b3, \b3, \c3
+ vror.vi \b0, \b0, 32 - 7
+ vror.vi \b1, \b1, 32 - 7
+ vror.vi \b2, \b2, 32 - 7
+ vror.vi \b3, \b3, 32 - 7
+.endm
+
+// void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
+// size_t nblocks, int nrounds);
+//
+// |nblocks| is the number of 64-byte blocks to process, and must be nonzero.
+//
+// |state| gives the ChaCha state matrix, including the 32-bit counter in
+// state->x[12] following the RFC7539 convention; note that this differs from
+// the original Salsa20 paper which uses a 64-bit counter in state->x[12..13].
+// The updated 32-bit counter is written back to state->x[12] before returning.
+SYM_FUNC_START(chacha_zvkb)
+ addi sp, sp, -96
+ sd s0, 0(sp)
+ sd s1, 8(sp)
+ sd s2, 16(sp)
+ sd s3, 24(sp)
+ sd s4, 32(sp)
+ sd s5, 40(sp)
+ sd s6, 48(sp)
+ sd s7, 56(sp)
+ sd s8, 64(sp)
+ sd s9, 72(sp)
+ sd s10, 80(sp)
+ sd s11, 88(sp)
+
+ li STRIDE, 64
+
+ // Set up the initial state matrix in scalar registers.
+ lw CONSTS0, 0(STATEP)
+ lw CONSTS1, 4(STATEP)
+ lw CONSTS2, 8(STATEP)
+ lw CONSTS3, 12(STATEP)
+ lw KEY0, 16(STATEP)
+ lw KEY1, 20(STATEP)
+ lw KEY2, 24(STATEP)
+ lw KEY3, 28(STATEP)
+ lw KEY4, 32(STATEP)
+ lw KEY5, 36(STATEP)
+ lw KEY6, 40(STATEP)
+ lw KEY7, 44(STATEP)
+ lw COUNTER, 48(STATEP)
+ lw NONCE0, 52(STATEP)
+ lw NONCE1, 56(STATEP)
+ lw NONCE2, 60(STATEP)
+
+.Lblock_loop:
+ // Set vl to the number of blocks to process in this iteration.
+ vsetvli VL, NBLOCKS, e32, m1, ta, ma
+
+ // Set up the initial state matrix for the next VL blocks in v0-v15.
+ // v{i} holds the i'th 32-bit word of the state matrix for all blocks.
+ // Note that only the counter word, at index 12, differs across blocks.
+ vmv.v.x v0, CONSTS0
+ vmv.v.x v1, CONSTS1
+ vmv.v.x v2, CONSTS2
+ vmv.v.x v3, CONSTS3
+ vmv.v.x v4, KEY0
+ vmv.v.x v5, KEY1
+ vmv.v.x v6, KEY2
+ vmv.v.x v7, KEY3
+ vmv.v.x v8, KEY4
+ vmv.v.x v9, KEY5
+ vmv.v.x v10, KEY6
+ vmv.v.x v11, KEY7
+ vid.v v12
+ vadd.vx v12, v12, COUNTER
+ vmv.v.x v13, NONCE0
+ vmv.v.x v14, NONCE1
+ vmv.v.x v15, NONCE2
+
+ // Load the first half of the input data for each block into v16-v23.
+ // v{16+i} holds the i'th 32-bit word for all blocks.
+ vlsseg8e32.v v16, (INP), STRIDE
+
+ mv ROUND_CTR, NROUNDS
+.Lnext_doubleround:
+ addi ROUND_CTR, ROUND_CTR, -2
+ // column round
+ chacha_round v0, v4, v8, v12, v1, v5, v9, v13, \
+ v2, v6, v10, v14, v3, v7, v11, v15
+ // diagonal round
+ chacha_round v0, v5, v10, v15, v1, v6, v11, v12, \
+ v2, v7, v8, v13, v3, v4, v9, v14
+ bnez ROUND_CTR, .Lnext_doubleround
+
+ // Load the second half of the input data for each block into v24-v31.
+ // v{24+i} holds the {8+i}'th 32-bit word for all blocks.
+ addi TMP, INP, 32
+ vlsseg8e32.v v24, (TMP), STRIDE
+
+ // Finalize the first half of the keystream for each block.
+ vadd.vx v0, v0, CONSTS0
+ vadd.vx v1, v1, CONSTS1
+ vadd.vx v2, v2, CONSTS2
+ vadd.vx v3, v3, CONSTS3
+ vadd.vx v4, v4, KEY0
+ vadd.vx v5, v5, KEY1
+ vadd.vx v6, v6, KEY2
+ vadd.vx v7, v7, KEY3
+
+ // Encrypt/decrypt the first half of the data for each block.
+ vxor.vv v16, v16, v0
+ vxor.vv v17, v17, v1
+ vxor.vv v18, v18, v2
+ vxor.vv v19, v19, v3
+ vxor.vv v20, v20, v4
+ vxor.vv v21, v21, v5
+ vxor.vv v22, v22, v6
+ vxor.vv v23, v23, v7
+
+ // Store the first half of the output data for each block.
+ vssseg8e32.v v16, (OUTP), STRIDE
+
+ // Finalize the second half of the keystream for each block.
+ vadd.vx v8, v8, KEY4
+ vadd.vx v9, v9, KEY5
+ vadd.vx v10, v10, KEY6
+ vadd.vx v11, v11, KEY7
+ vid.v v0
+ vadd.vx v12, v12, COUNTER
+ vadd.vx v13, v13, NONCE0
+ vadd.vx v14, v14, NONCE1
+ vadd.vx v15, v15, NONCE2
+ vadd.vv v12, v12, v0
+
+ // Encrypt/decrypt the second half of the data for each block.
+ vxor.vv v24, v24, v8
+ vxor.vv v25, v25, v9
+ vxor.vv v26, v26, v10
+ vxor.vv v27, v27, v11
+ vxor.vv v29, v29, v13
+ vxor.vv v28, v28, v12
+ vxor.vv v30, v30, v14
+ vxor.vv v31, v31, v15
+
+ // Store the second half of the output data for each block.
+ addi TMP, OUTP, 32
+ vssseg8e32.v v24, (TMP), STRIDE
+
+ // Update the counter, the remaining number of blocks, and the input and
+ // output pointers according to the number of blocks processed (VL).
+ add COUNTER, COUNTER, VL
+ sub NBLOCKS, NBLOCKS, VL
+ slli TMP, VL, 6
+ add OUTP, OUTP, TMP
+ add INP, INP, TMP
+ bnez NBLOCKS, .Lblock_loop
+
+ sw COUNTER, 48(STATEP)
+ ld s0, 0(sp)
+ ld s1, 8(sp)
+ ld s2, 16(sp)
+ ld s3, 24(sp)
+ ld s4, 32(sp)
+ ld s5, 40(sp)
+ ld s6, 48(sp)
+ ld s7, 56(sp)
+ ld s8, 64(sp)
+ ld s9, 72(sp)
+ ld s10, 80(sp)
+ ld s11, 88(sp)
+ addi sp, sp, 96
+ ret
+SYM_FUNC_END(chacha_zvkb)
diff --git a/lib/crypto/riscv/chacha.h b/lib/crypto/riscv/chacha.h
new file mode 100644
index 000000000000..5c000c6aef4b
--- /dev/null
+++ b/lib/crypto/riscv/chacha.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * ChaCha stream cipher (RISC-V optimized)
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/simd.h>
+#include <linux/linkage.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_zvkb);
+
+asmlinkage void chacha_zvkb(struct chacha_state *state, const u8 *in, u8 *out,
+ size_t nblocks, int nrounds);
+
+#define hchacha_block_arch hchacha_block_generic /* not implemented yet */
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int bytes, int nrounds)
+{
+ u8 block_buffer[CHACHA_BLOCK_SIZE];
+ unsigned int full_blocks = bytes / CHACHA_BLOCK_SIZE;
+ unsigned int tail_bytes = bytes % CHACHA_BLOCK_SIZE;
+
+ if (!static_branch_likely(&use_zvkb) || !crypto_simd_usable())
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ kernel_vector_begin();
+ if (full_blocks) {
+ chacha_zvkb(state, src, dst, full_blocks, nrounds);
+ src += full_blocks * CHACHA_BLOCK_SIZE;
+ dst += full_blocks * CHACHA_BLOCK_SIZE;
+ }
+ if (tail_bytes) {
+ memcpy(block_buffer, src, tail_bytes);
+ chacha_zvkb(state, block_buffer, block_buffer, 1, nrounds);
+ memcpy(dst, block_buffer, tail_bytes);
+ }
+ kernel_vector_end();
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+ if (riscv_isa_extension_available(NULL, ZVKB) &&
+ riscv_vector_vlen() >= 128)
+ static_branch_enable(&use_zvkb);
+}
diff --git a/lib/crypto/riscv/poly1305-riscv.pl b/lib/crypto/riscv/poly1305-riscv.pl
new file mode 100644
index 000000000000..e25e6338a9ac
--- /dev/null
+++ b/lib/crypto/riscv/poly1305-riscv.pl
@@ -0,0 +1,847 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL.
+# ====================================================================
+#
+# Poly1305 hash for RISC-V.
+#
+# February 2019
+#
+# In the essence it's pretty straightforward transliteration of MIPS
+# module [without big-endian option].
+#
+# 1.8 cycles per byte on U74, >100% faster than compiler-generated
+# code. 1.9 cpb on C910, ~75% improvement. 3.3 on Spacemit X60, ~69%
+# improvement.
+#
+# June 2024.
+#
+# Add CHERI support.
+#
+######################################################################
+#
+($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4));
+($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27));
+#
+######################################################################
+
+$flavour = shift || "64";
+
+for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
+open STDOUT,">$output";
+
+$code.=<<___;
+#ifdef __KERNEL__
+# ifdef __riscv_zicfilp
+# undef __riscv_zicfilp // calls are expected to be direct
+# endif
+#endif
+
+#if defined(__CHERI_PURE_CAPABILITY__) && !defined(__riscv_misaligned_fast)
+# define __riscv_misaligned_fast 1
+#endif
+___
+
+if ($flavour =~ /64/) {{{
+######################################################################
+# 64-bit code path...
+#
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2);
+
+$code.=<<___;
+#if __riscv_xlen == 64
+# if __SIZEOF_POINTER__ == 16
+# define PUSH csc
+# define POP clc
+# else
+# define PUSH sd
+# define POP ld
+# endif
+#else
+# error "unsupported __riscv_xlen"
+#endif
+
+.option pic
+.text
+
+.globl poly1305_init
+.type poly1305_init,\@function
+poly1305_init:
+#ifdef __riscv_zicfilp
+ lpad 0
+#endif
+ sd $zero,0($ctx)
+ sd $zero,8($ctx)
+ sd $zero,16($ctx)
+
+ beqz $inp,.Lno_key
+
+#ifndef __riscv_misaligned_fast
+ andi $tmp0,$inp,7 # $inp % 8
+ andi $inp,$inp,-8 # align $inp
+ slli $tmp0,$tmp0,3 # byte to bit offset
+#endif
+ ld $in0,0($inp)
+ ld $in1,8($inp)
+#ifndef __riscv_misaligned_fast
+ beqz $tmp0,.Laligned_key
+
+ ld $tmp2,16($inp)
+ neg $tmp1,$tmp0 # implicit &63 in sll
+ srl $in0,$in0,$tmp0
+ sll $tmp3,$in1,$tmp1
+ srl $in1,$in1,$tmp0
+ sll $tmp2,$tmp2,$tmp1
+ or $in0,$in0,$tmp3
+ or $in1,$in1,$tmp2
+
+.Laligned_key:
+#endif
+ li $tmp0,1
+ slli $tmp0,$tmp0,32 # 0x0000000100000000
+ addi $tmp0,$tmp0,-63 # 0x00000000ffffffc1
+ slli $tmp0,$tmp0,28 # 0x0ffffffc10000000
+ addi $tmp0,$tmp0,-1 # 0x0ffffffc0fffffff
+
+ and $in0,$in0,$tmp0
+ addi $tmp0,$tmp0,-3 # 0x0ffffffc0ffffffc
+ and $in1,$in1,$tmp0
+
+ sd $in0,24($ctx)
+ srli $tmp0,$in1,2
+ sd $in1,32($ctx)
+ add $tmp0,$tmp0,$in1 # s1 = r1 + (r1 >> 2)
+ sd $tmp0,40($ctx)
+
+.Lno_key:
+ li $a0,0 # return 0
+ ret
+.size poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
+ ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2);
+my ($shr,$shl) = ($t5,$t6); # used on R6
+
+$code.=<<___;
+.globl poly1305_blocks
+.type poly1305_blocks,\@function
+poly1305_blocks:
+#ifdef __riscv_zicfilp
+ lpad 0
+#endif
+ andi $len,$len,-16 # complete blocks only
+ beqz $len,.Lno_data
+
+ caddi $sp,$sp,-4*__SIZEOF_POINTER__
+ PUSH $s0,3*__SIZEOF_POINTER__($sp)
+ PUSH $s1,2*__SIZEOF_POINTER__($sp)
+ PUSH $s2,1*__SIZEOF_POINTER__($sp)
+ PUSH $s3,0*__SIZEOF_POINTER__($sp)
+
+#ifndef __riscv_misaligned_fast
+ andi $shr,$inp,7
+ andi $inp,$inp,-8 # align $inp
+ slli $shr,$shr,3 # byte to bit offset
+ neg $shl,$shr # implicit &63 in sll
+#endif
+
+ ld $h0,0($ctx) # load hash value
+ ld $h1,8($ctx)
+ ld $h2,16($ctx)
+
+ ld $r0,24($ctx) # load key
+ ld $r1,32($ctx)
+ ld $rs1,40($ctx)
+
+ add $len,$len,$inp # end of buffer
+
+.Loop:
+ ld $in0,0($inp) # load input
+ ld $in1,8($inp)
+#ifndef __riscv_misaligned_fast
+ beqz $shr,.Laligned_inp
+
+ ld $tmp2,16($inp)
+ srl $in0,$in0,$shr
+ sll $tmp3,$in1,$shl
+ srl $in1,$in1,$shr
+ sll $tmp2,$tmp2,$shl
+ or $in0,$in0,$tmp3
+ or $in1,$in1,$tmp2
+
+.Laligned_inp:
+#endif
+ caddi $inp,$inp,16
+
+ andi $tmp0,$h2,-4 # modulo-scheduled reduction
+ srli $tmp1,$h2,2
+ andi $h2,$h2,3
+
+ add $d0,$h0,$in0 # accumulate input
+ add $tmp1,$tmp1,$tmp0
+ sltu $tmp0,$d0,$h0
+ add $d0,$d0,$tmp1 # ... and residue
+ sltu $tmp1,$d0,$tmp1
+ add $d1,$h1,$in1
+ add $tmp0,$tmp0,$tmp1
+ sltu $tmp1,$d1,$h1
+ add $d1,$d1,$tmp0
+
+ add $d2,$h2,$padbit
+ sltu $tmp0,$d1,$tmp0
+ mulhu $h1,$r0,$d0 # h0*r0
+ mul $h0,$r0,$d0
+
+ add $d2,$d2,$tmp1
+ add $d2,$d2,$tmp0
+ mulhu $tmp1,$rs1,$d1 # h1*5*r1
+ mul $tmp0,$rs1,$d1
+
+ mulhu $h2,$r1,$d0 # h0*r1
+ mul $tmp2,$r1,$d0
+ add $h0,$h0,$tmp0
+ add $h1,$h1,$tmp1
+ sltu $tmp0,$h0,$tmp0
+
+ add $h1,$h1,$tmp0
+ add $h1,$h1,$tmp2
+ mulhu $tmp1,$r0,$d1 # h1*r0
+ mul $tmp0,$r0,$d1
+
+ sltu $tmp2,$h1,$tmp2
+ add $h2,$h2,$tmp2
+ mul $tmp2,$rs1,$d2 # h2*5*r1
+
+ add $h1,$h1,$tmp0
+ add $h2,$h2,$tmp1
+ mul $tmp3,$r0,$d2 # h2*r0
+ sltu $tmp0,$h1,$tmp0
+ add $h2,$h2,$tmp0
+
+ add $h1,$h1,$tmp2
+ sltu $tmp2,$h1,$tmp2
+ add $h2,$h2,$tmp2
+ add $h2,$h2,$tmp3
+
+ bne $inp,$len,.Loop
+
+ sd $h0,0($ctx) # store hash value
+ sd $h1,8($ctx)
+ sd $h2,16($ctx)
+
+ POP $s0,3*__SIZEOF_POINTER__($sp) # epilogue
+ POP $s1,2*__SIZEOF_POINTER__($sp)
+ POP $s2,1*__SIZEOF_POINTER__($sp)
+ POP $s3,0*__SIZEOF_POINTER__($sp)
+ caddi $sp,$sp,4*__SIZEOF_POINTER__
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.globl poly1305_emit
+.type poly1305_emit,\@function
+poly1305_emit:
+#ifdef __riscv_zicfilp
+ lpad 0
+#endif
+ ld $tmp2,16($ctx)
+ ld $tmp0,0($ctx)
+ ld $tmp1,8($ctx)
+
+ andi $in0,$tmp2,-4 # final reduction
+ srl $in1,$tmp2,2
+ andi $tmp2,$tmp2,3
+ add $in0,$in0,$in1
+
+ add $tmp0,$tmp0,$in0
+ sltu $in1,$tmp0,$in0
+ addi $in0,$tmp0,5 # compare to modulus
+ add $tmp1,$tmp1,$in1
+ sltiu $tmp3,$in0,5
+ sltu $tmp4,$tmp1,$in1
+ add $in1,$tmp1,$tmp3
+ add $tmp2,$tmp2,$tmp4
+ sltu $tmp3,$in1,$tmp3
+ add $tmp2,$tmp2,$tmp3
+
+ srli $tmp2,$tmp2,2 # see if it carried/borrowed
+ neg $tmp2,$tmp2
+
+ xor $in0,$in0,$tmp0
+ xor $in1,$in1,$tmp1
+ and $in0,$in0,$tmp2
+ and $in1,$in1,$tmp2
+ xor $in0,$in0,$tmp0
+ xor $in1,$in1,$tmp1
+
+ lwu $tmp0,0($nonce) # load nonce
+ lwu $tmp1,4($nonce)
+ lwu $tmp2,8($nonce)
+ lwu $tmp3,12($nonce)
+ slli $tmp1,$tmp1,32
+ slli $tmp3,$tmp3,32
+ or $tmp0,$tmp0,$tmp1
+ or $tmp2,$tmp2,$tmp3
+
+ add $in0,$in0,$tmp0 # accumulate nonce
+ add $in1,$in1,$tmp2
+ sltu $tmp0,$in0,$tmp0
+ add $in1,$in1,$tmp0
+
+#ifdef __riscv_misaligned_fast
+ sd $in0,0($mac) # write mac value
+ sd $in1,8($mac)
+#else
+ srli $tmp0,$in0,8 # write mac value
+ srli $tmp1,$in0,16
+ srli $tmp2,$in0,24
+ sb $in0,0($mac)
+ srli $tmp3,$in0,32
+ sb $tmp0,1($mac)
+ srli $tmp0,$in0,40
+ sb $tmp1,2($mac)
+ srli $tmp1,$in0,48
+ sb $tmp2,3($mac)
+ srli $tmp2,$in0,56
+ sb $tmp3,4($mac)
+ srli $tmp3,$in1,8
+ sb $tmp0,5($mac)
+ srli $tmp0,$in1,16
+ sb $tmp1,6($mac)
+ srli $tmp1,$in1,24
+ sb $tmp2,7($mac)
+
+ sb $in1,8($mac)
+ srli $tmp2,$in1,32
+ sb $tmp3,9($mac)
+ srli $tmp3,$in1,40
+ sb $tmp0,10($mac)
+ srli $tmp0,$in1,48
+ sb $tmp1,11($mac)
+ srli $tmp1,$in1,56
+ sb $tmp2,12($mac)
+ sb $tmp3,13($mac)
+ sb $tmp0,14($mac)
+ sb $tmp1,15($mac)
+#endif
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
+___
+}
+}}} else {{{
+######################################################################
+# 32-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
+ ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3);
+
+$code.=<<___;
+#if __riscv_xlen == 32
+# if __SIZEOF_POINTER__ == 8
+# define PUSH csc
+# define POP clc
+# else
+# define PUSH sw
+# define POP lw
+# endif
+# define MULX(hi,lo,a,b) mulhu hi,a,b; mul lo,a,b
+# define srliw srli
+# define srlw srl
+# define sllw sll
+# define addw add
+# define addiw addi
+# define mulw mul
+#elif __riscv_xlen == 64
+# if __SIZEOF_POINTER__ == 16
+# define PUSH csc
+# define POP clc
+# else
+# define PUSH sd
+# define POP ld
+# endif
+# define MULX(hi,lo,a,b) slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32
+#else
+# error "unsupported __riscv_xlen"
+#endif
+
+.option pic
+.text
+
+.globl poly1305_init
+.type poly1305_init,\@function
+poly1305_init:
+#ifdef __riscv_zicfilp
+ lpad 0
+#endif
+ sw $zero,0($ctx)
+ sw $zero,4($ctx)
+ sw $zero,8($ctx)
+ sw $zero,12($ctx)
+ sw $zero,16($ctx)
+
+ beqz $inp,.Lno_key
+
+#ifndef __riscv_misaligned_fast
+ andi $tmp0,$inp,3 # $inp % 4
+ sub $inp,$inp,$tmp0 # align $inp
+ sll $tmp0,$tmp0,3 # byte to bit offset
+#endif
+ lw $in0,0($inp)
+ lw $in1,4($inp)
+ lw $in2,8($inp)
+ lw $in3,12($inp)
+#ifndef __riscv_misaligned_fast
+ beqz $tmp0,.Laligned_key
+
+ lw $tmp2,16($inp)
+ sub $tmp1,$zero,$tmp0
+ srlw $in0,$in0,$tmp0
+ sllw $tmp3,$in1,$tmp1
+ srlw $in1,$in1,$tmp0
+ or $in0,$in0,$tmp3
+ sllw $tmp3,$in2,$tmp1
+ srlw $in2,$in2,$tmp0
+ or $in1,$in1,$tmp3
+ sllw $tmp3,$in3,$tmp1
+ srlw $in3,$in3,$tmp0
+ or $in2,$in2,$tmp3
+ sllw $tmp2,$tmp2,$tmp1
+ or $in3,$in3,$tmp2
+.Laligned_key:
+#endif
+
+ lui $tmp0,0x10000
+ addi $tmp0,$tmp0,-1 # 0x0fffffff
+ and $in0,$in0,$tmp0
+ addi $tmp0,$tmp0,-3 # 0x0ffffffc
+ and $in1,$in1,$tmp0
+ and $in2,$in2,$tmp0
+ and $in3,$in3,$tmp0
+
+ sw $in0,20($ctx)
+ sw $in1,24($ctx)
+ sw $in2,28($ctx)
+ sw $in3,32($ctx)
+
+ srlw $tmp1,$in1,2
+ srlw $tmp2,$in2,2
+ srlw $tmp3,$in3,2
+ addw $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
+ addw $in2,$in2,$tmp2
+ addw $in3,$in3,$tmp3
+ sw $in1,36($ctx)
+ sw $in2,40($ctx)
+ sw $in3,44($ctx)
+.Lno_key:
+ li $a0,0
+ ret
+.size poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
+ ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2);
+my ($d0,$d1,$d2,$d3) =
+ ($a4,$a5,$a6,$a7);
+my $shr = $ra; # used on R6
+
+$code.=<<___;
+.globl poly1305_blocks
+.type poly1305_blocks,\@function
+poly1305_blocks:
+#ifdef __riscv_zicfilp
+ lpad 0
+#endif
+ andi $len,$len,-16 # complete blocks only
+ beqz $len,.Labort
+
+#ifdef __riscv_zcmp
+ cm.push {ra,s0-s8}, -48
+#else
+ caddi $sp,$sp,-__SIZEOF_POINTER__*12
+ PUSH $ra, __SIZEOF_POINTER__*11($sp)
+ PUSH $s0, __SIZEOF_POINTER__*10($sp)
+ PUSH $s1, __SIZEOF_POINTER__*9($sp)
+ PUSH $s2, __SIZEOF_POINTER__*8($sp)
+ PUSH $s3, __SIZEOF_POINTER__*7($sp)
+ PUSH $s4, __SIZEOF_POINTER__*6($sp)
+ PUSH $s5, __SIZEOF_POINTER__*5($sp)
+ PUSH $s6, __SIZEOF_POINTER__*4($sp)
+ PUSH $s7, __SIZEOF_POINTER__*3($sp)
+ PUSH $s8, __SIZEOF_POINTER__*2($sp)
+#endif
+
+#ifndef __riscv_misaligned_fast
+ andi $shr,$inp,3
+ andi $inp,$inp,-4 # align $inp
+ slli $shr,$shr,3 # byte to bit offset
+#endif
+
+ lw $h0,0($ctx) # load hash value
+ lw $h1,4($ctx)
+ lw $h2,8($ctx)
+ lw $h3,12($ctx)
+ lw $h4,16($ctx)
+
+ lw $r0,20($ctx) # load key
+ lw $r1,24($ctx)
+ lw $r2,28($ctx)
+ lw $r3,32($ctx)
+ lw $rs1,36($ctx)
+ lw $rs2,40($ctx)
+ lw $rs3,44($ctx)
+
+ add $len,$len,$inp # end of buffer
+
+.Loop:
+ lw $d0,0($inp) # load input
+ lw $d1,4($inp)
+ lw $d2,8($inp)
+ lw $d3,12($inp)
+#ifndef __riscv_misaligned_fast
+ beqz $shr,.Laligned_inp
+
+ lw $t4,16($inp)
+ sub $t5,$zero,$shr
+ srlw $d0,$d0,$shr
+ sllw $t3,$d1,$t5
+ srlw $d1,$d1,$shr
+ or $d0,$d0,$t3
+ sllw $t3,$d2,$t5
+ srlw $d2,$d2,$shr
+ or $d1,$d1,$t3
+ sllw $t3,$d3,$t5
+ srlw $d3,$d3,$shr
+ or $d2,$d2,$t3
+ sllw $t4,$t4,$t5
+ or $d3,$d3,$t4
+
+.Laligned_inp:
+#endif
+ srliw $t3,$h4,2 # modulo-scheduled reduction
+ andi $t4,$h4,-4
+ andi $h4,$h4,3
+
+ addw $d0,$d0,$h0 # accumulate input
+ addw $t4,$t4,$t3
+ sltu $h0,$d0,$h0
+ addw $d0,$d0,$t4 # ... and residue
+ sltu $t4,$d0,$t4
+
+ addw $d1,$d1,$h1
+ addw $h0,$h0,$t4 # carry
+ sltu $h1,$d1,$h1
+ addw $d1,$d1,$h0
+ sltu $h0,$d1,$h0
+
+ addw $d2,$d2,$h2
+ addw $h1,$h1,$h0 # carry
+ sltu $h2,$d2,$h2
+ addw $d2,$d2,$h1
+ sltu $h1,$d2,$h1
+
+ addw $d3,$d3,$h3
+ addw $h2,$h2,$h1 # carry
+ sltu $h3,$d3,$h3
+ addw $d3,$d3,$h2
+
+ MULX ($h1,$h0,$r0,$d0) # d0*r0
+
+ sltu $h2,$d3,$h2
+ addw $h3,$h3,$h2 # carry
+
+ MULX ($t4,$t3,$rs3,$d1) # d1*s3
+
+ addw $h4,$h4,$padbit
+ caddi $inp,$inp,16
+ addw $h4,$h4,$h3
+
+ MULX ($t6,$a3,$rs2,$d2) # d2*s2
+ addw $h0,$h0,$t3
+ addw $h1,$h1,$t4
+ sltu $t3,$h0,$t3
+ addw $h1,$h1,$t3
+
+ MULX ($t4,$t3,$rs1,$d3) # d3*s1
+ addw $h0,$h0,$a3
+ addw $h1,$h1,$t6
+ sltu $a3,$h0,$a3
+ addw $h1,$h1,$a3
+
+
+ MULX ($h2,$a3,$r1,$d0) # d0*r1
+ addw $h0,$h0,$t3
+ addw $h1,$h1,$t4
+ sltu $t3,$h0,$t3
+ addw $h1,$h1,$t3
+
+ MULX ($t4,$t3,$r0,$d1) # d1*r0
+ addw $h1,$h1,$a3
+ sltu $a3,$h1,$a3
+ addw $h2,$h2,$a3
+
+ MULX ($t6,$a3,$rs3,$d2) # d2*s3
+ addw $h1,$h1,$t3
+ addw $h2,$h2,$t4
+ sltu $t3,$h1,$t3
+ addw $h2,$h2,$t3
+
+ MULX ($t4,$t3,$rs2,$d3) # d3*s2
+ addw $h1,$h1,$a3
+ addw $h2,$h2,$t6
+ sltu $a3,$h1,$a3
+ addw $h2,$h2,$a3
+
+ mulw $a3,$rs1,$h4 # h4*s1
+ addw $h1,$h1,$t3
+ addw $h2,$h2,$t4
+ sltu $t3,$h1,$t3
+ addw $h2,$h2,$t3
+
+
+ MULX ($h3,$t3,$r2,$d0) # d0*r2
+ addw $h1,$h1,$a3
+ sltu $a3,$h1,$a3
+ addw $h2,$h2,$a3
+
+ MULX ($t6,$a3,$r1,$d1) # d1*r1
+ addw $h2,$h2,$t3
+ sltu $t3,$h2,$t3
+ addw $h3,$h3,$t3
+
+ MULX ($t4,$t3,$r0,$d2) # d2*r0
+ addw $h2,$h2,$a3
+ addw $h3,$h3,$t6
+ sltu $a3,$h2,$a3
+ addw $h3,$h3,$a3
+
+ MULX ($t6,$a3,$rs3,$d3) # d3*s3
+ addw $h2,$h2,$t3
+ addw $h3,$h3,$t4
+ sltu $t3,$h2,$t3
+ addw $h3,$h3,$t3
+
+ mulw $t3,$rs2,$h4 # h4*s2
+ addw $h2,$h2,$a3
+ addw $h3,$h3,$t6
+ sltu $a3,$h2,$a3
+ addw $h3,$h3,$a3
+
+
+ MULX ($t6,$a3,$r3,$d0) # d0*r3
+ addw $h2,$h2,$t3
+ sltu $t3,$h2,$t3
+ addw $h3,$h3,$t3
+
+ MULX ($t4,$t3,$r2,$d1) # d1*r2
+ addw $h3,$h3,$a3
+ sltu $a3,$h3,$a3
+ addw $t6,$t6,$a3
+
+ MULX ($a3,$d3,$r0,$d3) # d3*r0
+ addw $h3,$h3,$t3
+ addw $t6,$t6,$t4
+ sltu $t3,$h3,$t3
+ addw $t6,$t6,$t3
+
+ MULX ($t4,$t3,$r1,$d2) # d2*r1
+ addw $h3,$h3,$d3
+ addw $t6,$t6,$a3
+ sltu $d3,$h3,$d3
+ addw $t6,$t6,$d3
+
+ mulw $a3,$rs3,$h4 # h4*s3
+ addw $h3,$h3,$t3
+ addw $t6,$t6,$t4
+ sltu $t3,$h3,$t3
+ addw $t6,$t6,$t3
+
+
+ mulw $h4,$r0,$h4 # h4*r0
+ addw $h3,$h3,$a3
+ sltu $a3,$h3,$a3
+ addw $t6,$t6,$a3
+ addw $h4,$t6,$h4
+
+ li $padbit,1 # if we loop, padbit is 1
+
+ bne $inp,$len,.Loop
+
+ sw $h0,0($ctx) # store hash value
+ sw $h1,4($ctx)
+ sw $h2,8($ctx)
+ sw $h3,12($ctx)
+ sw $h4,16($ctx)
+
+#ifdef __riscv_zcmp
+ cm.popret {ra,s0-s8}, 48
+#else
+ POP $ra, __SIZEOF_POINTER__*11($sp)
+ POP $s0, __SIZEOF_POINTER__*10($sp)
+ POP $s1, __SIZEOF_POINTER__*9($sp)
+ POP $s2, __SIZEOF_POINTER__*8($sp)
+ POP $s3, __SIZEOF_POINTER__*7($sp)
+ POP $s4, __SIZEOF_POINTER__*6($sp)
+ POP $s5, __SIZEOF_POINTER__*5($sp)
+ POP $s6, __SIZEOF_POINTER__*4($sp)
+ POP $s7, __SIZEOF_POINTER__*3($sp)
+ POP $s8, __SIZEOF_POINTER__*2($sp)
+ caddi $sp,$sp,__SIZEOF_POINTER__*12
+#endif
+.Labort:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
+
+$code.=<<___;
+.globl poly1305_emit
+.type poly1305_emit,\@function
+poly1305_emit:
+#ifdef __riscv_zicfilp
+ lpad 0
+#endif
+ lw $tmp4,16($ctx)
+ lw $tmp0,0($ctx)
+ lw $tmp1,4($ctx)
+ lw $tmp2,8($ctx)
+ lw $tmp3,12($ctx)
+
+ srliw $ctx,$tmp4,2 # final reduction
+ andi $in0,$tmp4,-4
+ andi $tmp4,$tmp4,3
+ addw $ctx,$ctx,$in0
+
+ addw $tmp0,$tmp0,$ctx
+ sltu $ctx,$tmp0,$ctx
+ addiw $in0,$tmp0,5 # compare to modulus
+ addw $tmp1,$tmp1,$ctx
+ sltiu $in1,$in0,5
+ sltu $ctx,$tmp1,$ctx
+ addw $in1,$in1,$tmp1
+ addw $tmp2,$tmp2,$ctx
+ sltu $in2,$in1,$tmp1
+ sltu $ctx,$tmp2,$ctx
+ addw $in2,$in2,$tmp2
+ addw $tmp3,$tmp3,$ctx
+ sltu $in3,$in2,$tmp2
+ sltu $ctx,$tmp3,$ctx
+ addw $in3,$in3,$tmp3
+ addw $tmp4,$tmp4,$ctx
+ sltu $ctx,$in3,$tmp3
+ addw $ctx,$ctx,$tmp4
+
+ srl $ctx,$ctx,2 # see if it carried/borrowed
+ sub $ctx,$zero,$ctx
+
+ xor $in0,$in0,$tmp0
+ xor $in1,$in1,$tmp1
+ xor $in2,$in2,$tmp2
+ xor $in3,$in3,$tmp3
+ and $in0,$in0,$ctx
+ and $in1,$in1,$ctx
+ and $in2,$in2,$ctx
+ and $in3,$in3,$ctx
+ xor $in0,$in0,$tmp0
+ xor $in1,$in1,$tmp1
+ xor $in2,$in2,$tmp2
+ xor $in3,$in3,$tmp3
+
+ lw $tmp0,0($nonce) # load nonce
+ lw $tmp1,4($nonce)
+ lw $tmp2,8($nonce)
+ lw $tmp3,12($nonce)
+
+ addw $in0,$in0,$tmp0 # accumulate nonce
+ sltu $ctx,$in0,$tmp0
+
+ addw $in1,$in1,$tmp1
+ sltu $tmp1,$in1,$tmp1
+ addw $in1,$in1,$ctx
+ sltu $ctx,$in1,$ctx
+ addw $ctx,$ctx,$tmp1
+
+ addw $in2,$in2,$tmp2
+ sltu $tmp2,$in2,$tmp2
+ addw $in2,$in2,$ctx
+ sltu $ctx,$in2,$ctx
+ addw $ctx,$ctx,$tmp2
+
+ addw $in3,$in3,$tmp3
+ addw $in3,$in3,$ctx
+
+#ifdef __riscv_misaligned_fast
+ sw $in0,0($mac) # write mac value
+ sw $in1,4($mac)
+ sw $in2,8($mac)
+ sw $in3,12($mac)
+#else
+ srl $tmp0,$in0,8 # write mac value
+ srl $tmp1,$in0,16
+ srl $tmp2,$in0,24
+ sb $in0, 0($mac)
+ sb $tmp0,1($mac)
+ srl $tmp0,$in1,8
+ sb $tmp1,2($mac)
+ srl $tmp1,$in1,16
+ sb $tmp2,3($mac)
+ srl $tmp2,$in1,24
+ sb $in1, 4($mac)
+ sb $tmp0,5($mac)
+ srl $tmp0,$in2,8
+ sb $tmp1,6($mac)
+ srl $tmp1,$in2,16
+ sb $tmp2,7($mac)
+ srl $tmp2,$in2,24
+ sb $in2, 8($mac)
+ sb $tmp0,9($mac)
+ srl $tmp0,$in3,8
+ sb $tmp1,10($mac)
+ srl $tmp1,$in3,16
+ sb $tmp2,11($mac)
+ srl $tmp2,$in3,24
+ sb $in3, 12($mac)
+ sb $tmp0,13($mac)
+ sb $tmp1,14($mac)
+ sb $tmp2,15($mac)
+#endif
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+.string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm"
+___
+}
+}}}
+
+foreach (split("\n", $code)) {
+ if ($flavour =~ /^cheri/) {
+ s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/;
+ s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or
+ s/\b(ret|jal)\b/c$1/;
+ s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or
+ m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g;
+ } else {
+ s/\bcaddi?\b/add/ or
+ s/\bcmove\b/mv/;
+ }
+ print $_, "\n";
+}
+
+close STDOUT;
diff --git a/lib/crypto/riscv/poly1305.h b/lib/crypto/riscv/poly1305.h
new file mode 100644
index 000000000000..88f3df44e355
--- /dev/null
+++ b/lib/crypto/riscv/poly1305.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for riscv
+ *
+ * Copyright (C) 2025 Institute of Software, CAS.
+ */
+
+asmlinkage void poly1305_block_init(struct poly1305_block_state *state,
+ const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
+ const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(const struct poly1305_state *state,
+ u8 digest[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
diff --git a/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
new file mode 100644
index 000000000000..1618d1220a6e
--- /dev/null
+++ b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvknha, +zvkb
+
+#define STATEP a0
+#define DATA a1
+#define NUM_BLOCKS a2
+
+#define STATEP_C a3
+
+#define MASK v0
+#define INDICES v1
+#define W0 v2
+#define W1 v3
+#define W2 v4
+#define W3 v5
+#define VTMP v6
+#define FEBA v7
+#define HGDC v8
+#define K0 v10
+#define K1 v11
+#define K2 v12
+#define K3 v13
+#define K4 v14
+#define K5 v15
+#define K6 v16
+#define K7 v17
+#define K8 v18
+#define K9 v19
+#define K10 v20
+#define K11 v21
+#define K12 v22
+#define K13 v23
+#define K14 v24
+#define K15 v25
+#define PREV_FEBA v26
+#define PREV_HGDC v27
+
+// Do 4 rounds of SHA-256. w0 contains the current 4 message schedule words.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4
+// message schedule words; this macro computes the group after w3 and writes it
+// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
+// w0), so the caller must cycle through the registers accordingly.
+.macro sha256_4rounds last, k, w0, w1, w2, w3
+ vadd.vv VTMP, \k, \w0
+ vsha2cl.vv HGDC, FEBA, VTMP
+ vsha2ch.vv FEBA, HGDC, VTMP
+.if !\last
+ vmerge.vvm VTMP, \w2, \w1, MASK
+ vsha2ms.vv \w0, VTMP, \w3
+.endif
+.endm
+
+.macro sha256_16rounds last, k0, k1, k2, k3
+ sha256_4rounds \last, \k0, W0, W1, W2, W3
+ sha256_4rounds \last, \k1, W1, W2, W3, W0
+ sha256_4rounds \last, \k2, W2, W3, W0, W1
+ sha256_4rounds \last, \k3, W3, W0, W1, W2
+.endm
+
+// void sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
+// const u8 *data, size_t nblocks);
+SYM_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb)
+
+ // Load the round constants into K0-K15.
+ vsetivli zero, 4, e32, m1, ta, ma
+ la t0, K256
+ vle32.v K0, (t0)
+ addi t0, t0, 16
+ vle32.v K1, (t0)
+ addi t0, t0, 16
+ vle32.v K2, (t0)
+ addi t0, t0, 16
+ vle32.v K3, (t0)
+ addi t0, t0, 16
+ vle32.v K4, (t0)
+ addi t0, t0, 16
+ vle32.v K5, (t0)
+ addi t0, t0, 16
+ vle32.v K6, (t0)
+ addi t0, t0, 16
+ vle32.v K7, (t0)
+ addi t0, t0, 16
+ vle32.v K8, (t0)
+ addi t0, t0, 16
+ vle32.v K9, (t0)
+ addi t0, t0, 16
+ vle32.v K10, (t0)
+ addi t0, t0, 16
+ vle32.v K11, (t0)
+ addi t0, t0, 16
+ vle32.v K12, (t0)
+ addi t0, t0, 16
+ vle32.v K13, (t0)
+ addi t0, t0, 16
+ vle32.v K14, (t0)
+ addi t0, t0, 16
+ vle32.v K15, (t0)
+
+ // Setup mask for the vmerge to replace the first word (idx==0) in
+ // message scheduling. There are 4 words, so an 8-bit mask suffices.
+ vsetivli zero, 1, e8, m1, ta, ma
+ vmv.v.i MASK, 0x01
+
+ // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we
+ // need {f,e,b,a},{h,g,d,c}. The dst vtype is e32m1 and the index vtype
+ // is e8mf4. We use index-load with the i8 indices {20, 16, 4, 0},
+ // loaded using the 32-bit little endian value 0x00041014.
+ li t0, 0x00041014
+ vsetivli zero, 1, e32, m1, ta, ma
+ vmv.v.x INDICES, t0
+ addi STATEP_C, STATEP, 8
+ vsetivli zero, 4, e32, m1, ta, ma
+ vluxei8.v FEBA, (STATEP), INDICES
+ vluxei8.v HGDC, (STATEP_C), INDICES
+
+.Lnext_block:
+ addi NUM_BLOCKS, NUM_BLOCKS, -1
+
+ // Save the previous state, as it's needed later.
+ vmv.v.v PREV_FEBA, FEBA
+ vmv.v.v PREV_HGDC, HGDC
+
+ // Load the next 512-bit message block and endian-swap each 32-bit word.
+ vle32.v W0, (DATA)
+ vrev8.v W0, W0
+ addi DATA, DATA, 16
+ vle32.v W1, (DATA)
+ vrev8.v W1, W1
+ addi DATA, DATA, 16
+ vle32.v W2, (DATA)
+ vrev8.v W2, W2
+ addi DATA, DATA, 16
+ vle32.v W3, (DATA)
+ vrev8.v W3, W3
+ addi DATA, DATA, 16
+
+ // Do the 64 rounds of SHA-256.
+ sha256_16rounds 0, K0, K1, K2, K3
+ sha256_16rounds 0, K4, K5, K6, K7
+ sha256_16rounds 0, K8, K9, K10, K11
+ sha256_16rounds 1, K12, K13, K14, K15
+
+ // Add the previous state.
+ vadd.vv FEBA, FEBA, PREV_FEBA
+ vadd.vv HGDC, HGDC, PREV_HGDC
+
+ // Repeat if more blocks remain.
+ bnez NUM_BLOCKS, .Lnext_block
+
+ // Store the new state and return.
+ vsuxei8.v FEBA, (STATEP), INDICES
+ vsuxei8.v HGDC, (STATEP_C), INDICES
+ ret
+SYM_FUNC_END(sha256_transform_zvknha_or_zvknhb_zvkb)
+
+.section ".rodata"
+.p2align 2
+.type K256, @object
+K256:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+.size K256, . - K256
diff --git a/lib/crypto/riscv/sha256.h b/lib/crypto/riscv/sha256.h
new file mode 100644
index 000000000000..1def18b0a4fb
--- /dev/null
+++ b/lib/crypto/riscv/sha256.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 (RISC-V accelerated)
+ *
+ * Copyright (C) 2022 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
+
+asmlinkage void
+sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_extensions) && likely(may_use_simd())) {
+ kernel_vector_begin();
+ sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks);
+ kernel_vector_end();
+ } else {
+ sha256_blocks_generic(state, data, nblocks);
+ }
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+ /* Both zvknha and zvknhb provide the SHA-256 instructions. */
+ if ((riscv_isa_extension_available(NULL, ZVKNHA) ||
+ riscv_isa_extension_available(NULL, ZVKNHB)) &&
+ riscv_isa_extension_available(NULL, ZVKB) &&
+ riscv_vector_vlen() >= 128)
+ static_branch_enable(&have_extensions);
+}
diff --git a/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S b/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S
new file mode 100644
index 000000000000..b41eebf60546
--- /dev/null
+++ b/lib/crypto/riscv/sha512-riscv64-zvknhb-zvkb.S
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknhb')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvknhb, +zvkb
+
+#define STATEP a0
+#define DATA a1
+#define NUM_BLOCKS a2
+
+#define STATEP_C a3
+#define K a4
+
+#define MASK v0
+#define INDICES v1
+#define W0 v10 // LMUL=2
+#define W1 v12 // LMUL=2
+#define W2 v14 // LMUL=2
+#define W3 v16 // LMUL=2
+#define VTMP v20 // LMUL=2
+#define FEBA v22 // LMUL=2
+#define HGDC v24 // LMUL=2
+#define PREV_FEBA v26 // LMUL=2
+#define PREV_HGDC v28 // LMUL=2
+
+// Do 4 rounds of SHA-512. w0 contains the current 4 message schedule words.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4
+// message schedule words; this macro computes the group after w3 and writes it
+// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
+// w0), so the caller must cycle through the registers accordingly.
+.macro sha512_4rounds last, w0, w1, w2, w3
+ vle64.v VTMP, (K)
+ addi K, K, 32
+ vadd.vv VTMP, VTMP, \w0
+ vsha2cl.vv HGDC, FEBA, VTMP
+ vsha2ch.vv FEBA, HGDC, VTMP
+.if !\last
+ vmerge.vvm VTMP, \w2, \w1, MASK
+ vsha2ms.vv \w0, VTMP, \w3
+.endif
+.endm
+
+.macro sha512_16rounds last
+ sha512_4rounds \last, W0, W1, W2, W3
+ sha512_4rounds \last, W1, W2, W3, W0
+ sha512_4rounds \last, W2, W3, W0, W1
+ sha512_4rounds \last, W3, W0, W1, W2
+.endm
+
+// void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state,
+// const u8 *data, size_t nblocks);
+SYM_FUNC_START(sha512_transform_zvknhb_zvkb)
+
+ // Setup mask for the vmerge to replace the first word (idx==0) in
+ // message scheduling. There are 4 words, so an 8-bit mask suffices.
+ vsetivli zero, 1, e8, m1, ta, ma
+ vmv.v.i MASK, 0x01
+
+ // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we
+ // need {f,e,b,a},{h,g,d,c}. The dst vtype is e64m2 and the index vtype
+ // is e8mf4. We use index-load with the i8 indices {40, 32, 8, 0},
+ // loaded using the 32-bit little endian value 0x00082028.
+ li t0, 0x00082028
+ vsetivli zero, 1, e32, m1, ta, ma
+ vmv.v.x INDICES, t0
+ addi STATEP_C, STATEP, 16
+ vsetivli zero, 4, e64, m2, ta, ma
+ vluxei8.v FEBA, (STATEP), INDICES
+ vluxei8.v HGDC, (STATEP_C), INDICES
+
+.Lnext_block:
+ la K, K512
+ addi NUM_BLOCKS, NUM_BLOCKS, -1
+
+ // Save the previous state, as it's needed later.
+ vmv.v.v PREV_FEBA, FEBA
+ vmv.v.v PREV_HGDC, HGDC
+
+ // Load the next 1024-bit message block and endian-swap each 64-bit word
+ vle64.v W0, (DATA)
+ vrev8.v W0, W0
+ addi DATA, DATA, 32
+ vle64.v W1, (DATA)
+ vrev8.v W1, W1
+ addi DATA, DATA, 32
+ vle64.v W2, (DATA)
+ vrev8.v W2, W2
+ addi DATA, DATA, 32
+ vle64.v W3, (DATA)
+ vrev8.v W3, W3
+ addi DATA, DATA, 32
+
+ // Do the 80 rounds of SHA-512.
+ sha512_16rounds 0
+ sha512_16rounds 0
+ sha512_16rounds 0
+ sha512_16rounds 0
+ sha512_16rounds 1
+
+ // Add the previous state.
+ vadd.vv FEBA, FEBA, PREV_FEBA
+ vadd.vv HGDC, HGDC, PREV_HGDC
+
+ // Repeat if more blocks remain.
+ bnez NUM_BLOCKS, .Lnext_block
+
+ // Store the new state and return.
+ vsuxei8.v FEBA, (STATEP), INDICES
+ vsuxei8.v HGDC, (STATEP_C), INDICES
+ ret
+SYM_FUNC_END(sha512_transform_zvknhb_zvkb)
+
+.section ".rodata"
+.p2align 3
+.type K512, @object
+K512:
+ .dword 0x428a2f98d728ae22, 0x7137449123ef65cd
+ .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+ .dword 0x3956c25bf348b538, 0x59f111f1b605d019
+ .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+ .dword 0xd807aa98a3030242, 0x12835b0145706fbe
+ .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+ .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+ .dword 0x9bdc06a725c71235, 0xc19bf174cf692694
+ .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+ .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+ .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+ .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+ .dword 0x983e5152ee66dfab, 0xa831c66d2db43210
+ .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
+ .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
+ .dword 0x06ca6351e003826f, 0x142929670a0e6e70
+ .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
+ .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+ .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
+ .dword 0x81c2c92e47edaee6, 0x92722c851482353b
+ .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
+ .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
+ .dword 0xd192e819d6ef5218, 0xd69906245565a910
+ .dword 0xf40e35855771202a, 0x106aa07032bbd1b8
+ .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+ .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+ .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+ .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+ .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
+ .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
+ .dword 0x90befffa23631e28, 0xa4506cebde82bde9
+ .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
+ .dword 0xca273eceea26619c, 0xd186b8c721c0c207
+ .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+ .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
+ .dword 0x113f9804bef90dae, 0x1b710b35131c471b
+ .dword 0x28db77f523047d84, 0x32caab7b40c72493
+ .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+ .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+ .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+.size K512, . - K512
diff --git a/lib/crypto/riscv/sha512.h b/lib/crypto/riscv/sha512.h
new file mode 100644
index 000000000000..145bdab1214e
--- /dev/null
+++ b/lib/crypto/riscv/sha512.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-512 and SHA-384 using the RISC-V vector crypto extensions
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
+
+asmlinkage void sha512_transform_zvknhb_zvkb(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_extensions) && likely(may_use_simd())) {
+ kernel_vector_begin();
+ sha512_transform_zvknhb_zvkb(state, data, nblocks);
+ kernel_vector_end();
+ } else {
+ sha512_blocks_generic(state, data, nblocks);
+ }
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+ if (riscv_isa_extension_available(NULL, ZVKNHB) &&
+ riscv_isa_extension_available(NULL, ZVKB) &&
+ riscv_vector_vlen() >= 128)
+ static_branch_enable(&have_extensions);
+}
diff --git a/lib/crypto/s390/chacha-s390.S b/lib/crypto/s390/chacha-s390.S
new file mode 100644
index 000000000000..63f3102678c0
--- /dev/null
+++ b/lib/crypto/s390/chacha-s390.S
@@ -0,0 +1,908 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Original implementation written by Andy Polyakov, @dot-asm.
+ * This is an adaptation of the original code for kernel use.
+ *
+ * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/nospec-insn.h>
+#include <asm/fpu-insn.h>
+
+#define SP %r15
+#define FRAME (16 * 8 + 4 * 8)
+
+ .data
+ .balign 32
+
+SYM_DATA_START_LOCAL(sigma)
+ .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
+ .long 1,0,0,0
+ .long 2,0,0,0
+ .long 3,0,0,0
+ .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
+
+ .long 0,1,2,3
+ .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma
+ .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e
+ .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32
+ .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574
+SYM_DATA_END(sigma)
+
+ .previous
+
+ GEN_BR_THUNK %r14
+
+ .text
+
+#############################################################################
+# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
+# counst u32 *key, const u32 *counter)
+
+#define OUT %r2
+#define INP %r3
+#define LEN %r4
+#define KEY %r5
+#define COUNTER %r6
+
+#define BEPERM %v31
+#define CTR %v26
+
+#define K0 %v16
+#define K1 %v17
+#define K2 %v18
+#define K3 %v19
+
+#define XA0 %v0
+#define XA1 %v1
+#define XA2 %v2
+#define XA3 %v3
+
+#define XB0 %v4
+#define XB1 %v5
+#define XB2 %v6
+#define XB3 %v7
+
+#define XC0 %v8
+#define XC1 %v9
+#define XC2 %v10
+#define XC3 %v11
+
+#define XD0 %v12
+#define XD1 %v13
+#define XD2 %v14
+#define XD3 %v15
+
+#define XT0 %v27
+#define XT1 %v28
+#define XT2 %v29
+#define XT3 %v30
+
+SYM_FUNC_START(chacha20_vx_4x)
+ stmg %r6,%r7,6*8(SP)
+
+ larl %r7,sigma
+ lhi %r0,10
+ lhi %r1,0
+
+ VL K0,0,,%r7 # load sigma
+ VL K1,0,,KEY # load key
+ VL K2,16,,KEY
+ VL K3,0,,COUNTER # load counter
+
+ VL BEPERM,0x40,,%r7
+ VL CTR,0x50,,%r7
+
+ VLM XA0,XA3,0x60,%r7,4 # load [smashed] sigma
+
+ VREPF XB0,K1,0 # smash the key
+ VREPF XB1,K1,1
+ VREPF XB2,K1,2
+ VREPF XB3,K1,3
+
+ VREPF XD0,K3,0
+ VREPF XD1,K3,1
+ VREPF XD2,K3,2
+ VREPF XD3,K3,3
+ VAF XD0,XD0,CTR
+
+ VREPF XC0,K2,0
+ VREPF XC1,K2,1
+ VREPF XC2,K2,2
+ VREPF XC3,K2,3
+
+.Loop_4x:
+ VAF XA0,XA0,XB0
+ VX XD0,XD0,XA0
+ VERLLF XD0,XD0,16
+
+ VAF XA1,XA1,XB1
+ VX XD1,XD1,XA1
+ VERLLF XD1,XD1,16
+
+ VAF XA2,XA2,XB2
+ VX XD2,XD2,XA2
+ VERLLF XD2,XD2,16
+
+ VAF XA3,XA3,XB3
+ VX XD3,XD3,XA3
+ VERLLF XD3,XD3,16
+
+ VAF XC0,XC0,XD0
+ VX XB0,XB0,XC0
+ VERLLF XB0,XB0,12
+
+ VAF XC1,XC1,XD1
+ VX XB1,XB1,XC1
+ VERLLF XB1,XB1,12
+
+ VAF XC2,XC2,XD2
+ VX XB2,XB2,XC2
+ VERLLF XB2,XB2,12
+
+ VAF XC3,XC3,XD3
+ VX XB3,XB3,XC3
+ VERLLF XB3,XB3,12
+
+ VAF XA0,XA0,XB0
+ VX XD0,XD0,XA0
+ VERLLF XD0,XD0,8
+
+ VAF XA1,XA1,XB1
+ VX XD1,XD1,XA1
+ VERLLF XD1,XD1,8
+
+ VAF XA2,XA2,XB2
+ VX XD2,XD2,XA2
+ VERLLF XD2,XD2,8
+
+ VAF XA3,XA3,XB3
+ VX XD3,XD3,XA3
+ VERLLF XD3,XD3,8
+
+ VAF XC0,XC0,XD0
+ VX XB0,XB0,XC0
+ VERLLF XB0,XB0,7
+
+ VAF XC1,XC1,XD1
+ VX XB1,XB1,XC1
+ VERLLF XB1,XB1,7
+
+ VAF XC2,XC2,XD2
+ VX XB2,XB2,XC2
+ VERLLF XB2,XB2,7
+
+ VAF XC3,XC3,XD3
+ VX XB3,XB3,XC3
+ VERLLF XB3,XB3,7
+
+ VAF XA0,XA0,XB1
+ VX XD3,XD3,XA0
+ VERLLF XD3,XD3,16
+
+ VAF XA1,XA1,XB2
+ VX XD0,XD0,XA1
+ VERLLF XD0,XD0,16
+
+ VAF XA2,XA2,XB3
+ VX XD1,XD1,XA2
+ VERLLF XD1,XD1,16
+
+ VAF XA3,XA3,XB0
+ VX XD2,XD2,XA3
+ VERLLF XD2,XD2,16
+
+ VAF XC2,XC2,XD3
+ VX XB1,XB1,XC2
+ VERLLF XB1,XB1,12
+
+ VAF XC3,XC3,XD0
+ VX XB2,XB2,XC3
+ VERLLF XB2,XB2,12
+
+ VAF XC0,XC0,XD1
+ VX XB3,XB3,XC0
+ VERLLF XB3,XB3,12
+
+ VAF XC1,XC1,XD2
+ VX XB0,XB0,XC1
+ VERLLF XB0,XB0,12
+
+ VAF XA0,XA0,XB1
+ VX XD3,XD3,XA0
+ VERLLF XD3,XD3,8
+
+ VAF XA1,XA1,XB2
+ VX XD0,XD0,XA1
+ VERLLF XD0,XD0,8
+
+ VAF XA2,XA2,XB3
+ VX XD1,XD1,XA2
+ VERLLF XD1,XD1,8
+
+ VAF XA3,XA3,XB0
+ VX XD2,XD2,XA3
+ VERLLF XD2,XD2,8
+
+ VAF XC2,XC2,XD3
+ VX XB1,XB1,XC2
+ VERLLF XB1,XB1,7
+
+ VAF XC3,XC3,XD0
+ VX XB2,XB2,XC3
+ VERLLF XB2,XB2,7
+
+ VAF XC0,XC0,XD1
+ VX XB3,XB3,XC0
+ VERLLF XB3,XB3,7
+
+ VAF XC1,XC1,XD2
+ VX XB0,XB0,XC1
+ VERLLF XB0,XB0,7
+ brct %r0,.Loop_4x
+
+ VAF XD0,XD0,CTR
+
+ VMRHF XT0,XA0,XA1 # transpose data
+ VMRHF XT1,XA2,XA3
+ VMRLF XT2,XA0,XA1
+ VMRLF XT3,XA2,XA3
+ VPDI XA0,XT0,XT1,0b0000
+ VPDI XA1,XT0,XT1,0b0101
+ VPDI XA2,XT2,XT3,0b0000
+ VPDI XA3,XT2,XT3,0b0101
+
+ VMRHF XT0,XB0,XB1
+ VMRHF XT1,XB2,XB3
+ VMRLF XT2,XB0,XB1
+ VMRLF XT3,XB2,XB3
+ VPDI XB0,XT0,XT1,0b0000
+ VPDI XB1,XT0,XT1,0b0101
+ VPDI XB2,XT2,XT3,0b0000
+ VPDI XB3,XT2,XT3,0b0101
+
+ VMRHF XT0,XC0,XC1
+ VMRHF XT1,XC2,XC3
+ VMRLF XT2,XC0,XC1
+ VMRLF XT3,XC2,XC3
+ VPDI XC0,XT0,XT1,0b0000
+ VPDI XC1,XT0,XT1,0b0101
+ VPDI XC2,XT2,XT3,0b0000
+ VPDI XC3,XT2,XT3,0b0101
+
+ VMRHF XT0,XD0,XD1
+ VMRHF XT1,XD2,XD3
+ VMRLF XT2,XD0,XD1
+ VMRLF XT3,XD2,XD3
+ VPDI XD0,XT0,XT1,0b0000
+ VPDI XD1,XT0,XT1,0b0101
+ VPDI XD2,XT2,XT3,0b0000
+ VPDI XD3,XT2,XT3,0b0101
+
+ VAF XA0,XA0,K0
+ VAF XB0,XB0,K1
+ VAF XC0,XC0,K2
+ VAF XD0,XD0,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+
+ VAF XA0,XA1,K0
+ VAF XB0,XB1,K1
+ VAF XC0,XC1,K2
+ VAF XD0,XD1,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_4x
+
+ VAF XA0,XA2,K0
+ VAF XB0,XB2,K1
+ VAF XC0,XC2,K2
+ VAF XD0,XD2,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_4x
+
+ VAF XA0,XA3,K0
+ VAF XB0,XB3,K1
+ VAF XC0,XC3,K2
+ VAF XD0,XD3,K3
+
+ VPERM XA0,XA0,XA0,BEPERM
+ VPERM XB0,XB0,XB0,BEPERM
+ VPERM XC0,XC0,XC0,BEPERM
+ VPERM XD0,XD0,XD0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_4x
+
+ VLM XT0,XT3,0,INP,0
+
+ VX XT0,XT0,XA0
+ VX XT1,XT1,XB0
+ VX XT2,XT2,XC0
+ VX XT3,XT3,XD0
+
+ VSTM XT0,XT3,0,OUT,0
+
+.Ldone_4x:
+ lmg %r6,%r7,6*8(SP)
+ BR_EX %r14
+
+.Ltail_4x:
+ VLR XT0,XC0
+ VLR XT1,XD0
+
+ VST XA0,8*8+0x00,,SP
+ VST XB0,8*8+0x10,,SP
+ VST XT0,8*8+0x20,,SP
+ VST XT1,8*8+0x30,,SP
+
+ lghi %r1,0
+
+.Loop_tail_4x:
+ llgc %r5,0(%r1,INP)
+ llgc %r6,8*8(%r1,SP)
+ xr %r6,%r5
+ stc %r6,0(%r1,OUT)
+ la %r1,1(%r1)
+ brct LEN,.Loop_tail_4x
+
+ lmg %r6,%r7,6*8(SP)
+ BR_EX %r14
+SYM_FUNC_END(chacha20_vx_4x)
+
+#undef OUT
+#undef INP
+#undef LEN
+#undef KEY
+#undef COUNTER
+
+#undef BEPERM
+
+#undef K0
+#undef K1
+#undef K2
+#undef K3
+
+
+#############################################################################
+# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
+# counst u32 *key, const u32 *counter)
+
+#define OUT %r2
+#define INP %r3
+#define LEN %r4
+#define KEY %r5
+#define COUNTER %r6
+
+#define BEPERM %v31
+
+#define K0 %v27
+#define K1 %v24
+#define K2 %v25
+#define K3 %v26
+
+#define A0 %v0
+#define B0 %v1
+#define C0 %v2
+#define D0 %v3
+
+#define A1 %v4
+#define B1 %v5
+#define C1 %v6
+#define D1 %v7
+
+#define A2 %v8
+#define B2 %v9
+#define C2 %v10
+#define D2 %v11
+
+#define A3 %v12
+#define B3 %v13
+#define C3 %v14
+#define D3 %v15
+
+#define A4 %v16
+#define B4 %v17
+#define C4 %v18
+#define D4 %v19
+
+#define A5 %v20
+#define B5 %v21
+#define C5 %v22
+#define D5 %v23
+
+#define T0 %v27
+#define T1 %v28
+#define T2 %v29
+#define T3 %v30
+
+SYM_FUNC_START(chacha20_vx)
+ clgfi LEN,256
+ jle chacha20_vx_4x
+ stmg %r6,%r7,6*8(SP)
+
+ lghi %r1,-FRAME
+ lgr %r0,SP
+ la SP,0(%r1,SP)
+ stg %r0,0(SP) # back-chain
+
+ larl %r7,sigma
+ lhi %r0,10
+
+ VLM K1,K2,0,KEY,0 # load key
+ VL K3,0,,COUNTER # load counter
+
+ VLM K0,BEPERM,0,%r7,4 # load sigma, increments, ...
+
+.Loop_outer_vx:
+ VLR A0,K0
+ VLR B0,K1
+ VLR A1,K0
+ VLR B1,K1
+ VLR A2,K0
+ VLR B2,K1
+ VLR A3,K0
+ VLR B3,K1
+ VLR A4,K0
+ VLR B4,K1
+ VLR A5,K0
+ VLR B5,K1
+
+ VLR D0,K3
+ VAF D1,K3,T1 # K[3]+1
+ VAF D2,K3,T2 # K[3]+2
+ VAF D3,K3,T3 # K[3]+3
+ VAF D4,D2,T2 # K[3]+4
+ VAF D5,D2,T3 # K[3]+5
+
+ VLR C0,K2
+ VLR C1,K2
+ VLR C2,K2
+ VLR C3,K2
+ VLR C4,K2
+ VLR C5,K2
+
+ VLR T1,D1
+ VLR T2,D2
+ VLR T3,D3
+
+.Loop_vx:
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,16
+ VERLLF D1,D1,16
+ VERLLF D2,D2,16
+ VERLLF D3,D3,16
+ VERLLF D4,D4,16
+ VERLLF D5,D5,16
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,12
+ VERLLF B1,B1,12
+ VERLLF B2,B2,12
+ VERLLF B3,B3,12
+ VERLLF B4,B4,12
+ VERLLF B5,B5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,8
+ VERLLF D1,D1,8
+ VERLLF D2,D2,8
+ VERLLF D3,D3,8
+ VERLLF D4,D4,8
+ VERLLF D5,D5,8
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,7
+ VERLLF B1,B1,7
+ VERLLF B2,B2,7
+ VERLLF B3,B3,7
+ VERLLF B4,B4,7
+ VERLLF B5,B5,7
+
+ VSLDB C0,C0,C0,8
+ VSLDB C1,C1,C1,8
+ VSLDB C2,C2,C2,8
+ VSLDB C3,C3,C3,8
+ VSLDB C4,C4,C4,8
+ VSLDB C5,C5,C5,8
+ VSLDB B0,B0,B0,4
+ VSLDB B1,B1,B1,4
+ VSLDB B2,B2,B2,4
+ VSLDB B3,B3,B3,4
+ VSLDB B4,B4,B4,4
+ VSLDB B5,B5,B5,4
+ VSLDB D0,D0,D0,12
+ VSLDB D1,D1,D1,12
+ VSLDB D2,D2,D2,12
+ VSLDB D3,D3,D3,12
+ VSLDB D4,D4,D4,12
+ VSLDB D5,D5,D5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,16
+ VERLLF D1,D1,16
+ VERLLF D2,D2,16
+ VERLLF D3,D3,16
+ VERLLF D4,D4,16
+ VERLLF D5,D5,16
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,12
+ VERLLF B1,B1,12
+ VERLLF B2,B2,12
+ VERLLF B3,B3,12
+ VERLLF B4,B4,12
+ VERLLF B5,B5,12
+
+ VAF A0,A0,B0
+ VAF A1,A1,B1
+ VAF A2,A2,B2
+ VAF A3,A3,B3
+ VAF A4,A4,B4
+ VAF A5,A5,B5
+ VX D0,D0,A0
+ VX D1,D1,A1
+ VX D2,D2,A2
+ VX D3,D3,A3
+ VX D4,D4,A4
+ VX D5,D5,A5
+ VERLLF D0,D0,8
+ VERLLF D1,D1,8
+ VERLLF D2,D2,8
+ VERLLF D3,D3,8
+ VERLLF D4,D4,8
+ VERLLF D5,D5,8
+
+ VAF C0,C0,D0
+ VAF C1,C1,D1
+ VAF C2,C2,D2
+ VAF C3,C3,D3
+ VAF C4,C4,D4
+ VAF C5,C5,D5
+ VX B0,B0,C0
+ VX B1,B1,C1
+ VX B2,B2,C2
+ VX B3,B3,C3
+ VX B4,B4,C4
+ VX B5,B5,C5
+ VERLLF B0,B0,7
+ VERLLF B1,B1,7
+ VERLLF B2,B2,7
+ VERLLF B3,B3,7
+ VERLLF B4,B4,7
+ VERLLF B5,B5,7
+
+ VSLDB C0,C0,C0,8
+ VSLDB C1,C1,C1,8
+ VSLDB C2,C2,C2,8
+ VSLDB C3,C3,C3,8
+ VSLDB C4,C4,C4,8
+ VSLDB C5,C5,C5,8
+ VSLDB B0,B0,B0,12
+ VSLDB B1,B1,B1,12
+ VSLDB B2,B2,B2,12
+ VSLDB B3,B3,B3,12
+ VSLDB B4,B4,B4,12
+ VSLDB B5,B5,B5,12
+ VSLDB D0,D0,D0,4
+ VSLDB D1,D1,D1,4
+ VSLDB D2,D2,D2,4
+ VSLDB D3,D3,D3,4
+ VSLDB D4,D4,D4,4
+ VSLDB D5,D5,D5,4
+ brct %r0,.Loop_vx
+
+ VAF A0,A0,K0
+ VAF B0,B0,K1
+ VAF C0,C0,K2
+ VAF D0,D0,K3
+ VAF A1,A1,K0
+ VAF D1,D1,T1 # +K[3]+1
+
+ VPERM A0,A0,A0,BEPERM
+ VPERM B0,B0,B0,BEPERM
+ VPERM C0,C0,C0,BEPERM
+ VPERM D0,D0,D0,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VAF D2,D2,T2 # +K[3]+2
+ VAF D3,D3,T3 # +K[3]+3
+ VLM T0,T3,0,INP,0
+
+ VX A0,A0,T0
+ VX B0,B0,T1
+ VX C0,C0,T2
+ VX D0,D0,T3
+
+ VLM K0,T3,0,%r7,4 # re-load sigma and increments
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF B1,B1,K1
+ VAF C1,C1,K2
+
+ VPERM A0,A1,A1,BEPERM
+ VPERM B0,B1,B1,BEPERM
+ VPERM C0,C1,C1,BEPERM
+ VPERM D0,D1,D1,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A2,A2,K0
+ VAF B2,B2,K1
+ VAF C2,C2,K2
+
+ VPERM A0,A2,A2,BEPERM
+ VPERM B0,B2,B2,BEPERM
+ VPERM C0,C2,C2,BEPERM
+ VPERM D0,D2,D2,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A3,A3,K0
+ VAF B3,B3,K1
+ VAF C3,C3,K2
+ VAF D2,K3,T3 # K[3]+3
+
+ VPERM A0,A3,A3,BEPERM
+ VPERM B0,B3,B3,BEPERM
+ VPERM C0,C3,C3,BEPERM
+ VPERM D0,D3,D3,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VAF D3,D2,T1 # K[3]+4
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A4,A4,K0
+ VAF B4,B4,K1
+ VAF C4,C4,K2
+ VAF D4,D4,D3 # +K[3]+4
+ VAF D3,D3,T1 # K[3]+5
+ VAF K3,D2,T3 # K[3]+=6
+
+ VPERM A0,A4,A4,BEPERM
+ VPERM B0,B4,B4,BEPERM
+ VPERM C0,C4,C4,BEPERM
+ VPERM D0,D4,D4,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ aghi LEN,-0x40
+ je .Ldone_vx
+
+ VAF A5,A5,K0
+ VAF B5,B5,K1
+ VAF C5,C5,K2
+ VAF D5,D5,D3 # +K[3]+5
+
+ VPERM A0,A5,A5,BEPERM
+ VPERM B0,B5,B5,BEPERM
+ VPERM C0,C5,C5,BEPERM
+ VPERM D0,D5,D5,BEPERM
+
+ clgfi LEN,0x40
+ jl .Ltail_vx
+
+ VLM A1,D1,0,INP,0
+
+ VX A0,A0,A1
+ VX B0,B0,B1
+ VX C0,C0,C1
+ VX D0,D0,D1
+
+ VSTM A0,D0,0,OUT,0
+
+ la INP,0x40(INP)
+ la OUT,0x40(OUT)
+ lhi %r0,10
+ aghi LEN,-0x40
+ jne .Loop_outer_vx
+
+.Ldone_vx:
+ lmg %r6,%r7,FRAME+6*8(SP)
+ la SP,FRAME(SP)
+ BR_EX %r14
+
+.Ltail_vx:
+ VSTM A0,D0,8*8,SP,3
+ lghi %r1,0
+
+.Loop_tail_vx:
+ llgc %r5,0(%r1,INP)
+ llgc %r6,8*8(%r1,SP)
+ xr %r6,%r5
+ stc %r6,0(%r1,OUT)
+ la %r1,1(%r1)
+ brct LEN,.Loop_tail_vx
+
+ lmg %r6,%r7,FRAME+6*8(SP)
+ la SP,FRAME(SP)
+ BR_EX %r14
+SYM_FUNC_END(chacha20_vx)
+
+.previous
diff --git a/lib/crypto/s390/chacha-s390.h b/lib/crypto/s390/chacha-s390.h
new file mode 100644
index 000000000000..733744ce30f5
--- /dev/null
+++ b/lib/crypto/s390/chacha-s390.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#ifndef _CHACHA_S390_H
+#define _CHACHA_S390_H
+
+void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key,
+ const u32 *counter);
+
+#endif /* _CHACHA_S390_H */
diff --git a/lib/crypto/s390/chacha.h b/lib/crypto/s390/chacha.h
new file mode 100644
index 000000000000..fd9c4a422365
--- /dev/null
+++ b/lib/crypto/s390/chacha.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ChaCha stream cipher (s390 optimized)
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+#include <asm/fpu.h>
+#include "chacha-s390.h"
+
+#define hchacha_block_arch hchacha_block_generic /* not implemented yet */
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int bytes, int nrounds)
+{
+ /* s390 chacha20 implementation has 20 rounds hard-coded,
+ * it cannot handle a block of data or less, but otherwise
+ * it can handle data of arbitrary size
+ */
+ if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) {
+ chacha_crypt_generic(state, dst, src, bytes, nrounds);
+ } else {
+ DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
+
+ kernel_fpu_begin(&vxstate, KERNEL_VXR);
+ chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]);
+ kernel_fpu_end(&vxstate, KERNEL_VXR);
+
+ state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) /
+ CHACHA_BLOCK_SIZE;
+ }
+}
diff --git a/lib/crypto/s390/sha1.h b/lib/crypto/s390/sha1.h
new file mode 100644
index 000000000000..73d94476a157
--- /dev/null
+++ b/lib/crypto/s390/sha1.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha1);
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_cpacf_sha1))
+ cpacf_kimd(CPACF_KIMD_SHA_1, state, data,
+ nblocks * SHA1_BLOCK_SIZE);
+ else
+ sha1_blocks_generic(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+ if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+ cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_1))
+ static_branch_enable(&have_cpacf_sha1);
+}
diff --git a/lib/crypto/s390/sha256.h b/lib/crypto/s390/sha256.h
new file mode 100644
index 000000000000..acd483508789
--- /dev/null
+++ b/lib/crypto/s390/sha256.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256);
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_cpacf_sha256))
+ cpacf_kimd(CPACF_KIMD_SHA_256, state, data,
+ nblocks * SHA256_BLOCK_SIZE);
+ else
+ sha256_blocks_generic(state, data, nblocks);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+ if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+ cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256))
+ static_branch_enable(&have_cpacf_sha256);
+}
diff --git a/lib/crypto/s390/sha3.h b/lib/crypto/s390/sha3.h
new file mode 100644
index 000000000000..85471404775a
--- /dev/null
+++ b/lib/crypto/s390/sha3.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-3 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3_init_optim);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+ size_t nblocks, size_t block_size)
+{
+ if (static_branch_likely(&have_sha3)) {
+ /*
+ * Note that KIMD assumes little-endian order of the state
+ * words. sha3_state already uses that order, though, so
+ * there's no need for a byteswap.
+ */
+ switch (block_size) {
+ case SHA3_224_BLOCK_SIZE:
+ cpacf_kimd(CPACF_KIMD_SHA3_224, state,
+ data, nblocks * block_size);
+ return;
+ case SHA3_256_BLOCK_SIZE:
+ /*
+ * This case handles both SHA3-256 and SHAKE256, since
+ * they have the same block size.
+ */
+ cpacf_kimd(CPACF_KIMD_SHA3_256, state,
+ data, nblocks * block_size);
+ return;
+ case SHA3_384_BLOCK_SIZE:
+ cpacf_kimd(CPACF_KIMD_SHA3_384, state,
+ data, nblocks * block_size);
+ return;
+ case SHA3_512_BLOCK_SIZE:
+ cpacf_kimd(CPACF_KIMD_SHA3_512, state,
+ data, nblocks * block_size);
+ return;
+ }
+ }
+ sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+ if (static_branch_likely(&have_sha3)) {
+ /*
+ * Passing zeroes into any of CPACF_KIMD_SHA3_* gives the plain
+ * Keccak-f permutation, which is what we want here. Use
+ * SHA3-512 since it has the smallest block size.
+ */
+ static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+ cpacf_kimd(CPACF_KIMD_SHA3_512, state, zeroes, sizeof(zeroes));
+ } else {
+ sha3_keccakf_generic(state);
+ }
+}
+
+static inline bool s390_sha3(int func, const u8 *in, size_t in_len,
+ u8 *out, size_t out_len)
+{
+ struct sha3_state state;
+
+ if (!static_branch_likely(&have_sha3))
+ return false;
+
+ if (static_branch_likely(&have_sha3_init_optim))
+ func |= CPACF_KLMD_NIP | CPACF_KLMD_DUFOP;
+ else
+ memset(&state, 0, sizeof(state));
+
+ cpacf_klmd(func, &state, in, in_len);
+
+ if (static_branch_likely(&have_sha3_init_optim))
+ kmsan_unpoison_memory(&state, out_len);
+
+ memcpy(out, &state, out_len);
+ memzero_explicit(&state, sizeof(state));
+ return true;
+}
+
+#define sha3_224_arch sha3_224_arch
+static bool sha3_224_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_224_DIGEST_SIZE])
+{
+ return s390_sha3(CPACF_KLMD_SHA3_224, in, in_len,
+ out, SHA3_224_DIGEST_SIZE);
+}
+
+#define sha3_256_arch sha3_256_arch
+static bool sha3_256_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_256_DIGEST_SIZE])
+{
+ return s390_sha3(CPACF_KLMD_SHA3_256, in, in_len,
+ out, SHA3_256_DIGEST_SIZE);
+}
+
+#define sha3_384_arch sha3_384_arch
+static bool sha3_384_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_384_DIGEST_SIZE])
+{
+ return s390_sha3(CPACF_KLMD_SHA3_384, in, in_len,
+ out, SHA3_384_DIGEST_SIZE);
+}
+
+#define sha3_512_arch sha3_512_arch
+static bool sha3_512_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_512_DIGEST_SIZE])
+{
+ return s390_sha3(CPACF_KLMD_SHA3_512, in, in_len,
+ out, SHA3_512_DIGEST_SIZE);
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+ int num_present = 0;
+ int num_possible = 0;
+
+ if (!cpu_have_feature(S390_CPU_FEATURE_MSA))
+ return;
+ /*
+ * Since all the SHA-3 functions are in Message-Security-Assist
+ * Extension 6, just treat them as all or nothing. This way we need
+ * only one static_key.
+ */
+#define QUERY(opcode, func) \
+ ({ num_present += !!cpacf_query_func(opcode, func); num_possible++; })
+ QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_224);
+ QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_256);
+ QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_384);
+ QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_512);
+ QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_224);
+ QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_256);
+ QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_384);
+ QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_512);
+#undef QUERY
+
+ if (num_present == num_possible) {
+ static_branch_enable(&have_sha3);
+ if (test_facility(86))
+ static_branch_enable(&have_sha3_init_optim);
+ } else if (num_present != 0) {
+ pr_warn("Unsupported combination of SHA-3 facilities\n");
+ }
+}
diff --git a/lib/crypto/s390/sha512.h b/lib/crypto/s390/sha512.h
new file mode 100644
index 000000000000..46699d43df7e
--- /dev/null
+++ b/lib/crypto/s390/sha512.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-512 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha512);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_cpacf_sha512))
+ cpacf_kimd(CPACF_KIMD_SHA_512, state, data,
+ nblocks * SHA512_BLOCK_SIZE);
+ else
+ sha512_blocks_generic(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+ if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+ cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_512))
+ static_branch_enable(&have_cpacf_sha512);
+}
diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c
index 1aebe7be9401..52788278cd17 100644
--- a/lib/crypto/sha1.c
+++ b/lib/crypto/sha1.c
@@ -1,18 +1,22 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * SHA1 routine optimized to do word accesses rather than byte accesses,
- * and to avoid unnecessary copies into the context array.
- *
- * This was based on the git SHA1 implementation.
+ * SHA-1 and HMAC-SHA1 library functions
*/
-#include <linux/kernel.h>
+#include <crypto/hmac.h>
+#include <crypto/sha1.h>
+#include <linux/bitops.h>
#include <linux/export.h>
+#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/bitops.h>
#include <linux/string.h>
-#include <crypto/sha1.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+#include "fips.h"
+
+static const struct sha1_block_state sha1_iv = {
+ .h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+};
/*
* If you have 32 registers or more, the compiler can (and should)
@@ -124,10 +128,10 @@ void sha1_transform(__u32 *digest, const char *data, __u32 *array)
EXPORT_SYMBOL(sha1_transform);
/**
- * sha1_init - initialize the vectors for a SHA1 digest
+ * sha1_init_raw - initialize the vectors for a SHA1 digest
* @buf: vector to initialize
*/
-void sha1_init(__u32 *buf)
+void sha1_init_raw(__u32 *buf)
{
buf[0] = 0x67452301;
buf[1] = 0xefcdab89;
@@ -135,6 +139,227 @@ void sha1_init(__u32 *buf)
buf[3] = 0x10325476;
buf[4] = 0xc3d2e1f0;
}
-EXPORT_SYMBOL(sha1_init);
+EXPORT_SYMBOL(sha1_init_raw);
+
+static void __maybe_unused sha1_blocks_generic(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ u32 workspace[SHA1_WORKSPACE_WORDS];
+
+ do {
+ sha1_transform(state->h, data, workspace);
+ data += SHA1_BLOCK_SIZE;
+ } while (--nblocks);
+
+ memzero_explicit(workspace, sizeof(workspace));
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA1_ARCH
+#include "sha1.h" /* $(SRCARCH)/sha1.h */
+#else
+#define sha1_blocks sha1_blocks_generic
+#endif
+
+void sha1_init(struct sha1_ctx *ctx)
+{
+ ctx->state = sha1_iv;
+ ctx->bytecount = 0;
+}
+EXPORT_SYMBOL_GPL(sha1_init);
+
+void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len)
+{
+ size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;
+
+ ctx->bytecount += len;
+
+ if (partial + len >= SHA1_BLOCK_SIZE) {
+ size_t nblocks;
+
+ if (partial) {
+ size_t l = SHA1_BLOCK_SIZE - partial;
+
+ memcpy(&ctx->buf[partial], data, l);
+ data += l;
+ len -= l;
+
+ sha1_blocks(&ctx->state, ctx->buf, 1);
+ }
+
+ nblocks = len / SHA1_BLOCK_SIZE;
+ len %= SHA1_BLOCK_SIZE;
+
+ if (nblocks) {
+ sha1_blocks(&ctx->state, data, nblocks);
+ data += nblocks * SHA1_BLOCK_SIZE;
+ }
+ partial = 0;
+ }
+ if (len)
+ memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(sha1_update);
+
+static void __sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+ u64 bitcount = ctx->bytecount << 3;
+ size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;
+
+ ctx->buf[partial++] = 0x80;
+ if (partial > SHA1_BLOCK_SIZE - 8) {
+ memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - partial);
+ sha1_blocks(&ctx->state, ctx->buf, 1);
+ partial = 0;
+ }
+ memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - 8 - partial);
+ *(__be64 *)&ctx->buf[SHA1_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
+ sha1_blocks(&ctx->state, ctx->buf, 1);
+
+ for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
+ put_unaligned_be32(ctx->state.h[i / 4], out + i);
+}
+
+void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+ __sha1_final(ctx, out);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha1_final);
+
+void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE])
+{
+ struct sha1_ctx ctx;
+
+ sha1_init(&ctx);
+ sha1_update(&ctx, data, len);
+ sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha1);
+
+static void __hmac_sha1_preparekey(struct sha1_block_state *istate,
+ struct sha1_block_state *ostate,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ union {
+ u8 b[SHA1_BLOCK_SIZE];
+ unsigned long w[SHA1_BLOCK_SIZE / sizeof(unsigned long)];
+ } derived_key = { 0 };
+
+ if (unlikely(raw_key_len > SHA1_BLOCK_SIZE))
+ sha1(raw_key, raw_key_len, derived_key.b);
+ else
+ memcpy(derived_key.b, raw_key, raw_key_len);
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+ *istate = sha1_iv;
+ sha1_blocks(istate, derived_key.b, 1);
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+ HMAC_IPAD_VALUE);
+ *ostate = sha1_iv;
+ sha1_blocks(ostate, derived_key.b, 1);
+
+ memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha1_preparekey(struct hmac_sha1_key *key,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha1_preparekey(&key->istate, &key->ostate,
+ raw_key, raw_key_len);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_preparekey);
+
+void hmac_sha1_init(struct hmac_sha1_ctx *ctx, const struct hmac_sha1_key *key)
+{
+ ctx->sha_ctx.state = key->istate;
+ ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE;
+ ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_init);
+
+void hmac_sha1_init_usingrawkey(struct hmac_sha1_ctx *ctx,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha1_preparekey(&ctx->sha_ctx.state, &ctx->ostate,
+ raw_key, raw_key_len);
+ ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_init_usingrawkey);
+
+void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+ /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+ __sha1_final(&ctx->sha_ctx, ctx->sha_ctx.buf);
+ memset(&ctx->sha_ctx.buf[SHA1_DIGEST_SIZE], 0,
+ SHA1_BLOCK_SIZE - SHA1_DIGEST_SIZE);
+ ctx->sha_ctx.buf[SHA1_DIGEST_SIZE] = 0x80;
+ *(__be32 *)&ctx->sha_ctx.buf[SHA1_BLOCK_SIZE - 4] =
+ cpu_to_be32(8 * (SHA1_BLOCK_SIZE + SHA1_DIGEST_SIZE));
+
+ /* Compute the outer hash, which gives the HMAC value. */
+ sha1_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+ for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
+ put_unaligned_be32(ctx->ostate.h[i / 4], out + i);
+
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_final);
+
+void hmac_sha1(const struct hmac_sha1_key *key,
+ const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE])
+{
+ struct hmac_sha1_ctx ctx;
+
+ hmac_sha1_init(&ctx, key);
+ hmac_sha1_update(&ctx, data, data_len);
+ hmac_sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1);
+
+void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+ const u8 *data, size_t data_len,
+ u8 out[SHA1_DIGEST_SIZE])
+{
+ struct hmac_sha1_ctx ctx;
+
+ hmac_sha1_init_usingrawkey(&ctx, raw_key, raw_key_len);
+ hmac_sha1_update(&ctx, data, data_len);
+ hmac_sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_usingrawkey);
+
+#if defined(sha1_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha1_mod_init(void)
+{
+#ifdef sha1_mod_init_arch
+ sha1_mod_init_arch();
+#endif
+ if (fips_enabled) {
+ /*
+ * FIPS cryptographic algorithm self-test. As per the FIPS
+ * Implementation Guidance, testing HMAC-SHA1 satisfies the test
+ * requirement for SHA-1 too.
+ */
+ u8 mac[SHA1_DIGEST_SIZE];
+
+ hmac_sha1_usingrawkey(fips_test_key, sizeof(fips_test_key),
+ fips_test_data, sizeof(fips_test_data),
+ mac);
+ if (memcmp(fips_test_hmac_sha1_value, mac, sizeof(mac)) != 0)
+ panic("sha1: FIPS self-test failed\n");
+ }
+ return 0;
+}
+subsys_initcall(sha1_mod_init);
+
+static void __exit sha1_mod_exit(void)
+{
+}
+module_exit(sha1_mod_exit);
+#endif
+MODULE_DESCRIPTION("SHA-1 and HMAC-SHA1 library functions");
MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 72a4b0b1df28..5d6b77e7e141 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -1,56 +1,65 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * SHA-256, as specified in
- * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
- *
- * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
+ * SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions
*
* Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
* Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
* Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
* Copyright (c) 2014 Red Hat Inc.
+ * Copyright 2025 Google LLC
*/
-#include <linux/bitops.h>
+#include <crypto/hmac.h>
+#include <crypto/sha2.h>
#include <linux/export.h>
+#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
-#include <crypto/sha2.h>
-#include <asm/unaligned.h>
-
-static const u32 SHA256_K[] = {
- 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
- 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
- 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
- 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
- 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
- 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
- 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
- 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
- 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
- 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
- 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
- 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
- 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
- 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
- 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
- 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+#include "fips.h"
+
+static const struct sha256_block_state sha224_iv = {
+ .h = {
+ SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
+ SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
+ },
};
-static inline u32 Ch(u32 x, u32 y, u32 z)
-{
- return z ^ (x & (y ^ z));
-}
+static const struct sha256_ctx initial_sha256_ctx = {
+ .ctx = {
+ .state = {
+ .h = {
+ SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+ SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+ },
+ },
+ .bytecount = 0,
+ },
+};
-static inline u32 Maj(u32 x, u32 y, u32 z)
-{
- return (x & y) | (z & (x | y));
-}
+#define sha256_iv (initial_sha256_ctx.ctx.state)
+
+static const u32 sha256_K[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
+ 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
+ 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
+ 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
+ 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
+ 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
-#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22))
-#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25))
-#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3))
-#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10))
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
+#define e0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22))
+#define e1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25))
+#define s0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3))
+#define s1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10))
static inline void LOAD_OP(int I, u32 *W, const u8 *input)
{
@@ -59,18 +68,20 @@ static inline void LOAD_OP(int I, u32 *W, const u8 *input)
static inline void BLEND_OP(int I, u32 *W)
{
- W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
+ W[I] = s1(W[I - 2]) + W[I - 7] + s0(W[I - 15]) + W[I - 16];
}
-#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) do { \
- u32 t1, t2; \
- t1 = h + e1(e) + Ch(e, f, g) + SHA256_K[i] + W[i]; \
- t2 = e0(a) + Maj(a, b, c); \
- d += t1; \
- h = t1 + t2; \
-} while (0)
-
-static void sha256_transform(u32 *state, const u8 *input, u32 *W)
+#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) \
+ do { \
+ u32 t1, t2; \
+ t1 = h + e1(e) + Ch(e, f, g) + sha256_K[i] + W[i]; \
+ t2 = e0(a) + Maj(a, b, c); \
+ d += t1; \
+ h = t1 + t2; \
+ } while (0)
+
+static void sha256_block_generic(struct sha256_block_state *state,
+ const u8 *input, u32 W[64])
{
u32 a, b, c, d, e, f, g, h;
int i;
@@ -100,8 +111,14 @@ static void sha256_transform(u32 *state, const u8 *input, u32 *W)
}
/* load the state into our registers */
- a = state[0]; b = state[1]; c = state[2]; d = state[3];
- e = state[4]; f = state[5]; g = state[6]; h = state[7];
+ a = state->h[0];
+ b = state->h[1];
+ c = state->h[2];
+ d = state->h[3];
+ e = state->h[4];
+ f = state->h[5];
+ g = state->h[6];
+ h = state->h[7];
/* now iterate */
for (i = 0; i < 64; i += 8) {
@@ -115,95 +132,384 @@ static void sha256_transform(u32 *state, const u8 *input, u32 *W)
SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
}
- state[0] += a; state[1] += b; state[2] += c; state[3] += d;
- state[4] += e; state[5] += f; state[6] += g; state[7] += h;
+ state->h[0] += a;
+ state->h[1] += b;
+ state->h[2] += c;
+ state->h[3] += d;
+ state->h[4] += e;
+ state->h[5] += f;
+ state->h[6] += g;
+ state->h[7] += h;
}
-void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
+static void __maybe_unused
+sha256_blocks_generic(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
{
- unsigned int partial, done;
- const u8 *src;
u32 W[64];
- partial = sctx->count & 0x3f;
- sctx->count += len;
- done = 0;
- src = data;
+ do {
+ sha256_block_generic(state, data, W);
+ data += SHA256_BLOCK_SIZE;
+ } while (--nblocks);
+
+ memzero_explicit(W, sizeof(W));
+}
+
+#if defined(CONFIG_CRYPTO_LIB_SHA256_ARCH) && !defined(__DISABLE_EXPORTS)
+#include "sha256.h" /* $(SRCARCH)/sha256.h */
+#else
+#define sha256_blocks sha256_blocks_generic
+#endif
+
+static void __sha256_init(struct __sha256_ctx *ctx,
+ const struct sha256_block_state *iv,
+ u64 initial_bytecount)
+{
+ ctx->state = *iv;
+ ctx->bytecount = initial_bytecount;
+}
+
+void sha224_init(struct sha224_ctx *ctx)
+{
+ __sha256_init(&ctx->ctx, &sha224_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha224_init);
+
+void sha256_init(struct sha256_ctx *ctx)
+{
+ __sha256_init(&ctx->ctx, &sha256_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha256_init);
+
+void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len)
+{
+ size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;
+
+ ctx->bytecount += len;
+
+ if (partial + len >= SHA256_BLOCK_SIZE) {
+ size_t nblocks;
- if ((partial + len) > 63) {
if (partial) {
- done = -partial;
- memcpy(sctx->buf + partial, data, done + 64);
- src = sctx->buf;
+ size_t l = SHA256_BLOCK_SIZE - partial;
+
+ memcpy(&ctx->buf[partial], data, l);
+ data += l;
+ len -= l;
+
+ sha256_blocks(&ctx->state, ctx->buf, 1);
}
- do {
- sha256_transform(sctx->state, src, W);
- done += 64;
- src = data + done;
- } while (done + 63 < len);
+ nblocks = len / SHA256_BLOCK_SIZE;
+ len %= SHA256_BLOCK_SIZE;
+
+ if (nblocks) {
+ sha256_blocks(&ctx->state, data, nblocks);
+ data += nblocks * SHA256_BLOCK_SIZE;
+ }
+ partial = 0;
+ }
+ if (len)
+ memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL(__sha256_update);
- memzero_explicit(W, sizeof(W));
+static void __sha256_final(struct __sha256_ctx *ctx,
+ u8 *out, size_t digest_size)
+{
+ u64 bitcount = ctx->bytecount << 3;
+ size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;
+ ctx->buf[partial++] = 0x80;
+ if (partial > SHA256_BLOCK_SIZE - 8) {
+ memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - partial);
+ sha256_blocks(&ctx->state, ctx->buf, 1);
partial = 0;
}
- memcpy(sctx->buf + partial, src, len - done);
+ memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - 8 - partial);
+ *(__be64 *)&ctx->buf[SHA256_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
+ sha256_blocks(&ctx->state, ctx->buf, 1);
+
+ for (size_t i = 0; i < digest_size; i += 4)
+ put_unaligned_be32(ctx->state.h[i / 4], out + i);
}
-EXPORT_SYMBOL(sha256_update);
-void sha224_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
+void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE])
{
- sha256_update(sctx, data, len);
+ __sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
+ memzero_explicit(ctx, sizeof(*ctx));
}
-EXPORT_SYMBOL(sha224_update);
+EXPORT_SYMBOL(sha224_final);
-static void __sha256_final(struct sha256_state *sctx, u8 *out, int digest_words)
+void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE])
{
- __be32 *dst = (__be32 *)out;
- __be64 bits;
- unsigned int index, pad_len;
- int i;
- static const u8 padding[64] = { 0x80, };
+ __sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(sha256_final);
+
+void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE])
+{
+ struct sha224_ctx ctx;
- /* Save number of bits */
- bits = cpu_to_be64(sctx->count << 3);
+ sha224_init(&ctx);
+ sha224_update(&ctx, data, len);
+ sha224_final(&ctx, out);
+}
+EXPORT_SYMBOL(sha224);
- /* Pad out to 56 mod 64. */
- index = sctx->count & 0x3f;
- pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
- sha256_update(sctx, padding, pad_len);
+void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
+{
+ struct sha256_ctx ctx;
- /* Append length (before padding) */
- sha256_update(sctx, (const u8 *)&bits, sizeof(bits));
+ sha256_init(&ctx);
+ sha256_update(&ctx, data, len);
+ sha256_final(&ctx, out);
+}
+EXPORT_SYMBOL(sha256);
- /* Store state in digest */
- for (i = 0; i < digest_words; i++)
- put_unaligned_be32(sctx->state[i], &dst[i]);
+/*
+ * Pre-boot environments (as indicated by __DISABLE_EXPORTS being defined) just
+ * need the generic SHA-256 code. Omit all other features from them.
+ */
+#ifndef __DISABLE_EXPORTS
- /* Zeroize sensitive information. */
- memzero_explicit(sctx, sizeof(*sctx));
+#ifndef sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, size_t len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE])
+{
+ return false;
+}
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+ return false;
}
+#endif
-void sha256_final(struct sha256_state *sctx, u8 *out)
+/* Sequential fallback implementation of sha256_finup_2x() */
+static noinline_for_stack void sha256_finup_2x_sequential(
+ const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2,
+ size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE])
{
- __sha256_final(sctx, out, 8);
+ struct __sha256_ctx mut_ctx;
+
+ mut_ctx = *ctx;
+ __sha256_update(&mut_ctx, data1, len);
+ __sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE);
+
+ mut_ctx = *ctx;
+ __sha256_update(&mut_ctx, data2, len);
+ __sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE);
}
-EXPORT_SYMBOL(sha256_final);
-void sha224_final(struct sha256_state *sctx, u8 *out)
+void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1,
+ const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE])
+{
+ if (ctx == NULL)
+ ctx = &initial_sha256_ctx;
+
+ if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1,
+ out2)))
+ return;
+ sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2);
+}
+EXPORT_SYMBOL_GPL(sha256_finup_2x);
+
+bool sha256_finup_2x_is_optimized(void)
{
- __sha256_final(sctx, out, 7);
+ return sha256_finup_2x_is_optimized_arch();
}
-EXPORT_SYMBOL(sha224_final);
+EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized);
+
+static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
+ struct sha256_block_state *ostate,
+ const u8 *raw_key, size_t raw_key_len,
+ const struct sha256_block_state *iv)
+{
+ union {
+ u8 b[SHA256_BLOCK_SIZE];
+ unsigned long w[SHA256_BLOCK_SIZE / sizeof(unsigned long)];
+ } derived_key = { 0 };
+
+ if (unlikely(raw_key_len > SHA256_BLOCK_SIZE)) {
+ if (iv == &sha224_iv)
+ sha224(raw_key, raw_key_len, derived_key.b);
+ else
+ sha256(raw_key, raw_key_len, derived_key.b);
+ } else {
+ memcpy(derived_key.b, raw_key, raw_key_len);
+ }
-void sha256(const u8 *data, unsigned int len, u8 *out)
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+ *istate = *iv;
+ sha256_blocks(istate, derived_key.b, 1);
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+ HMAC_IPAD_VALUE);
+ *ostate = *iv;
+ sha256_blocks(ostate, derived_key.b, 1);
+
+ memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha224_preparekey(struct hmac_sha224_key *key,
+ const u8 *raw_key, size_t raw_key_len)
{
- struct sha256_state sctx;
+ __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
+ raw_key, raw_key_len, &sha224_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_preparekey);
- sha256_init(&sctx);
- sha256_update(&sctx, data, len);
- sha256_final(&sctx, out);
+void hmac_sha256_preparekey(struct hmac_sha256_key *key,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
+ raw_key, raw_key_len, &sha256_iv);
}
-EXPORT_SYMBOL(sha256);
+EXPORT_SYMBOL_GPL(hmac_sha256_preparekey);
+
+void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx,
+ const struct __hmac_sha256_key *key)
+{
+ __sha256_init(&ctx->sha_ctx, &key->istate, SHA256_BLOCK_SIZE);
+ ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(__hmac_sha256_init);
+
+void hmac_sha224_init_usingrawkey(struct hmac_sha224_ctx *ctx,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+ raw_key, raw_key_len, &sha224_iv);
+ ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_init_usingrawkey);
+
+void hmac_sha256_init_usingrawkey(struct hmac_sha256_ctx *ctx,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+ raw_key, raw_key_len, &sha256_iv);
+ ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_init_usingrawkey);
+
+static void __hmac_sha256_final(struct __hmac_sha256_ctx *ctx,
+ u8 *out, size_t digest_size)
+{
+ /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+ __sha256_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
+ memset(&ctx->sha_ctx.buf[digest_size], 0,
+ SHA256_BLOCK_SIZE - digest_size);
+ ctx->sha_ctx.buf[digest_size] = 0x80;
+ *(__be32 *)&ctx->sha_ctx.buf[SHA256_BLOCK_SIZE - 4] =
+ cpu_to_be32(8 * (SHA256_BLOCK_SIZE + digest_size));
+
+ /* Compute the outer hash, which gives the HMAC value. */
+ sha256_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+ for (size_t i = 0; i < digest_size; i += 4)
+ put_unaligned_be32(ctx->ostate.h[i / 4], out + i);
+
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+
+void hmac_sha224_final(struct hmac_sha224_ctx *ctx,
+ u8 out[SHA224_DIGEST_SIZE])
+{
+ __hmac_sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_final);
+
+void hmac_sha256_final(struct hmac_sha256_ctx *ctx,
+ u8 out[SHA256_DIGEST_SIZE])
+{
+ __hmac_sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_final);
+
+void hmac_sha224(const struct hmac_sha224_key *key,
+ const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE])
+{
+ struct hmac_sha224_ctx ctx;
+
+ hmac_sha224_init(&ctx, key);
+ hmac_sha224_update(&ctx, data, data_len);
+ hmac_sha224_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224);
+
+void hmac_sha256(const struct hmac_sha256_key *key,
+ const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE])
+{
+ struct hmac_sha256_ctx ctx;
+
+ hmac_sha256_init(&ctx, key);
+ hmac_sha256_update(&ctx, data, data_len);
+ hmac_sha256_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256);
+
+void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+ const u8 *data, size_t data_len,
+ u8 out[SHA224_DIGEST_SIZE])
+{
+ struct hmac_sha224_ctx ctx;
+
+ hmac_sha224_init_usingrawkey(&ctx, raw_key, raw_key_len);
+ hmac_sha224_update(&ctx, data, data_len);
+ hmac_sha224_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_usingrawkey);
+
+void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+ const u8 *data, size_t data_len,
+ u8 out[SHA256_DIGEST_SIZE])
+{
+ struct hmac_sha256_ctx ctx;
+
+ hmac_sha256_init_usingrawkey(&ctx, raw_key, raw_key_len);
+ hmac_sha256_update(&ctx, data, data_len);
+ hmac_sha256_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);
+
+#if defined(sha256_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha256_mod_init(void)
+{
+#ifdef sha256_mod_init_arch
+ sha256_mod_init_arch();
+#endif
+ if (fips_enabled) {
+ /*
+ * FIPS cryptographic algorithm self-test. As per the FIPS
+ * Implementation Guidance, testing HMAC-SHA256 satisfies the
+ * test requirement for SHA-224, SHA-256, and HMAC-SHA224 too.
+ */
+ u8 mac[SHA256_DIGEST_SIZE];
+
+ hmac_sha256_usingrawkey(fips_test_key, sizeof(fips_test_key),
+ fips_test_data, sizeof(fips_test_data),
+ mac);
+ if (memcmp(fips_test_hmac_sha256_value, mac, sizeof(mac)) != 0)
+ panic("sha256: FIPS self-test failed\n");
+ }
+ return 0;
+}
+subsys_initcall(sha256_mod_init);
+
+static void __exit sha256_mod_exit(void)
+{
+}
+module_exit(sha256_mod_exit);
+#endif
+
+#endif /* !__DISABLE_EXPORTS */
+MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions");
MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha3.c b/lib/crypto/sha3.c
new file mode 100644
index 000000000000..32b7074de792
--- /dev/null
+++ b/lib/crypto/sha3.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-3, as specified in
+ * https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+ *
+ * SHA-3 code by Jeff Garzik <jeff@garzik.org>
+ * Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ * David Howells <dhowells@redhat.com>
+ *
+ * See also Documentation/crypto/sha3.rst
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <crypto/sha3.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+#include "fips.h"
+
+/*
+ * On some 32-bit architectures, such as h8300, GCC ends up using over 1 KB of
+ * stack if the round calculation gets inlined into the loop in
+ * sha3_keccakf_generic(). On the other hand, on 64-bit architectures with
+ * plenty of [64-bit wide] general purpose registers, not inlining it severely
+ * hurts performance. So let's use 64-bitness as a heuristic to decide whether
+ * to inline or not.
+ */
+#ifdef CONFIG_64BIT
+#define SHA3_INLINE inline
+#else
+#define SHA3_INLINE noinline
+#endif
+
+#define SHA3_KECCAK_ROUNDS 24
+
+static const u64 sha3_keccakf_rndc[SHA3_KECCAK_ROUNDS] = {
+ 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
+ 0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
+ 0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
+ 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
+ 0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
+ 0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+ 0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
+ 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+/*
+ * Perform a single round of Keccak mixing.
+ */
+static SHA3_INLINE void sha3_keccakf_one_round_generic(u64 st[25], int round)
+{
+ u64 t[5], tt, bc[5];
+
+ /* Theta */
+ bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+ bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+ bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+ bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+ bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+
+ t[0] = bc[4] ^ rol64(bc[1], 1);
+ t[1] = bc[0] ^ rol64(bc[2], 1);
+ t[2] = bc[1] ^ rol64(bc[3], 1);
+ t[3] = bc[2] ^ rol64(bc[4], 1);
+ t[4] = bc[3] ^ rol64(bc[0], 1);
+
+ st[0] ^= t[0];
+
+ /* Rho Pi */
+ tt = st[1];
+ st[ 1] = rol64(st[ 6] ^ t[1], 44);
+ st[ 6] = rol64(st[ 9] ^ t[4], 20);
+ st[ 9] = rol64(st[22] ^ t[2], 61);
+ st[22] = rol64(st[14] ^ t[4], 39);
+ st[14] = rol64(st[20] ^ t[0], 18);
+ st[20] = rol64(st[ 2] ^ t[2], 62);
+ st[ 2] = rol64(st[12] ^ t[2], 43);
+ st[12] = rol64(st[13] ^ t[3], 25);
+ st[13] = rol64(st[19] ^ t[4], 8);
+ st[19] = rol64(st[23] ^ t[3], 56);
+ st[23] = rol64(st[15] ^ t[0], 41);
+ st[15] = rol64(st[ 4] ^ t[4], 27);
+ st[ 4] = rol64(st[24] ^ t[4], 14);
+ st[24] = rol64(st[21] ^ t[1], 2);
+ st[21] = rol64(st[ 8] ^ t[3], 55);
+ st[ 8] = rol64(st[16] ^ t[1], 45);
+ st[16] = rol64(st[ 5] ^ t[0], 36);
+ st[ 5] = rol64(st[ 3] ^ t[3], 28);
+ st[ 3] = rol64(st[18] ^ t[3], 21);
+ st[18] = rol64(st[17] ^ t[2], 15);
+ st[17] = rol64(st[11] ^ t[1], 10);
+ st[11] = rol64(st[ 7] ^ t[2], 6);
+ st[ 7] = rol64(st[10] ^ t[0], 3);
+ st[10] = rol64( tt ^ t[1], 1);
+
+ /* Chi */
+ bc[ 0] = ~st[ 1] & st[ 2];
+ bc[ 1] = ~st[ 2] & st[ 3];
+ bc[ 2] = ~st[ 3] & st[ 4];
+ bc[ 3] = ~st[ 4] & st[ 0];
+ bc[ 4] = ~st[ 0] & st[ 1];
+ st[ 0] ^= bc[ 0];
+ st[ 1] ^= bc[ 1];
+ st[ 2] ^= bc[ 2];
+ st[ 3] ^= bc[ 3];
+ st[ 4] ^= bc[ 4];
+
+ bc[ 0] = ~st[ 6] & st[ 7];
+ bc[ 1] = ~st[ 7] & st[ 8];
+ bc[ 2] = ~st[ 8] & st[ 9];
+ bc[ 3] = ~st[ 9] & st[ 5];
+ bc[ 4] = ~st[ 5] & st[ 6];
+ st[ 5] ^= bc[ 0];
+ st[ 6] ^= bc[ 1];
+ st[ 7] ^= bc[ 2];
+ st[ 8] ^= bc[ 3];
+ st[ 9] ^= bc[ 4];
+
+ bc[ 0] = ~st[11] & st[12];
+ bc[ 1] = ~st[12] & st[13];
+ bc[ 2] = ~st[13] & st[14];
+ bc[ 3] = ~st[14] & st[10];
+ bc[ 4] = ~st[10] & st[11];
+ st[10] ^= bc[ 0];
+ st[11] ^= bc[ 1];
+ st[12] ^= bc[ 2];
+ st[13] ^= bc[ 3];
+ st[14] ^= bc[ 4];
+
+ bc[ 0] = ~st[16] & st[17];
+ bc[ 1] = ~st[17] & st[18];
+ bc[ 2] = ~st[18] & st[19];
+ bc[ 3] = ~st[19] & st[15];
+ bc[ 4] = ~st[15] & st[16];
+ st[15] ^= bc[ 0];
+ st[16] ^= bc[ 1];
+ st[17] ^= bc[ 2];
+ st[18] ^= bc[ 3];
+ st[19] ^= bc[ 4];
+
+ bc[ 0] = ~st[21] & st[22];
+ bc[ 1] = ~st[22] & st[23];
+ bc[ 2] = ~st[23] & st[24];
+ bc[ 3] = ~st[24] & st[20];
+ bc[ 4] = ~st[20] & st[21];
+ st[20] ^= bc[ 0];
+ st[21] ^= bc[ 1];
+ st[22] ^= bc[ 2];
+ st[23] ^= bc[ 3];
+ st[24] ^= bc[ 4];
+
+ /* Iota */
+ st[0] ^= sha3_keccakf_rndc[round];
+}
+
+/* Generic implementation of the Keccak-f[1600] permutation */
+static void sha3_keccakf_generic(struct sha3_state *state)
+{
+ /*
+ * Temporarily convert the state words from little-endian to native-
+ * endian so that they can be operated on. Note that on little-endian
+ * machines this conversion is a no-op and is optimized out.
+ */
+
+ for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+ state->native_words[i] = le64_to_cpu(state->words[i]);
+
+ for (int round = 0; round < SHA3_KECCAK_ROUNDS; round++)
+ sha3_keccakf_one_round_generic(state->native_words, round);
+
+ for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+ state->words[i] = cpu_to_le64(state->native_words[i]);
+}
+
+/*
+ * Generic implementation of absorbing the given nonzero number of full blocks
+ * into the sponge function Keccak[r=8*block_size, c=1600-8*block_size].
+ */
+static void __maybe_unused
+sha3_absorb_blocks_generic(struct sha3_state *state, const u8 *data,
+ size_t nblocks, size_t block_size)
+{
+ do {
+ for (size_t i = 0; i < block_size; i += 8)
+ state->words[i / 8] ^= get_unaligned((__le64 *)&data[i]);
+ sha3_keccakf_generic(state);
+ data += block_size;
+ } while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA3_ARCH
+#include "sha3.h" /* $(SRCARCH)/sha3.h */
+#else
+#define sha3_keccakf sha3_keccakf_generic
+#define sha3_absorb_blocks sha3_absorb_blocks_generic
+#endif
+
+void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len)
+{
+ const size_t block_size = ctx->block_size;
+ size_t absorb_offset = ctx->absorb_offset;
+
+ /* Warn if squeezing has already begun. */
+ WARN_ON_ONCE(absorb_offset >= block_size);
+
+ if (absorb_offset && absorb_offset + in_len >= block_size) {
+ crypto_xor(&ctx->state.bytes[absorb_offset], in,
+ block_size - absorb_offset);
+ in += block_size - absorb_offset;
+ in_len -= block_size - absorb_offset;
+ sha3_keccakf(&ctx->state);
+ absorb_offset = 0;
+ }
+
+ if (in_len >= block_size) {
+ size_t nblocks = in_len / block_size;
+
+ sha3_absorb_blocks(&ctx->state, in, nblocks, block_size);
+ in += nblocks * block_size;
+ in_len -= nblocks * block_size;
+ }
+
+ if (in_len) {
+ crypto_xor(&ctx->state.bytes[absorb_offset], in, in_len);
+ absorb_offset += in_len;
+ }
+ ctx->absorb_offset = absorb_offset;
+}
+EXPORT_SYMBOL_GPL(__sha3_update);
+
+void sha3_final(struct sha3_ctx *sha3_ctx, u8 *out)
+{
+ struct __sha3_ctx *ctx = &sha3_ctx->ctx;
+
+ ctx->state.bytes[ctx->absorb_offset] ^= 0x06;
+ ctx->state.bytes[ctx->block_size - 1] ^= 0x80;
+ sha3_keccakf(&ctx->state);
+ memcpy(out, ctx->state.bytes, ctx->digest_size);
+ sha3_zeroize_ctx(sha3_ctx);
+}
+EXPORT_SYMBOL_GPL(sha3_final);
+
+void shake_squeeze(struct shake_ctx *shake_ctx, u8 *out, size_t out_len)
+{
+ struct __sha3_ctx *ctx = &shake_ctx->ctx;
+ const size_t block_size = ctx->block_size;
+ size_t squeeze_offset = ctx->squeeze_offset;
+
+ if (ctx->absorb_offset < block_size) {
+ /* First squeeze: */
+
+ /* Add the domain separation suffix and padding. */
+ ctx->state.bytes[ctx->absorb_offset] ^= 0x1f;
+ ctx->state.bytes[block_size - 1] ^= 0x80;
+
+ /* Indicate that squeezing has begun. */
+ ctx->absorb_offset = block_size;
+
+ /*
+ * Indicate that no output is pending yet, i.e. sha3_keccakf()
+ * will need to be called before the first copy.
+ */
+ squeeze_offset = block_size;
+ }
+ while (out_len) {
+ if (squeeze_offset == block_size) {
+ sha3_keccakf(&ctx->state);
+ squeeze_offset = 0;
+ }
+ size_t copy = min(out_len, block_size - squeeze_offset);
+
+ memcpy(out, &ctx->state.bytes[squeeze_offset], copy);
+ out += copy;
+ out_len -= copy;
+ squeeze_offset += copy;
+ }
+ ctx->squeeze_offset = squeeze_offset;
+}
+EXPORT_SYMBOL_GPL(shake_squeeze);
+
+#ifndef sha3_224_arch
+static inline bool sha3_224_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_224_DIGEST_SIZE])
+{
+ return false;
+}
+#endif
+#ifndef sha3_256_arch
+static inline bool sha3_256_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_256_DIGEST_SIZE])
+{
+ return false;
+}
+#endif
+#ifndef sha3_384_arch
+static inline bool sha3_384_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_384_DIGEST_SIZE])
+{
+ return false;
+}
+#endif
+#ifndef sha3_512_arch
+static inline bool sha3_512_arch(const u8 *in, size_t in_len,
+ u8 out[SHA3_512_DIGEST_SIZE])
+{
+ return false;
+}
+#endif
+
+void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE])
+{
+ struct sha3_ctx ctx;
+
+ if (sha3_224_arch(in, in_len, out))
+ return;
+ sha3_224_init(&ctx);
+ sha3_update(&ctx, in, in_len);
+ sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_224);
+
+void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE])
+{
+ struct sha3_ctx ctx;
+
+ if (sha3_256_arch(in, in_len, out))
+ return;
+ sha3_256_init(&ctx);
+ sha3_update(&ctx, in, in_len);
+ sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_256);
+
+void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE])
+{
+ struct sha3_ctx ctx;
+
+ if (sha3_384_arch(in, in_len, out))
+ return;
+ sha3_384_init(&ctx);
+ sha3_update(&ctx, in, in_len);
+ sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_384);
+
+void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE])
+{
+ struct sha3_ctx ctx;
+
+ if (sha3_512_arch(in, in_len, out))
+ return;
+ sha3_512_init(&ctx);
+ sha3_update(&ctx, in, in_len);
+ sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_512);
+
+void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+ struct shake_ctx ctx;
+
+ shake128_init(&ctx);
+ shake_update(&ctx, in, in_len);
+ shake_squeeze(&ctx, out, out_len);
+ shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake128);
+
+void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+ struct shake_ctx ctx;
+
+ shake256_init(&ctx);
+ shake_update(&ctx, in, in_len);
+ shake_squeeze(&ctx, out, out_len);
+ shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake256);
+
+#if defined(sha3_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha3_mod_init(void)
+{
+#ifdef sha3_mod_init_arch
+ sha3_mod_init_arch();
+#endif
+ if (fips_enabled) {
+ /*
+ * FIPS cryptographic algorithm self-test. As per the FIPS
+ * Implementation Guidance, testing any SHA-3 algorithm
+ * satisfies the test requirement for all of them.
+ */
+ u8 hash[SHA3_256_DIGEST_SIZE];
+
+ sha3_256(fips_test_data, sizeof(fips_test_data), hash);
+ if (memcmp(fips_test_sha3_256_value, hash, sizeof(hash)) != 0)
+ panic("sha3: FIPS self-test failed\n");
+ }
+ return 0;
+}
+subsys_initcall(sha3_mod_init);
+
+static void __exit sha3_mod_exit(void)
+{
+}
+module_exit(sha3_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-3 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c
new file mode 100644
index 000000000000..605eab51aabd
--- /dev/null
+++ b/lib/crypto/sha512.c
@@ -0,0 +1,440 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/hmac.h>
+#include <crypto/sha2.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/overflow.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+#include <linux/wordpart.h>
+#include "fips.h"
+
+static const struct sha512_block_state sha384_iv = {
+ .h = {
+ SHA384_H0, SHA384_H1, SHA384_H2, SHA384_H3,
+ SHA384_H4, SHA384_H5, SHA384_H6, SHA384_H7,
+ },
+};
+
+static const struct sha512_block_state sha512_iv = {
+ .h = {
+ SHA512_H0, SHA512_H1, SHA512_H2, SHA512_H3,
+ SHA512_H4, SHA512_H5, SHA512_H6, SHA512_H7,
+ },
+};
+
+static const u64 sha512_K[80] = {
+ 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+ 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+ 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+ 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+ 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+ 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+ 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+ 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+ 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+ 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+ 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+ 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+ 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+ 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+ 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+ 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+ 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+ 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+ 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+ 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+ 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+ 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+ 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+ 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+ 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+ 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+ 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
+};
+
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
+#define e0(x) (ror64((x), 28) ^ ror64((x), 34) ^ ror64((x), 39))
+#define e1(x) (ror64((x), 14) ^ ror64((x), 18) ^ ror64((x), 41))
+#define s0(x) (ror64((x), 1) ^ ror64((x), 8) ^ ((x) >> 7))
+#define s1(x) (ror64((x), 19) ^ ror64((x), 61) ^ ((x) >> 6))
+
+static void sha512_block_generic(struct sha512_block_state *state,
+ const u8 *data)
+{
+ u64 a = state->h[0];
+ u64 b = state->h[1];
+ u64 c = state->h[2];
+ u64 d = state->h[3];
+ u64 e = state->h[4];
+ u64 f = state->h[5];
+ u64 g = state->h[6];
+ u64 h = state->h[7];
+ u64 t1, t2;
+ u64 W[16];
+
+ for (int j = 0; j < 16; j++)
+ W[j] = get_unaligned_be64(data + j * sizeof(u64));
+
+ for (int i = 0; i < 80; i += 8) {
+ if ((i & 15) == 0 && i != 0) {
+ for (int j = 0; j < 16; j++) {
+ W[j & 15] += s1(W[(j - 2) & 15]) +
+ W[(j - 7) & 15] +
+ s0(W[(j - 15) & 15]);
+ }
+ }
+ t1 = h + e1(e) + Ch(e, f, g) + sha512_K[i] + W[(i & 15)];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2;
+ t1 = g + e1(d) + Ch(d, e, f) + sha512_K[i+1] + W[(i & 15) + 1];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2;
+ t1 = f + e1(c) + Ch(c, d, e) + sha512_K[i+2] + W[(i & 15) + 2];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2;
+ t1 = e + e1(b) + Ch(b, c, d) + sha512_K[i+3] + W[(i & 15) + 3];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2;
+ t1 = d + e1(a) + Ch(a, b, c) + sha512_K[i+4] + W[(i & 15) + 4];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2;
+ t1 = c + e1(h) + Ch(h, a, b) + sha512_K[i+5] + W[(i & 15) + 5];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2;
+ t1 = b + e1(g) + Ch(g, h, a) + sha512_K[i+6] + W[(i & 15) + 6];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2;
+ t1 = a + e1(f) + Ch(f, g, h) + sha512_K[i+7] + W[(i & 15) + 7];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2;
+ }
+
+ state->h[0] += a;
+ state->h[1] += b;
+ state->h[2] += c;
+ state->h[3] += d;
+ state->h[4] += e;
+ state->h[5] += f;
+ state->h[6] += g;
+ state->h[7] += h;
+}
+
+static void __maybe_unused
+sha512_blocks_generic(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ do {
+ sha512_block_generic(state, data);
+ data += SHA512_BLOCK_SIZE;
+ } while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA512_ARCH
+#include "sha512.h" /* $(SRCARCH)/sha512.h */
+#else
+#define sha512_blocks sha512_blocks_generic
+#endif
+
+static void __sha512_init(struct __sha512_ctx *ctx,
+ const struct sha512_block_state *iv,
+ u64 initial_bytecount)
+{
+ ctx->state = *iv;
+ ctx->bytecount_lo = initial_bytecount;
+ ctx->bytecount_hi = 0;
+}
+
+void sha384_init(struct sha384_ctx *ctx)
+{
+ __sha512_init(&ctx->ctx, &sha384_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha384_init);
+
+void sha512_init(struct sha512_ctx *ctx)
+{
+ __sha512_init(&ctx->ctx, &sha512_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha512_init);
+
+void __sha512_update(struct __sha512_ctx *ctx, const u8 *data, size_t len)
+{
+ size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE;
+
+ if (check_add_overflow(ctx->bytecount_lo, len, &ctx->bytecount_lo))
+ ctx->bytecount_hi++;
+
+ if (partial + len >= SHA512_BLOCK_SIZE) {
+ size_t nblocks;
+
+ if (partial) {
+ size_t l = SHA512_BLOCK_SIZE - partial;
+
+ memcpy(&ctx->buf[partial], data, l);
+ data += l;
+ len -= l;
+
+ sha512_blocks(&ctx->state, ctx->buf, 1);
+ }
+
+ nblocks = len / SHA512_BLOCK_SIZE;
+ len %= SHA512_BLOCK_SIZE;
+
+ if (nblocks) {
+ sha512_blocks(&ctx->state, data, nblocks);
+ data += nblocks * SHA512_BLOCK_SIZE;
+ }
+ partial = 0;
+ }
+ if (len)
+ memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(__sha512_update);
+
+static void __sha512_final(struct __sha512_ctx *ctx,
+ u8 *out, size_t digest_size)
+{
+ u64 bitcount_hi = (ctx->bytecount_hi << 3) | (ctx->bytecount_lo >> 61);
+ u64 bitcount_lo = ctx->bytecount_lo << 3;
+ size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE;
+
+ ctx->buf[partial++] = 0x80;
+ if (partial > SHA512_BLOCK_SIZE - 16) {
+ memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - partial);
+ sha512_blocks(&ctx->state, ctx->buf, 1);
+ partial = 0;
+ }
+ memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - 16 - partial);
+ *(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 16] = cpu_to_be64(bitcount_hi);
+ *(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 8] = cpu_to_be64(bitcount_lo);
+ sha512_blocks(&ctx->state, ctx->buf, 1);
+
+ for (size_t i = 0; i < digest_size; i += 8)
+ put_unaligned_be64(ctx->state.h[i / 8], out + i);
+}
+
+void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE])
+{
+ __sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha384_final);
+
+void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE])
+{
+ __sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE);
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha512_final);
+
+void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE])
+{
+ struct sha384_ctx ctx;
+
+ sha384_init(&ctx);
+ sha384_update(&ctx, data, len);
+ sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha384);
+
+void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE])
+{
+ struct sha512_ctx ctx;
+
+ sha512_init(&ctx);
+ sha512_update(&ctx, data, len);
+ sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha512);
+
+static void __hmac_sha512_preparekey(struct sha512_block_state *istate,
+ struct sha512_block_state *ostate,
+ const u8 *raw_key, size_t raw_key_len,
+ const struct sha512_block_state *iv)
+{
+ union {
+ u8 b[SHA512_BLOCK_SIZE];
+ unsigned long w[SHA512_BLOCK_SIZE / sizeof(unsigned long)];
+ } derived_key = { 0 };
+
+ if (unlikely(raw_key_len > SHA512_BLOCK_SIZE)) {
+ if (iv == &sha384_iv)
+ sha384(raw_key, raw_key_len, derived_key.b);
+ else
+ sha512(raw_key, raw_key_len, derived_key.b);
+ } else {
+ memcpy(derived_key.b, raw_key, raw_key_len);
+ }
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+ *istate = *iv;
+ sha512_blocks(istate, derived_key.b, 1);
+
+ for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+ derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+ HMAC_IPAD_VALUE);
+ *ostate = *iv;
+ sha512_blocks(ostate, derived_key.b, 1);
+
+ memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha384_preparekey(struct hmac_sha384_key *key,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha512_preparekey(&key->key.istate, &key->key.ostate,
+ raw_key, raw_key_len, &sha384_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_preparekey);
+
+void hmac_sha512_preparekey(struct hmac_sha512_key *key,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha512_preparekey(&key->key.istate, &key->key.ostate,
+ raw_key, raw_key_len, &sha512_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_preparekey);
+
+void __hmac_sha512_init(struct __hmac_sha512_ctx *ctx,
+ const struct __hmac_sha512_key *key)
+{
+ __sha512_init(&ctx->sha_ctx, &key->istate, SHA512_BLOCK_SIZE);
+ ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(__hmac_sha512_init);
+
+void hmac_sha384_init_usingrawkey(struct hmac_sha384_ctx *ctx,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+ raw_key, raw_key_len, &sha384_iv);
+ ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE;
+ ctx->ctx.sha_ctx.bytecount_hi = 0;
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_init_usingrawkey);
+
+void hmac_sha512_init_usingrawkey(struct hmac_sha512_ctx *ctx,
+ const u8 *raw_key, size_t raw_key_len)
+{
+ __hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+ raw_key, raw_key_len, &sha512_iv);
+ ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE;
+ ctx->ctx.sha_ctx.bytecount_hi = 0;
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_init_usingrawkey);
+
+static void __hmac_sha512_final(struct __hmac_sha512_ctx *ctx,
+ u8 *out, size_t digest_size)
+{
+ /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+ __sha512_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
+ memset(&ctx->sha_ctx.buf[digest_size], 0,
+ SHA512_BLOCK_SIZE - digest_size);
+ ctx->sha_ctx.buf[digest_size] = 0x80;
+ *(__be32 *)&ctx->sha_ctx.buf[SHA512_BLOCK_SIZE - 4] =
+ cpu_to_be32(8 * (SHA512_BLOCK_SIZE + digest_size));
+
+ /* Compute the outer hash, which gives the HMAC value. */
+ sha512_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+ for (size_t i = 0; i < digest_size; i += 8)
+ put_unaligned_be64(ctx->ostate.h[i / 8], out + i);
+
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+
+void hmac_sha384_final(struct hmac_sha384_ctx *ctx,
+ u8 out[SHA384_DIGEST_SIZE])
+{
+ __hmac_sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_final);
+
+void hmac_sha512_final(struct hmac_sha512_ctx *ctx,
+ u8 out[SHA512_DIGEST_SIZE])
+{
+ __hmac_sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_final);
+
+void hmac_sha384(const struct hmac_sha384_key *key,
+ const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE])
+{
+ struct hmac_sha384_ctx ctx;
+
+ hmac_sha384_init(&ctx, key);
+ hmac_sha384_update(&ctx, data, data_len);
+ hmac_sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384);
+
+void hmac_sha512(const struct hmac_sha512_key *key,
+ const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE])
+{
+ struct hmac_sha512_ctx ctx;
+
+ hmac_sha512_init(&ctx, key);
+ hmac_sha512_update(&ctx, data, data_len);
+ hmac_sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512);
+
+void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+ const u8 *data, size_t data_len,
+ u8 out[SHA384_DIGEST_SIZE])
+{
+ struct hmac_sha384_ctx ctx;
+
+ hmac_sha384_init_usingrawkey(&ctx, raw_key, raw_key_len);
+ hmac_sha384_update(&ctx, data, data_len);
+ hmac_sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_usingrawkey);
+
+void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+ const u8 *data, size_t data_len,
+ u8 out[SHA512_DIGEST_SIZE])
+{
+ struct hmac_sha512_ctx ctx;
+
+ hmac_sha512_init_usingrawkey(&ctx, raw_key, raw_key_len);
+ hmac_sha512_update(&ctx, data, data_len);
+ hmac_sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey);
+
+#if defined(sha512_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha512_mod_init(void)
+{
+#ifdef sha512_mod_init_arch
+ sha512_mod_init_arch();
+#endif
+ if (fips_enabled) {
+ /*
+ * FIPS cryptographic algorithm self-test. As per the FIPS
+ * Implementation Guidance, testing HMAC-SHA512 satisfies the
+ * test requirement for SHA-384, SHA-512, and HMAC-SHA384 too.
+ */
+ u8 mac[SHA512_DIGEST_SIZE];
+
+ hmac_sha512_usingrawkey(fips_test_key, sizeof(fips_test_key),
+ fips_test_data, sizeof(fips_test_data),
+ mac);
+ if (memcmp(fips_test_hmac_sha512_value, mac, sizeof(mac)) != 0)
+ panic("sha512: FIPS self-test failed\n");
+ }
+ return 0;
+}
+subsys_initcall(sha512_mod_init);
+
+static void __exit sha512_mod_exit(void)
+{
+}
+module_exit(sha512_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/simd.c b/lib/crypto/simd.c
new file mode 100644
index 000000000000..9c36cb3bb49c
--- /dev/null
+++ b/lib/crypto/simd.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SIMD testing utility functions
+ *
+ * Copyright 2024 Google LLC
+ */
+
+#include <crypto/internal/simd.h>
+
+DEFINE_PER_CPU(bool, crypto_simd_disabled_for_test);
+EXPORT_PER_CPU_SYMBOL_GPL(crypto_simd_disabled_for_test);
diff --git a/lib/crypto/sm3.c b/lib/crypto/sm3.c
new file mode 100644
index 000000000000..c6b9ad8a3ac6
--- /dev/null
+++ b/lib/crypto/sm3.c
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SM3 secure hash, as specified by OSCCA GM/T 0004-2012 SM3 and described
+ * at https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02
+ *
+ * Copyright (C) 2017 ARM Limited or its affiliates.
+ * Copyright (C) 2017 Gilad Ben-Yossef <gilad@benyossef.com>
+ * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <crypto/sm3.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+static const u32 ____cacheline_aligned K[64] = {
+ 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb,
+ 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc,
+ 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce,
+ 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6,
+ 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c,
+ 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce,
+ 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec,
+ 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5,
+ 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53,
+ 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d,
+ 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4,
+ 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43,
+ 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c,
+ 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce,
+ 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec,
+ 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
+};
+
+/*
+ * Transform the message X which consists of 16 32-bit-words. See
+ * GM/T 004-2012 for details.
+ */
+#define R(i, a, b, c, d, e, f, g, h, t, w1, w2) \
+ do { \
+ ss1 = rol32((rol32((a), 12) + (e) + (t)), 7); \
+ ss2 = ss1 ^ rol32((a), 12); \
+ d += FF ## i(a, b, c) + ss2 + ((w1) ^ (w2)); \
+ h += GG ## i(e, f, g) + ss1 + (w1); \
+ b = rol32((b), 9); \
+ f = rol32((f), 19); \
+ h = P0((h)); \
+ } while (0)
+
+#define R1(a, b, c, d, e, f, g, h, t, w1, w2) \
+ R(1, a, b, c, d, e, f, g, h, t, w1, w2)
+#define R2(a, b, c, d, e, f, g, h, t, w1, w2) \
+ R(2, a, b, c, d, e, f, g, h, t, w1, w2)
+
+#define FF1(x, y, z) (x ^ y ^ z)
+#define FF2(x, y, z) ((x & y) | (x & z) | (y & z))
+
+#define GG1(x, y, z) FF1(x, y, z)
+#define GG2(x, y, z) ((x & y) | (~x & z))
+
+/* Message expansion */
+#define P0(x) ((x) ^ rol32((x), 9) ^ rol32((x), 17))
+#define P1(x) ((x) ^ rol32((x), 15) ^ rol32((x), 23))
+#define I(i) (W[i] = get_unaligned_be32(data + i * 4))
+#define W1(i) (W[i & 0x0f])
+#define W2(i) (W[i & 0x0f] = \
+ P1(W[i & 0x0f] \
+ ^ W[(i-9) & 0x0f] \
+ ^ rol32(W[(i-3) & 0x0f], 15)) \
+ ^ rol32(W[(i-13) & 0x0f], 7) \
+ ^ W[(i-6) & 0x0f])
+
+static void sm3_transform(struct sm3_state *sctx, u8 const *data, u32 W[16])
+{
+ u32 a, b, c, d, e, f, g, h, ss1, ss2;
+
+ a = sctx->state[0];
+ b = sctx->state[1];
+ c = sctx->state[2];
+ d = sctx->state[3];
+ e = sctx->state[4];
+ f = sctx->state[5];
+ g = sctx->state[6];
+ h = sctx->state[7];
+
+ R1(a, b, c, d, e, f, g, h, K[0], I(0), I(4));
+ R1(d, a, b, c, h, e, f, g, K[1], I(1), I(5));
+ R1(c, d, a, b, g, h, e, f, K[2], I(2), I(6));
+ R1(b, c, d, a, f, g, h, e, K[3], I(3), I(7));
+ R1(a, b, c, d, e, f, g, h, K[4], W1(4), I(8));
+ R1(d, a, b, c, h, e, f, g, K[5], W1(5), I(9));
+ R1(c, d, a, b, g, h, e, f, K[6], W1(6), I(10));
+ R1(b, c, d, a, f, g, h, e, K[7], W1(7), I(11));
+ R1(a, b, c, d, e, f, g, h, K[8], W1(8), I(12));
+ R1(d, a, b, c, h, e, f, g, K[9], W1(9), I(13));
+ R1(c, d, a, b, g, h, e, f, K[10], W1(10), I(14));
+ R1(b, c, d, a, f, g, h, e, K[11], W1(11), I(15));
+ R1(a, b, c, d, e, f, g, h, K[12], W1(12), W2(16));
+ R1(d, a, b, c, h, e, f, g, K[13], W1(13), W2(17));
+ R1(c, d, a, b, g, h, e, f, K[14], W1(14), W2(18));
+ R1(b, c, d, a, f, g, h, e, K[15], W1(15), W2(19));
+
+ R2(a, b, c, d, e, f, g, h, K[16], W1(16), W2(20));
+ R2(d, a, b, c, h, e, f, g, K[17], W1(17), W2(21));
+ R2(c, d, a, b, g, h, e, f, K[18], W1(18), W2(22));
+ R2(b, c, d, a, f, g, h, e, K[19], W1(19), W2(23));
+ R2(a, b, c, d, e, f, g, h, K[20], W1(20), W2(24));
+ R2(d, a, b, c, h, e, f, g, K[21], W1(21), W2(25));
+ R2(c, d, a, b, g, h, e, f, K[22], W1(22), W2(26));
+ R2(b, c, d, a, f, g, h, e, K[23], W1(23), W2(27));
+ R2(a, b, c, d, e, f, g, h, K[24], W1(24), W2(28));
+ R2(d, a, b, c, h, e, f, g, K[25], W1(25), W2(29));
+ R2(c, d, a, b, g, h, e, f, K[26], W1(26), W2(30));
+ R2(b, c, d, a, f, g, h, e, K[27], W1(27), W2(31));
+ R2(a, b, c, d, e, f, g, h, K[28], W1(28), W2(32));
+ R2(d, a, b, c, h, e, f, g, K[29], W1(29), W2(33));
+ R2(c, d, a, b, g, h, e, f, K[30], W1(30), W2(34));
+ R2(b, c, d, a, f, g, h, e, K[31], W1(31), W2(35));
+
+ R2(a, b, c, d, e, f, g, h, K[32], W1(32), W2(36));
+ R2(d, a, b, c, h, e, f, g, K[33], W1(33), W2(37));
+ R2(c, d, a, b, g, h, e, f, K[34], W1(34), W2(38));
+ R2(b, c, d, a, f, g, h, e, K[35], W1(35), W2(39));
+ R2(a, b, c, d, e, f, g, h, K[36], W1(36), W2(40));
+ R2(d, a, b, c, h, e, f, g, K[37], W1(37), W2(41));
+ R2(c, d, a, b, g, h, e, f, K[38], W1(38), W2(42));
+ R2(b, c, d, a, f, g, h, e, K[39], W1(39), W2(43));
+ R2(a, b, c, d, e, f, g, h, K[40], W1(40), W2(44));
+ R2(d, a, b, c, h, e, f, g, K[41], W1(41), W2(45));
+ R2(c, d, a, b, g, h, e, f, K[42], W1(42), W2(46));
+ R2(b, c, d, a, f, g, h, e, K[43], W1(43), W2(47));
+ R2(a, b, c, d, e, f, g, h, K[44], W1(44), W2(48));
+ R2(d, a, b, c, h, e, f, g, K[45], W1(45), W2(49));
+ R2(c, d, a, b, g, h, e, f, K[46], W1(46), W2(50));
+ R2(b, c, d, a, f, g, h, e, K[47], W1(47), W2(51));
+
+ R2(a, b, c, d, e, f, g, h, K[48], W1(48), W2(52));
+ R2(d, a, b, c, h, e, f, g, K[49], W1(49), W2(53));
+ R2(c, d, a, b, g, h, e, f, K[50], W1(50), W2(54));
+ R2(b, c, d, a, f, g, h, e, K[51], W1(51), W2(55));
+ R2(a, b, c, d, e, f, g, h, K[52], W1(52), W2(56));
+ R2(d, a, b, c, h, e, f, g, K[53], W1(53), W2(57));
+ R2(c, d, a, b, g, h, e, f, K[54], W1(54), W2(58));
+ R2(b, c, d, a, f, g, h, e, K[55], W1(55), W2(59));
+ R2(a, b, c, d, e, f, g, h, K[56], W1(56), W2(60));
+ R2(d, a, b, c, h, e, f, g, K[57], W1(57), W2(61));
+ R2(c, d, a, b, g, h, e, f, K[58], W1(58), W2(62));
+ R2(b, c, d, a, f, g, h, e, K[59], W1(59), W2(63));
+ R2(a, b, c, d, e, f, g, h, K[60], W1(60), W2(64));
+ R2(d, a, b, c, h, e, f, g, K[61], W1(61), W2(65));
+ R2(c, d, a, b, g, h, e, f, K[62], W1(62), W2(66));
+ R2(b, c, d, a, f, g, h, e, K[63], W1(63), W2(67));
+
+ sctx->state[0] ^= a;
+ sctx->state[1] ^= b;
+ sctx->state[2] ^= c;
+ sctx->state[3] ^= d;
+ sctx->state[4] ^= e;
+ sctx->state[5] ^= f;
+ sctx->state[6] ^= g;
+ sctx->state[7] ^= h;
+}
+#undef R
+#undef R1
+#undef R2
+#undef I
+#undef W1
+#undef W2
+
+void sm3_block_generic(struct sm3_state *sctx, u8 const *data, int blocks)
+{
+ u32 W[16];
+
+ do {
+ sm3_transform(sctx, data, W);
+ data += SM3_BLOCK_SIZE;
+ } while (--blocks);
+
+ memzero_explicit(W, sizeof(W));
+}
+EXPORT_SYMBOL_GPL(sm3_block_generic);
+
+MODULE_DESCRIPTION("Generic SM3 library");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/sparc/md5.h b/lib/crypto/sparc/md5.h
new file mode 100644
index 000000000000..3995f3e075eb
--- /dev/null
+++ b/lib/crypto/sparc/md5.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * MD5 accelerated using the sparc64 crypto opcodes
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_md5_opcodes);
+
+asmlinkage void md5_sparc64_transform(struct md5_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void md5_blocks(struct md5_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_md5_opcodes)) {
+ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+ md5_sparc64_transform(state, data, nblocks);
+ le32_to_cpu_array(state->h, ARRAY_SIZE(state->h));
+ } else {
+ md5_blocks_generic(state, data, nblocks);
+ }
+}
+
+#define md5_mod_init_arch md5_mod_init_arch
+static void md5_mod_init_arch(void)
+{
+ unsigned long cfr;
+
+ if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+ return;
+
+ __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+ if (!(cfr & CFR_MD5))
+ return;
+
+ static_branch_enable(&have_md5_opcodes);
+ pr_info("Using sparc64 md5 opcode optimized MD5 implementation\n");
+}
diff --git a/lib/crypto/sparc/md5_asm.S b/lib/crypto/sparc/md5_asm.S
new file mode 100644
index 000000000000..60b544e4d205
--- /dev/null
+++ b/lib/crypto/sparc/md5_asm.S
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(md5_sparc64_transform)
+ /* %o0 = digest, %o1 = data, %o2 = rounds */
+ VISEntryHalf
+ ld [%o0 + 0x00], %f0
+ ld [%o0 + 0x04], %f1
+ andcc %o1, 0x7, %g0
+ ld [%o0 + 0x08], %f2
+ bne,pn %xcc, 10f
+ ld [%o0 + 0x0c], %f3
+
+1:
+ ldd [%o1 + 0x00], %f8
+ ldd [%o1 + 0x08], %f10
+ ldd [%o1 + 0x10], %f12
+ ldd [%o1 + 0x18], %f14
+ ldd [%o1 + 0x20], %f16
+ ldd [%o1 + 0x28], %f18
+ ldd [%o1 + 0x30], %f20
+ ldd [%o1 + 0x38], %f22
+
+ MD5
+
+ subcc %o2, 1, %o2
+ bne,pt %xcc, 1b
+ add %o1, 0x40, %o1
+
+5:
+ st %f0, [%o0 + 0x00]
+ st %f1, [%o0 + 0x04]
+ st %f2, [%o0 + 0x08]
+ st %f3, [%o0 + 0x0c]
+ retl
+ VISExitHalf
+10:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f10
+1:
+ ldd [%o1 + 0x08], %f12
+ ldd [%o1 + 0x10], %f14
+ ldd [%o1 + 0x18], %f16
+ ldd [%o1 + 0x20], %f18
+ ldd [%o1 + 0x28], %f20
+ ldd [%o1 + 0x30], %f22
+ ldd [%o1 + 0x38], %f24
+ ldd [%o1 + 0x40], %f26
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ MD5
+
+ subcc %o2, 1, %o2
+ fsrc2 %f26, %f10
+ bne,pt %xcc, 1b
+ add %o1, 0x40, %o1
+
+ ba,a,pt %xcc, 5b
+ENDPROC(md5_sparc64_transform)
diff --git a/lib/crypto/sparc/sha1.h b/lib/crypto/sparc/sha1.h
new file mode 100644
index 000000000000..bdf771fcc1f7
--- /dev/null
+++ b/lib/crypto/sparc/sha1.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-1 accelerated using the sparc64 crypto opcodes
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha1_opcodes);
+
+asmlinkage void sha1_sparc64_transform(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_sha1_opcodes))
+ sha1_sparc64_transform(state, data, nblocks);
+ else
+ sha1_blocks_generic(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+ unsigned long cfr;
+
+ if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+ return;
+
+ __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+ if (!(cfr & CFR_SHA1))
+ return;
+
+ static_branch_enable(&have_sha1_opcodes);
+ pr_info("Using sparc64 sha1 opcode optimized SHA-1 implementation\n");
+}
diff --git a/lib/crypto/sparc/sha1_asm.S b/lib/crypto/sparc/sha1_asm.S
new file mode 100644
index 000000000000..00b46bac1b08
--- /dev/null
+++ b/lib/crypto/sparc/sha1_asm.S
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(sha1_sparc64_transform)
+ /* %o0 = digest, %o1 = data, %o2 = rounds */
+ VISEntryHalf
+ ld [%o0 + 0x00], %f0
+ ld [%o0 + 0x04], %f1
+ ld [%o0 + 0x08], %f2
+ andcc %o1, 0x7, %g0
+ ld [%o0 + 0x0c], %f3
+ bne,pn %xcc, 10f
+ ld [%o0 + 0x10], %f4
+
+1:
+ ldd [%o1 + 0x00], %f8
+ ldd [%o1 + 0x08], %f10
+ ldd [%o1 + 0x10], %f12
+ ldd [%o1 + 0x18], %f14
+ ldd [%o1 + 0x20], %f16
+ ldd [%o1 + 0x28], %f18
+ ldd [%o1 + 0x30], %f20
+ ldd [%o1 + 0x38], %f22
+
+ SHA1
+
+ subcc %o2, 1, %o2
+ bne,pt %xcc, 1b
+ add %o1, 0x40, %o1
+
+5:
+ st %f0, [%o0 + 0x00]
+ st %f1, [%o0 + 0x04]
+ st %f2, [%o0 + 0x08]
+ st %f3, [%o0 + 0x0c]
+ st %f4, [%o0 + 0x10]
+ retl
+ VISExitHalf
+10:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f10
+1:
+ ldd [%o1 + 0x08], %f12
+ ldd [%o1 + 0x10], %f14
+ ldd [%o1 + 0x18], %f16
+ ldd [%o1 + 0x20], %f18
+ ldd [%o1 + 0x28], %f20
+ ldd [%o1 + 0x30], %f22
+ ldd [%o1 + 0x38], %f24
+ ldd [%o1 + 0x40], %f26
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ SHA1
+
+ subcc %o2, 1, %o2
+ fsrc2 %f26, %f10
+ bne,pt %xcc, 1b
+ add %o1, 0x40, %o1
+
+ ba,a,pt %xcc, 5b
+ENDPROC(sha1_sparc64_transform)
diff --git a/lib/crypto/sparc/sha256.h b/lib/crypto/sparc/sha256.h
new file mode 100644
index 000000000000..b2f4419ec778
--- /dev/null
+++ b/lib/crypto/sparc/sha256.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-256 accelerated using the sparc64 sha256 opcodes
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes);
+
+asmlinkage void sha256_sparc64_transform(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_sha256_opcodes))
+ sha256_sparc64_transform(state, data, nblocks);
+ else
+ sha256_blocks_generic(state, data, nblocks);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+ unsigned long cfr;
+
+ if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+ return;
+
+ __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+ if (!(cfr & CFR_SHA256))
+ return;
+
+ static_branch_enable(&have_sha256_opcodes);
+ pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n");
+}
diff --git a/lib/crypto/sparc/sha256_asm.S b/lib/crypto/sparc/sha256_asm.S
new file mode 100644
index 000000000000..ddcdd3daf31e
--- /dev/null
+++ b/lib/crypto/sparc/sha256_asm.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(sha256_sparc64_transform)
+ /* %o0 = state, %o1 = data, %o2 = nblocks */
+ VISEntryHalf
+ ld [%o0 + 0x00], %f0
+ ld [%o0 + 0x04], %f1
+ ld [%o0 + 0x08], %f2
+ ld [%o0 + 0x0c], %f3
+ ld [%o0 + 0x10], %f4
+ ld [%o0 + 0x14], %f5
+ andcc %o1, 0x7, %g0
+ ld [%o0 + 0x18], %f6
+ bne,pn %xcc, 10f
+ ld [%o0 + 0x1c], %f7
+
+1:
+ ldd [%o1 + 0x00], %f8
+ ldd [%o1 + 0x08], %f10
+ ldd [%o1 + 0x10], %f12
+ ldd [%o1 + 0x18], %f14
+ ldd [%o1 + 0x20], %f16
+ ldd [%o1 + 0x28], %f18
+ ldd [%o1 + 0x30], %f20
+ ldd [%o1 + 0x38], %f22
+
+ SHA256
+
+ subcc %o2, 1, %o2
+ bne,pt %xcc, 1b
+ add %o1, 0x40, %o1
+
+5:
+ st %f0, [%o0 + 0x00]
+ st %f1, [%o0 + 0x04]
+ st %f2, [%o0 + 0x08]
+ st %f3, [%o0 + 0x0c]
+ st %f4, [%o0 + 0x10]
+ st %f5, [%o0 + 0x14]
+ st %f6, [%o0 + 0x18]
+ st %f7, [%o0 + 0x1c]
+ retl
+ VISExitHalf
+10:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f10
+1:
+ ldd [%o1 + 0x08], %f12
+ ldd [%o1 + 0x10], %f14
+ ldd [%o1 + 0x18], %f16
+ ldd [%o1 + 0x20], %f18
+ ldd [%o1 + 0x28], %f20
+ ldd [%o1 + 0x30], %f22
+ ldd [%o1 + 0x38], %f24
+ ldd [%o1 + 0x40], %f26
+
+ faligndata %f10, %f12, %f8
+ faligndata %f12, %f14, %f10
+ faligndata %f14, %f16, %f12
+ faligndata %f16, %f18, %f14
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+
+ SHA256
+
+ subcc %o2, 1, %o2
+ fsrc2 %f26, %f10
+ bne,pt %xcc, 1b
+ add %o1, 0x40, %o1
+
+ ba,a,pt %xcc, 5b
+ENDPROC(sha256_sparc64_transform)
diff --git a/lib/crypto/sparc/sha512.h b/lib/crypto/sparc/sha512.h
new file mode 100644
index 000000000000..a8c37a7d4c39
--- /dev/null
+++ b/lib/crypto/sparc/sha512.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-512 accelerated using the sparc64 sha512 opcodes
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_opcodes);
+
+asmlinkage void sha512_sparc64_transform(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_sha512_opcodes))
+ sha512_sparc64_transform(state, data, nblocks);
+ else
+ sha512_blocks_generic(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+ unsigned long cfr;
+
+ if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+ return;
+
+ __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+ if (!(cfr & CFR_SHA512))
+ return;
+
+ static_branch_enable(&have_sha512_opcodes);
+ pr_info("Using sparc64 sha512 opcode optimized SHA-512/SHA-384 implementation\n");
+}
diff --git a/lib/crypto/sparc/sha512_asm.S b/lib/crypto/sparc/sha512_asm.S
new file mode 100644
index 000000000000..9932b4fe1b59
--- /dev/null
+++ b/lib/crypto/sparc/sha512_asm.S
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+
+ENTRY(sha512_sparc64_transform)
+ /* %o0 = digest, %o1 = data, %o2 = rounds */
+ VISEntry
+ ldd [%o0 + 0x00], %f0
+ ldd [%o0 + 0x08], %f2
+ ldd [%o0 + 0x10], %f4
+ ldd [%o0 + 0x18], %f6
+ ldd [%o0 + 0x20], %f8
+ ldd [%o0 + 0x28], %f10
+ andcc %o1, 0x7, %g0
+ ldd [%o0 + 0x30], %f12
+ bne,pn %xcc, 10f
+ ldd [%o0 + 0x38], %f14
+
+1:
+ ldd [%o1 + 0x00], %f16
+ ldd [%o1 + 0x08], %f18
+ ldd [%o1 + 0x10], %f20
+ ldd [%o1 + 0x18], %f22
+ ldd [%o1 + 0x20], %f24
+ ldd [%o1 + 0x28], %f26
+ ldd [%o1 + 0x30], %f28
+ ldd [%o1 + 0x38], %f30
+ ldd [%o1 + 0x40], %f32
+ ldd [%o1 + 0x48], %f34
+ ldd [%o1 + 0x50], %f36
+ ldd [%o1 + 0x58], %f38
+ ldd [%o1 + 0x60], %f40
+ ldd [%o1 + 0x68], %f42
+ ldd [%o1 + 0x70], %f44
+ ldd [%o1 + 0x78], %f46
+
+ SHA512
+
+ subcc %o2, 1, %o2
+ bne,pt %xcc, 1b
+ add %o1, 0x80, %o1
+
+5:
+ std %f0, [%o0 + 0x00]
+ std %f2, [%o0 + 0x08]
+ std %f4, [%o0 + 0x10]
+ std %f6, [%o0 + 0x18]
+ std %f8, [%o0 + 0x20]
+ std %f10, [%o0 + 0x28]
+ std %f12, [%o0 + 0x30]
+ std %f14, [%o0 + 0x38]
+ retl
+ VISExit
+10:
+ alignaddr %o1, %g0, %o1
+
+ ldd [%o1 + 0x00], %f18
+1:
+ ldd [%o1 + 0x08], %f20
+ ldd [%o1 + 0x10], %f22
+ ldd [%o1 + 0x18], %f24
+ ldd [%o1 + 0x20], %f26
+ ldd [%o1 + 0x28], %f28
+ ldd [%o1 + 0x30], %f30
+ ldd [%o1 + 0x38], %f32
+ ldd [%o1 + 0x40], %f34
+ ldd [%o1 + 0x48], %f36
+ ldd [%o1 + 0x50], %f38
+ ldd [%o1 + 0x58], %f40
+ ldd [%o1 + 0x60], %f42
+ ldd [%o1 + 0x68], %f44
+ ldd [%o1 + 0x70], %f46
+ ldd [%o1 + 0x78], %f48
+ ldd [%o1 + 0x80], %f50
+
+ faligndata %f18, %f20, %f16
+ faligndata %f20, %f22, %f18
+ faligndata %f22, %f24, %f20
+ faligndata %f24, %f26, %f22
+ faligndata %f26, %f28, %f24
+ faligndata %f28, %f30, %f26
+ faligndata %f30, %f32, %f28
+ faligndata %f32, %f34, %f30
+ faligndata %f34, %f36, %f32
+ faligndata %f36, %f38, %f34
+ faligndata %f38, %f40, %f36
+ faligndata %f40, %f42, %f38
+ faligndata %f42, %f44, %f40
+ faligndata %f44, %f46, %f42
+ faligndata %f46, %f48, %f44
+ faligndata %f48, %f50, %f46
+
+ SHA512
+
+ subcc %o2, 1, %o2
+ fsrc2 %f50, %f18
+ bne,pt %xcc, 1b
+ add %o1, 0x80, %o1
+
+ ba,a,pt %xcc, 5b
+ENDPROC(sha512_sparc64_transform)
diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig
new file mode 100644
index 000000000000..61d435c450bb
--- /dev/null
+++ b/lib/crypto/tests/Kconfig
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+config CRYPTO_LIB_BLAKE2B_KUNIT_TEST
+ tristate "KUnit tests for BLAKE2b" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_BLAKE2B
+ help
+ KUnit tests for the BLAKE2b cryptographic hash function.
+
+config CRYPTO_LIB_BLAKE2S_KUNIT_TEST
+ tristate "KUnit tests for BLAKE2s" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ # No need to select CRYPTO_LIB_BLAKE2S here, as that option doesn't
+ # exist; the BLAKE2s code is always built-in for the /dev/random driver.
+ help
+ KUnit tests for the BLAKE2s cryptographic hash function.
+
+config CRYPTO_LIB_CURVE25519_KUNIT_TEST
+ tristate "KUnit tests for Curve25519" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_CURVE25519
+ help
+ KUnit tests for the Curve25519 Diffie-Hellman function.
+
+config CRYPTO_LIB_MD5_KUNIT_TEST
+ tristate "KUnit tests for MD5" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_MD5
+ help
+ KUnit tests for the MD5 cryptographic hash function and its
+ corresponding HMAC.
+
+config CRYPTO_LIB_POLY1305_KUNIT_TEST
+ tristate "KUnit tests for Poly1305" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_POLY1305
+ help
+ KUnit tests for the Poly1305 library functions.
+
+config CRYPTO_LIB_POLYVAL_KUNIT_TEST
+ tristate "KUnit tests for POLYVAL" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_POLYVAL
+ help
+ KUnit tests for the POLYVAL library functions.
+
+config CRYPTO_LIB_SHA1_KUNIT_TEST
+ tristate "KUnit tests for SHA-1" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_SHA1
+ help
+ KUnit tests for the SHA-1 cryptographic hash function and its
+ corresponding HMAC.
+
+# Option is named *_SHA256_KUNIT_TEST, though both SHA-224 and SHA-256 tests are
+# included, for consistency with the naming used elsewhere (e.g. CRYPTO_SHA256).
+config CRYPTO_LIB_SHA256_KUNIT_TEST
+ tristate "KUnit tests for SHA-224 and SHA-256" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_SHA256
+ help
+ KUnit tests for the SHA-224 and SHA-256 cryptographic hash functions
+ and their corresponding HMACs.
+
+# Option is named *_SHA512_KUNIT_TEST, though both SHA-384 and SHA-512 tests are
+# included, for consistency with the naming used elsewhere (e.g. CRYPTO_SHA512).
+config CRYPTO_LIB_SHA512_KUNIT_TEST
+ tristate "KUnit tests for SHA-384 and SHA-512" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_SHA512
+ help
+ KUnit tests for the SHA-384 and SHA-512 cryptographic hash functions
+ and their corresponding HMACs.
+
+config CRYPTO_LIB_SHA3_KUNIT_TEST
+ tristate "KUnit tests for SHA-3" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+ select CRYPTO_LIB_BENCHMARK_VISIBLE
+ select CRYPTO_LIB_SHA3
+ help
+ KUnit tests for the SHA3 cryptographic hash and XOF functions,
+ including SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128 and
+ SHAKE256.
+
+config CRYPTO_LIB_BENCHMARK_VISIBLE
+ bool
+
+config CRYPTO_LIB_BENCHMARK
+ bool "Include benchmarks in KUnit tests for cryptographic functions"
+ depends on CRYPTO_LIB_BENCHMARK_VISIBLE
+ help
+ Include benchmarks in the KUnit tests for cryptographic functions.
+ The benchmark results are printed to the kernel log when the
+ corresponding KUnit test suite runs.
+
+ This is useful for evaluating the performance of the cryptographic
+ functions. However, it will increase the runtime of the KUnit tests.
+
+ If you're only interested in correctness testing, leave this disabled.
diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile
new file mode 100644
index 000000000000..5109a0651925
--- /dev/null
+++ b/lib/crypto/tests/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2B_KUNIT_TEST) += blake2b_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_KUNIT_TEST) += blake2s_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_CURVE25519_KUNIT_TEST) += curve25519_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_MD5_KUNIT_TEST) += md5_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_POLYVAL_KUNIT_TEST) += polyval_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA256_KUNIT_TEST) += sha224_kunit.o sha256_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA512_KUNIT_TEST) += sha384_kunit.o sha512_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA3_KUNIT_TEST) += sha3_kunit.o
diff --git a/lib/crypto/tests/blake2b-testvecs.h b/lib/crypto/tests/blake2b-testvecs.h
new file mode 100644
index 000000000000..9e407dbc219c
--- /dev/null
+++ b/lib/crypto/tests/blake2b-testvecs.h
@@ -0,0 +1,342 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py blake2b */
+
+static const struct {
+ size_t data_len;
+ u8 digest[BLAKE2B_HASH_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0x78, 0x6a, 0x02, 0xf7, 0x42, 0x01, 0x59, 0x03,
+ 0xc6, 0xc6, 0xfd, 0x85, 0x25, 0x52, 0xd2, 0x72,
+ 0x91, 0x2f, 0x47, 0x40, 0xe1, 0x58, 0x47, 0x61,
+ 0x8a, 0x86, 0xe2, 0x17, 0xf7, 0x1f, 0x54, 0x19,
+ 0xd2, 0x5e, 0x10, 0x31, 0xaf, 0xee, 0x58, 0x53,
+ 0x13, 0x89, 0x64, 0x44, 0x93, 0x4e, 0xb0, 0x4b,
+ 0x90, 0x3a, 0x68, 0x5b, 0x14, 0x48, 0xb7, 0x55,
+ 0xd5, 0x6f, 0x70, 0x1a, 0xfe, 0x9b, 0xe2, 0xce,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x6f, 0x2e, 0xcc, 0x83, 0x53, 0xa3, 0x20, 0x16,
+ 0x5b, 0xda, 0xd0, 0x04, 0xd3, 0xcb, 0xe4, 0x37,
+ 0x5b, 0xf0, 0x84, 0x36, 0xe1, 0xad, 0x45, 0xcc,
+ 0x4d, 0x7f, 0x09, 0x68, 0xb2, 0x62, 0x93, 0x7f,
+ 0x72, 0x32, 0xe8, 0xa7, 0x2f, 0x1f, 0x6f, 0xc6,
+ 0x14, 0xd6, 0x70, 0xae, 0x0c, 0xf0, 0xf3, 0xce,
+ 0x64, 0x4d, 0x22, 0xdf, 0xc7, 0xa7, 0xf8, 0xa8,
+ 0x18, 0x23, 0xd8, 0x6c, 0xaf, 0x65, 0xa2, 0x54,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x04, 0x13, 0xe2, 0x10, 0xbe, 0x65, 0xde, 0xce,
+ 0x61, 0xa8, 0xe0, 0xd6, 0x35, 0xb1, 0xb8, 0x88,
+ 0xd2, 0xea, 0x45, 0x3a, 0xe1, 0x8d, 0x94, 0xb5,
+ 0x66, 0x06, 0x98, 0x96, 0x39, 0xf8, 0x0e, 0xcb,
+ 0x34, 0xa6, 0xa8, 0x17, 0xfe, 0x56, 0xbc, 0xa9,
+ 0x5e, 0x1b, 0xb1, 0xde, 0x3c, 0xc7, 0x78, 0x4f,
+ 0x39, 0xc6, 0xfc, 0xa8, 0xb3, 0x27, 0x66, 0x3e,
+ 0x4e, 0xb5, 0x5d, 0x08, 0x89, 0xee, 0xd1, 0xe0,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0x2b, 0x4a, 0xa3, 0x4e, 0x2b, 0x7a, 0x47, 0x20,
+ 0x30, 0x5b, 0x09, 0x17, 0x3a, 0xf4, 0xcc, 0xf0,
+ 0xf7, 0x7b, 0x97, 0x68, 0x98, 0x9f, 0x4f, 0x09,
+ 0x46, 0x25, 0xe7, 0xd6, 0x53, 0x6b, 0xf9, 0x68,
+ 0x48, 0x12, 0x44, 0x8c, 0x9a, 0xc8, 0xd4, 0x42,
+ 0xeb, 0x2c, 0x5f, 0x41, 0xba, 0x17, 0xd0, 0xc3,
+ 0xad, 0xfd, 0xfb, 0x42, 0x33, 0xcb, 0x08, 0x5d,
+ 0xd2, 0x5c, 0x3d, 0xde, 0x87, 0x4d, 0xd6, 0xe4,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0xbf, 0x40, 0xf2, 0x38, 0x44, 0x8e, 0x24, 0x5e,
+ 0xbc, 0x67, 0xbb, 0xf0, 0x10, 0x9a, 0x79, 0xbb,
+ 0x36, 0x55, 0xce, 0xd2, 0xba, 0x04, 0x0d, 0xe8,
+ 0x30, 0x29, 0x5c, 0x2a, 0xa6, 0x3a, 0x4f, 0x37,
+ 0xac, 0x5f, 0xd4, 0x13, 0xa2, 0xf4, 0xfe, 0x80,
+ 0x61, 0xd7, 0x58, 0x66, 0x0c, 0x7f, 0xa2, 0x56,
+ 0x6b, 0x52, 0x7c, 0x22, 0x73, 0x7f, 0x17, 0xaa,
+ 0x91, 0x5a, 0x22, 0x06, 0xd9, 0x00, 0x48, 0x12,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x41, 0x04, 0x65, 0x93, 0x81, 0x9a, 0x20, 0x0a,
+ 0x00, 0x60, 0x00, 0x64, 0x4c, 0x04, 0x3d, 0xe0,
+ 0x6b, 0x17, 0x0c, 0xe1, 0x0e, 0x28, 0x8b, 0xa0,
+ 0x76, 0xd2, 0x79, 0xb0, 0x33, 0x60, 0x61, 0x27,
+ 0xf2, 0x64, 0xf1, 0x8a, 0xe5, 0x3e, 0xaa, 0x37,
+ 0x60, 0xad, 0x2d, 0x75, 0x13, 0xae, 0xd8, 0x9e,
+ 0xec, 0xe0, 0xe4, 0x40, 0x2f, 0x59, 0x44, 0xb0,
+ 0x66, 0x7a, 0x68, 0x38, 0xce, 0x21, 0x99, 0x2a,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x19, 0x6f, 0x9d, 0xc7, 0x87, 0x12, 0x5c, 0xa3,
+ 0xe2, 0xd3, 0xf1, 0x82, 0xec, 0xf3, 0x55, 0x9c,
+ 0x86, 0xd1, 0x6d, 0xde, 0xcf, 0x5b, 0xec, 0x4c,
+ 0x43, 0x25, 0x85, 0x90, 0xef, 0xe8, 0xe3, 0x5f,
+ 0x2c, 0x3a, 0x84, 0x07, 0xb8, 0x55, 0xfd, 0x5e,
+ 0xa4, 0x45, 0xf2, 0xac, 0xe4, 0xbd, 0xc7, 0x96,
+ 0x80, 0x59, 0x3e, 0xc9, 0xb1, 0x60, 0xb1, 0x2b,
+ 0x17, 0x49, 0x7d, 0x3e, 0x7d, 0x4d, 0x70, 0x24,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x73, 0x72, 0xd5, 0x0a, 0x97, 0xb4, 0x7d, 0xdb,
+ 0x05, 0x14, 0x8e, 0x40, 0xc2, 0x9a, 0x8a, 0x74,
+ 0x4b, 0xda, 0x7e, 0xfc, 0x97, 0x57, 0x23, 0x39,
+ 0xdc, 0x57, 0x09, 0x13, 0x24, 0xfc, 0xf3, 0x23,
+ 0x55, 0x48, 0xdd, 0xe5, 0x07, 0x9a, 0x6f, 0x7b,
+ 0x62, 0xea, 0x4d, 0x79, 0xb4, 0xb9, 0xc5, 0x86,
+ 0xc0, 0x34, 0xd6, 0xd2, 0x6c, 0xc3, 0x94, 0xfb,
+ 0x34, 0xd6, 0x62, 0xae, 0xb8, 0x99, 0xf1, 0x38,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x42, 0x3a, 0xe3, 0xa2, 0xae, 0x5a, 0x28, 0xce,
+ 0xf1, 0x3c, 0x97, 0xc2, 0x34, 0xf6, 0xb5, 0x1e,
+ 0xfc, 0x31, 0xb4, 0x04, 0x61, 0xb7, 0x54, 0x0b,
+ 0x0d, 0x1a, 0x22, 0x9c, 0x04, 0x67, 0x5c, 0x4c,
+ 0x75, 0x1b, 0x10, 0x0b, 0x99, 0xe2, 0xb1, 0x5e,
+ 0x5d, 0x4b, 0x7a, 0xe6, 0xf6, 0xb5, 0x62, 0xee,
+ 0x2d, 0x44, 0x57, 0xb2, 0x96, 0x73, 0x5e, 0xb9,
+ 0x6a, 0xb2, 0xb3, 0x16, 0xa3, 0xd9, 0x6a, 0x60,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x50, 0xb9, 0xbe, 0xb2, 0x69, 0x07, 0x45, 0x5b,
+ 0x59, 0xde, 0x8d, 0xbf, 0x08, 0xdc, 0x2e, 0x7f,
+ 0x93, 0x29, 0xc1, 0x91, 0xe8, 0x74, 0x03, 0x89,
+ 0x20, 0xfb, 0xb2, 0x4b, 0xe8, 0x68, 0x6f, 0xe1,
+ 0xb4, 0x30, 0xbe, 0x11, 0x3c, 0x43, 0x19, 0x66,
+ 0x72, 0x78, 0xb7, 0xf4, 0xe9, 0x09, 0x18, 0x4e,
+ 0xae, 0x4a, 0x24, 0xe0, 0x6f, 0x44, 0x02, 0xe3,
+ 0xfd, 0xda, 0xb3, 0x3e, 0x3c, 0x6d, 0x54, 0x2e,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0xd6, 0xf2, 0xa9, 0x61, 0x3f, 0xce, 0x2a, 0x68,
+ 0x19, 0x86, 0xff, 0xd1, 0xee, 0x89, 0x3b, 0xa4,
+ 0x10, 0x9a, 0x91, 0x50, 0x35, 0x48, 0x9e, 0xf5,
+ 0x9c, 0x95, 0xe0, 0xfb, 0x92, 0x0f, 0xa8, 0xf7,
+ 0x6c, 0x43, 0x85, 0xf1, 0x6e, 0x11, 0x4e, 0x67,
+ 0x78, 0xd7, 0x53, 0x25, 0x0c, 0xf8, 0xce, 0x38,
+ 0x74, 0x08, 0xb0, 0x3c, 0x53, 0x20, 0x4d, 0xc4,
+ 0x9a, 0xf5, 0x78, 0xe8, 0x41, 0x8f, 0xed, 0x1f,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0xe8, 0xb2, 0xc5, 0xa7, 0xf5, 0xfa, 0xee, 0xa0,
+ 0x57, 0xba, 0x58, 0xf9, 0x0a, 0xf2, 0x64, 0x16,
+ 0xa8, 0xa6, 0x03, 0x85, 0x3b, 0xb8, 0x6f, 0xca,
+ 0x76, 0xc3, 0xa1, 0x2b, 0xec, 0xef, 0xc4, 0x66,
+ 0x11, 0xdf, 0x03, 0x85, 0x9d, 0x0c, 0x37, 0x7b,
+ 0xa9, 0x7b, 0x44, 0xfb, 0x11, 0x8f, 0x3f, 0x71,
+ 0xcd, 0x81, 0x43, 0x2e, 0x71, 0x5c, 0x54, 0x9f,
+ 0xca, 0x0f, 0x01, 0x91, 0xca, 0xaa, 0x93, 0xe9,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x05, 0x8e, 0x9d, 0xdc, 0xe9, 0x36, 0x3e, 0x73,
+ 0x63, 0x59, 0x69, 0x81, 0x0b, 0x8c, 0xc7, 0x9e,
+ 0xcc, 0xe7, 0x9c, 0x19, 0x54, 0xa7, 0x2f, 0x86,
+ 0xb5, 0xea, 0xae, 0x6d, 0xfe, 0x4e, 0x6e, 0x83,
+ 0x8d, 0x1a, 0x1c, 0x70, 0x3f, 0x34, 0xa1, 0x04,
+ 0x59, 0xd1, 0xbb, 0xaa, 0x58, 0xf7, 0xce, 0xfb,
+ 0x86, 0x66, 0x22, 0xfc, 0x78, 0x74, 0x6e, 0x85,
+ 0xf1, 0x59, 0x7d, 0x9e, 0x1c, 0x3b, 0xc6, 0x65,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x6b, 0x1f, 0x7c, 0x9a, 0x65, 0x7f, 0x09, 0x61,
+ 0xe5, 0x04, 0x9a, 0xf1, 0x4b, 0x36, 0x8e, 0x41,
+ 0x86, 0xcf, 0x86, 0x19, 0xd8, 0xc9, 0x34, 0x70,
+ 0x67, 0xd1, 0x03, 0x72, 0x12, 0xf7, 0x27, 0x92,
+ 0x2e, 0x3d, 0x2b, 0x54, 0x9a, 0x48, 0xa4, 0xc2,
+ 0x61, 0xea, 0x6a, 0xe8, 0xdd, 0x07, 0x41, 0x85,
+ 0x58, 0x6d, 0xcd, 0x12, 0x0d, 0xbc, 0xb1, 0x23,
+ 0xb2, 0xdb, 0x24, 0x1f, 0xc4, 0xa7, 0xae, 0xda,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x50, 0xd8, 0xdc, 0xb2, 0x50, 0x24, 0x7a, 0x49,
+ 0xb1, 0x00, 0x73, 0x16, 0x1f, 0xce, 0xf9, 0xe8,
+ 0x77, 0x0a, 0x27, 0x74, 0xc7, 0xeb, 0xf0, 0x62,
+ 0xb9, 0xf3, 0x24, 0xa6, 0x03, 0x18, 0x40, 0xde,
+ 0x9b, 0x1d, 0xa8, 0xd0, 0xbf, 0x66, 0xa3, 0xc1,
+ 0x31, 0x04, 0x95, 0xc7, 0xc3, 0xb7, 0x11, 0xe2,
+ 0x1e, 0x31, 0x49, 0x98, 0x06, 0xab, 0xf0, 0xe6,
+ 0x5c, 0xac, 0x88, 0x28, 0x0b, 0x3d, 0xb2, 0xc2,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0xd4, 0x2b, 0x6b, 0x9e, 0xfc, 0x44, 0xc0, 0x90,
+ 0x64, 0x77, 0x5d, 0xf3, 0x44, 0xb6, 0x92, 0x8f,
+ 0x80, 0xe2, 0xe4, 0x9b, 0xaf, 0x49, 0x04, 0xea,
+ 0x29, 0xf7, 0x4a, 0x33, 0x3f, 0xc7, 0x3b, 0xab,
+ 0xa1, 0x71, 0x7f, 0xa2, 0x8e, 0x03, 0xa0, 0xd6,
+ 0xa7, 0xcd, 0xe0, 0xf8, 0xd7, 0x3b, 0xa4, 0x0d,
+ 0x84, 0x79, 0x12, 0x72, 0x3f, 0x8e, 0x48, 0x35,
+ 0x76, 0x4f, 0x56, 0xe9, 0x21, 0x40, 0x19, 0xbe,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0x84, 0xd4, 0xd8, 0x6c, 0x60, 0x3d, 0x6e, 0xfd,
+ 0x84, 0xb7, 0xdf, 0xba, 0x13, 0x5e, 0x07, 0x94,
+ 0x5b, 0x6b, 0x62, 0x1d, 0x82, 0x02, 0xa7, 0xb3,
+ 0x21, 0xdf, 0x42, 0x20, 0x85, 0xa8, 0x6f, 0x30,
+ 0xf7, 0x03, 0xba, 0x66, 0x0e, 0xa6, 0x42, 0x21,
+ 0x37, 0xe8, 0xed, 0x5b, 0x22, 0xf5, 0x4e, 0xa5,
+ 0xe5, 0x80, 0x1b, 0x47, 0xf0, 0x49, 0xb3, 0xe5,
+ 0x6e, 0xd9, 0xd9, 0x95, 0x3d, 0x2e, 0x42, 0x13,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x71, 0x17, 0xab, 0x93, 0xfe, 0x3b, 0xa4, 0xe6,
+ 0xcb, 0xb0, 0xea, 0x95, 0xe7, 0x1a, 0x01, 0xc0,
+ 0x12, 0x33, 0xfe, 0xcc, 0x79, 0x15, 0xae, 0x56,
+ 0xd2, 0x70, 0x44, 0x60, 0x54, 0x42, 0xa8, 0x69,
+ 0x7e, 0xc3, 0x90, 0xa0, 0x0c, 0x63, 0x39, 0xff,
+ 0x55, 0x53, 0xb8, 0x46, 0xef, 0x06, 0xcb, 0xba,
+ 0x73, 0xf4, 0x76, 0x22, 0xf1, 0x60, 0x98, 0xbc,
+ 0xbf, 0x76, 0x95, 0x85, 0x13, 0x1d, 0x11, 0x3b,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0x3a, 0xaa, 0x85, 0xa0, 0x8c, 0x8e, 0xe1, 0x9c,
+ 0x9b, 0x43, 0x72, 0x7f, 0x40, 0x88, 0x3b, 0xd1,
+ 0xc4, 0xd8, 0x2b, 0x69, 0xa6, 0x74, 0x47, 0x69,
+ 0x5f, 0x7d, 0xab, 0x75, 0xa9, 0xf9, 0x88, 0x54,
+ 0xce, 0x57, 0xcc, 0x9d, 0xac, 0x13, 0x91, 0xdb,
+ 0x6d, 0x5c, 0xd8, 0xf4, 0x35, 0xc9, 0x30, 0xf0,
+ 0x4b, 0x91, 0x25, 0xab, 0x92, 0xa8, 0xc8, 0x6f,
+ 0xa0, 0xeb, 0x71, 0x56, 0x95, 0xab, 0xfd, 0xd7,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0xe1, 0xe9, 0xbe, 0x6c, 0x96, 0xe2, 0xe8, 0xa6,
+ 0x53, 0xcd, 0x79, 0x77, 0x57, 0x51, 0x2f, 0xb2,
+ 0x9f, 0xfc, 0x09, 0xaa, 0x2c, 0xbc, 0x6c, 0x5f,
+ 0xb0, 0xf2, 0x12, 0x39, 0x54, 0xd7, 0x27, 0xf8,
+ 0x33, 0x5d, 0xd4, 0x8a, 0xca, 0xd8, 0x2e, 0xbb,
+ 0x02, 0x82, 0xca, 0x1b, 0x54, 0xfa, 0xd6, 0xf4,
+ 0x49, 0x63, 0xfc, 0xc8, 0x73, 0xd4, 0x26, 0x8d,
+ 0x4f, 0x1c, 0x56, 0xa7, 0xf4, 0x58, 0x6f, 0x51,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0xf2, 0xf6, 0xe1, 0x16, 0x98, 0x69, 0x74, 0x5f,
+ 0x6c, 0xc4, 0x9d, 0x34, 0xa2, 0x84, 0x5d, 0x47,
+ 0xac, 0x39, 0xe0, 0x14, 0x2d, 0x78, 0xfa, 0x27,
+ 0xd5, 0x18, 0xaf, 0x26, 0x89, 0xa4, 0x69, 0xd3,
+ 0x56, 0xde, 0xfe, 0x4b, 0x9f, 0x0c, 0x9d, 0x5a,
+ 0x9a, 0x73, 0x3e, 0x3c, 0x76, 0x4b, 0x96, 0xca,
+ 0x49, 0xda, 0x05, 0x8c, 0x53, 0xbb, 0x85, 0x89,
+ 0x60, 0xc7, 0xe0, 0xb3, 0x51, 0x18, 0xd2, 0xd2,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0xfc, 0x5c, 0xcf, 0xbf, 0x29, 0xe3, 0x01, 0xef,
+ 0x4b, 0x40, 0x70, 0x01, 0xca, 0x4d, 0x46, 0xce,
+ 0xa9, 0x95, 0x5d, 0xb4, 0xf1, 0x79, 0x29, 0xdb,
+ 0xac, 0x32, 0x3d, 0xd9, 0x60, 0x9e, 0x6b, 0xb8,
+ 0x28, 0x62, 0xb7, 0x4a, 0xbb, 0x33, 0xb9, 0xd0,
+ 0x83, 0xe0, 0xd7, 0x5a, 0x2d, 0x01, 0x4c, 0x61,
+ 0x9e, 0x7d, 0x2d, 0x2d, 0x60, 0x29, 0x5e, 0x60,
+ 0x10, 0xb7, 0x41, 0x00, 0x3f, 0xe5, 0xf7, 0x52,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0xf8, 0xe5, 0x4b, 0xe5, 0x89, 0xf9, 0x1b, 0x43,
+ 0xbb, 0x65, 0x3d, 0xa0, 0xb4, 0xdc, 0x04, 0x26,
+ 0x68, 0x15, 0xae, 0x4d, 0xd6, 0x03, 0xb7, 0x27,
+ 0x06, 0x8c, 0x2a, 0x82, 0x51, 0x96, 0xbf, 0x83,
+ 0x38, 0x96, 0x21, 0x8a, 0xd9, 0xf9, 0x4e, 0x38,
+ 0xc6, 0xb3, 0xbd, 0xfe, 0xd3, 0x49, 0x90, 0xbc,
+ 0xa1, 0x77, 0xd0, 0xa0, 0x3c, 0x2b, 0x4e, 0x10,
+ 0x34, 0xc3, 0x17, 0x85, 0x3d, 0xec, 0xa8, 0x05,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0x38, 0x56, 0xaf, 0x83, 0x68, 0x9c, 0xba, 0xe3,
+ 0xec, 0x51, 0xf5, 0xf4, 0x93, 0x48, 0x1d, 0xe6,
+ 0xad, 0xa8, 0x8c, 0x70, 0x2a, 0xd9, 0xaa, 0x43,
+ 0x04, 0x40, 0x95, 0xc1, 0xe6, 0x8a, 0xf5, 0x01,
+ 0x6b, 0x79, 0xd9, 0xb4, 0xd0, 0x1d, 0x93, 0x26,
+ 0xfe, 0xf5, 0x07, 0x57, 0xda, 0x08, 0x0a, 0x82,
+ 0xc9, 0x17, 0x13, 0x5b, 0x9e, 0x11, 0x96, 0xa5,
+ 0xd0, 0x92, 0xcd, 0xf1, 0xa3, 0x5b, 0x43, 0x21,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[BLAKE2B_HASH_SIZE] = {
+ 0xa4, 0xf8, 0xf6, 0xa1, 0x36, 0x89, 0xc0, 0x2a,
+ 0xc3, 0x42, 0x32, 0x71, 0xe5, 0xea, 0x14, 0x77,
+ 0xf3, 0x99, 0x91, 0x87, 0x49, 0xc2, 0x8d, 0xa5,
+ 0x2f, 0xed, 0x01, 0x35, 0x39, 0x64, 0x09, 0x25,
+ 0xe3, 0xa8, 0x50, 0x97, 0x35, 0x8b, 0xf5, 0x19,
+ 0x1e, 0xd5, 0x9f, 0x03, 0x0b, 0x65, 0x55, 0x0e,
+ 0xa0, 0xb7, 0xda, 0x18, 0x7b, 0x7f, 0x88, 0x55,
+ 0x1f, 0xdb, 0x82, 0x6b, 0x98, 0x90, 0x1c, 0xdd,
+};
+
+static const u8 blake2b_keyed_testvec_consolidated[BLAKE2B_HASH_SIZE] = {
+ 0x2b, 0x89, 0x36, 0x3a, 0x36, 0xe4, 0x18, 0x38,
+ 0xc4, 0x5b, 0x5c, 0xa5, 0x9a, 0xed, 0xf2, 0xee,
+ 0x5a, 0xb6, 0x82, 0x6c, 0x63, 0xf2, 0x29, 0x57,
+ 0xc7, 0xd5, 0x32, 0x27, 0xba, 0x88, 0xb1, 0xab,
+ 0xf2, 0x2a, 0xc1, 0xea, 0xf3, 0x91, 0x89, 0x66,
+ 0x47, 0x1e, 0x5b, 0xc6, 0x98, 0x12, 0xe9, 0x25,
+ 0xbf, 0x72, 0xd2, 0x3f, 0x88, 0x97, 0x17, 0x51,
+ 0xed, 0x96, 0xfb, 0xe9, 0xca, 0x52, 0x42, 0xc9,
+};
diff --git a/lib/crypto/tests/blake2b_kunit.c b/lib/crypto/tests/blake2b_kunit.c
new file mode 100644
index 000000000000..bc0be7da1e76
--- /dev/null
+++ b/lib/crypto/tests/blake2b_kunit.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/blake2b.h>
+#include "blake2b-testvecs.h"
+
+/*
+ * The following are compatibility functions that present BLAKE2b as an unkeyed
+ * hash function that produces hashes of fixed length BLAKE2B_HASH_SIZE, so that
+ * hash-test-template.h can be reused to test it.
+ */
+
+static void blake2b_default(const u8 *data, size_t len,
+ u8 out[BLAKE2B_HASH_SIZE])
+{
+ blake2b(NULL, 0, data, len, out, BLAKE2B_HASH_SIZE);
+}
+
+static void blake2b_init_default(struct blake2b_ctx *ctx)
+{
+ blake2b_init(ctx, BLAKE2B_HASH_SIZE);
+}
+
+/*
+ * Generate the HASH_KUNIT_CASES using hash-test-template.h. These test BLAKE2b
+ * with a key length of 0 and a hash length of BLAKE2B_HASH_SIZE.
+ */
+#define HASH blake2b_default
+#define HASH_CTX blake2b_ctx
+#define HASH_SIZE BLAKE2B_HASH_SIZE
+#define HASH_INIT blake2b_init_default
+#define HASH_UPDATE blake2b_update
+#define HASH_FINAL blake2b_final
+#include "hash-test-template.h"
+
+/*
+ * BLAKE2b specific test case which tests all possible combinations of key
+ * length and hash length.
+ */
+static void test_blake2b_all_key_and_hash_lens(struct kunit *test)
+{
+ const size_t data_len = 100;
+ u8 *data = &test_buf[0];
+ u8 *key = data + data_len;
+ u8 *hash = key + BLAKE2B_KEY_SIZE;
+ struct blake2b_ctx main_ctx;
+ u8 main_hash[BLAKE2B_HASH_SIZE];
+
+ rand_bytes_seeded_from_len(data, data_len);
+ blake2b_init(&main_ctx, BLAKE2B_HASH_SIZE);
+ for (int key_len = 0; key_len <= BLAKE2B_KEY_SIZE; key_len++) {
+ rand_bytes_seeded_from_len(key, key_len);
+ for (int out_len = 1; out_len <= BLAKE2B_HASH_SIZE; out_len++) {
+ blake2b(key, key_len, data, data_len, hash, out_len);
+ blake2b_update(&main_ctx, hash, out_len);
+ }
+ }
+ blake2b_final(&main_ctx, main_hash);
+ KUNIT_ASSERT_MEMEQ(test, main_hash, blake2b_keyed_testvec_consolidated,
+ BLAKE2B_HASH_SIZE);
+}
+
+/*
+ * BLAKE2b specific test case which tests using a guarded buffer for all allowed
+ * key lengths. Also tests both blake2b() and blake2b_init_key().
+ */
+static void test_blake2b_with_guarded_key_buf(struct kunit *test)
+{
+ const size_t data_len = 100;
+
+ rand_bytes(test_buf, data_len);
+ for (int key_len = 0; key_len <= BLAKE2B_KEY_SIZE; key_len++) {
+ u8 key[BLAKE2B_KEY_SIZE];
+ u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
+ u8 hash1[BLAKE2B_HASH_SIZE];
+ u8 hash2[BLAKE2B_HASH_SIZE];
+ struct blake2b_ctx ctx;
+
+ rand_bytes(key, key_len);
+ memcpy(guarded_key, key, key_len);
+
+ blake2b(key, key_len, test_buf, data_len,
+ hash1, BLAKE2B_HASH_SIZE);
+ blake2b(guarded_key, key_len, test_buf, data_len,
+ hash2, BLAKE2B_HASH_SIZE);
+ KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2B_HASH_SIZE);
+
+ blake2b_init_key(&ctx, BLAKE2B_HASH_SIZE, guarded_key, key_len);
+ blake2b_update(&ctx, test_buf, data_len);
+ blake2b_final(&ctx, hash2);
+ KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2B_HASH_SIZE);
+ }
+}
+
+/*
+ * BLAKE2b specific test case which tests using a guarded output buffer for all
+ * allowed output lengths.
+ */
+static void test_blake2b_with_guarded_out_buf(struct kunit *test)
+{
+ const size_t data_len = 100;
+
+ rand_bytes(test_buf, data_len);
+ for (int out_len = 1; out_len <= BLAKE2B_HASH_SIZE; out_len++) {
+ u8 hash[BLAKE2B_HASH_SIZE];
+ u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
+
+ blake2b(NULL, 0, test_buf, data_len, hash, out_len);
+ blake2b(NULL, 0, test_buf, data_len, guarded_hash, out_len);
+ KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
+ }
+}
+
+static struct kunit_case blake2b_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(test_blake2b_all_key_and_hash_lens),
+ KUNIT_CASE(test_blake2b_with_guarded_key_buf),
+ KUNIT_CASE(test_blake2b_with_guarded_out_buf),
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite blake2b_test_suite = {
+ .name = "blake2b",
+ .test_cases = blake2b_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(blake2b_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for BLAKE2b");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/blake2s-testvecs.h b/lib/crypto/tests/blake2s-testvecs.h
new file mode 100644
index 000000000000..6f978b79a59b
--- /dev/null
+++ b/lib/crypto/tests/blake2s-testvecs.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py blake2s */
+
+static const struct {
+ size_t data_len;
+ u8 digest[BLAKE2S_HASH_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0x69, 0x21, 0x7a, 0x30, 0x79, 0x90, 0x80, 0x94,
+ 0xe1, 0x11, 0x21, 0xd0, 0x42, 0x35, 0x4a, 0x7c,
+ 0x1f, 0x55, 0xb6, 0x48, 0x2c, 0xa1, 0xa5, 0x1e,
+ 0x1b, 0x25, 0x0d, 0xfd, 0x1e, 0xd0, 0xee, 0xf9,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x7c, 0xab, 0x53, 0xe2, 0x48, 0x87, 0xdf, 0x64,
+ 0x98, 0x6a, 0xc1, 0x7e, 0xf0, 0x01, 0x4d, 0xc9,
+ 0x07, 0x4f, 0xb8, 0x2f, 0x46, 0xd7, 0xee, 0xa9,
+ 0xad, 0xe5, 0xf8, 0x21, 0xac, 0xfe, 0x17, 0x58,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x5e, 0x63, 0x2c, 0xd0, 0xf8, 0x7b, 0xf5, 0xae,
+ 0x61, 0x97, 0x94, 0x57, 0xc8, 0x76, 0x22, 0xd9,
+ 0x8b, 0x04, 0x5e, 0xf1, 0x5d, 0xd0, 0xfc, 0xd9,
+ 0x0c, 0x19, 0x2e, 0xe2, 0xc5, 0xd9, 0x73, 0x51,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0x33, 0x65, 0xa6, 0x37, 0xbf, 0xf8, 0x4f, 0x15,
+ 0x4c, 0xac, 0x9e, 0xa4, 0x3b, 0x02, 0x07, 0x0c,
+ 0x80, 0x86, 0x0d, 0x6c, 0xe4, 0xaf, 0x1c, 0xbc,
+ 0x0b, 0x9c, 0x0a, 0x98, 0xc2, 0x99, 0x71, 0xcd,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x59, 0xd2, 0x10, 0xd3, 0x75, 0xac, 0x48, 0x32,
+ 0xb1, 0xea, 0xee, 0xcf, 0x0a, 0xd2, 0x8b, 0x15,
+ 0x5d, 0x72, 0x71, 0x4c, 0xa7, 0x29, 0xb0, 0x7a,
+ 0x44, 0x48, 0x8a, 0x54, 0x54, 0x54, 0x41, 0xf5,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0xdc, 0xfc, 0x46, 0x81, 0xc6, 0x1b, 0x2b, 0x47,
+ 0x8b, 0xed, 0xe0, 0x73, 0x34, 0x38, 0x53, 0x92,
+ 0x97, 0x2f, 0xfb, 0x51, 0xab, 0x4f, 0x2d, 0x9d,
+ 0x69, 0x04, 0xa9, 0x5d, 0x33, 0xef, 0xcb, 0x1c,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0xd6, 0x2a, 0x7f, 0x96, 0x04, 0x4d, 0x16, 0xc8,
+ 0x49, 0xe0, 0x37, 0x33, 0xe3, 0x7b, 0x34, 0x56,
+ 0x99, 0xc5, 0x78, 0x57, 0x06, 0x02, 0xb4, 0xea,
+ 0x80, 0xc4, 0xf8, 0x8f, 0x8d, 0x2b, 0xe4, 0x05,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x8b, 0x58, 0x62, 0xb5, 0x85, 0xf6, 0x83, 0x36,
+ 0xf5, 0x34, 0xb8, 0xd4, 0xbc, 0x5c, 0x8b, 0x38,
+ 0xfd, 0x15, 0xcd, 0x44, 0x83, 0x25, 0x71, 0xe1,
+ 0xd5, 0xe8, 0xa1, 0xa4, 0x36, 0x98, 0x7e, 0x68,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x7e, 0xeb, 0x06, 0x87, 0xdf, 0x1a, 0xdc, 0xe5,
+ 0xfb, 0x64, 0xd4, 0xd1, 0x5d, 0x9e, 0x75, 0xc0,
+ 0xb9, 0xad, 0x55, 0x6c, 0xe6, 0xba, 0x4d, 0x98,
+ 0x2f, 0xbf, 0x72, 0xad, 0x61, 0x37, 0xf6, 0x11,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x72, 0xdb, 0x43, 0x16, 0x57, 0x8e, 0x3a, 0x96,
+ 0xf3, 0x98, 0x19, 0x24, 0x17, 0x3b, 0xe8, 0xad,
+ 0xa1, 0x9b, 0xa4, 0x1b, 0x74, 0x85, 0x2e, 0x24,
+ 0x70, 0xea, 0x31, 0x5a, 0x1c, 0xbe, 0x43, 0xb5,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x32, 0x48, 0xb0, 0xf0, 0x3f, 0xbb, 0xd2, 0xa3,
+ 0xfd, 0xf6, 0x28, 0x4a, 0x2a, 0xc5, 0xbe, 0x4b,
+ 0x73, 0x50, 0x63, 0xd6, 0x16, 0x00, 0xef, 0xed,
+ 0xfe, 0x97, 0x41, 0x29, 0xb2, 0x84, 0xc4, 0xa3,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x17, 0xda, 0x6b, 0x96, 0x6a, 0xa6, 0xa4, 0xa6,
+ 0xa6, 0xf3, 0x9d, 0x18, 0x19, 0x8d, 0x98, 0x7c,
+ 0x66, 0x38, 0xe8, 0x99, 0xe7, 0x0a, 0x50, 0x92,
+ 0xaf, 0x11, 0x80, 0x05, 0x66, 0xed, 0xab, 0x74,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x13, 0xd5, 0x8b, 0x22, 0xae, 0x90, 0x7b, 0x67,
+ 0x87, 0x4e, 0x3c, 0x35, 0x4e, 0x01, 0xf0, 0xb1,
+ 0xd3, 0xd1, 0x67, 0xbb, 0x43, 0xdb, 0x7c, 0x75,
+ 0xa4, 0xc7, 0x64, 0x83, 0x1e, 0x9b, 0x98, 0xad,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x6f, 0xe0, 0x5d, 0x9d, 0xd5, 0x78, 0x29, 0xfb,
+ 0xd0, 0x77, 0xd1, 0x8a, 0xf0, 0x80, 0xcb, 0x81,
+ 0x71, 0x9e, 0x4d, 0x49, 0xde, 0x74, 0x2a, 0x37,
+ 0xc0, 0xd5, 0xf0, 0xfa, 0x50, 0xe6, 0x23, 0xfe,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x89, 0xac, 0xf6, 0xe7, 0x5e, 0xba, 0x53, 0xf4,
+ 0x92, 0x32, 0xd5, 0x64, 0xfb, 0xc4, 0x08, 0xac,
+ 0x2c, 0x19, 0x6e, 0x63, 0x13, 0x75, 0xd0, 0x60,
+ 0x54, 0x35, 0x82, 0xc4, 0x6d, 0x03, 0x1a, 0x05,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x1c, 0xaf, 0x94, 0x7d, 0x9c, 0xce, 0x57, 0x64,
+ 0xf8, 0xa8, 0x25, 0x45, 0x32, 0x86, 0x2b, 0x04,
+ 0xb3, 0x2e, 0x67, 0xca, 0x73, 0x04, 0x2f, 0xab,
+ 0xcc, 0xda, 0x9e, 0x42, 0xa1, 0xaf, 0x83, 0x5a,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0x21, 0xdf, 0xdc, 0x29, 0xd9, 0xfc, 0x7b, 0xe7,
+ 0x3a, 0xc4, 0xe1, 0x61, 0xc5, 0xb5, 0xe1, 0xee,
+ 0x7a, 0x9d, 0x0c, 0x66, 0x36, 0x63, 0xe4, 0x12,
+ 0x62, 0xe2, 0xf5, 0x68, 0x72, 0xfc, 0x1e, 0x18,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x6e, 0xc7, 0x2e, 0xac, 0xd0, 0xbb, 0x22, 0xe0,
+ 0xc2, 0x40, 0xb2, 0xfe, 0x8c, 0xaf, 0x9e, 0xcf,
+ 0x32, 0x06, 0xc6, 0x45, 0x29, 0xbd, 0xe0, 0x7f,
+ 0x53, 0x32, 0xc3, 0x2b, 0x2f, 0x68, 0x12, 0xcd,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0x76, 0xba, 0x52, 0xb5, 0x09, 0xf5, 0x19, 0x09,
+ 0x70, 0x1c, 0x09, 0x28, 0xb4, 0xaa, 0x98, 0x6a,
+ 0x79, 0xe7, 0x5e, 0xcd, 0xe8, 0xa4, 0x73, 0x69,
+ 0x1f, 0xf8, 0x05, 0x0a, 0xb4, 0xfe, 0xf9, 0x63,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0xf7, 0xad, 0xf9, 0xc8, 0x0e, 0x04, 0x2f, 0xdf,
+ 0xbe, 0x39, 0x79, 0x07, 0x0d, 0xd8, 0x1b, 0x06,
+ 0x42, 0x3a, 0x43, 0x93, 0xf6, 0x7c, 0xc4, 0xe5,
+ 0xc2, 0xd5, 0xd0, 0xa6, 0x35, 0x6c, 0xbd, 0x17,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x38, 0xd7, 0xab, 0x7e, 0x08, 0xdc, 0x1e, 0xab,
+ 0x55, 0xbb, 0x3b, 0x7b, 0x6a, 0x17, 0xcc, 0x79,
+ 0xa7, 0x02, 0x62, 0x66, 0x9b, 0xca, 0xee, 0xc0,
+ 0x3d, 0x75, 0x34, 0x2e, 0x55, 0x82, 0x26, 0x3c,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0xf7, 0xeb, 0x2f, 0x24, 0x98, 0x54, 0x04, 0x5a,
+ 0x19, 0xe4, 0x12, 0x9d, 0x97, 0xbc, 0x87, 0xa5,
+ 0x0b, 0x85, 0x29, 0xa1, 0x36, 0x89, 0xc9, 0xba,
+ 0xa0, 0xe0, 0xac, 0x99, 0x7d, 0xa4, 0x51, 0x9f,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x8f, 0xe8, 0xa7, 0x79, 0x02, 0xbb, 0x4a, 0x56,
+ 0x66, 0x91, 0xef, 0x22, 0xd1, 0x09, 0x26, 0x6c,
+ 0xa9, 0x13, 0xd7, 0x44, 0xc7, 0x19, 0x9c, 0x0b,
+ 0xfb, 0x4f, 0xca, 0x72, 0x8f, 0x34, 0xf7, 0x82,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0xaa, 0x21, 0xbb, 0x25, 0x4b, 0x66, 0x6e, 0x29,
+ 0x71, 0xc1, 0x44, 0x67, 0x19, 0xed, 0xe6, 0xe6,
+ 0x61, 0x13, 0xf4, 0xb7, 0x02, 0x94, 0x81, 0x0f,
+ 0xa7, 0x4d, 0xbb, 0x2c, 0xb8, 0xeb, 0x41, 0x0e,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[BLAKE2S_HASH_SIZE] = {
+ 0x84, 0x21, 0xbb, 0x73, 0x64, 0x47, 0x45, 0xe0,
+ 0xc1, 0x83, 0x78, 0xf1, 0xea, 0xe5, 0xfd, 0xdb,
+ 0x01, 0xda, 0xb7, 0x86, 0x70, 0x3b, 0x83, 0xb3,
+ 0xbc, 0xd9, 0xfd, 0x96, 0xbd, 0x50, 0x06, 0x67,
+};
+
+static const u8 blake2s_keyed_testvec_consolidated[BLAKE2S_HASH_SIZE] = {
+ 0xa6, 0xad, 0xcd, 0xb8, 0xd9, 0xdd, 0xc7, 0x70,
+ 0x07, 0x09, 0x7f, 0x9f, 0x41, 0xa9, 0x70, 0xa4,
+ 0x1c, 0xca, 0x61, 0xbb, 0x58, 0xb5, 0xb2, 0x1d,
+ 0xd1, 0x71, 0x16, 0xb0, 0x49, 0x4f, 0x9e, 0x1b,
+};
diff --git a/lib/crypto/tests/blake2s_kunit.c b/lib/crypto/tests/blake2s_kunit.c
new file mode 100644
index 000000000000..6832d9aa7b82
--- /dev/null
+++ b/lib/crypto/tests/blake2s_kunit.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/blake2s.h>
+#include "blake2s-testvecs.h"
+
+/*
+ * The following are compatibility functions that present BLAKE2s as an unkeyed
+ * hash function that produces hashes of fixed length BLAKE2S_HASH_SIZE, so that
+ * hash-test-template.h can be reused to test it.
+ */
+
+static void blake2s_default(const u8 *data, size_t len,
+ u8 out[BLAKE2S_HASH_SIZE])
+{
+ blake2s(NULL, 0, data, len, out, BLAKE2S_HASH_SIZE);
+}
+
+static void blake2s_init_default(struct blake2s_ctx *ctx)
+{
+ blake2s_init(ctx, BLAKE2S_HASH_SIZE);
+}
+
+/*
+ * Generate the HASH_KUNIT_CASES using hash-test-template.h. These test BLAKE2s
+ * with a key length of 0 and a hash length of BLAKE2S_HASH_SIZE.
+ */
+#define HASH blake2s_default
+#define HASH_CTX blake2s_ctx
+#define HASH_SIZE BLAKE2S_HASH_SIZE
+#define HASH_INIT blake2s_init_default
+#define HASH_UPDATE blake2s_update
+#define HASH_FINAL blake2s_final
+#include "hash-test-template.h"
+
+/*
+ * BLAKE2s specific test case which tests all possible combinations of key
+ * length and hash length.
+ */
+static void test_blake2s_all_key_and_hash_lens(struct kunit *test)
+{
+ const size_t data_len = 100;
+ u8 *data = &test_buf[0];
+ u8 *key = data + data_len;
+ u8 *hash = key + BLAKE2S_KEY_SIZE;
+ struct blake2s_ctx main_ctx;
+ u8 main_hash[BLAKE2S_HASH_SIZE];
+
+ rand_bytes_seeded_from_len(data, data_len);
+ blake2s_init(&main_ctx, BLAKE2S_HASH_SIZE);
+ for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
+ rand_bytes_seeded_from_len(key, key_len);
+ for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
+ blake2s(key, key_len, data, data_len, hash, out_len);
+ blake2s_update(&main_ctx, hash, out_len);
+ }
+ }
+ blake2s_final(&main_ctx, main_hash);
+ KUNIT_ASSERT_MEMEQ(test, main_hash, blake2s_keyed_testvec_consolidated,
+ BLAKE2S_HASH_SIZE);
+}
+
+/*
+ * BLAKE2s specific test case which tests using a guarded buffer for all allowed
+ * key lengths. Also tests both blake2s() and blake2s_init_key().
+ */
+static void test_blake2s_with_guarded_key_buf(struct kunit *test)
+{
+ const size_t data_len = 100;
+
+ rand_bytes(test_buf, data_len);
+ for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
+ u8 key[BLAKE2S_KEY_SIZE];
+ u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
+ u8 hash1[BLAKE2S_HASH_SIZE];
+ u8 hash2[BLAKE2S_HASH_SIZE];
+ struct blake2s_ctx ctx;
+
+ rand_bytes(key, key_len);
+ memcpy(guarded_key, key, key_len);
+
+ blake2s(key, key_len, test_buf, data_len,
+ hash1, BLAKE2S_HASH_SIZE);
+ blake2s(guarded_key, key_len, test_buf, data_len,
+ hash2, BLAKE2S_HASH_SIZE);
+ KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
+
+ blake2s_init_key(&ctx, BLAKE2S_HASH_SIZE, guarded_key, key_len);
+ blake2s_update(&ctx, test_buf, data_len);
+ blake2s_final(&ctx, hash2);
+ KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
+ }
+}
+
+/*
+ * BLAKE2s specific test case which tests using a guarded output buffer for all
+ * allowed output lengths.
+ */
+static void test_blake2s_with_guarded_out_buf(struct kunit *test)
+{
+ const size_t data_len = 100;
+
+ rand_bytes(test_buf, data_len);
+ for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
+ u8 hash[BLAKE2S_HASH_SIZE];
+ u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
+
+ blake2s(NULL, 0, test_buf, data_len, hash, out_len);
+ blake2s(NULL, 0, test_buf, data_len, guarded_hash, out_len);
+ KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
+ }
+}
+
+static struct kunit_case blake2s_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(test_blake2s_all_key_and_hash_lens),
+ KUNIT_CASE(test_blake2s_with_guarded_key_buf),
+ KUNIT_CASE(test_blake2s_with_guarded_out_buf),
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite blake2s_test_suite = {
+ .name = "blake2s",
+ .test_cases = blake2s_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(blake2s_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for BLAKE2s");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/curve25519-selftest.c b/lib/crypto/tests/curve25519_kunit.c
index c85e85381e78..248d05f66b35 100644
--- a/lib/crypto/curve25519-selftest.c
+++ b/lib/crypto/tests/curve25519_kunit.c
@@ -4,6 +4,8 @@
*/
#include <crypto/curve25519.h>
+#include <kunit/test.h>
+#include <linux/timekeeping.h>
struct curve25519_test_vector {
u8 private[CURVE25519_KEY_SIZE];
@@ -11,7 +13,7 @@ struct curve25519_test_vector {
u8 result[CURVE25519_KEY_SIZE];
bool valid;
};
-static const struct curve25519_test_vector curve25519_test_vectors[] __initconst = {
+static const struct curve25519_test_vector curve25519_test_vectors[] = {
{
.private = { 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d,
0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45,
@@ -1280,42 +1282,82 @@ static const struct curve25519_test_vector curve25519_test_vectors[] __initconst
}
};
-bool __init curve25519_selftest(void)
+static void test_curve25519(struct kunit *test)
{
- bool success = true, ret, ret2;
- size_t i = 0, j;
- u8 in[CURVE25519_KEY_SIZE];
- u8 out[CURVE25519_KEY_SIZE], out2[CURVE25519_KEY_SIZE],
- out3[CURVE25519_KEY_SIZE];
+ for (size_t i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) {
+ const struct curve25519_test_vector *vec =
+ &curve25519_test_vectors[i];
+ u8 out[CURVE25519_KEY_SIZE] = {};
+ bool ret;
- for (i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) {
- memset(out, 0, CURVE25519_KEY_SIZE);
- ret = curve25519(out, curve25519_test_vectors[i].private,
- curve25519_test_vectors[i].public);
- if (ret != curve25519_test_vectors[i].valid ||
- memcmp(out, curve25519_test_vectors[i].result,
- CURVE25519_KEY_SIZE)) {
- pr_err("curve25519 self-test %zu: FAIL\n", i + 1);
- success = false;
- }
+ ret = curve25519(out, vec->private, vec->public);
+ KUNIT_EXPECT_EQ_MSG(test, ret, vec->valid,
+ "Wrong return value with test vector %zu",
+ i);
+ KUNIT_EXPECT_MEMEQ_MSG(test, out, vec->result, sizeof(out),
+ "Wrong output with test vector %zu", i);
}
+}
+
+static void test_curve25519_basepoint(struct kunit *test)
+{
+ for (size_t i = 0; i < 5; ++i) {
+ u8 in[CURVE25519_KEY_SIZE];
+ u8 out[CURVE25519_KEY_SIZE];
+ u8 out2[CURVE25519_KEY_SIZE];
+ bool ret, ret2;
- for (i = 0; i < 5; ++i) {
get_random_bytes(in, sizeof(in));
ret = curve25519_generate_public(out, in);
ret2 = curve25519(out2, in, (u8[CURVE25519_KEY_SIZE]){ 9 });
- curve25519_generic(out3, in, (u8[CURVE25519_KEY_SIZE]){ 9 });
- if (ret != ret2 ||
- memcmp(out, out2, CURVE25519_KEY_SIZE) ||
- memcmp(out, out3, CURVE25519_KEY_SIZE)) {
- pr_err("curve25519 basepoint self-test %zu: FAIL: input - 0x",
- i + 1);
- for (j = CURVE25519_KEY_SIZE; j-- > 0;)
- printk(KERN_CONT "%02x", in[j]);
- printk(KERN_CONT "\n");
- success = false;
- }
+ KUNIT_EXPECT_EQ_MSG(test, ret, ret2,
+ "in=%*phN", CURVE25519_KEY_SIZE, in);
+ KUNIT_EXPECT_MEMEQ_MSG(test, out, out2, CURVE25519_KEY_SIZE,
+ "in=%*phN", CURVE25519_KEY_SIZE, in);
}
+}
+
+static void benchmark_curve25519(struct kunit *test)
+{
+ const u8 *private = curve25519_test_vectors[0].private;
+ const u8 *public = curve25519_test_vectors[0].public;
+ const size_t warmup_niter = 5000;
+ const size_t benchmark_niter = 1024;
+ u8 out[CURVE25519_KEY_SIZE];
+ bool ok = true;
+ u64 t;
+
+ if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+ kunit_skip(test, "not enabled");
- return success;
+ /* Warm-up */
+ for (size_t i = 0; i < warmup_niter; i++)
+ ok &= curve25519(out, private, public);
+
+ /* Benchmark */
+ preempt_disable();
+ t = ktime_get_ns();
+ for (size_t i = 0; i < benchmark_niter; i++)
+ ok &= curve25519(out, private, public);
+ t = ktime_get_ns() - t;
+ preempt_enable();
+ KUNIT_EXPECT_TRUE(test, ok);
+ kunit_info(test, "%llu ops/s",
+ div64_u64((u64)benchmark_niter * NSEC_PER_SEC, t ?: 1));
}
+
+static struct kunit_case curve25519_test_cases[] = {
+ KUNIT_CASE(test_curve25519),
+ KUNIT_CASE(test_curve25519_basepoint),
+ KUNIT_CASE(benchmark_curve25519),
+ {},
+};
+
+static struct kunit_suite curve25519_test_suite = {
+ .name = "curve25519",
+ .test_cases = curve25519_test_cases,
+};
+kunit_test_suite(curve25519_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for Curve25519");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/hash-test-template.h b/lib/crypto/tests/hash-test-template.h
new file mode 100644
index 000000000000..61b43e62779f
--- /dev/null
+++ b/lib/crypto/tests/hash-test-template.h
@@ -0,0 +1,568 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Test cases for hash functions, including a benchmark. This is included by
+ * KUnit test suites that want to use it. See sha512_kunit.c for an example.
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <kunit/run-in-irq-context.h>
+#include <kunit/test.h>
+#include <linux/vmalloc.h>
+
+/* test_buf is a guarded buffer, i.e. &test_buf[TEST_BUF_LEN] is not mapped. */
+#define TEST_BUF_LEN 16384
+static u8 *test_buf;
+
+static u8 *orig_test_buf;
+
+static u64 random_seed;
+
+/*
+ * This is a simple linear congruential generator. It is used only for testing,
+ * which does not require cryptographically secure random numbers. A hard-coded
+ * algorithm is used instead of <linux/prandom.h> so that it matches the
+ * algorithm used by the test vector generation script. This allows the input
+ * data in random test vectors to be concisely stored as just the seed.
+ */
+static u32 rand32(void)
+{
+ random_seed = (random_seed * 25214903917 + 11) & ((1ULL << 48) - 1);
+ return random_seed >> 16;
+}
+
+static void rand_bytes(u8 *out, size_t len)
+{
+ for (size_t i = 0; i < len; i++)
+ out[i] = rand32();
+}
+
+static void rand_bytes_seeded_from_len(u8 *out, size_t len)
+{
+ random_seed = len;
+ rand_bytes(out, len);
+}
+
+static bool rand_bool(void)
+{
+ return rand32() % 2;
+}
+
+/* Generate a random length, preferring small lengths. */
+static size_t rand_length(size_t max_len)
+{
+ size_t len;
+
+ switch (rand32() % 3) {
+ case 0:
+ len = rand32() % 128;
+ break;
+ case 1:
+ len = rand32() % 3072;
+ break;
+ default:
+ len = rand32();
+ break;
+ }
+ return len % (max_len + 1);
+}
+
+static size_t rand_offset(size_t max_offset)
+{
+ return min(rand32() % 128, max_offset);
+}
+
+static int hash_suite_init(struct kunit_suite *suite)
+{
+ /*
+ * Allocate the test buffer using vmalloc() with a page-aligned length
+ * so that it is immediately followed by a guard page. This allows
+ * buffer overreads to be detected, even in assembly code.
+ */
+ size_t alloc_len = round_up(TEST_BUF_LEN, PAGE_SIZE);
+
+ orig_test_buf = vmalloc(alloc_len);
+ if (!orig_test_buf)
+ return -ENOMEM;
+
+ test_buf = orig_test_buf + alloc_len - TEST_BUF_LEN;
+ return 0;
+}
+
+static void hash_suite_exit(struct kunit_suite *suite)
+{
+ vfree(orig_test_buf);
+ orig_test_buf = NULL;
+ test_buf = NULL;
+}
+
+/*
+ * Test the hash function against a list of test vectors.
+ *
+ * Note that it's only necessary to run each test vector in one way (e.g.,
+ * one-shot instead of incremental), since consistency between different ways of
+ * using the APIs is verified by other test cases.
+ */
+static void test_hash_test_vectors(struct kunit *test)
+{
+ for (size_t i = 0; i < ARRAY_SIZE(hash_testvecs); i++) {
+ size_t data_len = hash_testvecs[i].data_len;
+ u8 actual_hash[HASH_SIZE];
+
+ KUNIT_ASSERT_LE(test, data_len, TEST_BUF_LEN);
+ rand_bytes_seeded_from_len(test_buf, data_len);
+
+ HASH(test_buf, data_len, actual_hash);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, actual_hash, hash_testvecs[i].digest, HASH_SIZE,
+ "Wrong result with test vector %zu; data_len=%zu", i,
+ data_len);
+ }
+}
+
+/*
+ * Test that the hash function produces correct results for *every* length up to
+ * 4096 bytes. To do this, generate seeded random data, then calculate a hash
+ * value for each length 0..4096, then hash the hash values. Verify just the
+ * final hash value, which should match only when all hash values were correct.
+ */
+static void test_hash_all_lens_up_to_4096(struct kunit *test)
+{
+ struct HASH_CTX ctx;
+ u8 hash[HASH_SIZE];
+
+ static_assert(TEST_BUF_LEN >= 4096);
+ rand_bytes_seeded_from_len(test_buf, 4096);
+ HASH_INIT(&ctx);
+ for (size_t len = 0; len <= 4096; len++) {
+ HASH(test_buf, len, hash);
+ HASH_UPDATE(&ctx, hash, HASH_SIZE);
+ }
+ HASH_FINAL(&ctx, hash);
+ KUNIT_ASSERT_MEMEQ(test, hash, hash_testvec_consolidated, HASH_SIZE);
+}
+
+/*
+ * Test that the hash function produces the same result with a one-shot
+ * computation as it does with an incremental computation.
+ */
+static void test_hash_incremental_updates(struct kunit *test)
+{
+ for (int i = 0; i < 1000; i++) {
+ size_t total_len, offset;
+ struct HASH_CTX ctx;
+ u8 hash1[HASH_SIZE];
+ u8 hash2[HASH_SIZE];
+ size_t num_parts = 0;
+ size_t remaining_len, cur_offset;
+
+ total_len = rand_length(TEST_BUF_LEN);
+ offset = rand_offset(TEST_BUF_LEN - total_len);
+ rand_bytes(&test_buf[offset], total_len);
+
+ /* Compute the hash value in one shot. */
+ HASH(&test_buf[offset], total_len, hash1);
+
+ /*
+ * Compute the hash value incrementally, using a randomly
+ * selected sequence of update lengths that sum to total_len.
+ */
+ HASH_INIT(&ctx);
+ remaining_len = total_len;
+ cur_offset = offset;
+ while (rand_bool()) {
+ size_t part_len = rand_length(remaining_len);
+
+ HASH_UPDATE(&ctx, &test_buf[cur_offset], part_len);
+ num_parts++;
+ cur_offset += part_len;
+ remaining_len -= part_len;
+ }
+ if (remaining_len != 0 || rand_bool()) {
+ HASH_UPDATE(&ctx, &test_buf[cur_offset], remaining_len);
+ num_parts++;
+ }
+ HASH_FINAL(&ctx, hash2);
+
+ /* Verify that the two hash values are the same. */
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash1, hash2, HASH_SIZE,
+ "Incremental test failed with total_len=%zu num_parts=%zu offset=%zu",
+ total_len, num_parts, offset);
+ }
+}
+
+/*
+ * Test that the hash function does not overrun any buffers. Uses a guard page
+ * to catch buffer overruns even if they occur in assembly code.
+ */
+static void test_hash_buffer_overruns(struct kunit *test)
+{
+ const size_t max_tested_len = TEST_BUF_LEN - sizeof(struct HASH_CTX);
+ void *const buf_end = &test_buf[TEST_BUF_LEN];
+ struct HASH_CTX *guarded_ctx = buf_end - sizeof(*guarded_ctx);
+
+ rand_bytes(test_buf, TEST_BUF_LEN);
+
+ for (int i = 0; i < 100; i++) {
+ size_t len = rand_length(max_tested_len);
+ struct HASH_CTX ctx;
+ u8 hash[HASH_SIZE];
+
+ /* Check for overruns of the data buffer. */
+ HASH(buf_end - len, len, hash);
+ HASH_INIT(&ctx);
+ HASH_UPDATE(&ctx, buf_end - len, len);
+ HASH_FINAL(&ctx, hash);
+
+ /* Check for overruns of the hash value buffer. */
+ HASH(test_buf, len, buf_end - HASH_SIZE);
+ HASH_INIT(&ctx);
+ HASH_UPDATE(&ctx, test_buf, len);
+ HASH_FINAL(&ctx, buf_end - HASH_SIZE);
+
+ /* Check for overuns of the hash context. */
+ HASH_INIT(guarded_ctx);
+ HASH_UPDATE(guarded_ctx, test_buf, len);
+ HASH_FINAL(guarded_ctx, hash);
+ }
+}
+
+/*
+ * Test that the caller is permitted to alias the output digest and source data
+ * buffer, and also modify the source data buffer after it has been used.
+ */
+static void test_hash_overlaps(struct kunit *test)
+{
+ const size_t max_tested_len = TEST_BUF_LEN - HASH_SIZE;
+ struct HASH_CTX ctx;
+ u8 hash[HASH_SIZE];
+
+ rand_bytes(test_buf, TEST_BUF_LEN);
+
+ for (int i = 0; i < 100; i++) {
+ size_t len = rand_length(max_tested_len);
+ size_t offset = HASH_SIZE + rand_offset(max_tested_len - len);
+ bool left_end = rand_bool();
+ u8 *ovl_hash = left_end ? &test_buf[offset] :
+ &test_buf[offset + len - HASH_SIZE];
+
+ HASH(&test_buf[offset], len, hash);
+ HASH(&test_buf[offset], len, ovl_hash);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash, ovl_hash, HASH_SIZE,
+ "Overlap test 1 failed with len=%zu offset=%zu left_end=%d",
+ len, offset, left_end);
+
+ /* Repeat the above test, but this time use init+update+final */
+ HASH(&test_buf[offset], len, hash);
+ HASH_INIT(&ctx);
+ HASH_UPDATE(&ctx, &test_buf[offset], len);
+ HASH_FINAL(&ctx, ovl_hash);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash, ovl_hash, HASH_SIZE,
+ "Overlap test 2 failed with len=%zu offset=%zu left_end=%d",
+ len, offset, left_end);
+
+ /* Test modifying the source data after it was used. */
+ HASH(&test_buf[offset], len, hash);
+ HASH_INIT(&ctx);
+ HASH_UPDATE(&ctx, &test_buf[offset], len);
+ rand_bytes(&test_buf[offset], len);
+ HASH_FINAL(&ctx, ovl_hash);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash, ovl_hash, HASH_SIZE,
+ "Overlap test 3 failed with len=%zu offset=%zu left_end=%d",
+ len, offset, left_end);
+ }
+}
+
+/*
+ * Test that if the same data is hashed at different alignments in memory, the
+ * results are the same.
+ */
+static void test_hash_alignment_consistency(struct kunit *test)
+{
+ u8 hash1[128 + HASH_SIZE];
+ u8 hash2[128 + HASH_SIZE];
+
+ for (int i = 0; i < 100; i++) {
+ size_t len = rand_length(TEST_BUF_LEN);
+ size_t data_offs1 = rand_offset(TEST_BUF_LEN - len);
+ size_t data_offs2 = rand_offset(TEST_BUF_LEN - len);
+ size_t hash_offs1 = rand_offset(128);
+ size_t hash_offs2 = rand_offset(128);
+
+ rand_bytes(&test_buf[data_offs1], len);
+ HASH(&test_buf[data_offs1], len, &hash1[hash_offs1]);
+ memmove(&test_buf[data_offs2], &test_buf[data_offs1], len);
+ HASH(&test_buf[data_offs2], len, &hash2[hash_offs2]);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, &hash1[hash_offs1], &hash2[hash_offs2], HASH_SIZE,
+ "Alignment consistency test failed with len=%zu data_offs=(%zu,%zu) hash_offs=(%zu,%zu)",
+ len, data_offs1, data_offs2, hash_offs1, hash_offs2);
+ }
+}
+
+/* Test that HASH_FINAL zeroizes the context. */
+static void test_hash_ctx_zeroization(struct kunit *test)
+{
+ static const u8 zeroes[sizeof(struct HASH_CTX)];
+ struct HASH_CTX ctx;
+
+ rand_bytes(test_buf, 128);
+ HASH_INIT(&ctx);
+ HASH_UPDATE(&ctx, test_buf, 128);
+ HASH_FINAL(&ctx, test_buf);
+ KUNIT_ASSERT_MEMEQ_MSG(test, &ctx, zeroes, sizeof(ctx),
+ "Hash context was not zeroized by finalization");
+}
+
+#define IRQ_TEST_DATA_LEN 256
+#define IRQ_TEST_NUM_BUFFERS 3 /* matches max concurrency level */
+
+struct hash_irq_test1_state {
+ u8 expected_hashes[IRQ_TEST_NUM_BUFFERS][HASH_SIZE];
+ atomic_t seqno;
+};
+
+/*
+ * Compute the hash of one of the test messages and verify that it matches the
+ * expected hash from @state->expected_hashes. To increase the chance of
+ * detecting problems, cycle through multiple messages.
+ */
+static bool hash_irq_test1_func(void *state_)
+{
+ struct hash_irq_test1_state *state = state_;
+ u32 i = (u32)atomic_inc_return(&state->seqno) % IRQ_TEST_NUM_BUFFERS;
+ u8 actual_hash[HASH_SIZE];
+
+ HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN, actual_hash);
+ return memcmp(actual_hash, state->expected_hashes[i], HASH_SIZE) == 0;
+}
+
+/*
+ * Test that if hashes are computed in task, softirq, and hardirq context
+ * concurrently, then all results are as expected.
+ */
+static void test_hash_interrupt_context_1(struct kunit *test)
+{
+ struct hash_irq_test1_state state = {};
+
+ /* Prepare some test messages and compute the expected hash of each. */
+ rand_bytes(test_buf, IRQ_TEST_NUM_BUFFERS * IRQ_TEST_DATA_LEN);
+ for (int i = 0; i < IRQ_TEST_NUM_BUFFERS; i++)
+ HASH(&test_buf[i * IRQ_TEST_DATA_LEN], IRQ_TEST_DATA_LEN,
+ state.expected_hashes[i]);
+
+ kunit_run_irq_test(test, hash_irq_test1_func, 100000, &state);
+}
+
+struct hash_irq_test2_hash_ctx {
+ struct HASH_CTX hash_ctx;
+ atomic_t in_use;
+ int offset;
+ int step;
+};
+
+struct hash_irq_test2_state {
+ struct hash_irq_test2_hash_ctx ctxs[IRQ_TEST_NUM_BUFFERS];
+ u8 expected_hash[HASH_SIZE];
+ u16 update_lens[32];
+ int num_steps;
+};
+
+static bool hash_irq_test2_func(void *state_)
+{
+ struct hash_irq_test2_state *state = state_;
+ struct hash_irq_test2_hash_ctx *ctx;
+ bool ret = true;
+
+ for (ctx = &state->ctxs[0]; ctx < &state->ctxs[ARRAY_SIZE(state->ctxs)];
+ ctx++) {
+ if (atomic_cmpxchg(&ctx->in_use, 0, 1) == 0)
+ break;
+ }
+ if (WARN_ON_ONCE(ctx == &state->ctxs[ARRAY_SIZE(state->ctxs)])) {
+ /*
+ * This should never happen, as the number of contexts is equal
+ * to the maximum concurrency level of kunit_run_irq_test().
+ */
+ return false;
+ }
+
+ if (ctx->step == 0) {
+ /* Init step */
+ HASH_INIT(&ctx->hash_ctx);
+ ctx->offset = 0;
+ ctx->step++;
+ } else if (ctx->step < state->num_steps - 1) {
+ /* Update step */
+ HASH_UPDATE(&ctx->hash_ctx, &test_buf[ctx->offset],
+ state->update_lens[ctx->step - 1]);
+ ctx->offset += state->update_lens[ctx->step - 1];
+ ctx->step++;
+ } else {
+ /* Final step */
+ u8 actual_hash[HASH_SIZE];
+
+ if (WARN_ON_ONCE(ctx->offset != TEST_BUF_LEN))
+ ret = false;
+ HASH_FINAL(&ctx->hash_ctx, actual_hash);
+ if (memcmp(actual_hash, state->expected_hash, HASH_SIZE) != 0)
+ ret = false;
+ ctx->step = 0;
+ }
+ atomic_set_release(&ctx->in_use, 0);
+ return ret;
+}
+
+/*
+ * Test that if hashes are computed in task, softirq, and hardirq context
+ * concurrently, *including doing different parts of the same incremental
+ * computation in different contexts*, then all results are as expected.
+ * Besides detecting bugs similar to those that test_hash_interrupt_context_1
+ * can detect, this test case can also detect bugs where hash function
+ * implementations don't correctly handle these mixed incremental computations.
+ */
+static void test_hash_interrupt_context_2(struct kunit *test)
+{
+ struct hash_irq_test2_state *state;
+ int remaining = TEST_BUF_LEN;
+
+ state = kunit_kzalloc(test, sizeof(*state), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, state);
+
+ rand_bytes(test_buf, TEST_BUF_LEN);
+ HASH(test_buf, TEST_BUF_LEN, state->expected_hash);
+
+ /*
+ * Generate a list of update lengths to use. Ensure that it contains
+ * multiple entries but is limited to a maximum length.
+ */
+ static_assert(TEST_BUF_LEN / 4096 > 1);
+ for (state->num_steps = 0;
+ state->num_steps < ARRAY_SIZE(state->update_lens) - 1 && remaining;
+ state->num_steps++) {
+ state->update_lens[state->num_steps] =
+ rand_length(min(remaining, 4096));
+ remaining -= state->update_lens[state->num_steps];
+ }
+ if (remaining)
+ state->update_lens[state->num_steps++] = remaining;
+ state->num_steps += 2; /* for init and final */
+
+ kunit_run_irq_test(test, hash_irq_test2_func, 250000, state);
+}
+
+#define UNKEYED_HASH_KUNIT_CASES \
+ KUNIT_CASE(test_hash_test_vectors), \
+ KUNIT_CASE(test_hash_all_lens_up_to_4096), \
+ KUNIT_CASE(test_hash_incremental_updates), \
+ KUNIT_CASE(test_hash_buffer_overruns), \
+ KUNIT_CASE(test_hash_overlaps), \
+ KUNIT_CASE(test_hash_alignment_consistency), \
+ KUNIT_CASE(test_hash_ctx_zeroization), \
+ KUNIT_CASE(test_hash_interrupt_context_1), \
+ KUNIT_CASE(test_hash_interrupt_context_2)
+/* benchmark_hash is omitted so that the suites can put it last. */
+
+#ifdef HMAC
+/*
+ * Test the corresponding HMAC variant.
+ *
+ * This test case is fairly short, since HMAC is just a simple C wrapper around
+ * the underlying unkeyed hash function, which is already well-tested by the
+ * other test cases. It's not useful to test things like data alignment or
+ * interrupt context again for HMAC, nor to have a long list of test vectors.
+ *
+ * Thus, just do a single consolidated test, which covers all data lengths up to
+ * 4096 bytes and all key lengths up to 292 bytes. For each data length, select
+ * a key length, generate the inputs from a seed, and compute the HMAC value.
+ * Concatenate all these HMAC values together, and compute the HMAC of that.
+ * Verify that value. If this fails, then the HMAC implementation is wrong.
+ * This won't show which specific input failed, but that should be fine. Any
+ * failure would likely be non-input-specific or also show in the unkeyed tests.
+ */
+static void test_hmac(struct kunit *test)
+{
+ static const u8 zeroes[sizeof(struct HMAC_CTX)];
+ u8 *raw_key;
+ struct HMAC_KEY key;
+ struct HMAC_CTX ctx;
+ u8 mac[HASH_SIZE];
+ u8 mac2[HASH_SIZE];
+
+ static_assert(TEST_BUF_LEN >= 4096 + 293);
+ rand_bytes_seeded_from_len(test_buf, 4096);
+ raw_key = &test_buf[4096];
+
+ rand_bytes_seeded_from_len(raw_key, 32);
+ HMAC_PREPAREKEY(&key, raw_key, 32);
+ HMAC_INIT(&ctx, &key);
+ for (size_t data_len = 0; data_len <= 4096; data_len++) {
+ /*
+ * Cycle through key lengths as well. Somewhat arbitrarily go
+ * up to 293, which is somewhat larger than the largest hash
+ * block size (which is the size at which the key starts being
+ * hashed down to one block); going higher would not be useful.
+ * To reduce correlation with data_len, use a prime number here.
+ */
+ size_t key_len = data_len % 293;
+
+ HMAC_UPDATE(&ctx, test_buf, data_len);
+
+ rand_bytes_seeded_from_len(raw_key, key_len);
+ HMAC_USINGRAWKEY(raw_key, key_len, test_buf, data_len, mac);
+ HMAC_UPDATE(&ctx, mac, HASH_SIZE);
+
+ /* Verify that HMAC() is consistent with HMAC_USINGRAWKEY(). */
+ HMAC_PREPAREKEY(&key, raw_key, key_len);
+ HMAC(&key, test_buf, data_len, mac2);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, mac, mac2, HASH_SIZE,
+ "HMAC gave different results with raw and prepared keys");
+ }
+ HMAC_FINAL(&ctx, mac);
+ KUNIT_EXPECT_MEMEQ_MSG(test, mac, hmac_testvec_consolidated, HASH_SIZE,
+ "HMAC gave wrong result");
+ KUNIT_EXPECT_MEMEQ_MSG(test, &ctx, zeroes, sizeof(ctx),
+ "HMAC context was not zeroized by finalization");
+}
+#define HASH_KUNIT_CASES UNKEYED_HASH_KUNIT_CASES, KUNIT_CASE(test_hmac)
+#else
+#define HASH_KUNIT_CASES UNKEYED_HASH_KUNIT_CASES
+#endif
+
+/* Benchmark the hash function on various data lengths. */
+static void benchmark_hash(struct kunit *test)
+{
+ static const size_t lens_to_test[] = {
+ 1, 16, 64, 127, 128, 200, 256,
+ 511, 512, 1024, 3173, 4096, 16384,
+ };
+ u8 hash[HASH_SIZE];
+
+ if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+ kunit_skip(test, "not enabled");
+
+ /* Warm-up */
+ for (size_t i = 0; i < 10000000; i += TEST_BUF_LEN)
+ HASH(test_buf, TEST_BUF_LEN, hash);
+
+ for (size_t i = 0; i < ARRAY_SIZE(lens_to_test); i++) {
+ size_t len = lens_to_test[i];
+ /* The '+ 128' tries to account for per-message overhead. */
+ size_t num_iters = 10000000 / (len + 128);
+ u64 t;
+
+ KUNIT_ASSERT_LE(test, len, TEST_BUF_LEN);
+ preempt_disable();
+ t = ktime_get_ns();
+ for (size_t j = 0; j < num_iters; j++)
+ HASH(test_buf, len, hash);
+ t = ktime_get_ns() - t;
+ preempt_enable();
+ kunit_info(test, "len=%zu: %llu MB/s", len,
+ div64_u64((u64)len * num_iters * 1000, t ?: 1));
+ }
+}
diff --git a/lib/crypto/tests/md5-testvecs.h b/lib/crypto/tests/md5-testvecs.h
new file mode 100644
index 000000000000..be6727feb296
--- /dev/null
+++ b/lib/crypto/tests/md5-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py md5 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[MD5_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04,
+ 0xe9, 0x80, 0x09, 0x98, 0xec, 0xf8, 0x42, 0x7e,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x16, 0x7b, 0x86, 0xf2, 0x1d, 0xf3, 0x76, 0xc9,
+ 0x6f, 0x10, 0xa0, 0x61, 0x5b, 0x14, 0x20, 0x0b,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x2d, 0x30, 0x96, 0xc7, 0x43, 0x40, 0xed, 0xb2,
+ 0xfb, 0x84, 0x63, 0x9a, 0xec, 0xc7, 0x3c, 0x3c,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0xe5, 0x0f, 0xce, 0xe0, 0xc8, 0xff, 0x4e, 0x08,
+ 0x5e, 0x19, 0xe5, 0xf2, 0x08, 0x11, 0x19, 0x16,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0xe8, 0xca, 0x29, 0x05, 0x2f, 0xd1, 0xf3, 0x99,
+ 0x40, 0x71, 0xf5, 0xc2, 0xf7, 0xf8, 0x17, 0x3e,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0xe3, 0x20, 0xc1, 0xd8, 0x21, 0x14, 0x44, 0x59,
+ 0x1a, 0xf5, 0x91, 0xaf, 0x69, 0xbe, 0x93, 0x9d,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0xfb, 0x06, 0xb0, 0xf0, 0x00, 0x10, 0x4b, 0x68,
+ 0x3d, 0x75, 0xf9, 0x70, 0xde, 0xbb, 0x32, 0x16,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x52, 0x86, 0x48, 0x8b, 0xae, 0x91, 0x7c, 0x4e,
+ 0xc2, 0x2a, 0x69, 0x07, 0x35, 0xcc, 0xb2, 0x88,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0xfa, 0xd3, 0xf6, 0xe6, 0x7b, 0x1a, 0xc6, 0x05,
+ 0x73, 0x35, 0x02, 0xab, 0xc7, 0xb3, 0x47, 0xcb,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0xc5, 0x59, 0x29, 0xe9, 0x0a, 0x4a, 0x86, 0x43,
+ 0x7c, 0xaf, 0xdf, 0x83, 0xd3, 0xb8, 0x33, 0x5f,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x80, 0x05, 0x75, 0x39, 0xec, 0x44, 0x8a, 0x81,
+ 0xe7, 0x6e, 0x8d, 0xd1, 0xc6, 0xeb, 0xc2, 0xf0,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x3f, 0x02, 0xe8, 0xc6, 0xb8, 0x6a, 0x39, 0xc3,
+ 0xa4, 0x1c, 0xd9, 0x8f, 0x4a, 0x71, 0x40, 0x30,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x89, 0x4f, 0x79, 0x3e, 0xff, 0x0c, 0x22, 0x60,
+ 0xa2, 0xdc, 0x10, 0x5f, 0x23, 0x0a, 0xe7, 0xc6,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x06, 0x56, 0x61, 0xb8, 0x8a, 0x82, 0x77, 0x1b,
+ 0x2c, 0x35, 0xb8, 0x9f, 0xd6, 0xf7, 0xbd, 0x5a,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x5d, 0xdf, 0x7d, 0xc8, 0x43, 0x96, 0x3b, 0xdb,
+ 0xc7, 0x0e, 0x44, 0x42, 0x23, 0xf7, 0xed, 0xdf,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0xf6, 0x5f, 0x26, 0x51, 0x8a, 0x5a, 0x46, 0x8f,
+ 0x48, 0x72, 0x90, 0x74, 0x9d, 0x87, 0xbd, 0xdf,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0xd8, 0x2c, 0xc9, 0x76, 0xfa, 0x67, 0x2e, 0xa6,
+ 0xc8, 0x12, 0x4a, 0x64, 0xaa, 0x0b, 0x3d, 0xbd,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0xe2, 0x7e, 0xb4, 0x5f, 0xe1, 0x74, 0x51, 0xfc,
+ 0xe0, 0xc8, 0xd5, 0xe6, 0x8b, 0x40, 0xd2, 0x0e,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0xcd, 0x7d, 0x56, 0xa9, 0x4c, 0x47, 0xea, 0xc2,
+ 0x34, 0x0b, 0x84, 0x05, 0xf9, 0xad, 0xbb, 0x46,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0x63, 0x6e, 0x58, 0xb3, 0x94, 0x6b, 0x83, 0x5f,
+ 0x1f, 0x0e, 0xd3, 0x66, 0x78, 0x71, 0x98, 0x42,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x9d, 0x68, 0xfc, 0x26, 0x8b, 0x4c, 0xa8, 0xe7,
+ 0x30, 0x0b, 0x19, 0x52, 0x6e, 0xa5, 0x65, 0x1c,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x1c, 0xaa, 0x7d, 0xee, 0x91, 0x01, 0xe2, 0x5a,
+ 0xec, 0xe9, 0xde, 0x57, 0x0a, 0xb6, 0x4c, 0x2f,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x1b, 0x31, 0xe3, 0x14, 0x07, 0x16, 0x17, 0xc6,
+ 0x98, 0x79, 0x88, 0x23, 0xb6, 0x3b, 0x25, 0xc4,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0xc6, 0x3d, 0x56, 0x90, 0xf0, 0xf6, 0xe6, 0x50,
+ 0xf4, 0x76, 0x78, 0x67, 0xa3, 0xdd, 0x62, 0x7b,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[MD5_DIGEST_SIZE] = {
+ 0x70, 0x86, 0x9e, 0x6c, 0xa4, 0xc6, 0x71, 0x43,
+ 0x26, 0x02, 0x1b, 0x3f, 0xfd, 0x56, 0x9f, 0xa6,
+};
+
+static const u8 hmac_testvec_consolidated[MD5_DIGEST_SIZE] = {
+ 0x10, 0x02, 0x74, 0xf6, 0x4d, 0xb3, 0x3c, 0xc7,
+ 0xa1, 0xf7, 0xe6, 0xd4, 0x32, 0x64, 0xfa, 0x6d,
+};
diff --git a/lib/crypto/tests/md5_kunit.c b/lib/crypto/tests/md5_kunit.c
new file mode 100644
index 000000000000..38bd52c25ae3
--- /dev/null
+++ b/lib/crypto/tests/md5_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/md5.h>
+#include "md5-testvecs.h"
+
+#define HASH md5
+#define HASH_CTX md5_ctx
+#define HASH_SIZE MD5_DIGEST_SIZE
+#define HASH_INIT md5_init
+#define HASH_UPDATE md5_update
+#define HASH_FINAL md5_final
+#define HMAC_KEY hmac_md5_key
+#define HMAC_CTX hmac_md5_ctx
+#define HMAC_PREPAREKEY hmac_md5_preparekey
+#define HMAC_INIT hmac_md5_init
+#define HMAC_UPDATE hmac_md5_update
+#define HMAC_FINAL hmac_md5_final
+#define HMAC hmac_md5
+#define HMAC_USINGRAWKEY hmac_md5_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite hash_test_suite = {
+ .name = "md5",
+ .test_cases = hash_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for MD5 and HMAC-MD5");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/poly1305-testvecs.h b/lib/crypto/tests/poly1305-testvecs.h
new file mode 100644
index 000000000000..ecf8662225c8
--- /dev/null
+++ b/lib/crypto/tests/poly1305-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py poly1305 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[POLY1305_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xe8, 0x2d, 0x67, 0x2c, 0x01, 0x48, 0xf9, 0xb7,
+ 0x87, 0x85, 0x3f, 0xcf, 0x18, 0x66, 0x8c, 0xd3,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0xb8, 0xad, 0xca, 0x6b, 0x32, 0xba, 0x34, 0x42,
+ 0x54, 0x10, 0x28, 0xf5, 0x0f, 0x7e, 0x8e, 0xe3,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0xb8, 0xf7, 0xf4, 0xc2, 0x85, 0x33, 0x80, 0x63,
+ 0xd1, 0x45, 0xda, 0xf8, 0x7c, 0x79, 0x42, 0xd1,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0xf3, 0x73, 0x7b, 0x60, 0x24, 0xcc, 0x5d, 0x3e,
+ 0xd1, 0x95, 0x86, 0xce, 0x89, 0x0a, 0x33, 0xba,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x0a, 0x1a, 0x2d, 0x39, 0xea, 0x49, 0x8f, 0xb7,
+ 0x90, 0xb6, 0x74, 0x3b, 0x41, 0x3b, 0x96, 0x11,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x99, 0x05, 0xe3, 0xa7, 0x9e, 0x2a, 0xd2, 0x42,
+ 0xb9, 0x45, 0x0c, 0x08, 0xe7, 0x10, 0xe4, 0xe1,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0xe1, 0xb2, 0x15, 0xee, 0xa2, 0xf3, 0x04, 0xac,
+ 0xdd, 0x27, 0x57, 0x95, 0x2f, 0x45, 0xa8, 0xd3,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x1c, 0xf3, 0xab, 0x39, 0xc0, 0x69, 0x49, 0x69,
+ 0x89, 0x6f, 0x1f, 0x03, 0x16, 0xe7, 0xc0, 0xf0,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x30, 0xb0, 0x32, 0x87, 0x51, 0x55, 0x9c, 0x39,
+ 0x38, 0x42, 0x06, 0xe9, 0x2a, 0x3e, 0x2c, 0x92,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x2c, 0x04, 0x16, 0x36, 0x55, 0x25, 0x2d, 0xc6,
+ 0x3d, 0x70, 0x5b, 0x88, 0x46, 0xb6, 0x71, 0x77,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x03, 0x87, 0xdd, 0xbe, 0xe8, 0x30, 0xf2, 0x15,
+ 0x40, 0x44, 0x29, 0x7b, 0xb1, 0xe9, 0x9d, 0xe7,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x29, 0x83, 0x4f, 0xcb, 0x5a, 0x93, 0x25, 0xad,
+ 0x05, 0xa4, 0xb3, 0x24, 0x77, 0x62, 0x2d, 0x3d,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x20, 0x0e, 0x2c, 0x05, 0xe2, 0x0b, 0x85, 0xa0,
+ 0x24, 0x73, 0x7f, 0x65, 0x70, 0x6c, 0x3e, 0xb0,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0xef, 0x2f, 0x98, 0x42, 0xc2, 0x90, 0x55, 0xea,
+ 0xba, 0x28, 0x76, 0xfd, 0x9e, 0x3e, 0x4d, 0x53,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x9e, 0x75, 0x4b, 0xc7, 0x69, 0x68, 0x51, 0x90,
+ 0xdc, 0x29, 0xc8, 0xfa, 0x86, 0xf1, 0xc9, 0xb3,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x9d, 0x13, 0xf5, 0x54, 0xe6, 0xe3, 0x45, 0x38,
+ 0x8b, 0x6d, 0x5c, 0xc4, 0x50, 0xeb, 0x90, 0xcb,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0xaa, 0xb2, 0x3e, 0x3c, 0x2a, 0xfc, 0x62, 0x0e,
+ 0xd4, 0xe6, 0xe5, 0x5c, 0x6b, 0x9f, 0x3d, 0xc7,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0xd6, 0x8c, 0xea, 0x8a, 0x0f, 0x68, 0xa9, 0xa8,
+ 0x67, 0x86, 0xf9, 0xc1, 0x4c, 0x26, 0x10, 0x6d,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0xdc, 0xc1, 0x54, 0xe7, 0x38, 0xc6, 0xdb, 0x24,
+ 0xa7, 0x0b, 0xff, 0xd3, 0x1b, 0x93, 0x01, 0xa6,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0x8f, 0x88, 0x3e, 0x9c, 0x7b, 0x2e, 0x82, 0x5a,
+ 0x1d, 0x31, 0x82, 0xcc, 0x69, 0xb4, 0x16, 0x26,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x23, 0x45, 0x94, 0xa8, 0x11, 0x54, 0x9d, 0xf2,
+ 0xa1, 0x9a, 0xca, 0xf9, 0x3e, 0x65, 0x52, 0xfd,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x7b, 0xfc, 0xa9, 0x1e, 0x03, 0xad, 0xef, 0x03,
+ 0xe2, 0x20, 0x92, 0xc7, 0x54, 0x83, 0xfa, 0x37,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x46, 0xab, 0x8c, 0x75, 0xb3, 0x10, 0xa6, 0x3f,
+ 0x74, 0x55, 0x42, 0x6d, 0x69, 0x35, 0xd2, 0xf5,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0xd0, 0xfe, 0x26, 0xc2, 0xca, 0x94, 0x2d, 0x52,
+ 0x2d, 0xe1, 0x11, 0xdd, 0x42, 0x28, 0x83, 0xa6,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[POLY1305_DIGEST_SIZE] = {
+ 0x9d, 0x07, 0x5d, 0xc9, 0x6c, 0xeb, 0x62, 0x5d,
+ 0x02, 0x5f, 0xe1, 0xe3, 0xd1, 0x71, 0x69, 0x34,
+};
+
+static const u8 poly1305_allones_macofmacs[POLY1305_DIGEST_SIZE] = {
+ 0x0c, 0x26, 0x6b, 0x45, 0x87, 0x06, 0xcf, 0xc4,
+ 0x3f, 0x70, 0x7d, 0xb3, 0x50, 0xdd, 0x81, 0x25,
+};
diff --git a/lib/crypto/tests/poly1305_kunit.c b/lib/crypto/tests/poly1305_kunit.c
new file mode 100644
index 000000000000..7ac191bd96b6
--- /dev/null
+++ b/lib/crypto/tests/poly1305_kunit.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/poly1305.h>
+#include "poly1305-testvecs.h"
+
+/*
+ * A fixed key used when presenting Poly1305 as an unkeyed hash function in
+ * order to reuse hash-test-template.h. At the beginning of the test suite,
+ * this is initialized to bytes generated from a fixed seed.
+ */
+static u8 test_key[POLY1305_KEY_SIZE];
+
+/* This probably should be in the actual API, but just define it here for now */
+static void poly1305(const u8 key[POLY1305_KEY_SIZE], const u8 *data,
+ size_t len, u8 out[POLY1305_DIGEST_SIZE])
+{
+ struct poly1305_desc_ctx ctx;
+
+ poly1305_init(&ctx, key);
+ poly1305_update(&ctx, data, len);
+ poly1305_final(&ctx, out);
+}
+
+static void poly1305_init_withtestkey(struct poly1305_desc_ctx *ctx)
+{
+ poly1305_init(ctx, test_key);
+}
+
+static void poly1305_withtestkey(const u8 *data, size_t len,
+ u8 out[POLY1305_DIGEST_SIZE])
+{
+ poly1305(test_key, data, len, out);
+}
+
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
+#define HASH poly1305_withtestkey
+#define HASH_CTX poly1305_desc_ctx
+#define HASH_SIZE POLY1305_DIGEST_SIZE
+#define HASH_INIT poly1305_init_withtestkey
+#define HASH_UPDATE poly1305_update
+#define HASH_FINAL poly1305_final
+#include "hash-test-template.h"
+
+static int poly1305_suite_init(struct kunit_suite *suite)
+{
+ rand_bytes_seeded_from_len(test_key, POLY1305_KEY_SIZE);
+ return hash_suite_init(suite);
+}
+
+static void poly1305_suite_exit(struct kunit_suite *suite)
+{
+ hash_suite_exit(suite);
+}
+
+/*
+ * Poly1305 test case which uses a key and message consisting only of one bits:
+ *
+ * - Using an all-one-bits r_key tests the key clamping.
+ * - Using an all-one-bits s_key tests carries in implementations of the
+ * addition mod 2**128 during finalization.
+ * - Using all-one-bits message, and to a lesser extent r_key, tends to maximize
+ * any intermediate accumulator values. This increases the chance of
+ * detecting bugs that occur only in rare cases where the accumulator values
+ * get very large, for example the bug fixed by commit 678cce4019d746da
+ * ("crypto: x86/poly1305 - fix overflow during partial reduction").
+ *
+ * Accumulator overflow bugs may be specific to particular update lengths (in
+ * blocks) and/or particular values of the previous acculumator. Note that the
+ * accumulator starts at 0 which gives the lowest chance of an overflow. Thus,
+ * a single all-one-bits test vector may be insufficient.
+ *
+ * Considering that, do the following test: continuously update a single
+ * Poly1305 context with all-one-bits data of varying lengths (0, 16, 32, ...,
+ * 4096 bytes). After each update, generate the MAC from the current context,
+ * and feed that MAC into a separate Poly1305 context. Repeat that entire
+ * sequence of updates 32 times without re-initializing either context,
+ * resulting in a total of 8224 MAC computations from a long-running, cumulative
+ * context. Finally, generate and verify the MAC of all the MACs.
+ */
+static void test_poly1305_allones_keys_and_message(struct kunit *test)
+{
+ struct poly1305_desc_ctx mac_ctx, macofmacs_ctx;
+ u8 mac[POLY1305_DIGEST_SIZE];
+
+ static_assert(TEST_BUF_LEN >= 4096);
+ memset(test_buf, 0xff, 4096);
+
+ poly1305_init(&mac_ctx, test_buf);
+ poly1305_init(&macofmacs_ctx, test_buf);
+ for (int i = 0; i < 32; i++) {
+ for (size_t len = 0; len <= 4096; len += 16) {
+ struct poly1305_desc_ctx tmp_ctx;
+
+ poly1305_update(&mac_ctx, test_buf, len);
+ tmp_ctx = mac_ctx;
+ poly1305_final(&tmp_ctx, mac);
+ poly1305_update(&macofmacs_ctx, mac,
+ POLY1305_DIGEST_SIZE);
+ }
+ }
+ poly1305_final(&macofmacs_ctx, mac);
+ KUNIT_ASSERT_MEMEQ(test, mac, poly1305_allones_macofmacs,
+ POLY1305_DIGEST_SIZE);
+}
+
+/*
+ * Poly1305 test case which uses r_key=1, s_key=0, and a 48-byte message
+ * consisting of three blocks with integer values [2**128 - i, 0, 0]. In this
+ * case, the result of the polynomial evaluation is 2**130 - i. For small
+ * values of i, this is very close to the modulus 2**130 - 5, which helps catch
+ * edge case bugs in the modular reduction logic.
+ */
+static void test_poly1305_reduction_edge_cases(struct kunit *test)
+{
+ static const u8 key[POLY1305_KEY_SIZE] = { 1 }; /* r_key=1, s_key=0 */
+ u8 data[3 * POLY1305_BLOCK_SIZE] = {};
+ u8 expected_mac[POLY1305_DIGEST_SIZE];
+ u8 actual_mac[POLY1305_DIGEST_SIZE];
+
+ for (int i = 1; i <= 10; i++) {
+ /* Set the first data block to 2**128 - i. */
+ data[0] = -i;
+ memset(&data[1], 0xff, POLY1305_BLOCK_SIZE - 1);
+
+ /*
+ * Assuming s_key=0, the expected MAC as an integer is
+ * (2**130 - i mod 2**130 - 5) + 0 mod 2**128. If 1 <= i <= 5,
+ * that's 5 - i. If 6 <= i <= 10, that's 2**128 - i.
+ */
+ if (i <= 5) {
+ expected_mac[0] = 5 - i;
+ memset(&expected_mac[1], 0, POLY1305_DIGEST_SIZE - 1);
+ } else {
+ expected_mac[0] = -i;
+ memset(&expected_mac[1], 0xff,
+ POLY1305_DIGEST_SIZE - 1);
+ }
+
+ /* Compute and verify the MAC. */
+ poly1305(key, data, sizeof(data), actual_mac);
+ KUNIT_ASSERT_MEMEQ(test, actual_mac, expected_mac,
+ POLY1305_DIGEST_SIZE);
+ }
+}
+
+static struct kunit_case poly1305_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(test_poly1305_allones_keys_and_message),
+ KUNIT_CASE(test_poly1305_reduction_edge_cases),
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite poly1305_test_suite = {
+ .name = "poly1305",
+ .test_cases = poly1305_test_cases,
+ .suite_init = poly1305_suite_init,
+ .suite_exit = poly1305_suite_exit,
+};
+kunit_test_suite(poly1305_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for Poly1305");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/polyval-testvecs.h b/lib/crypto/tests/polyval-testvecs.h
new file mode 100644
index 000000000000..3d33f60d58bb
--- /dev/null
+++ b/lib/crypto/tests/polyval-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py polyval */
+
+static const struct {
+ size_t data_len;
+ u8 digest[POLYVAL_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0xb5, 0x51, 0x69, 0x89, 0xd4, 0x3c, 0x59, 0xca,
+ 0x6a, 0x1c, 0x2a, 0xe9, 0xa1, 0x9c, 0x6c, 0x83,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0xf4, 0x50, 0xaf, 0x07, 0xda, 0x42, 0xa7, 0x41,
+ 0x4d, 0x24, 0x88, 0x87, 0xe3, 0x40, 0x73, 0x7c,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0x9e, 0x88, 0x78, 0x71, 0x4c, 0x55, 0x87, 0xe8,
+ 0xb4, 0x96, 0x3d, 0x56, 0xc8, 0xb2, 0xe1, 0x68,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x9e, 0x81, 0x37, 0x8f, 0x49, 0xf7, 0xa2, 0xe4,
+ 0x04, 0x45, 0x12, 0x78, 0x45, 0x42, 0x27, 0xad,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x60, 0x19, 0xd0, 0xa4, 0xf0, 0xde, 0x9e, 0xe7,
+ 0x6a, 0x89, 0x1a, 0xea, 0x80, 0x14, 0xa9, 0xa3,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x0c, 0xa2, 0x70, 0x4d, 0x7c, 0x89, 0xac, 0x41,
+ 0xc2, 0x9e, 0x0d, 0x07, 0x07, 0x6a, 0x7f, 0xd5,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x91, 0xd3, 0xa9, 0x5c, 0x79, 0x3d, 0x6b, 0x84,
+ 0x99, 0x54, 0xa7, 0xb4, 0x06, 0x66, 0xfd, 0x1c,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x29, 0x37, 0xb8, 0xe5, 0xd8, 0x27, 0x4d, 0xfb,
+ 0x83, 0x4f, 0x67, 0xf7, 0xf9, 0xc1, 0x0a, 0x9d,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x17, 0xa9, 0x06, 0x2c, 0xf3, 0xe8, 0x2e, 0xa6,
+ 0x6b, 0xb2, 0x1f, 0x5d, 0x94, 0x3c, 0x02, 0xa2,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x7c, 0x80, 0x74, 0xd7, 0xa1, 0x37, 0x30, 0x64,
+ 0x3b, 0xa4, 0xa3, 0x98, 0xde, 0x47, 0x10, 0x23,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x27, 0x3a, 0xcf, 0xf5, 0xaf, 0x9f, 0xd8, 0xd8,
+ 0x2d, 0x6a, 0x91, 0xfb, 0xb8, 0xfa, 0xbe, 0x0c,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x97, 0x6e, 0xc4, 0xbe, 0x6b, 0x15, 0xa6, 0x7c,
+ 0xc4, 0xa2, 0xb8, 0x0a, 0x0e, 0x9c, 0xc7, 0x3a,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x2b, 0xc3, 0x98, 0xba, 0x6e, 0x42, 0xf8, 0x18,
+ 0x85, 0x69, 0x15, 0x37, 0x10, 0x60, 0xe6, 0xac,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x88, 0x21, 0x77, 0x89, 0xd7, 0x93, 0x90, 0xfc,
+ 0xf3, 0xb0, 0xe3, 0xfb, 0x14, 0xe2, 0xcf, 0x74,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x66, 0x3d, 0x3e, 0x08, 0xa0, 0x49, 0x81, 0x68,
+ 0x3e, 0x3b, 0xc8, 0x80, 0x55, 0xd4, 0x15, 0xe9,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0x05, 0xf5, 0x06, 0x66, 0xe7, 0x11, 0x08, 0x84,
+ 0xff, 0x94, 0x50, 0x85, 0x65, 0x95, 0x2a, 0x20,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0xd3, 0xa0, 0x51, 0x69, 0xb5, 0x38, 0xae, 0x1b,
+ 0xe1, 0xa2, 0x89, 0xc6, 0x8d, 0x2b, 0x62, 0x37,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0x37, 0x6d, 0x6a, 0x14, 0xdc, 0xa5, 0x37, 0xfc,
+ 0xfe, 0x67, 0x76, 0xb2, 0x64, 0x68, 0x64, 0x05,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0xe3, 0x12, 0x0c, 0x58, 0x46, 0x45, 0x27, 0x7a,
+ 0x0e, 0xa2, 0xfa, 0x2c, 0x35, 0x73, 0x6c, 0x94,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x63, 0x0d, 0xa1, 0xbc, 0x6e, 0x3e, 0xd3, 0x1d,
+ 0x28, 0x52, 0xd2, 0xf4, 0x30, 0x2d, 0xff, 0xc4,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0xb2, 0x91, 0x49, 0xe2, 0x02, 0x98, 0x00, 0x79,
+ 0x71, 0xb9, 0xd7, 0xd4, 0xb5, 0x94, 0x6d, 0x7d,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x58, 0x96, 0x48, 0x69, 0x05, 0x17, 0xe1, 0x6d,
+ 0xbc, 0xf2, 0x3d, 0x10, 0x96, 0x00, 0x74, 0x58,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0x99, 0x3c, 0xcb, 0x4d, 0x64, 0xc9, 0xa9, 0x41,
+ 0x52, 0x93, 0xfd, 0x65, 0xc4, 0xcc, 0xa5, 0xe5,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[POLYVAL_DIGEST_SIZE] = {
+ 0xdf, 0x68, 0x52, 0x99, 0x92, 0xc3, 0xe8, 0x88,
+ 0x29, 0x13, 0xc8, 0x35, 0x67, 0xa3, 0xd3, 0xad,
+};
+
+static const u8 polyval_allones_hashofhashes[POLYVAL_DIGEST_SIZE] = {
+ 0xd5, 0xf7, 0xfd, 0xb2, 0xa6, 0xef, 0x0b, 0x85,
+ 0x0d, 0x0a, 0x06, 0x10, 0xbc, 0x64, 0x94, 0x73,
+};
diff --git a/lib/crypto/tests/polyval_kunit.c b/lib/crypto/tests/polyval_kunit.c
new file mode 100644
index 000000000000..e59f598c1572
--- /dev/null
+++ b/lib/crypto/tests/polyval_kunit.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/polyval.h>
+#include "polyval-testvecs.h"
+
+/*
+ * A fixed key used when presenting POLYVAL as an unkeyed hash function in order
+ * to reuse hash-test-template.h. At the beginning of the test suite, this is
+ * initialized to a key prepared from bytes generated from a fixed seed.
+ */
+static struct polyval_key test_key;
+
+static void polyval_init_withtestkey(struct polyval_ctx *ctx)
+{
+ polyval_init(ctx, &test_key);
+}
+
+static void polyval_withtestkey(const u8 *data, size_t len,
+ u8 out[POLYVAL_BLOCK_SIZE])
+{
+ polyval(&test_key, data, len, out);
+}
+
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
+#define HASH polyval_withtestkey
+#define HASH_CTX polyval_ctx
+#define HASH_SIZE POLYVAL_BLOCK_SIZE
+#define HASH_INIT polyval_init_withtestkey
+#define HASH_UPDATE polyval_update
+#define HASH_FINAL polyval_final
+#include "hash-test-template.h"
+
+/*
+ * Test an example from RFC8452 ("AES-GCM-SIV: Nonce Misuse-Resistant
+ * Authenticated Encryption") to ensure compatibility with that.
+ */
+static void test_polyval_rfc8452_testvec(struct kunit *test)
+{
+ static const u8 raw_key[POLYVAL_BLOCK_SIZE] =
+ "\x31\x07\x28\xd9\x91\x1f\x1f\x38"
+ "\x37\xb2\x43\x16\xc3\xfa\xb9\xa0";
+ static const u8 data[48] =
+ "\x65\x78\x61\x6d\x70\x6c\x65\x00"
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+ "\x48\x65\x6c\x6c\x6f\x20\x77\x6f"
+ "\x72\x6c\x64\x00\x00\x00\x00\x00"
+ "\x38\x00\x00\x00\x00\x00\x00\x00"
+ "\x58\x00\x00\x00\x00\x00\x00\x00";
+ static const u8 expected_hash[POLYVAL_BLOCK_SIZE] =
+ "\xad\x7f\xcf\x0b\x51\x69\x85\x16"
+ "\x62\x67\x2f\x3c\x5f\x95\x13\x8f";
+ u8 hash[POLYVAL_BLOCK_SIZE];
+ struct polyval_key key;
+
+ polyval_preparekey(&key, raw_key);
+ polyval(&key, data, sizeof(data), hash);
+ KUNIT_ASSERT_MEMEQ(test, hash, expected_hash, sizeof(hash));
+}
+
+/*
+ * Test a key and messages containing all one bits. This is useful to detect
+ * overflow bugs in implementations that emulate carryless multiplication using
+ * a series of standard multiplications with the bits spread out.
+ */
+static void test_polyval_allones_key_and_message(struct kunit *test)
+{
+ struct polyval_key key;
+ struct polyval_ctx hashofhashes_ctx;
+ u8 hash[POLYVAL_BLOCK_SIZE];
+
+ static_assert(TEST_BUF_LEN >= 4096);
+ memset(test_buf, 0xff, 4096);
+
+ polyval_preparekey(&key, test_buf);
+ polyval_init(&hashofhashes_ctx, &key);
+ for (size_t len = 0; len <= 4096; len += 16) {
+ polyval(&key, test_buf, len, hash);
+ polyval_update(&hashofhashes_ctx, hash, sizeof(hash));
+ }
+ polyval_final(&hashofhashes_ctx, hash);
+ KUNIT_ASSERT_MEMEQ(test, hash, polyval_allones_hashofhashes,
+ sizeof(hash));
+}
+
+#define MAX_LEN_FOR_KEY_CHECK 1024
+
+/*
+ * Given two prepared keys which should be identical (but may differ in
+ * alignment and/or whether they are followed by a guard page or not), verify
+ * that they produce consistent results on various data lengths.
+ */
+static void check_key_consistency(struct kunit *test,
+ const struct polyval_key *key1,
+ const struct polyval_key *key2)
+{
+ u8 *data = test_buf;
+ u8 hash1[POLYVAL_BLOCK_SIZE];
+ u8 hash2[POLYVAL_BLOCK_SIZE];
+
+ rand_bytes(data, MAX_LEN_FOR_KEY_CHECK);
+ KUNIT_ASSERT_MEMEQ(test, key1, key2, sizeof(*key1));
+
+ for (int i = 0; i < 100; i++) {
+ size_t len = rand_length(MAX_LEN_FOR_KEY_CHECK);
+
+ polyval(key1, data, len, hash1);
+ polyval(key2, data, len, hash2);
+ KUNIT_ASSERT_MEMEQ(test, hash1, hash2, sizeof(hash1));
+ }
+}
+
+/* Test that no buffer overreads occur on either raw_key or polyval_key. */
+static void test_polyval_with_guarded_key(struct kunit *test)
+{
+ u8 raw_key[POLYVAL_BLOCK_SIZE];
+ u8 *guarded_raw_key = &test_buf[TEST_BUF_LEN - sizeof(raw_key)];
+ struct polyval_key key1, key2;
+ struct polyval_key *guarded_key =
+ (struct polyval_key *)&test_buf[TEST_BUF_LEN - sizeof(key1)];
+
+ /* Prepare with regular buffers. */
+ rand_bytes(raw_key, sizeof(raw_key));
+ polyval_preparekey(&key1, raw_key);
+
+ /* Prepare with guarded raw_key, then check that it works. */
+ memcpy(guarded_raw_key, raw_key, sizeof(raw_key));
+ polyval_preparekey(&key2, guarded_raw_key);
+ check_key_consistency(test, &key1, &key2);
+
+ /* Prepare guarded polyval_key, then check that it works. */
+ polyval_preparekey(guarded_key, raw_key);
+ check_key_consistency(test, &key1, guarded_key);
+}
+
+/*
+ * Test that polyval_key only needs to be aligned to
+ * __alignof__(struct polyval_key), i.e. 8 bytes. The assembly code may prefer
+ * 16-byte or higher alignment, but it musn't require it.
+ */
+static void test_polyval_with_minimally_aligned_key(struct kunit *test)
+{
+ u8 raw_key[POLYVAL_BLOCK_SIZE];
+ struct polyval_key key;
+ struct polyval_key *minaligned_key =
+ (struct polyval_key *)&test_buf[MAX_LEN_FOR_KEY_CHECK +
+ __alignof__(struct polyval_key)];
+
+ KUNIT_ASSERT_TRUE(test, IS_ALIGNED((uintptr_t)minaligned_key,
+ __alignof__(struct polyval_key)));
+ KUNIT_ASSERT_TRUE(test,
+ !IS_ALIGNED((uintptr_t)minaligned_key,
+ 2 * __alignof__(struct polyval_key)));
+
+ rand_bytes(raw_key, sizeof(raw_key));
+ polyval_preparekey(&key, raw_key);
+ polyval_preparekey(minaligned_key, raw_key);
+ check_key_consistency(test, &key, minaligned_key);
+}
+
+struct polyval_irq_test_state {
+ struct polyval_key expected_key;
+ u8 raw_key[POLYVAL_BLOCK_SIZE];
+};
+
+static bool polyval_irq_test_func(void *state_)
+{
+ struct polyval_irq_test_state *state = state_;
+ struct polyval_key key;
+
+ polyval_preparekey(&key, state->raw_key);
+ return memcmp(&key, &state->expected_key, sizeof(key)) == 0;
+}
+
+/*
+ * Test that polyval_preparekey() produces the same output regardless of whether
+ * FPU or vector registers are usable when it is called.
+ */
+static void test_polyval_preparekey_in_irqs(struct kunit *test)
+{
+ struct polyval_irq_test_state state;
+
+ rand_bytes(state.raw_key, sizeof(state.raw_key));
+ polyval_preparekey(&state.expected_key, state.raw_key);
+ kunit_run_irq_test(test, polyval_irq_test_func, 20000, &state);
+}
+
+static int polyval_suite_init(struct kunit_suite *suite)
+{
+ u8 raw_key[POLYVAL_BLOCK_SIZE];
+
+ rand_bytes_seeded_from_len(raw_key, sizeof(raw_key));
+ polyval_preparekey(&test_key, raw_key);
+ return hash_suite_init(suite);
+}
+
+static void polyval_suite_exit(struct kunit_suite *suite)
+{
+ hash_suite_exit(suite);
+}
+
+static struct kunit_case polyval_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(test_polyval_rfc8452_testvec),
+ KUNIT_CASE(test_polyval_allones_key_and_message),
+ KUNIT_CASE(test_polyval_with_guarded_key),
+ KUNIT_CASE(test_polyval_with_minimally_aligned_key),
+ KUNIT_CASE(test_polyval_preparekey_in_irqs),
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite polyval_test_suite = {
+ .name = "polyval",
+ .test_cases = polyval_test_cases,
+ .suite_init = polyval_suite_init,
+ .suite_exit = polyval_suite_exit,
+};
+kunit_test_suite(polyval_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for POLYVAL");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha1-testvecs.h b/lib/crypto/tests/sha1-testvecs.h
new file mode 100644
index 000000000000..f5d050122e84
--- /dev/null
+++ b/lib/crypto/tests/sha1-testvecs.h
@@ -0,0 +1,212 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha1 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[SHA1_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d,
+ 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90,
+ 0xaf, 0xd8, 0x07, 0x09,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x0a, 0xd0, 0x52, 0xdd, 0x9f, 0x32, 0x40, 0x55,
+ 0x21, 0xe4, 0x3c, 0x6e, 0xbd, 0xc5, 0x2f, 0x5a,
+ 0x02, 0x54, 0x93, 0xb2,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x13, 0x83, 0x82, 0x03, 0x23, 0xff, 0x46, 0xd6,
+ 0x12, 0x7f, 0xad, 0x05, 0x2b, 0xc3, 0x4a, 0x42,
+ 0x49, 0x6a, 0xf8, 0x84,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0xe4, 0xdf, 0x7b, 0xdc, 0xe8, 0x6e, 0x81, 0x97,
+ 0x1e, 0x0f, 0xe8, 0x8b, 0x76, 0xa8, 0x59, 0x04,
+ 0xae, 0x92, 0x1a, 0x7c,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x8c, 0x1c, 0x30, 0xd8, 0xbc, 0xc4, 0xc3, 0xf5,
+ 0xf8, 0x83, 0x0d, 0x1e, 0x04, 0x5d, 0x29, 0xb5,
+ 0x68, 0x89, 0xc1, 0xe9,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x6c, 0x1d, 0x72, 0x31, 0xa5, 0x03, 0x4f, 0xdc,
+ 0xff, 0x2d, 0x06, 0x3e, 0x24, 0x26, 0x34, 0x8d,
+ 0x60, 0xa4, 0x67, 0x16,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x37, 0x53, 0x33, 0xfa, 0xd0, 0x21, 0xad, 0xe7,
+ 0xa5, 0x43, 0xf1, 0x94, 0x64, 0x11, 0x47, 0x9c,
+ 0x72, 0xb5, 0x78, 0xb4,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x51, 0x5c, 0xd8, 0x5a, 0xa9, 0xde, 0x7b, 0x2a,
+ 0xa2, 0xff, 0x70, 0x09, 0x56, 0x88, 0x40, 0x2b,
+ 0x50, 0x93, 0x82, 0x47,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0xbc, 0x9c, 0xab, 0x93, 0x06, 0xd5, 0xdb, 0xac,
+ 0x2c, 0x33, 0x15, 0x83, 0x56, 0xf6, 0x91, 0x20,
+ 0x09, 0xc7, 0xb2, 0x6b,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x26, 0x90, 0x3b, 0x47, 0xe1, 0x92, 0x42, 0xd0,
+ 0x85, 0x63, 0x2e, 0x6b, 0x68, 0xa4, 0xc4, 0x4c,
+ 0xe6, 0xf4, 0xb0, 0x52,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x55, 0x6f, 0x87, 0xdc, 0x34, 0x3d, 0xe2, 0x4f,
+ 0xc3, 0x81, 0xa4, 0x82, 0x79, 0x84, 0x64, 0x01,
+ 0x55, 0xa0, 0x1e, 0x36,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0xb7, 0xd5, 0x5f, 0xa4, 0xef, 0xbf, 0x4f, 0x96,
+ 0x01, 0xc1, 0x06, 0xe3, 0x75, 0xa8, 0x90, 0x92,
+ 0x4c, 0x5f, 0xf1, 0x21,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x70, 0x0c, 0xea, 0xa4, 0x93, 0xd0, 0x56, 0xf0,
+ 0x6f, 0xbb, 0x53, 0x42, 0x5b, 0xe3, 0xf2, 0xb0,
+ 0x30, 0x66, 0x8e, 0x75,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x15, 0x01, 0xbc, 0xb0, 0xee, 0xd8, 0xeb, 0xa8,
+ 0x7d, 0xd9, 0x4d, 0x50, 0x2e, 0x41, 0x30, 0xba,
+ 0x41, 0xaa, 0x7b, 0x02,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x98, 0x05, 0x52, 0xf5, 0x0f, 0xf0, 0xd3, 0x97,
+ 0x15, 0x8c, 0xa3, 0x9a, 0x2b, 0x4d, 0x67, 0x57,
+ 0x29, 0xa0, 0xac, 0x61,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x1f, 0x47, 0xf0, 0xcc, 0xd7, 0xda, 0xa5, 0x3b,
+ 0x39, 0xb4, 0x5b, 0xa8, 0x33, 0xd4, 0xca, 0x2f,
+ 0xdd, 0xf2, 0x39, 0x89,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0xb9, 0x75, 0xe6, 0x57, 0x42, 0x7f, 0x8b, 0x0a,
+ 0xcc, 0x53, 0x10, 0x69, 0x45, 0xac, 0xfd, 0x11,
+ 0xf7, 0x1f, 0x4e, 0x6f,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x63, 0x66, 0xcb, 0x44, 0xc1, 0x2c, 0xa2, 0x06,
+ 0x5d, 0xb9, 0x8e, 0x31, 0xcb, 0x4f, 0x4e, 0x49,
+ 0xe0, 0xfb, 0x3c, 0x4e,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0x35, 0xbc, 0x74, 0xfb, 0x31, 0x9c, 0xd4, 0xdd,
+ 0xe8, 0x87, 0xa7, 0x56, 0x3b, 0x08, 0xe5, 0x49,
+ 0xe1, 0xe9, 0xc9, 0xa8,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0x43, 0x00, 0xea, 0xcd, 0x4e, 0x7c, 0xe9, 0xe4,
+ 0x32, 0xce, 0x25, 0xa8, 0xcd, 0x20, 0xa8, 0xaa,
+ 0x7b, 0x63, 0x2c, 0x3c,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0xd0, 0x67, 0x26, 0x0e, 0x22, 0x72, 0xaa, 0x63,
+ 0xfc, 0x34, 0x55, 0x07, 0xab, 0xc8, 0x64, 0xb6,
+ 0xc4, 0xea, 0xd5, 0x7c,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x6b, 0xc9, 0x5e, 0xb9, 0x41, 0x19, 0x50, 0x35,
+ 0xf1, 0x39, 0xfe, 0xd9, 0x72, 0x6d, 0xd0, 0x55,
+ 0xb8, 0x1f, 0x1a, 0x95,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x70, 0x5d, 0x10, 0x2e, 0x4e, 0x44, 0xc9, 0x80,
+ 0x8f, 0xba, 0x13, 0xbc, 0xd0, 0x77, 0x78, 0xc7,
+ 0x84, 0xe3, 0x24, 0x43,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0xa8, 0x82, 0xca, 0x08, 0xb4, 0x84, 0x09, 0x13,
+ 0xc0, 0x9c, 0x26, 0x18, 0xcf, 0x0f, 0xf3, 0x08,
+ 0xff, 0xa1, 0xe4, 0x5d,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[SHA1_DIGEST_SIZE] = {
+ 0xe1, 0x72, 0xa5, 0x3c, 0xda, 0xf2, 0xe5, 0x56,
+ 0xb8, 0xb5, 0x35, 0x6e, 0xce, 0xc8, 0x37, 0x57,
+ 0x31, 0xb4, 0x05, 0xdd,
+};
+
+static const u8 hmac_testvec_consolidated[SHA1_DIGEST_SIZE] = {
+ 0x9d, 0xe5, 0xb1, 0x43, 0x97, 0x95, 0x16, 0x52,
+ 0xa0, 0x7a, 0xc0, 0xe2, 0xc1, 0x60, 0x64, 0x7c,
+ 0x24, 0xf9, 0x34, 0xd7,
+};
diff --git a/lib/crypto/tests/sha1_kunit.c b/lib/crypto/tests/sha1_kunit.c
new file mode 100644
index 000000000000..24ba8d5669c8
--- /dev/null
+++ b/lib/crypto/tests/sha1_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha1.h>
+#include "sha1-testvecs.h"
+
+#define HASH sha1
+#define HASH_CTX sha1_ctx
+#define HASH_SIZE SHA1_DIGEST_SIZE
+#define HASH_INIT sha1_init
+#define HASH_UPDATE sha1_update
+#define HASH_FINAL sha1_final
+#define HMAC_KEY hmac_sha1_key
+#define HMAC_CTX hmac_sha1_ctx
+#define HMAC_PREPAREKEY hmac_sha1_preparekey
+#define HMAC_INIT hmac_sha1_init
+#define HMAC_UPDATE hmac_sha1_update
+#define HMAC_FINAL hmac_sha1_final
+#define HMAC hmac_sha1
+#define HMAC_USINGRAWKEY hmac_sha1_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite hash_test_suite = {
+ .name = "sha1",
+ .test_cases = hash_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-1 and HMAC-SHA1");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha224-testvecs.h b/lib/crypto/tests/sha224-testvecs.h
new file mode 100644
index 000000000000..523adc178e1a
--- /dev/null
+++ b/lib/crypto/tests/sha224-testvecs.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha224 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[SHA224_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xd1, 0x4a, 0x02, 0x8c, 0x2a, 0x3a, 0x2b, 0xc9,
+ 0x47, 0x61, 0x02, 0xbb, 0x28, 0x82, 0x34, 0xc4,
+ 0x15, 0xa2, 0xb0, 0x1f, 0x82, 0x8e, 0xa6, 0x2a,
+ 0xc5, 0xb3, 0xe4, 0x2f,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0xe3, 0x4d, 0x79, 0x17, 0x75, 0x35, 0xdc, 0xd2,
+ 0x27, 0xc9, 0x9d, 0x0b, 0x90, 0x0f, 0x21, 0x5d,
+ 0x95, 0xfb, 0x9c, 0x6d, 0xa8, 0xec, 0x19, 0x15,
+ 0x12, 0xef, 0xf5, 0x0f,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x81, 0xc7, 0x60, 0x0d, 0x6d, 0x13, 0x75, 0x70,
+ 0x4b, 0xc0, 0xab, 0xea, 0x04, 0xe3, 0x78, 0x7e,
+ 0x73, 0xb9, 0x0f, 0xb6, 0xae, 0x90, 0xf3, 0x94,
+ 0xb2, 0x56, 0xda, 0xc8,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0x24, 0xf0, 0x8c, 0x6e, 0x9d, 0xd6, 0x06, 0x80,
+ 0x0a, 0x03, 0xee, 0x9b, 0x33, 0xec, 0x83, 0x42,
+ 0x2c, 0x8b, 0xe7, 0xc7, 0xc6, 0x04, 0xfb, 0xc6,
+ 0xa3, 0x3a, 0x4d, 0xc9,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x1c, 0x08, 0xa8, 0x55, 0x8f, 0xc6, 0x0a, 0xea,
+ 0x2f, 0x1b, 0x54, 0xff, 0x8d, 0xd2, 0xa3, 0xc7,
+ 0x42, 0xc2, 0x93, 0x3d, 0x73, 0x18, 0x84, 0xba,
+ 0x75, 0x49, 0x34, 0xfd,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x45, 0xdd, 0xb5, 0xf0, 0x3c, 0xda, 0xe6, 0xd4,
+ 0x6c, 0x86, 0x91, 0x29, 0x11, 0x2f, 0x88, 0x7d,
+ 0xd8, 0x3c, 0xa3, 0xd6, 0xdd, 0x1e, 0xac, 0x98,
+ 0xff, 0xf0, 0x14, 0x69,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x0b, 0xfb, 0x71, 0x4c, 0x06, 0x7a, 0xd5, 0x89,
+ 0x76, 0x0a, 0x43, 0x8b, 0x2b, 0x47, 0x12, 0x56,
+ 0xa7, 0x64, 0x33, 0x1d, 0xd3, 0x44, 0x17, 0x95,
+ 0x23, 0xe7, 0x53, 0x01,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0xc4, 0xae, 0x9c, 0x33, 0xd5, 0x1d, 0xf4, 0xa7,
+ 0xfd, 0xb7, 0xd4, 0x6b, 0xc3, 0xeb, 0xa8, 0xbf,
+ 0xfb, 0x07, 0x89, 0x4b, 0x07, 0x15, 0x22, 0xec,
+ 0xe1, 0x45, 0x84, 0xba,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0xad, 0x01, 0x34, 0x2a, 0xe2, 0x3b, 0x58, 0x06,
+ 0x9f, 0x20, 0xc8, 0xfb, 0xf3, 0x20, 0x82, 0xa6,
+ 0x9f, 0xee, 0x7a, 0xbe, 0xdf, 0xf3, 0x5d, 0x57,
+ 0x9b, 0xce, 0x79, 0x96,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0xa7, 0xa6, 0x47, 0xf7, 0xed, 0x2a, 0xe5, 0xe3,
+ 0xc0, 0x1e, 0x7b, 0x40, 0xe4, 0xf7, 0x40, 0x65,
+ 0x42, 0xc1, 0x6f, 0x7d, 0x8d, 0x0d, 0x17, 0x4f,
+ 0xd3, 0xbc, 0x0d, 0x85,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0xc4, 0x9c, 0xb5, 0x6a, 0x01, 0x2d, 0x10, 0xa9,
+ 0x5f, 0xa4, 0x5a, 0xe1, 0xba, 0x40, 0x12, 0x09,
+ 0x7b, 0xea, 0xdb, 0xa6, 0x7b, 0xcb, 0x56, 0xf0,
+ 0xfd, 0x5b, 0xe2, 0xe7,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x14, 0xda, 0x0e, 0x01, 0xca, 0x78, 0x7d, 0x2d,
+ 0x85, 0xa3, 0xca, 0x0e, 0x80, 0xf9, 0x95, 0x10,
+ 0xa1, 0x7b, 0xa5, 0xaa, 0xfc, 0x95, 0x05, 0x08,
+ 0x53, 0xda, 0x52, 0xee,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0xa5, 0x24, 0xc4, 0x54, 0xe1, 0x50, 0xab, 0xee,
+ 0x22, 0xc1, 0xa7, 0x27, 0x15, 0x2c, 0x6f, 0xf7,
+ 0x4c, 0x31, 0xe5, 0x15, 0x25, 0x4e, 0x71, 0xc6,
+ 0x7e, 0xa0, 0x11, 0x5d,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x73, 0xd0, 0x8c, 0xce, 0xed, 0xed, 0x9f, 0xaa,
+ 0x21, 0xaf, 0xa2, 0x08, 0x80, 0x16, 0x15, 0x59,
+ 0x3f, 0x1d, 0x7f, 0x0a, 0x79, 0x3d, 0x7b, 0x58,
+ 0xf8, 0xc8, 0x5c, 0x27,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x31, 0xa7, 0xa1, 0xca, 0x49, 0x72, 0x75, 0xcc,
+ 0x6e, 0x02, 0x9e, 0xad, 0xea, 0x86, 0x5c, 0x91,
+ 0x02, 0xe4, 0xc9, 0xf9, 0xd3, 0x9e, 0x74, 0x50,
+ 0xd8, 0x43, 0x6b, 0x85,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x40, 0x60, 0x8b, 0xb0, 0x03, 0xa9, 0x75, 0xab,
+ 0x2d, 0x5b, 0x20, 0x9a, 0x05, 0x72, 0xb7, 0xa8,
+ 0xce, 0xf2, 0x4f, 0x66, 0x62, 0xe3, 0x7e, 0x24,
+ 0xd6, 0xe2, 0xea, 0xfa,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0x4f, 0x5f, 0x9f, 0x1e, 0xb3, 0x66, 0x81, 0xdb,
+ 0x41, 0x5d, 0x65, 0x97, 0x00, 0x8d, 0xdc, 0x62,
+ 0x03, 0xb0, 0x4d, 0x6b, 0x5c, 0x7f, 0x1e, 0xa0,
+ 0xfe, 0xfc, 0x0e, 0xb8,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x08, 0xa8, 0xa1, 0xc0, 0xd8, 0xf9, 0xb4, 0xaa,
+ 0x53, 0x22, 0xa1, 0x73, 0x0b, 0x45, 0xa0, 0x20,
+ 0x72, 0xf3, 0xa9, 0xbc, 0x51, 0xd0, 0x20, 0x79,
+ 0x69, 0x97, 0xf7, 0xe3,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0xe8, 0x60, 0x5f, 0xb9, 0x12, 0xe1, 0x6b, 0x24,
+ 0xc5, 0xe8, 0x43, 0xa9, 0x5c, 0x3f, 0x65, 0xed,
+ 0xbe, 0xfd, 0x77, 0xf5, 0x47, 0xf2, 0x75, 0x21,
+ 0xc2, 0x8f, 0x54, 0x8f,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0xc7, 0xdf, 0x50, 0x16, 0x10, 0x01, 0xb7, 0xdf,
+ 0x34, 0x1d, 0x18, 0xa2, 0xd5, 0xad, 0x1f, 0x50,
+ 0xf7, 0xa8, 0x9a, 0x72, 0xfb, 0xfd, 0xd9, 0x1c,
+ 0x57, 0xac, 0x08, 0x97,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0xdf, 0x16, 0x76, 0x7f, 0xc0, 0x16, 0x84, 0x63,
+ 0xac, 0xcf, 0xd0, 0x78, 0x1e, 0x96, 0x67, 0xc5,
+ 0x3c, 0x06, 0xe9, 0xdb, 0x6e, 0x7d, 0xd0, 0x07,
+ 0xaa, 0xb1, 0x56, 0xc9,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x49, 0xec, 0x5c, 0x18, 0xd7, 0x5b, 0xda, 0xed,
+ 0x5b, 0x59, 0xde, 0x09, 0x34, 0xb2, 0x49, 0x43,
+ 0x62, 0x6a, 0x0a, 0x63, 0x6a, 0x51, 0x08, 0x37,
+ 0x8c, 0xb6, 0x29, 0x84,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x3d, 0xc2, 0xc8, 0x43, 0xcf, 0xb7, 0x33, 0x14,
+ 0x04, 0x93, 0xed, 0xe2, 0xcd, 0x8a, 0x69, 0x5c,
+ 0x5a, 0xd5, 0x9b, 0x52, 0xdf, 0x48, 0xa7, 0xaa,
+ 0x28, 0x2b, 0x5d, 0x27,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0xa7, 0xaf, 0xda, 0x92, 0xe2, 0xe7, 0x61, 0xdc,
+ 0xa1, 0x32, 0x53, 0x2a, 0x3f, 0x41, 0x5c, 0x7e,
+ 0xc9, 0x89, 0xda, 0x1c, 0xf7, 0x8d, 0x00, 0xbd,
+ 0x21, 0x73, 0xb1, 0x69,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[SHA224_DIGEST_SIZE] = {
+ 0x9e, 0xb8, 0x82, 0xab, 0x83, 0x37, 0xe4, 0x63,
+ 0x84, 0xee, 0x21, 0x15, 0xc2, 0xbb, 0xa3, 0x17,
+ 0x8f, 0xc4, 0x99, 0x33, 0xa0, 0x2c, 0x9f, 0xec,
+ 0xca, 0xd0, 0xf3, 0x73,
+};
+
+static const u8 hmac_testvec_consolidated[SHA224_DIGEST_SIZE] = {
+ 0x66, 0x34, 0x79, 0x92, 0x47, 0x0e, 0xcd, 0x70,
+ 0xb0, 0x8b, 0x91, 0xcb, 0x94, 0x2f, 0x67, 0x65,
+ 0x2f, 0xc9, 0xd2, 0x91, 0x32, 0xaf, 0xf7, 0x5f,
+ 0xb6, 0x01, 0x5b, 0xf2,
+};
diff --git a/lib/crypto/tests/sha224_kunit.c b/lib/crypto/tests/sha224_kunit.c
new file mode 100644
index 000000000000..962ad46b9c99
--- /dev/null
+++ b/lib/crypto/tests/sha224_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha224-testvecs.h"
+
+#define HASH sha224
+#define HASH_CTX sha224_ctx
+#define HASH_SIZE SHA224_DIGEST_SIZE
+#define HASH_INIT sha224_init
+#define HASH_UPDATE sha224_update
+#define HASH_FINAL sha224_final
+#define HMAC_KEY hmac_sha224_key
+#define HMAC_CTX hmac_sha224_ctx
+#define HMAC_PREPAREKEY hmac_sha224_preparekey
+#define HMAC_INIT hmac_sha224_init
+#define HMAC_UPDATE hmac_sha224_update
+#define HMAC_FINAL hmac_sha224_final
+#define HMAC hmac_sha224
+#define HMAC_USINGRAWKEY hmac_sha224_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite hash_test_suite = {
+ .name = "sha224",
+ .test_cases = hash_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-224 and HMAC-SHA224");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha256-testvecs.h b/lib/crypto/tests/sha256-testvecs.h
new file mode 100644
index 000000000000..2db7b2042034
--- /dev/null
+++ b/lib/crypto/tests/sha256-testvecs.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha256 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[SHA256_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14,
+ 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24,
+ 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c,
+ 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x45, 0xf8, 0x3d, 0x17, 0xe1, 0x0b, 0x34, 0xfc,
+ 0xa0, 0x1e, 0xb8, 0xf4, 0x45, 0x4d, 0xac, 0x34,
+ 0xa7, 0x77, 0xd9, 0x40, 0x4a, 0x46, 0x4e, 0x73,
+ 0x2c, 0xf4, 0xab, 0xf2, 0xc0, 0xda, 0x94, 0xc4,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0xf9, 0xd3, 0x52, 0x2f, 0xd5, 0xe0, 0x99, 0x15,
+ 0x1c, 0xd6, 0xa9, 0x24, 0x4f, 0x40, 0xba, 0x25,
+ 0x33, 0x43, 0x3e, 0xe1, 0x78, 0x6a, 0xfe, 0x7d,
+ 0x07, 0xe2, 0x29, 0x7b, 0x6d, 0xc5, 0x73, 0xf5,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0x71, 0xf7, 0xa1, 0xef, 0x69, 0x86, 0x0e, 0xe4,
+ 0x87, 0x25, 0x58, 0x4c, 0x07, 0x2c, 0xfc, 0x60,
+ 0xc5, 0xf6, 0xe2, 0x44, 0xaa, 0xfb, 0x41, 0xc7,
+ 0x2b, 0xc5, 0x01, 0x8c, 0x39, 0x98, 0x30, 0x37,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x09, 0x95, 0x9a, 0xfa, 0x25, 0x18, 0x86, 0x06,
+ 0xfe, 0x65, 0xc9, 0x2f, 0x91, 0x15, 0x74, 0x06,
+ 0x6c, 0xbf, 0xef, 0x7b, 0x0b, 0xc7, 0x2c, 0x05,
+ 0xdd, 0x17, 0x5d, 0x6f, 0x8a, 0xa5, 0xde, 0x3c,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0xe5, 0x52, 0x3c, 0x85, 0xea, 0x1b, 0xe1, 0x6c,
+ 0xe0, 0xdb, 0xc3, 0xef, 0xf0, 0xca, 0xc2, 0xe1,
+ 0xb9, 0x36, 0xa1, 0x28, 0xb6, 0x9e, 0xf5, 0x6e,
+ 0x70, 0xf7, 0xf9, 0xa7, 0x1c, 0xd3, 0x22, 0xd0,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x5f, 0x84, 0xd4, 0xd7, 0x2e, 0x80, 0x09, 0xef,
+ 0x1c, 0x77, 0x7c, 0x25, 0x59, 0x63, 0x88, 0x64,
+ 0xfd, 0x56, 0xea, 0x23, 0xf4, 0x4f, 0x2e, 0x49,
+ 0xcd, 0xb4, 0xaa, 0xc7, 0x5c, 0x8b, 0x75, 0x84,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x22, 0x6e, 0xca, 0xda, 0x00, 0x2d, 0x90, 0x96,
+ 0x24, 0xf8, 0x55, 0x17, 0x11, 0xda, 0x42, 0x1c,
+ 0x78, 0x4e, 0xbf, 0xd9, 0xc5, 0xcf, 0xf3, 0xe3,
+ 0xaf, 0xd3, 0x60, 0xcd, 0xaa, 0xe2, 0xc7, 0x22,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x97, 0xe2, 0x74, 0xdc, 0x6b, 0xa4, 0xaf, 0x32,
+ 0x3b, 0x50, 0x6d, 0x80, 0xb5, 0xd3, 0x0c, 0x36,
+ 0xea, 0x3f, 0x5d, 0x36, 0xa7, 0x49, 0x51, 0xf3,
+ 0xbd, 0x69, 0x68, 0x60, 0x9b, 0xde, 0x73, 0xf5,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x13, 0x74, 0xb1, 0x72, 0xd6, 0x53, 0x48, 0x28,
+ 0x42, 0xd8, 0xba, 0x64, 0x20, 0x60, 0xb6, 0x4c,
+ 0xc3, 0xac, 0x5d, 0x93, 0x8c, 0xb9, 0xd4, 0xcc,
+ 0xb4, 0x9f, 0x31, 0x1f, 0xeb, 0x68, 0x35, 0x58,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0xda, 0xbe, 0xd7, 0xbc, 0x6e, 0xe6, 0x5a, 0x57,
+ 0xeb, 0x9a, 0x93, 0xaa, 0x66, 0xd0, 0xe0, 0xc4,
+ 0x29, 0x7f, 0xe9, 0x3b, 0x8e, 0xdf, 0x81, 0x82,
+ 0x8d, 0x15, 0x11, 0x59, 0x4e, 0x13, 0xa5, 0x58,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x8c, 0x1a, 0xba, 0x40, 0x66, 0x94, 0x19, 0xf4,
+ 0x2e, 0xa2, 0xae, 0x94, 0x53, 0x18, 0xb6, 0xfd,
+ 0xa0, 0x12, 0xc5, 0xef, 0xd5, 0xd6, 0x1b, 0xa1,
+ 0x37, 0xea, 0x19, 0x44, 0x35, 0x54, 0x85, 0x74,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0xfd, 0x07, 0xd8, 0x77, 0x7d, 0x8b, 0x4f, 0xee,
+ 0x60, 0x60, 0x26, 0xef, 0x2a, 0x86, 0xfb, 0x67,
+ 0xeb, 0x31, 0x27, 0x03, 0x99, 0x3c, 0xde, 0xe5,
+ 0x84, 0x72, 0x71, 0x4c, 0x33, 0x7b, 0x87, 0x13,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x97, 0xc5, 0x58, 0x38, 0x20, 0xc7, 0xde, 0xfa,
+ 0xdd, 0x9b, 0x10, 0xc6, 0xc2, 0x2f, 0x94, 0xb5,
+ 0xc0, 0x33, 0xc0, 0x20, 0x1c, 0x2f, 0xb4, 0x28,
+ 0x5e, 0x36, 0xfa, 0x8c, 0x24, 0x1c, 0x18, 0x27,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x62, 0x17, 0x84, 0x26, 0x98, 0x30, 0x57, 0xca,
+ 0x4f, 0x32, 0xd9, 0x09, 0x09, 0x34, 0xe2, 0xcb,
+ 0x92, 0x45, 0xd5, 0xeb, 0x8b, 0x9b, 0x3c, 0xd8,
+ 0xaa, 0xc7, 0xd2, 0x2b, 0x04, 0xab, 0xb3, 0x35,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x7f, 0xe1, 0x09, 0x78, 0x5d, 0x61, 0xfa, 0x5e,
+ 0x9b, 0x8c, 0xb1, 0xa9, 0x09, 0x69, 0xb4, 0x24,
+ 0x54, 0xf2, 0x1c, 0xc9, 0x5f, 0xfb, 0x59, 0x9d,
+ 0x36, 0x1b, 0x37, 0x44, 0xfc, 0x64, 0x79, 0xb6,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0xd2, 0x3b, 0x3a, 0xe7, 0x13, 0x4f, 0xbd, 0x29,
+ 0x6b, 0xd2, 0x79, 0x26, 0x6c, 0xd2, 0x22, 0x43,
+ 0x25, 0x34, 0x9b, 0x9b, 0x22, 0xb0, 0x9f, 0x61,
+ 0x1d, 0xf4, 0xe2, 0x65, 0x68, 0x95, 0x02, 0x6c,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x0c, 0x34, 0x53, 0x3f, 0x0f, 0x8a, 0x39, 0x8d,
+ 0x63, 0xe4, 0x83, 0x6e, 0x11, 0x7d, 0x14, 0x8e,
+ 0x5b, 0xf0, 0x4d, 0xca, 0x23, 0x24, 0xb5, 0xd2,
+ 0x13, 0x3f, 0xd9, 0xde, 0x84, 0x74, 0x26, 0x59,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0xa8, 0xb8, 0x83, 0x01, 0x1b, 0x38, 0x7a, 0xca,
+ 0x59, 0xe9, 0x5b, 0x37, 0x6a, 0xab, 0xb4, 0x85,
+ 0x94, 0x73, 0x26, 0x04, 0xef, 0xed, 0xf4, 0x0d,
+ 0xd6, 0x09, 0x21, 0x09, 0x96, 0x78, 0xe3, 0xcf,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0x0b, 0x12, 0x66, 0x96, 0x78, 0x4f, 0x2c, 0x35,
+ 0xa4, 0xed, 0xbc, 0xb8, 0x30, 0xa6, 0x37, 0x9b,
+ 0x94, 0x13, 0xae, 0x86, 0xf0, 0x20, 0xfb, 0x49,
+ 0x8f, 0x5d, 0x20, 0x70, 0x60, 0x2b, 0x02, 0x70,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0xe4, 0xbd, 0xe4, 0x3b, 0x85, 0xf4, 0x6f, 0x11,
+ 0xad, 0xc4, 0x79, 0xcc, 0x8e, 0x6d, 0x8b, 0x15,
+ 0xbb, 0xf9, 0xd3, 0x65, 0xe1, 0xf8, 0x8d, 0x22,
+ 0x65, 0x66, 0x66, 0xb3, 0xf5, 0xd0, 0x9c, 0xaf,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x90, 0x5f, 0xe0, 0xfc, 0xb1, 0xdc, 0x38, 0x1b,
+ 0xe5, 0x37, 0x3f, 0xd2, 0xcc, 0x48, 0xc4, 0xbc,
+ 0xb4, 0xfd, 0xf7, 0x71, 0x5f, 0x6b, 0xf4, 0xc4,
+ 0xa6, 0x08, 0x7e, 0xfc, 0x4e, 0x96, 0xf7, 0xc2,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x1f, 0x34, 0x0a, 0x3b, 0xdb, 0xf7, 0x7a, 0xdb,
+ 0x3d, 0x89, 0x85, 0x0c, 0xd2, 0xf0, 0x0c, 0xbd,
+ 0x25, 0x39, 0x14, 0x06, 0x28, 0x0f, 0x6b, 0x5f,
+ 0xe3, 0x1f, 0x2a, 0xb6, 0xca, 0x56, 0x41, 0xa1,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0x7b, 0x01, 0x2d, 0x84, 0x70, 0xee, 0xe0, 0x77,
+ 0x3c, 0x17, 0x63, 0xfe, 0x40, 0xd7, 0xfd, 0xa1,
+ 0x75, 0x90, 0xb8, 0x3e, 0x50, 0xcd, 0x06, 0xb7,
+ 0xb9, 0xb9, 0x2b, 0x91, 0x4f, 0xba, 0xe4, 0x4c,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[SHA256_DIGEST_SIZE] = {
+ 0x78, 0x1c, 0xb1, 0x9f, 0xe5, 0xe1, 0xcb, 0x41,
+ 0x8b, 0x34, 0x00, 0x33, 0x57, 0xc3, 0x1c, 0x8f,
+ 0x5c, 0x84, 0xc7, 0x8b, 0x87, 0x6a, 0x13, 0x29,
+ 0x2f, 0xc4, 0x1a, 0x70, 0x5e, 0x40, 0xf2, 0xfe,
+};
+
+static const u8 hmac_testvec_consolidated[SHA256_DIGEST_SIZE] = {
+ 0x56, 0x96, 0x2e, 0x23, 0x3f, 0x94, 0x89, 0x0d,
+ 0x0f, 0x24, 0x36, 0x2e, 0x19, 0x3d, 0xb5, 0xac,
+ 0xb8, 0xcd, 0xf1, 0xc9, 0xca, 0xac, 0xee, 0x9d,
+ 0x62, 0xe6, 0x81, 0xe5, 0x96, 0xf9, 0x38, 0xf5,
+};
diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c
new file mode 100644
index 000000000000..5dccdee79693
--- /dev/null
+++ b/lib/crypto/tests/sha256_kunit.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha256-testvecs.h"
+
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
+#define HASH sha256
+#define HASH_CTX sha256_ctx
+#define HASH_SIZE SHA256_DIGEST_SIZE
+#define HASH_INIT sha256_init
+#define HASH_UPDATE sha256_update
+#define HASH_FINAL sha256_final
+#define HMAC_KEY hmac_sha256_key
+#define HMAC_CTX hmac_sha256_ctx
+#define HMAC_PREPAREKEY hmac_sha256_preparekey
+#define HMAC_INIT hmac_sha256_init
+#define HMAC_UPDATE hmac_sha256_update
+#define HMAC_FINAL hmac_sha256_final
+#define HMAC hmac_sha256
+#define HMAC_USINGRAWKEY hmac_sha256_usingrawkey
+#include "hash-test-template.h"
+
+static void free_guarded_buf(void *buf)
+{
+ vfree(buf);
+}
+
+/*
+ * Allocate a KUnit-managed buffer that has length @len bytes immediately
+ * followed by an unmapped page, and assert that the allocation succeeds.
+ */
+static void *alloc_guarded_buf(struct kunit *test, size_t len)
+{
+ size_t full_len = round_up(len, PAGE_SIZE);
+ void *buf = vmalloc(full_len);
+
+ KUNIT_ASSERT_NOT_NULL(test, buf);
+ KUNIT_ASSERT_EQ(test, 0,
+ kunit_add_action_or_reset(test, free_guarded_buf, buf));
+ return buf + full_len - len;
+}
+
+/*
+ * Test for sha256_finup_2x(). Specifically, choose various data lengths and
+ * salt lengths, and for each one, verify that sha256_finup_2x() produces the
+ * same results as sha256_update() and sha256_final().
+ *
+ * Use guarded buffers for all inputs and outputs to reliably detect any
+ * out-of-bounds reads or writes, even if they occur in assembly code.
+ */
+static void test_sha256_finup_2x(struct kunit *test)
+{
+ const size_t max_data_len = 16384;
+ u8 *data1_buf, *data2_buf, *hash1, *hash2;
+ u8 expected_hash1[SHA256_DIGEST_SIZE];
+ u8 expected_hash2[SHA256_DIGEST_SIZE];
+ u8 salt[SHA256_BLOCK_SIZE];
+ struct sha256_ctx *ctx;
+
+ data1_buf = alloc_guarded_buf(test, max_data_len);
+ data2_buf = alloc_guarded_buf(test, max_data_len);
+ hash1 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+ hash2 = alloc_guarded_buf(test, SHA256_DIGEST_SIZE);
+ ctx = alloc_guarded_buf(test, sizeof(*ctx));
+
+ rand_bytes(data1_buf, max_data_len);
+ rand_bytes(data2_buf, max_data_len);
+ rand_bytes(salt, sizeof(salt));
+ memset(ctx, 0, sizeof(*ctx));
+
+ for (size_t i = 0; i < 500; i++) {
+ size_t salt_len = rand_length(sizeof(salt));
+ size_t data_len = rand_length(max_data_len);
+ const u8 *data1 = data1_buf + max_data_len - data_len;
+ const u8 *data2 = data2_buf + max_data_len - data_len;
+ struct sha256_ctx orig_ctx;
+
+ sha256_init(ctx);
+ sha256_update(ctx, salt, salt_len);
+ orig_ctx = *ctx;
+
+ sha256_finup_2x(ctx, data1, data2, data_len, hash1, hash2);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, ctx, &orig_ctx, sizeof(*ctx),
+ "sha256_finup_2x() modified its ctx argument");
+
+ sha256_update(ctx, data1, data_len);
+ sha256_final(ctx, expected_hash1);
+ sha256_update(&orig_ctx, data2, data_len);
+ sha256_final(&orig_ctx, expected_hash2);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash1, expected_hash1, SHA256_DIGEST_SIZE,
+ "Wrong hash1 with salt_len=%zu data_len=%zu", salt_len,
+ data_len);
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, hash2, expected_hash2, SHA256_DIGEST_SIZE,
+ "Wrong hash2 with salt_len=%zu data_len=%zu", salt_len,
+ data_len);
+ }
+}
+
+/* Test sha256_finup_2x() with ctx == NULL */
+static void test_sha256_finup_2x_defaultctx(struct kunit *test)
+{
+ const size_t data_len = 128;
+ struct sha256_ctx ctx;
+ u8 hash1_a[SHA256_DIGEST_SIZE];
+ u8 hash2_a[SHA256_DIGEST_SIZE];
+ u8 hash1_b[SHA256_DIGEST_SIZE];
+ u8 hash2_b[SHA256_DIGEST_SIZE];
+
+ rand_bytes(test_buf, 2 * data_len);
+
+ sha256_init(&ctx);
+ sha256_finup_2x(&ctx, test_buf, &test_buf[data_len], data_len, hash1_a,
+ hash2_a);
+
+ sha256_finup_2x(NULL, test_buf, &test_buf[data_len], data_len, hash1_b,
+ hash2_b);
+
+ KUNIT_ASSERT_MEMEQ(test, hash1_a, hash1_b, SHA256_DIGEST_SIZE);
+ KUNIT_ASSERT_MEMEQ(test, hash2_a, hash2_b, SHA256_DIGEST_SIZE);
+}
+
+/*
+ * Test that sha256_finup_2x() and sha256_update/final() produce consistent
+ * results with total message lengths that require more than 32 bits.
+ */
+static void test_sha256_finup_2x_hugelen(struct kunit *test)
+{
+ const size_t data_len = 4 * SHA256_BLOCK_SIZE;
+ struct sha256_ctx ctx = {};
+ u8 expected_hash[SHA256_DIGEST_SIZE];
+ u8 hash[SHA256_DIGEST_SIZE];
+
+ rand_bytes(test_buf, data_len);
+ for (size_t align = 0; align < SHA256_BLOCK_SIZE; align++) {
+ sha256_init(&ctx);
+ ctx.ctx.bytecount = 0x123456789abcd00 + align;
+
+ sha256_finup_2x(&ctx, test_buf, test_buf, data_len, hash, hash);
+
+ sha256_update(&ctx, test_buf, data_len);
+ sha256_final(&ctx, expected_hash);
+
+ KUNIT_ASSERT_MEMEQ(test, hash, expected_hash,
+ SHA256_DIGEST_SIZE);
+ }
+}
+
+/* Benchmark for sha256_finup_2x() */
+static void benchmark_sha256_finup_2x(struct kunit *test)
+{
+ /*
+ * Try a few different salt lengths, since sha256_finup_2x() performance
+ * may vary slightly for the same data_len depending on how many bytes
+ * were already processed in the initial context.
+ */
+ static const size_t salt_lens_to_test[] = { 0, 32, 64 };
+ const size_t data_len = 4096;
+ const size_t num_iters = 4096;
+ struct sha256_ctx ctx;
+ u8 hash1[SHA256_DIGEST_SIZE];
+ u8 hash2[SHA256_DIGEST_SIZE];
+
+ if (!IS_ENABLED(CONFIG_CRYPTO_LIB_BENCHMARK))
+ kunit_skip(test, "not enabled");
+ if (!sha256_finup_2x_is_optimized())
+ kunit_skip(test, "not relevant");
+
+ rand_bytes(test_buf, data_len * 2);
+
+ /* Warm-up */
+ for (size_t i = 0; i < num_iters; i++)
+ sha256_finup_2x(NULL, &test_buf[0], &test_buf[data_len],
+ data_len, hash1, hash2);
+
+ for (size_t i = 0; i < ARRAY_SIZE(salt_lens_to_test); i++) {
+ size_t salt_len = salt_lens_to_test[i];
+ u64 t0, t1;
+
+ /*
+ * Prepare the initial context. The time to process the salt is
+ * not measured; we're just interested in sha256_finup_2x().
+ */
+ sha256_init(&ctx);
+ sha256_update(&ctx, test_buf, salt_len);
+
+ preempt_disable();
+ t0 = ktime_get_ns();
+ for (size_t j = 0; j < num_iters; j++)
+ sha256_finup_2x(&ctx, &test_buf[0], &test_buf[data_len],
+ data_len, hash1, hash2);
+ t1 = ktime_get_ns();
+ preempt_enable();
+ kunit_info(test, "data_len=%zu salt_len=%zu: %llu MB/s",
+ data_len, salt_len,
+ div64_u64((u64)data_len * 2 * num_iters * 1000,
+ t1 - t0 ?: 1));
+ }
+}
+
+static struct kunit_case hash_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(test_sha256_finup_2x),
+ KUNIT_CASE(test_sha256_finup_2x_defaultctx),
+ KUNIT_CASE(test_sha256_finup_2x_hugelen),
+ KUNIT_CASE(benchmark_hash),
+ KUNIT_CASE(benchmark_sha256_finup_2x),
+ {},
+};
+
+static struct kunit_suite hash_test_suite = {
+ .name = "sha256",
+ .test_cases = hash_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-256 and HMAC-SHA256");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha3-testvecs.h b/lib/crypto/tests/sha3-testvecs.h
new file mode 100644
index 000000000000..8d614a5fa0c3
--- /dev/null
+++ b/lib/crypto/tests/sha3-testvecs.h
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha3 */
+
+/* SHA3-256 test vectors */
+
+static const struct {
+ size_t data_len;
+ u8 digest[SHA3_256_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xa7, 0xff, 0xc6, 0xf8, 0xbf, 0x1e, 0xd7, 0x66,
+ 0x51, 0xc1, 0x47, 0x56, 0xa0, 0x61, 0xd6, 0x62,
+ 0xf5, 0x80, 0xff, 0x4d, 0xe4, 0x3b, 0x49, 0xfa,
+ 0x82, 0xd8, 0x0a, 0x4b, 0x80, 0xf8, 0x43, 0x4a,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x11, 0x03, 0xe7, 0x84, 0x51, 0x50, 0x86, 0x35,
+ 0x71, 0x8a, 0x70, 0xe3, 0xc4, 0x26, 0x7b, 0x21,
+ 0x02, 0x13, 0xa0, 0x81, 0xe8, 0xe6, 0x14, 0x25,
+ 0x07, 0x34, 0xe5, 0xc5, 0x40, 0x06, 0xf2, 0x8b,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x2f, 0x6f, 0x6d, 0x47, 0x48, 0x52, 0x11, 0xb9,
+ 0xe4, 0x3d, 0xc8, 0x71, 0xcf, 0xb2, 0xee, 0xae,
+ 0x5b, 0xf4, 0x12, 0x84, 0x5b, 0x1c, 0xec, 0x6c,
+ 0xc1, 0x66, 0x88, 0xaa, 0xc3, 0x40, 0xbd, 0x7e,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0xec, 0x02, 0xe8, 0x81, 0x4f, 0x84, 0x41, 0x69,
+ 0x06, 0xd8, 0xdc, 0x1d, 0x01, 0x78, 0xd7, 0xcb,
+ 0x39, 0xdf, 0xd3, 0x12, 0x1c, 0x99, 0xfd, 0xf3,
+ 0x5c, 0x83, 0xc9, 0xc2, 0x7a, 0x7b, 0x6a, 0x05,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0xff, 0x6f, 0xc3, 0x41, 0xc3, 0x5f, 0x34, 0x6d,
+ 0xa7, 0xdf, 0x3e, 0xc2, 0x8b, 0x29, 0xb6, 0xf1,
+ 0xf8, 0x67, 0xfd, 0xcd, 0xb1, 0x9f, 0x38, 0x08,
+ 0x1d, 0x8d, 0xd9, 0xc2, 0x43, 0x66, 0x18, 0x6c,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0xe4, 0xb1, 0x06, 0x17, 0xf8, 0x8b, 0x91, 0x95,
+ 0xe7, 0x57, 0x66, 0xac, 0x08, 0xb2, 0x03, 0x3e,
+ 0xf7, 0x84, 0x1f, 0xe3, 0x25, 0xa3, 0x11, 0xd2,
+ 0x11, 0xa4, 0x78, 0x74, 0x2a, 0x43, 0x20, 0xa5,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0xeb, 0x57, 0x5f, 0x20, 0xa3, 0x6b, 0xc7, 0xb4,
+ 0x66, 0x2a, 0xa0, 0x30, 0x3b, 0x52, 0x00, 0xc9,
+ 0xce, 0x6a, 0xd8, 0x1e, 0xbe, 0xed, 0xa1, 0xd1,
+ 0xbe, 0x63, 0xc7, 0xe1, 0xe2, 0x66, 0x67, 0x0c,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0xf0, 0x67, 0xad, 0x66, 0xbe, 0xec, 0x5a, 0xfd,
+ 0x29, 0xd2, 0x4f, 0x1d, 0xb2, 0x24, 0xb8, 0x90,
+ 0x05, 0x28, 0x0e, 0x66, 0x67, 0x74, 0x2d, 0xee,
+ 0x66, 0x25, 0x11, 0xd1, 0x76, 0xa2, 0xfc, 0x3a,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x57, 0x56, 0x21, 0xb3, 0x2d, 0x2d, 0xe1, 0x9d,
+ 0xbf, 0x2c, 0x82, 0xa8, 0xad, 0x7e, 0x6c, 0x46,
+ 0xfb, 0x30, 0xeb, 0xce, 0xcf, 0xed, 0x2d, 0x65,
+ 0xe7, 0xe4, 0x96, 0x69, 0xe0, 0x48, 0xd2, 0xb6,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x7b, 0xba, 0x67, 0x15, 0xe5, 0x21, 0xc4, 0x69,
+ 0xd3, 0xef, 0x5c, 0x97, 0x9f, 0x5b, 0xba, 0x9c,
+ 0xfa, 0x55, 0x64, 0xec, 0xb5, 0x37, 0x53, 0x1b,
+ 0x3f, 0x4c, 0x0a, 0xed, 0x51, 0x98, 0x2b, 0x52,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x44, 0xb6, 0x6b, 0x83, 0x09, 0x83, 0x55, 0x83,
+ 0xde, 0x1f, 0xcc, 0x33, 0xef, 0xdc, 0x05, 0xbb,
+ 0x3b, 0x63, 0x76, 0x45, 0xe4, 0x8e, 0x14, 0x7a,
+ 0x2d, 0xae, 0x90, 0xce, 0x68, 0xc3, 0xa4, 0xf2,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x50, 0x3e, 0x99, 0x4e, 0x28, 0x2b, 0xc9, 0xf4,
+ 0xf5, 0xeb, 0x2b, 0x16, 0x04, 0x2d, 0xf5, 0xbe,
+ 0xc0, 0x91, 0x41, 0x2a, 0x8e, 0x69, 0x5e, 0x39,
+ 0x53, 0x2c, 0xc1, 0x18, 0xa5, 0xeb, 0xd8, 0xda,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x90, 0x0b, 0xa6, 0x92, 0x84, 0x30, 0xaf, 0xee,
+ 0x38, 0x59, 0x83, 0x83, 0xe9, 0xfe, 0xab, 0x86,
+ 0x79, 0x1b, 0xcd, 0xe7, 0x0a, 0x0f, 0x58, 0x53,
+ 0x36, 0xab, 0x12, 0xe1, 0x5c, 0x97, 0xc1, 0xfb,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x2b, 0x52, 0x1e, 0x54, 0xbe, 0x38, 0x4c, 0x3e,
+ 0x73, 0x37, 0x18, 0xf5, 0x25, 0x2c, 0xc8, 0xc7,
+ 0xda, 0x7e, 0xb6, 0x47, 0x9d, 0xf4, 0x46, 0xce,
+ 0xfa, 0x80, 0x20, 0x6b, 0xbd, 0xfd, 0x2a, 0xd8,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x45, 0xf0, 0xf5, 0x9b, 0xd9, 0x91, 0x26, 0xd5,
+ 0x91, 0x3b, 0xf8, 0x87, 0x8b, 0x34, 0x02, 0x31,
+ 0x64, 0xab, 0xf4, 0x1c, 0x6e, 0x34, 0x72, 0xdf,
+ 0x32, 0x6d, 0xe5, 0xd2, 0x67, 0x5e, 0x86, 0x93,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0xb3, 0xaf, 0x71, 0x64, 0xfa, 0xd4, 0xf1, 0x07,
+ 0x38, 0xef, 0x04, 0x8e, 0x89, 0xf4, 0x02, 0xd2,
+ 0xa5, 0xaf, 0x3b, 0xf5, 0x67, 0x56, 0xcf, 0xa9,
+ 0x8e, 0x43, 0xf5, 0xb5, 0xe3, 0x91, 0x8e, 0xe7,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0x51, 0xac, 0x0a, 0x65, 0xb7, 0x96, 0x20, 0xcf,
+ 0x88, 0xf6, 0x97, 0x35, 0x89, 0x0d, 0x31, 0x0f,
+ 0xbe, 0x17, 0xbe, 0x62, 0x03, 0x67, 0xc0, 0xee,
+ 0x4f, 0xc1, 0xe3, 0x7f, 0x6f, 0xab, 0xac, 0xb4,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x7e, 0xea, 0xa8, 0xd7, 0xde, 0x20, 0x1b, 0x58,
+ 0x24, 0xd8, 0x26, 0x40, 0x36, 0x5f, 0x3f, 0xaa,
+ 0xe5, 0x5a, 0xea, 0x98, 0x58, 0xd4, 0xd6, 0xfc,
+ 0x20, 0x4c, 0x5c, 0x4f, 0xaf, 0x56, 0xc7, 0xc3,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0x61, 0xb1, 0xb1, 0x3e, 0x0e, 0x7e, 0x90, 0x3d,
+ 0x31, 0x54, 0xbd, 0xc9, 0x0d, 0x53, 0x62, 0xf1,
+ 0xcd, 0x18, 0x80, 0xf9, 0x91, 0x75, 0x41, 0xb3,
+ 0x51, 0x39, 0x57, 0xa7, 0xa8, 0x1e, 0xfb, 0xc9,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0xab, 0x29, 0xda, 0x10, 0xc4, 0x11, 0x2d, 0x5c,
+ 0xd1, 0xce, 0x1c, 0x95, 0xfa, 0xc6, 0xc7, 0xb0,
+ 0x1b, 0xd1, 0xdc, 0x6f, 0xa0, 0x9d, 0x1b, 0x23,
+ 0xfb, 0x6e, 0x90, 0x97, 0xd0, 0x75, 0x44, 0x7a,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x02, 0x45, 0x95, 0xf4, 0x19, 0xb5, 0x93, 0x29,
+ 0x90, 0xf2, 0x63, 0x3f, 0x89, 0xe8, 0xa5, 0x31,
+ 0x76, 0xf2, 0x89, 0x79, 0x66, 0xd3, 0x96, 0xdf,
+ 0x33, 0xd1, 0xa6, 0x17, 0x73, 0xb1, 0xd0, 0x45,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0xd1, 0x8e, 0x22, 0xea, 0x44, 0x87, 0x6e, 0x9d,
+ 0xfb, 0x36, 0x02, 0x20, 0x63, 0xb7, 0x69, 0x45,
+ 0x25, 0x41, 0x69, 0xe0, 0x9b, 0x87, 0xcf, 0xa3,
+ 0x51, 0xbb, 0xfc, 0x8d, 0xf7, 0x29, 0xa7, 0xea,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0x11, 0x86, 0x7d, 0x84, 0xf9, 0x8c, 0x6e, 0xc4,
+ 0x64, 0x36, 0xc6, 0xf3, 0x42, 0x92, 0x31, 0x2b,
+ 0x1e, 0x12, 0xe6, 0x4d, 0xbe, 0xfa, 0x77, 0x3f,
+ 0x89, 0x41, 0x33, 0x58, 0x1c, 0x98, 0x16, 0x0a,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0xb2, 0xba, 0x0c, 0x8c, 0x9d, 0xbb, 0x1e, 0xb0,
+ 0x03, 0xb5, 0xdf, 0x4f, 0xf5, 0x35, 0xdb, 0xec,
+ 0x60, 0xf2, 0x5b, 0xb6, 0xd0, 0x49, 0xd3, 0xed,
+ 0x55, 0xc0, 0x7a, 0xd7, 0xaf, 0xa1, 0xea, 0x53,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+ 0x3b, 0x33, 0x67, 0xf8, 0xea, 0x92, 0x78, 0x62,
+ 0xdd, 0xbe, 0x72, 0x15, 0xbd, 0x6f, 0xfa, 0xe5,
+ 0x5e, 0xab, 0x9f, 0xb1, 0xe4, 0x23, 0x7c, 0x2c,
+ 0x80, 0xcf, 0x09, 0x75, 0xf8, 0xe2, 0xfa, 0x30,
+};
+
+/* SHAKE test vectors */
+
+static const u8 shake128_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+ 0x89, 0x88, 0x3a, 0x44, 0xec, 0xfe, 0x3c, 0xeb,
+ 0x2f, 0x1c, 0x1d, 0xda, 0x9e, 0x36, 0x64, 0xf0,
+ 0x85, 0x4c, 0x49, 0x12, 0x76, 0x5a, 0x4d, 0xe7,
+ 0xa8, 0xfd, 0xcd, 0xbe, 0x45, 0xb4, 0x6f, 0xb0,
+};
+
+static const u8 shake256_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+ 0x5a, 0xfd, 0x66, 0x62, 0x5c, 0x37, 0x2b, 0x41,
+ 0x77, 0x1c, 0x01, 0x5d, 0x64, 0x7c, 0x63, 0x7a,
+ 0x7c, 0x76, 0x9e, 0xa8, 0xd1, 0xb0, 0x8e, 0x02,
+ 0x16, 0x9b, 0xfe, 0x0e, 0xb5, 0xd8, 0x6a, 0xb5,
+};
diff --git a/lib/crypto/tests/sha384-testvecs.h b/lib/crypto/tests/sha384-testvecs.h
new file mode 100644
index 000000000000..6e2f572dd792
--- /dev/null
+++ b/lib/crypto/tests/sha384-testvecs.h
@@ -0,0 +1,290 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha384 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[SHA384_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0x38, 0xb0, 0x60, 0xa7, 0x51, 0xac, 0x96, 0x38,
+ 0x4c, 0xd9, 0x32, 0x7e, 0xb1, 0xb1, 0xe3, 0x6a,
+ 0x21, 0xfd, 0xb7, 0x11, 0x14, 0xbe, 0x07, 0x43,
+ 0x4c, 0x0c, 0xc7, 0xbf, 0x63, 0xf6, 0xe1, 0xda,
+ 0x27, 0x4e, 0xde, 0xbf, 0xe7, 0x6f, 0x65, 0xfb,
+ 0xd5, 0x1a, 0xd2, 0xf1, 0x48, 0x98, 0xb9, 0x5b,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x07, 0x34, 0x9d, 0x74, 0x48, 0x76, 0xa5, 0x72,
+ 0x78, 0x02, 0xb8, 0x6e, 0x21, 0x59, 0xb0, 0x75,
+ 0x09, 0x68, 0x11, 0x39, 0x53, 0x61, 0xee, 0x8d,
+ 0xf2, 0x01, 0xf3, 0x90, 0x53, 0x7c, 0xd3, 0xde,
+ 0x13, 0x9f, 0xd2, 0x74, 0x28, 0xfe, 0xe1, 0xc8,
+ 0x2e, 0x95, 0xc6, 0x7d, 0x69, 0x4d, 0x04, 0xc6,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0xc4, 0xef, 0x6e, 0x8c, 0x19, 0x1c, 0xaa, 0x0e,
+ 0x86, 0xf2, 0x68, 0xa1, 0xa0, 0x2d, 0x2e, 0xb2,
+ 0x84, 0xbc, 0x5d, 0x53, 0x31, 0xf8, 0x03, 0x75,
+ 0x56, 0xf4, 0x8b, 0x23, 0x1a, 0x68, 0x15, 0x9a,
+ 0x60, 0xb2, 0xec, 0x05, 0xe1, 0xd4, 0x5e, 0x9e,
+ 0xe8, 0x7c, 0x9d, 0xe4, 0x0f, 0x9c, 0x3a, 0xdd,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0x29, 0xd2, 0x02, 0xa2, 0x77, 0x24, 0xc7, 0xa7,
+ 0x23, 0x0c, 0x3e, 0x30, 0x56, 0x47, 0xdb, 0x75,
+ 0xd4, 0x41, 0xf8, 0xb3, 0x8e, 0x26, 0xf6, 0x92,
+ 0xbc, 0x20, 0x2e, 0x96, 0xcc, 0x81, 0x5f, 0x32,
+ 0x82, 0x60, 0xe2, 0xcf, 0x23, 0xd7, 0x3c, 0x90,
+ 0xb2, 0x56, 0x8f, 0xb6, 0x0f, 0xf0, 0x6b, 0x80,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x21, 0x4c, 0xac, 0xfe, 0xbd, 0x40, 0x74, 0x1f,
+ 0xa2, 0x2d, 0x2f, 0x35, 0x91, 0xfd, 0xc9, 0x97,
+ 0x88, 0x12, 0x6c, 0x0c, 0x6e, 0xd8, 0x50, 0x0b,
+ 0x4b, 0x2c, 0x89, 0xa6, 0xa6, 0x4a, 0xad, 0xd7,
+ 0x72, 0x62, 0x2c, 0x62, 0x81, 0xcd, 0x24, 0x74,
+ 0xf5, 0x44, 0x05, 0xa0, 0x97, 0xea, 0xf1, 0x78,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x06, 0x8b, 0x92, 0x9f, 0x8b, 0x64, 0xb2, 0x80,
+ 0xde, 0xcc, 0xde, 0xc3, 0x2f, 0x22, 0x27, 0xe8,
+ 0x3b, 0x6e, 0x16, 0x21, 0x14, 0x81, 0xbe, 0x5b,
+ 0xa7, 0xa7, 0x14, 0x8a, 0x00, 0x8f, 0x0d, 0x38,
+ 0x11, 0x63, 0xe8, 0x3e, 0xb9, 0xf1, 0xcf, 0x87,
+ 0xb1, 0x28, 0xe5, 0xa1, 0x89, 0xa8, 0x7a, 0xde,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x9e, 0x37, 0x76, 0x62, 0x98, 0x39, 0xbe, 0xfd,
+ 0x2b, 0x91, 0x20, 0x54, 0x8f, 0x21, 0xe7, 0x30,
+ 0x0a, 0x01, 0x7a, 0x65, 0x0b, 0xc9, 0xb3, 0x89,
+ 0x3c, 0xb6, 0xd3, 0xa8, 0xff, 0xc9, 0x1b, 0x5c,
+ 0xd4, 0xac, 0xb4, 0x7e, 0xba, 0x94, 0xc3, 0x8a,
+ 0x26, 0x41, 0xf6, 0xd5, 0xed, 0x6f, 0x27, 0xa7,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x03, 0x1f, 0xef, 0x5a, 0x16, 0x28, 0x78, 0x10,
+ 0x29, 0xe8, 0xe2, 0xe4, 0x84, 0x36, 0x19, 0x10,
+ 0xaa, 0xea, 0xde, 0x06, 0x39, 0x5f, 0xb2, 0x36,
+ 0xca, 0x24, 0x4f, 0x7b, 0x66, 0xf7, 0xe7, 0x31,
+ 0xf3, 0x9b, 0x74, 0x1e, 0x17, 0x20, 0x88, 0x62,
+ 0x50, 0xeb, 0x5f, 0x9a, 0xa7, 0x2c, 0xf4, 0xc9,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x10, 0xce, 0xed, 0x26, 0xb8, 0xac, 0xc1, 0x1b,
+ 0xe6, 0xb9, 0xeb, 0x7c, 0xae, 0xcd, 0x55, 0x5a,
+ 0x20, 0x2a, 0x7b, 0x43, 0xe6, 0x3e, 0xf0, 0x3f,
+ 0xd9, 0x2f, 0x8c, 0x52, 0xe2, 0xf0, 0xb6, 0x24,
+ 0x2e, 0xa4, 0xac, 0x24, 0x3a, 0x54, 0x99, 0x71,
+ 0x65, 0xab, 0x97, 0x2d, 0xb6, 0xe6, 0x94, 0x20,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x24, 0x6d, 0x9f, 0x59, 0x42, 0x36, 0xca, 0x34,
+ 0x36, 0x41, 0xa2, 0xcd, 0x69, 0xdf, 0x3d, 0xcb,
+ 0x64, 0x94, 0x54, 0xb2, 0xed, 0xc1, 0x1c, 0x31,
+ 0xe3, 0x26, 0xcb, 0x71, 0xe6, 0x98, 0xb2, 0x56,
+ 0x74, 0x30, 0xa9, 0x15, 0x98, 0x9d, 0xb3, 0x07,
+ 0xcc, 0xa8, 0xcc, 0x6f, 0x42, 0xb0, 0x9d, 0x2b,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x85, 0x1f, 0xbc, 0x5e, 0x2a, 0x00, 0x7d, 0xc2,
+ 0x21, 0x4c, 0x28, 0x14, 0xc5, 0xd8, 0x0c, 0xe8,
+ 0x55, 0xa5, 0xa0, 0x77, 0xda, 0x8f, 0xce, 0xd4,
+ 0xf0, 0xcb, 0x30, 0xb8, 0x9c, 0x47, 0xe1, 0x33,
+ 0x92, 0x18, 0xc5, 0x1f, 0xf2, 0xef, 0xb5, 0xe5,
+ 0xbc, 0x63, 0xa6, 0xe5, 0x9a, 0xc9, 0xcc, 0xf1,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0x26, 0xd2, 0x4c, 0xb6, 0xce, 0xd8, 0x22, 0x2b,
+ 0x44, 0x10, 0x6f, 0x59, 0xf7, 0x0d, 0xb9, 0x3f,
+ 0x7d, 0x29, 0x75, 0xf1, 0x71, 0xb2, 0x71, 0x23,
+ 0xef, 0x68, 0xb7, 0x25, 0xae, 0xb8, 0x45, 0xf8,
+ 0xa3, 0xb2, 0x2d, 0x7a, 0x83, 0x0a, 0x05, 0x61,
+ 0xbc, 0x73, 0xf1, 0xf9, 0xba, 0xfb, 0x3d, 0xc2,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x7c, 0xe5, 0x7f, 0x5e, 0xea, 0xd9, 0x7e, 0x54,
+ 0x14, 0x30, 0x6f, 0x37, 0x02, 0x71, 0x0f, 0xf1,
+ 0x14, 0x16, 0xfa, 0xeb, 0x6e, 0x1e, 0xf0, 0xbe,
+ 0x10, 0xed, 0x01, 0xbf, 0xa0, 0x9d, 0xcb, 0x07,
+ 0x5f, 0x8b, 0x7f, 0x44, 0xe1, 0xd9, 0x13, 0xf0,
+ 0x29, 0xa2, 0x54, 0x32, 0xd9, 0xb0, 0x69, 0x69,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0xc5, 0x54, 0x1f, 0xcb, 0x9d, 0x8f, 0xdf, 0xbf,
+ 0xab, 0x55, 0x92, 0x1d, 0x3b, 0x93, 0x79, 0x26,
+ 0xdf, 0xba, 0x9a, 0x28, 0xff, 0xa0, 0x6c, 0xae,
+ 0x7b, 0x53, 0x8d, 0xfa, 0xef, 0x35, 0x88, 0x19,
+ 0x16, 0xb8, 0x72, 0x86, 0x76, 0x2a, 0xf5, 0xe6,
+ 0xec, 0xb2, 0xd7, 0xd4, 0xbe, 0x1a, 0xe4, 0x9f,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x74, 0x9d, 0x77, 0xfb, 0xe8, 0x0f, 0x0c, 0x2d,
+ 0x86, 0x0d, 0x49, 0xea, 0x2b, 0xd0, 0x13, 0xd1,
+ 0xe8, 0xb8, 0xe1, 0xa3, 0x7b, 0x48, 0xab, 0x6a,
+ 0x21, 0x2b, 0x4c, 0x48, 0x32, 0xb5, 0xdc, 0x31,
+ 0x7f, 0xd0, 0x32, 0x67, 0x9a, 0xc0, 0x85, 0x53,
+ 0xef, 0xe9, 0xfb, 0xe1, 0x8b, 0xd8, 0xcc, 0xc2,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x7b, 0xa9, 0xde, 0xa3, 0x07, 0x5c, 0x4c, 0xaa,
+ 0x31, 0xc6, 0x9e, 0x55, 0xd4, 0x3f, 0x52, 0xdd,
+ 0xde, 0x36, 0x70, 0x96, 0x59, 0x6e, 0x90, 0x78,
+ 0x4c, 0x6a, 0x27, 0xde, 0x83, 0x84, 0xc3, 0x35,
+ 0x53, 0x76, 0x1d, 0xbf, 0x83, 0x64, 0xcf, 0xf2,
+ 0xb0, 0x3e, 0x07, 0x27, 0xe4, 0x25, 0x6c, 0x56,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0x53, 0x50, 0xf7, 0x3b, 0x86, 0x1d, 0x7a, 0xe2,
+ 0x5d, 0x9b, 0x71, 0xfa, 0x25, 0x23, 0x5a, 0xfe,
+ 0x8c, 0xb9, 0xac, 0x8a, 0x9d, 0x6c, 0x99, 0xbc,
+ 0x01, 0x9e, 0xa0, 0xd6, 0x3c, 0x03, 0x46, 0x21,
+ 0xb6, 0xd0, 0xb0, 0xb3, 0x23, 0x23, 0x58, 0xf1,
+ 0xea, 0x4e, 0xf2, 0x1a, 0x2f, 0x14, 0x2b, 0x5a,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x06, 0x03, 0xb3, 0xba, 0x14, 0xe0, 0x28, 0x07,
+ 0xd5, 0x15, 0x97, 0x1f, 0x87, 0xef, 0x80, 0xba,
+ 0x48, 0x03, 0xb6, 0xc5, 0x47, 0xca, 0x8c, 0x95,
+ 0xed, 0x95, 0xfd, 0x27, 0xb6, 0x83, 0xda, 0x6d,
+ 0xa7, 0xb2, 0x1a, 0xd2, 0xb5, 0x89, 0xbb, 0xb4,
+ 0x00, 0xbc, 0x86, 0x54, 0x7d, 0x5a, 0x91, 0x63,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0xd3, 0xe0, 0x6e, 0x7d, 0x80, 0x08, 0x53, 0x07,
+ 0x8c, 0x0f, 0xc2, 0xce, 0x9f, 0x09, 0x86, 0x31,
+ 0x28, 0x24, 0x3c, 0x3e, 0x2d, 0x36, 0xb4, 0x28,
+ 0xc7, 0x1b, 0x70, 0xf9, 0x35, 0x9b, 0x10, 0xfa,
+ 0xc8, 0x5e, 0x2b, 0x32, 0x7f, 0x65, 0xd2, 0x68,
+ 0xb2, 0x84, 0x90, 0xf6, 0xc8, 0x6e, 0xb8, 0xdb,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0x39, 0xeb, 0xc4, 0xb3, 0x08, 0xe2, 0xdd, 0xf3,
+ 0x9f, 0x5e, 0x44, 0x93, 0x63, 0x8b, 0x39, 0x57,
+ 0xd7, 0xe8, 0x7e, 0x3d, 0x74, 0xf8, 0xf6, 0xab,
+ 0xfe, 0x74, 0x51, 0xe4, 0x1b, 0x4a, 0x23, 0xbc,
+ 0x69, 0xfc, 0xbb, 0xa7, 0x71, 0xa7, 0x86, 0x24,
+ 0xcc, 0x85, 0x70, 0xf2, 0x31, 0x0d, 0x47, 0xc0,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x23, 0xc3, 0x97, 0x06, 0x79, 0xbe, 0x8a, 0xe9,
+ 0x1f, 0x1a, 0x43, 0xad, 0xe6, 0x76, 0x23, 0x13,
+ 0x64, 0xae, 0xda, 0xe7, 0x8b, 0x88, 0x96, 0xb6,
+ 0xa9, 0x1a, 0xb7, 0x80, 0x8e, 0x1c, 0x94, 0x98,
+ 0x09, 0x08, 0xdb, 0x8e, 0x4d, 0x0a, 0x09, 0x65,
+ 0xe5, 0x21, 0x1c, 0xd9, 0xab, 0x64, 0xbb, 0xea,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x4f, 0x4a, 0x88, 0x9f, 0x40, 0x89, 0xfe, 0xb6,
+ 0xda, 0x9d, 0xcd, 0xa5, 0x27, 0xd2, 0x29, 0x71,
+ 0x58, 0x60, 0xd4, 0x55, 0xfe, 0x92, 0xcd, 0x51,
+ 0x8b, 0xec, 0x3b, 0xd3, 0xd1, 0x3e, 0x8d, 0x36,
+ 0x7b, 0xb1, 0x41, 0xef, 0xec, 0x9d, 0xdf, 0xcd,
+ 0x4e, 0xde, 0x5a, 0xe5, 0xe5, 0x16, 0x14, 0x54,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0xb5, 0xa5, 0x3e, 0x86, 0x39, 0x20, 0x49, 0x4c,
+ 0xcd, 0xb6, 0xdd, 0x03, 0xfe, 0x36, 0x6e, 0xa6,
+ 0xfc, 0xff, 0x19, 0x33, 0x0c, 0x52, 0xea, 0x37,
+ 0x94, 0xda, 0x5b, 0x27, 0xd1, 0x99, 0x5a, 0x89,
+ 0x40, 0x78, 0xfa, 0x96, 0xb9, 0x2f, 0xa0, 0x48,
+ 0xc9, 0xf8, 0x5c, 0xf0, 0x95, 0xf4, 0xea, 0x61,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0x6f, 0x48, 0x6f, 0x21, 0xb9, 0xc1, 0xcc, 0x92,
+ 0x4e, 0xed, 0x6b, 0xef, 0x51, 0x88, 0xdf, 0xfd,
+ 0xcb, 0x3d, 0x44, 0x9c, 0x37, 0x85, 0xb4, 0xc5,
+ 0xeb, 0x60, 0x55, 0x58, 0x01, 0x47, 0xbf, 0x75,
+ 0x9b, 0xa8, 0x82, 0x8c, 0xec, 0xe8, 0x0e, 0x58,
+ 0xc1, 0x26, 0xa2, 0x45, 0x87, 0x3e, 0xfb, 0x8d,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[SHA384_DIGEST_SIZE] = {
+ 0xfc, 0xcb, 0xe6, 0x42, 0xf0, 0x9e, 0x2b, 0x77,
+ 0x7b, 0x62, 0xe8, 0x70, 0x86, 0xbf, 0xaf, 0x3c,
+ 0xbb, 0x02, 0xd9, 0x86, 0xdc, 0xba, 0xd3, 0xa4,
+ 0x0d, 0x8d, 0xb3, 0x2d, 0x0b, 0xa3, 0x84, 0x04,
+ 0x7c, 0x16, 0x37, 0xaf, 0xba, 0x1e, 0xd4, 0x2f,
+ 0x4c, 0x57, 0x55, 0x86, 0x52, 0x47, 0x9a, 0xec,
+};
+
+static const u8 hmac_testvec_consolidated[SHA384_DIGEST_SIZE] = {
+ 0x82, 0xcf, 0x7d, 0x80, 0x71, 0xdb, 0x91, 0x09,
+ 0x67, 0xe8, 0x44, 0x4a, 0x0d, 0x03, 0xb1, 0xf9,
+ 0x62, 0xde, 0x4e, 0xbb, 0x1f, 0x41, 0xcd, 0x62,
+ 0x39, 0x6b, 0x2d, 0x44, 0x0e, 0xde, 0x98, 0x73,
+ 0xdd, 0xeb, 0x9d, 0x53, 0xfb, 0xee, 0xd1, 0xc3,
+ 0x96, 0xdb, 0xfc, 0x2a, 0x38, 0x90, 0x02, 0x53,
+};
diff --git a/lib/crypto/tests/sha384_kunit.c b/lib/crypto/tests/sha384_kunit.c
new file mode 100644
index 000000000000..e1ef5c995bb6
--- /dev/null
+++ b/lib/crypto/tests/sha384_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha384-testvecs.h"
+
+#define HASH sha384
+#define HASH_CTX sha384_ctx
+#define HASH_SIZE SHA384_DIGEST_SIZE
+#define HASH_INIT sha384_init
+#define HASH_UPDATE sha384_update
+#define HASH_FINAL sha384_final
+#define HMAC_KEY hmac_sha384_key
+#define HMAC_CTX hmac_sha384_ctx
+#define HMAC_PREPAREKEY hmac_sha384_preparekey
+#define HMAC_INIT hmac_sha384_init
+#define HMAC_UPDATE hmac_sha384_update
+#define HMAC_FINAL hmac_sha384_final
+#define HMAC hmac_sha384
+#define HMAC_USINGRAWKEY hmac_sha384_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite hash_test_suite = {
+ .name = "sha384",
+ .test_cases = hash_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-384 and HMAC-SHA384");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha3_kunit.c b/lib/crypto/tests/sha3_kunit.c
new file mode 100644
index 000000000000..ed5fbe80337f
--- /dev/null
+++ b/lib/crypto/tests/sha3_kunit.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#include <crypto/sha3.h>
+#include "sha3-testvecs.h"
+
+#define HASH sha3_256
+#define HASH_CTX sha3_ctx
+#define HASH_SIZE SHA3_256_DIGEST_SIZE
+#define HASH_INIT sha3_256_init
+#define HASH_UPDATE sha3_update
+#define HASH_FINAL sha3_final
+#include "hash-test-template.h"
+
+/*
+ * Sample message and the output generated for various algorithms by passing it
+ * into "openssl sha3-224" etc..
+ */
+static const u8 test_sha3_sample[] =
+ "The quick red fox jumped over the lazy brown dog!\n"
+ "The quick red fox jumped over the lazy brown dog!\n"
+ "The quick red fox jumped over the lazy brown dog!\n"
+ "The quick red fox jumped over the lazy brown dog!\n";
+
+static const u8 test_sha3_224[8 + SHA3_224_DIGEST_SIZE + 8] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+ 0xd6, 0xe8, 0xd8, 0x80, 0xfa, 0x42, 0x80, 0x70,
+ 0x7e, 0x7f, 0xd7, 0xd2, 0xd7, 0x7a, 0x35, 0x65,
+ 0xf0, 0x0b, 0x4f, 0x9f, 0x2a, 0x33, 0xca, 0x0a,
+ 0xef, 0xa6, 0x4c, 0xb8,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_256[8 + SHA3_256_DIGEST_SIZE + 8] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+ 0xdb, 0x3b, 0xb0, 0xb8, 0x8d, 0x15, 0x78, 0xe5,
+ 0x78, 0x76, 0x8e, 0x39, 0x7e, 0x89, 0x86, 0xb9,
+ 0x14, 0x3a, 0x1e, 0xe7, 0x96, 0x7c, 0xf3, 0x25,
+ 0x70, 0xbd, 0xc3, 0xa9, 0xae, 0x63, 0x71, 0x1d,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_384[8 + SHA3_384_DIGEST_SIZE + 8] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+ 0x2d, 0x4b, 0x29, 0x85, 0x19, 0x94, 0xaa, 0x31,
+ 0x9b, 0x04, 0x9d, 0x6e, 0x79, 0x66, 0xc7, 0x56,
+ 0x8a, 0x2e, 0x99, 0x84, 0x06, 0xcf, 0x10, 0x2d,
+ 0xec, 0xf0, 0x03, 0x04, 0x1f, 0xd5, 0x99, 0x63,
+ 0x2f, 0xc3, 0x2b, 0x0d, 0xd9, 0x45, 0xf7, 0xbb,
+ 0x0a, 0xc3, 0x46, 0xab, 0xfe, 0x4d, 0x94, 0xc2,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_512[8 + SHA3_512_DIGEST_SIZE + 8] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+ 0xdd, 0x71, 0x3b, 0x44, 0xb6, 0x6c, 0xd7, 0x78,
+ 0xe7, 0x93, 0xa1, 0x4c, 0xd7, 0x24, 0x16, 0xf1,
+ 0xfd, 0xa2, 0x82, 0x4e, 0xed, 0x59, 0xe9, 0x83,
+ 0x15, 0x38, 0x89, 0x7d, 0x39, 0x17, 0x0c, 0xb2,
+ 0xcf, 0x12, 0x80, 0x78, 0xa1, 0x78, 0x41, 0xeb,
+ 0xed, 0x21, 0x4c, 0xa4, 0x4a, 0x5f, 0x30, 0x1a,
+ 0x70, 0x98, 0x4f, 0x14, 0xa2, 0xd1, 0x64, 0x1b,
+ 0xc2, 0x0a, 0xff, 0x3b, 0xe8, 0x26, 0x41, 0x8f,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_shake128[8 + SHAKE128_DEFAULT_SIZE + 8] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+ 0x41, 0xd6, 0xb8, 0x9c, 0xf8, 0xe8, 0x54, 0xf2,
+ 0x5c, 0xde, 0x51, 0x12, 0xaf, 0x9e, 0x0d, 0x91,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_shake256[8 + SHAKE256_DEFAULT_SIZE + 8] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+ 0xab, 0x06, 0xd4, 0xf9, 0x8b, 0xfd, 0xb2, 0xc4,
+ 0xfe, 0xf1, 0xcc, 0xe2, 0x40, 0x45, 0xdd, 0x15,
+ 0xcb, 0xdd, 0x02, 0x8d, 0xb7, 0x9f, 0x1e, 0x67,
+ 0xd6, 0x7f, 0x98, 0x5e, 0x1b, 0x19, 0xf8, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static void test_sha3_224_basic(struct kunit *test)
+{
+ u8 out[8 + SHA3_224_DIGEST_SIZE + 8];
+
+ BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_224));
+
+ memset(out, 0, sizeof(out));
+ sha3_224(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_224, sizeof(test_sha3_224),
+ "SHA3-224 gives wrong output");
+}
+
+static void test_sha3_256_basic(struct kunit *test)
+{
+ u8 out[8 + SHA3_256_DIGEST_SIZE + 8];
+
+ BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_256));
+
+ memset(out, 0, sizeof(out));
+ sha3_256(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_256, sizeof(test_sha3_256),
+ "SHA3-256 gives wrong output");
+}
+
+static void test_sha3_384_basic(struct kunit *test)
+{
+ u8 out[8 + SHA3_384_DIGEST_SIZE + 8];
+
+ BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_384));
+
+ memset(out, 0, sizeof(out));
+ sha3_384(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_384, sizeof(test_sha3_384),
+ "SHA3-384 gives wrong output");
+}
+
+static void test_sha3_512_basic(struct kunit *test)
+{
+ u8 out[8 + SHA3_512_DIGEST_SIZE + 8];
+
+ BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_512));
+
+ memset(out, 0, sizeof(out));
+ sha3_512(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_512, sizeof(test_sha3_512),
+ "SHA3-512 gives wrong output");
+}
+
+static void test_shake128_basic(struct kunit *test)
+{
+ u8 out[8 + SHAKE128_DEFAULT_SIZE + 8];
+
+ BUILD_BUG_ON(sizeof(out) != sizeof(test_shake128));
+
+ memset(out, 0, sizeof(out));
+ shake128(test_sha3_sample, sizeof(test_sha3_sample) - 1,
+ out + 8, sizeof(out) - 16);
+
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128, sizeof(test_shake128),
+ "SHAKE128 gives wrong output");
+}
+
+static void test_shake256_basic(struct kunit *test)
+{
+ u8 out[8 + SHAKE256_DEFAULT_SIZE + 8];
+
+ BUILD_BUG_ON(sizeof(out) != sizeof(test_shake256));
+
+ memset(out, 0, sizeof(out));
+ shake256(test_sha3_sample, sizeof(test_sha3_sample) - 1,
+ out + 8, sizeof(out) - 16);
+
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256, sizeof(test_shake256),
+ "SHAKE256 gives wrong output");
+}
+
+/*
+ * Usable NIST tests.
+ *
+ * From: https://csrc.nist.gov/projects/cryptographic-standards-and-guidelines/example-values
+ */
+static const u8 test_nist_1600_sample[] = {
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+ 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3
+};
+
+static const u8 test_shake128_nist_0[] = {
+ 0x7f, 0x9c, 0x2b, 0xa4, 0xe8, 0x8f, 0x82, 0x7d,
+ 0x61, 0x60, 0x45, 0x50, 0x76, 0x05, 0x85, 0x3e
+};
+
+static const u8 test_shake128_nist_1600[] = {
+ 0x13, 0x1a, 0xb8, 0xd2, 0xb5, 0x94, 0x94, 0x6b,
+ 0x9c, 0x81, 0x33, 0x3f, 0x9b, 0xb6, 0xe0, 0xce,
+};
+
+static const u8 test_shake256_nist_0[] = {
+ 0x46, 0xb9, 0xdd, 0x2b, 0x0b, 0xa8, 0x8d, 0x13,
+ 0x23, 0x3b, 0x3f, 0xeb, 0x74, 0x3e, 0xeb, 0x24,
+ 0x3f, 0xcd, 0x52, 0xea, 0x62, 0xb8, 0x1b, 0x82,
+ 0xb5, 0x0c, 0x27, 0x64, 0x6e, 0xd5, 0x76, 0x2f
+};
+
+static const u8 test_shake256_nist_1600[] = {
+ 0xcd, 0x8a, 0x92, 0x0e, 0xd1, 0x41, 0xaa, 0x04,
+ 0x07, 0xa2, 0x2d, 0x59, 0x28, 0x86, 0x52, 0xe9,
+ 0xd9, 0xf1, 0xa7, 0xee, 0x0c, 0x1e, 0x7c, 0x1c,
+ 0xa6, 0x99, 0x42, 0x4d, 0xa8, 0x4a, 0x90, 0x4d,
+};
+
+static void test_shake128_nist(struct kunit *test)
+{
+ u8 out[SHAKE128_DEFAULT_SIZE];
+
+ shake128("", 0, out, sizeof(out));
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128_nist_0, sizeof(out),
+ "SHAKE128 gives wrong output for NIST.0");
+
+ shake128(test_nist_1600_sample, sizeof(test_nist_1600_sample),
+ out, sizeof(out));
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128_nist_1600, sizeof(out),
+ "SHAKE128 gives wrong output for NIST.1600");
+}
+
+static void test_shake256_nist(struct kunit *test)
+{
+ u8 out[SHAKE256_DEFAULT_SIZE];
+
+ shake256("", 0, out, sizeof(out));
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256_nist_0, sizeof(out),
+ "SHAKE256 gives wrong output for NIST.0");
+
+ shake256(test_nist_1600_sample, sizeof(test_nist_1600_sample),
+ out, sizeof(out));
+ KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256_nist_1600, sizeof(out),
+ "SHAKE256 gives wrong output for NIST.1600");
+}
+
+static void shake(int alg, const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+ if (alg == 0)
+ shake128(in, in_len, out, out_len);
+ else
+ shake256(in, in_len, out, out_len);
+}
+
+static void shake_init(struct shake_ctx *ctx, int alg)
+{
+ if (alg == 0)
+ shake128_init(ctx);
+ else
+ shake256_init(ctx);
+}
+
+/*
+ * Test each of SHAKE128 and SHAKE256 with all input lengths 0 through 4096, for
+ * both input and output. The input and output lengths cycle through the values
+ * together, so we do 4096 tests total. To verify all the SHAKE outputs,
+ * compute and verify the SHA3-256 digest of all of them concatenated together.
+ */
+static void test_shake_all_lens_up_to_4096(struct kunit *test)
+{
+ struct sha3_ctx main_ctx;
+ const size_t max_len = 4096;
+ u8 *const in = test_buf;
+ u8 *const out = &test_buf[TEST_BUF_LEN - max_len];
+ u8 main_hash[SHA3_256_DIGEST_SIZE];
+
+ KUNIT_ASSERT_LE(test, 2 * max_len, TEST_BUF_LEN);
+
+ rand_bytes_seeded_from_len(in, max_len);
+ for (int alg = 0; alg < 2; alg++) {
+ sha3_256_init(&main_ctx);
+ for (size_t in_len = 0; in_len <= max_len; in_len++) {
+ size_t out_len = (in_len * 293) % (max_len + 1);
+
+ shake(alg, in, in_len, out, out_len);
+ sha3_update(&main_ctx, out, out_len);
+ }
+ sha3_final(&main_ctx, main_hash);
+ if (alg == 0)
+ KUNIT_ASSERT_MEMEQ_MSG(test, main_hash,
+ shake128_testvec_consolidated,
+ sizeof(main_hash),
+ "shake128() gives wrong output");
+ else
+ KUNIT_ASSERT_MEMEQ_MSG(test, main_hash,
+ shake256_testvec_consolidated,
+ sizeof(main_hash),
+ "shake256() gives wrong output");
+ }
+}
+
+/*
+ * Test that a sequence of SHAKE squeezes gives the same output as a single
+ * squeeze of the same total length.
+ */
+static void test_shake_multiple_squeezes(struct kunit *test)
+{
+ const size_t max_len = 512;
+ u8 *ref_out;
+
+ KUNIT_ASSERT_GE(test, TEST_BUF_LEN, 2 * max_len);
+
+ ref_out = kunit_kzalloc(test, max_len, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, ref_out);
+
+ for (int i = 0; i < 2000; i++) {
+ const int alg = rand32() % 2;
+ const size_t in_len = rand_length(max_len);
+ const size_t out_len = rand_length(max_len);
+ const size_t in_offs = rand_offset(max_len - in_len);
+ const size_t out_offs = rand_offset(max_len - out_len);
+ u8 *const in = &test_buf[in_offs];
+ u8 *const out = &test_buf[out_offs];
+ struct shake_ctx ctx;
+ size_t remaining_len, j, num_parts;
+
+ rand_bytes(in, in_len);
+ rand_bytes(out, out_len);
+
+ /* Compute the output using the one-shot function. */
+ shake(alg, in, in_len, ref_out, out_len);
+
+ /* Compute the output using a random sequence of squeezes. */
+ shake_init(&ctx, alg);
+ shake_update(&ctx, in, in_len);
+ remaining_len = out_len;
+ j = 0;
+ num_parts = 0;
+ while (rand_bool()) {
+ size_t part_len = rand_length(remaining_len);
+
+ shake_squeeze(&ctx, &out[j], part_len);
+ num_parts++;
+ j += part_len;
+ remaining_len -= part_len;
+ }
+ if (remaining_len != 0 || rand_bool()) {
+ shake_squeeze(&ctx, &out[j], remaining_len);
+ num_parts++;
+ }
+
+ /* Verify that the outputs are the same. */
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, out, ref_out, out_len,
+ "Multi-squeeze test failed with in_len=%zu in_offs=%zu out_len=%zu out_offs=%zu num_parts=%zu alg=%d",
+ in_len, in_offs, out_len, out_offs, num_parts, alg);
+ }
+}
+
+/*
+ * Test that SHAKE operations on buffers immediately followed by an unmapped
+ * page work as expected. This catches out-of-bounds memory accesses even if
+ * they occur in assembly code.
+ */
+static void test_shake_with_guarded_bufs(struct kunit *test)
+{
+ const size_t max_len = 512;
+ u8 *reg_buf;
+
+ KUNIT_ASSERT_GE(test, TEST_BUF_LEN, max_len);
+
+ reg_buf = kunit_kzalloc(test, max_len, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, reg_buf);
+
+ for (int alg = 0; alg < 2; alg++) {
+ for (size_t len = 0; len <= max_len; len++) {
+ u8 *guarded_buf = &test_buf[TEST_BUF_LEN - len];
+
+ rand_bytes(reg_buf, len);
+ memcpy(guarded_buf, reg_buf, len);
+
+ shake(alg, reg_buf, len, reg_buf, len);
+ shake(alg, guarded_buf, len, guarded_buf, len);
+
+ KUNIT_ASSERT_MEMEQ_MSG(
+ test, reg_buf, guarded_buf, len,
+ "Guard page test failed with len=%zu alg=%d",
+ len, alg);
+ }
+ }
+}
+
+static struct kunit_case sha3_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(test_sha3_224_basic),
+ KUNIT_CASE(test_sha3_256_basic),
+ KUNIT_CASE(test_sha3_384_basic),
+ KUNIT_CASE(test_sha3_512_basic),
+ KUNIT_CASE(test_shake128_basic),
+ KUNIT_CASE(test_shake256_basic),
+ KUNIT_CASE(test_shake128_nist),
+ KUNIT_CASE(test_shake256_nist),
+ KUNIT_CASE(test_shake_all_lens_up_to_4096),
+ KUNIT_CASE(test_shake_multiple_squeezes),
+ KUNIT_CASE(test_shake_with_guarded_bufs),
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite sha3_test_suite = {
+ .name = "sha3",
+ .test_cases = sha3_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(sha3_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA3");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha512-testvecs.h b/lib/crypto/tests/sha512-testvecs.h
new file mode 100644
index 000000000000..c506aaf72ba7
--- /dev/null
+++ b/lib/crypto/tests/sha512-testvecs.h
@@ -0,0 +1,342 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha512 */
+
+static const struct {
+ size_t data_len;
+ u8 digest[SHA512_DIGEST_SIZE];
+} hash_testvecs[] = {
+ {
+ .data_len = 0,
+ .digest = {
+ 0xcf, 0x83, 0xe1, 0x35, 0x7e, 0xef, 0xb8, 0xbd,
+ 0xf1, 0x54, 0x28, 0x50, 0xd6, 0x6d, 0x80, 0x07,
+ 0xd6, 0x20, 0xe4, 0x05, 0x0b, 0x57, 0x15, 0xdc,
+ 0x83, 0xf4, 0xa9, 0x21, 0xd3, 0x6c, 0xe9, 0xce,
+ 0x47, 0xd0, 0xd1, 0x3c, 0x5d, 0x85, 0xf2, 0xb0,
+ 0xff, 0x83, 0x18, 0xd2, 0x87, 0x7e, 0xec, 0x2f,
+ 0x63, 0xb9, 0x31, 0xbd, 0x47, 0x41, 0x7a, 0x81,
+ 0xa5, 0x38, 0x32, 0x7a, 0xf9, 0x27, 0xda, 0x3e,
+ },
+ },
+ {
+ .data_len = 1,
+ .digest = {
+ 0x12, 0xf2, 0xb6, 0xec, 0x84, 0xa0, 0x8e, 0xcf,
+ 0x0d, 0xec, 0x17, 0xfd, 0x1f, 0x91, 0x15, 0x69,
+ 0xd2, 0xb9, 0x89, 0xff, 0x89, 0x9d, 0xd9, 0x0b,
+ 0x7a, 0x0f, 0x82, 0x94, 0x57, 0x5b, 0xf3, 0x08,
+ 0x42, 0x45, 0x23, 0x08, 0x44, 0x54, 0x35, 0x36,
+ 0xed, 0x4b, 0xb3, 0xa5, 0xbf, 0x17, 0xc1, 0x3c,
+ 0xdd, 0x25, 0x4a, 0x30, 0x64, 0xed, 0x66, 0x06,
+ 0x72, 0x05, 0xc2, 0x71, 0x5a, 0x6c, 0xd0, 0x75,
+ },
+ },
+ {
+ .data_len = 2,
+ .digest = {
+ 0x01, 0x37, 0x97, 0xcc, 0x0a, 0xcb, 0x61, 0xa4,
+ 0x93, 0x26, 0x36, 0x4b, 0xd2, 0x27, 0xea, 0xaf,
+ 0xda, 0xfa, 0x8f, 0x86, 0x12, 0x99, 0x7b, 0xc8,
+ 0x94, 0xa9, 0x1c, 0x70, 0x3f, 0x43, 0xf3, 0x9a,
+ 0x02, 0xc5, 0x0d, 0x8e, 0x01, 0xf8, 0x3a, 0xa6,
+ 0xec, 0xaa, 0xc5, 0xc7, 0x9d, 0x3d, 0x7f, 0x9d,
+ 0x47, 0x0e, 0x58, 0x2d, 0x9a, 0x2d, 0x51, 0x1d,
+ 0xc3, 0x77, 0xb2, 0x7f, 0x69, 0x9a, 0xc3, 0x50,
+ },
+ },
+ {
+ .data_len = 3,
+ .digest = {
+ 0xd4, 0xa3, 0xc2, 0x50, 0xa0, 0x33, 0xc6, 0xe4,
+ 0x50, 0x08, 0xea, 0xc9, 0xb8, 0x35, 0x55, 0x34,
+ 0x61, 0xb8, 0x2e, 0xa2, 0xe5, 0xdc, 0x04, 0x70,
+ 0x99, 0x86, 0x5f, 0xee, 0x2e, 0x1e, 0xd4, 0x40,
+ 0xf8, 0x88, 0x01, 0x84, 0x97, 0x16, 0x6a, 0xd3,
+ 0x0a, 0xb4, 0xe2, 0x34, 0xca, 0x1f, 0xfc, 0x6b,
+ 0x61, 0xf3, 0x7f, 0x72, 0xfa, 0x8d, 0x22, 0xcf,
+ 0x7f, 0x2d, 0x87, 0x9d, 0xbb, 0x59, 0xac, 0x53,
+ },
+ },
+ {
+ .data_len = 16,
+ .digest = {
+ 0x3b, 0xae, 0x66, 0x48, 0x0f, 0x35, 0x3c, 0xcd,
+ 0x92, 0xa9, 0xb6, 0xb9, 0xfe, 0x19, 0x90, 0x92,
+ 0x52, 0xa5, 0x02, 0xd1, 0x89, 0xcf, 0x98, 0x86,
+ 0x29, 0x28, 0xab, 0xc4, 0x9e, 0xcc, 0x75, 0x38,
+ 0x95, 0xa7, 0x59, 0x43, 0xef, 0x8c, 0x3a, 0xeb,
+ 0x40, 0xa3, 0xbe, 0x2b, 0x75, 0x0d, 0xfd, 0xc3,
+ 0xaf, 0x69, 0x08, 0xad, 0x9f, 0xc9, 0xf4, 0x96,
+ 0xa9, 0xc2, 0x2b, 0x1b, 0x66, 0x6f, 0x1d, 0x28,
+ },
+ },
+ {
+ .data_len = 32,
+ .digest = {
+ 0x9c, 0xfb, 0x3c, 0x40, 0xd5, 0x3b, 0xc4, 0xff,
+ 0x07, 0xa7, 0xf0, 0x24, 0xb7, 0xd6, 0x5e, 0x12,
+ 0x5b, 0x85, 0xb5, 0xa5, 0xe0, 0x82, 0xa6, 0xda,
+ 0x30, 0x13, 0x2f, 0x1a, 0xe3, 0xd0, 0x55, 0xcb,
+ 0x14, 0x19, 0xe2, 0x09, 0x91, 0x96, 0x26, 0xf9,
+ 0x38, 0xd7, 0xfa, 0x4a, 0xfb, 0x2f, 0x6f, 0xc0,
+ 0xf4, 0x95, 0xc3, 0x40, 0xf6, 0xdb, 0xe7, 0xc2,
+ 0x79, 0x23, 0xa4, 0x20, 0x96, 0x3a, 0x00, 0xbb,
+ },
+ },
+ {
+ .data_len = 48,
+ .digest = {
+ 0x92, 0x1a, 0x21, 0x06, 0x6e, 0x08, 0x84, 0x09,
+ 0x23, 0x8d, 0x63, 0xec, 0xd6, 0x72, 0xd3, 0x21,
+ 0x51, 0xe8, 0x65, 0x94, 0xf8, 0x1f, 0x5f, 0xa7,
+ 0xab, 0x6b, 0xae, 0x1c, 0x2c, 0xaf, 0xf9, 0x0c,
+ 0x7c, 0x5a, 0x74, 0x1d, 0x90, 0x26, 0x4a, 0xc3,
+ 0xa1, 0x60, 0xf4, 0x1d, 0xd5, 0x3c, 0x86, 0xe8,
+ 0x00, 0xb3, 0x99, 0x27, 0xb8, 0x9d, 0x3e, 0x17,
+ 0x32, 0x5a, 0x34, 0x3e, 0xc2, 0xb2, 0x6e, 0xbd,
+ },
+ },
+ {
+ .data_len = 49,
+ .digest = {
+ 0x5a, 0x1f, 0x40, 0x5f, 0xee, 0xf2, 0xdd, 0x67,
+ 0x01, 0xcb, 0x26, 0x58, 0xf5, 0x1b, 0xe8, 0x7e,
+ 0xeb, 0x7d, 0x9d, 0xef, 0xd3, 0x55, 0xd6, 0x89,
+ 0x2e, 0xfc, 0x14, 0xe2, 0x98, 0x4c, 0x31, 0xaa,
+ 0x69, 0x00, 0xf9, 0x4e, 0xb0, 0x75, 0x1b, 0x71,
+ 0x93, 0x60, 0xdf, 0xa1, 0xaf, 0xba, 0xc2, 0xd3,
+ 0x6a, 0x22, 0xa0, 0xff, 0xb5, 0x66, 0x15, 0x66,
+ 0xd2, 0x24, 0x9a, 0x7e, 0xe4, 0xe5, 0x84, 0xdb,
+ },
+ },
+ {
+ .data_len = 63,
+ .digest = {
+ 0x32, 0xbd, 0xcf, 0x72, 0xa9, 0x74, 0x87, 0xe6,
+ 0x2a, 0x53, 0x7e, 0x6d, 0xac, 0xc2, 0xdd, 0x2c,
+ 0x87, 0xb3, 0xf7, 0x90, 0x29, 0xc9, 0x16, 0x59,
+ 0xd2, 0x7e, 0x6e, 0x84, 0x1d, 0xa6, 0xaf, 0x3c,
+ 0xca, 0xd6, 0x1a, 0x24, 0xa4, 0xcd, 0x59, 0x44,
+ 0x20, 0xd7, 0xd2, 0x5b, 0x97, 0xda, 0xd5, 0xa9,
+ 0x23, 0xb1, 0xa4, 0x60, 0xb8, 0x05, 0x98, 0xdc,
+ 0xef, 0x89, 0x81, 0xe3, 0x3a, 0xf9, 0x24, 0x37,
+ },
+ },
+ {
+ .data_len = 64,
+ .digest = {
+ 0x96, 0x3a, 0x1a, 0xdd, 0x1b, 0xeb, 0x1a, 0x55,
+ 0x24, 0x52, 0x3d, 0xec, 0x9d, 0x52, 0x2e, 0xa6,
+ 0xfe, 0x81, 0xd6, 0x98, 0xac, 0xcc, 0x60, 0x56,
+ 0x04, 0x9d, 0xa3, 0xf3, 0x56, 0x05, 0xe4, 0x8a,
+ 0x61, 0xaf, 0x6f, 0x6e, 0x8e, 0x75, 0x67, 0x3a,
+ 0xd2, 0xb0, 0x85, 0x2d, 0x17, 0xd2, 0x86, 0x8c,
+ 0x50, 0x4b, 0xdd, 0xef, 0x35, 0x00, 0xde, 0x29,
+ 0x3d, 0x4b, 0x04, 0x12, 0x8a, 0x81, 0xe2, 0xcc,
+ },
+ },
+ {
+ .data_len = 65,
+ .digest = {
+ 0x9c, 0x6e, 0xf0, 0x6f, 0x71, 0x77, 0xd5, 0xd0,
+ 0xbb, 0x70, 0x1f, 0xcb, 0xbd, 0xd3, 0xfe, 0x23,
+ 0x71, 0x78, 0xad, 0x3a, 0xd2, 0x1e, 0x34, 0xf4,
+ 0x6d, 0xb4, 0xa2, 0x0a, 0x24, 0xcb, 0xe1, 0x99,
+ 0x07, 0xd0, 0x79, 0x8f, 0x7e, 0x69, 0x31, 0x68,
+ 0x29, 0xb5, 0x85, 0x82, 0x67, 0xdc, 0x4a, 0x8d,
+ 0x44, 0x04, 0x02, 0xc0, 0xfb, 0xd2, 0x19, 0x66,
+ 0x1e, 0x25, 0x8b, 0xd2, 0x5a, 0x59, 0x68, 0xc0,
+ },
+ },
+ {
+ .data_len = 127,
+ .digest = {
+ 0xb8, 0x8f, 0xa8, 0x29, 0x4d, 0xcf, 0x5f, 0x73,
+ 0x3c, 0x55, 0x43, 0xd9, 0x1c, 0xbc, 0x0c, 0x17,
+ 0x75, 0x0b, 0xc7, 0xb1, 0x1d, 0x9f, 0x7b, 0x2f,
+ 0x4c, 0x3d, 0x2a, 0x71, 0xfb, 0x1b, 0x0e, 0xca,
+ 0x4e, 0x96, 0xa0, 0x95, 0xee, 0xf4, 0x93, 0x76,
+ 0x36, 0xfb, 0x5d, 0xd3, 0x46, 0xc4, 0x1d, 0x41,
+ 0x32, 0x92, 0x9d, 0xed, 0xdb, 0x7f, 0xfa, 0xb3,
+ 0x91, 0x61, 0x3e, 0xd6, 0xb2, 0xca, 0x8d, 0x81,
+ },
+ },
+ {
+ .data_len = 128,
+ .digest = {
+ 0x54, 0xac, 0x1a, 0xa1, 0xa6, 0xa3, 0x47, 0x2a,
+ 0x5a, 0xac, 0x1a, 0x3a, 0x4b, 0xa1, 0x11, 0x08,
+ 0xa7, 0x90, 0xec, 0x52, 0xcb, 0xaf, 0x68, 0x41,
+ 0x38, 0x44, 0x53, 0x94, 0x93, 0x30, 0xaf, 0x3a,
+ 0xec, 0x99, 0x3a, 0x7d, 0x2a, 0xd5, 0xb6, 0x05,
+ 0xf5, 0xa6, 0xbb, 0x9b, 0x82, 0xc2, 0xbd, 0x98,
+ 0x28, 0x62, 0x98, 0x3e, 0xe4, 0x27, 0x9b, 0xaa,
+ 0xce, 0x0a, 0x6f, 0xab, 0x1b, 0x16, 0xf3, 0xdd,
+ },
+ },
+ {
+ .data_len = 129,
+ .digest = {
+ 0x04, 0x37, 0x60, 0xbc, 0xb3, 0xb1, 0xc6, 0x2d,
+ 0x42, 0xc5, 0xd7, 0x7e, 0xd9, 0x86, 0x82, 0xe0,
+ 0xf4, 0x62, 0xad, 0x75, 0x68, 0x0b, 0xc7, 0xa8,
+ 0xd6, 0x9a, 0x76, 0xe5, 0x29, 0xb8, 0x37, 0x30,
+ 0x0f, 0xc0, 0xbc, 0x81, 0x94, 0x7c, 0x13, 0xf4,
+ 0x9c, 0x27, 0xbc, 0x59, 0xa1, 0x70, 0x6a, 0x87,
+ 0x20, 0x12, 0x0a, 0x2a, 0x62, 0x5e, 0x6f, 0xca,
+ 0x91, 0x6b, 0x34, 0x7e, 0x4c, 0x0d, 0xf0, 0x6c,
+ },
+ },
+ {
+ .data_len = 256,
+ .digest = {
+ 0x4b, 0x7c, 0x1f, 0x53, 0x52, 0xcc, 0x30, 0xed,
+ 0x91, 0x44, 0x6f, 0x0d, 0xb5, 0x41, 0x79, 0x99,
+ 0xaf, 0x82, 0x65, 0x52, 0x03, 0xf8, 0x55, 0x74,
+ 0x7c, 0xd9, 0x41, 0xd6, 0xe8, 0x91, 0xa4, 0x85,
+ 0xcb, 0x0a, 0x60, 0x08, 0x76, 0x07, 0x60, 0x99,
+ 0x89, 0x76, 0xba, 0x84, 0xbd, 0x0b, 0xf2, 0xb3,
+ 0xdc, 0xf3, 0x33, 0xd1, 0x9b, 0x0b, 0x2e, 0x5d,
+ 0xf6, 0x9d, 0x0f, 0x67, 0xf4, 0x86, 0xb3, 0xd5,
+ },
+ },
+ {
+ .data_len = 511,
+ .digest = {
+ 0x7d, 0x83, 0x78, 0x6a, 0x5d, 0x52, 0x42, 0x2a,
+ 0xb1, 0x97, 0xc6, 0x62, 0xa2, 0x2a, 0x7c, 0x8b,
+ 0xcd, 0x4f, 0xa4, 0x86, 0x19, 0xa4, 0x5b, 0x1d,
+ 0xc7, 0x6f, 0x2f, 0x9c, 0x03, 0xc3, 0x45, 0x2e,
+ 0xa7, 0x8e, 0x38, 0xf2, 0x57, 0x55, 0x89, 0x47,
+ 0xed, 0xeb, 0x81, 0xe2, 0xe0, 0x55, 0x9f, 0xe6,
+ 0xca, 0x03, 0x59, 0xd3, 0xd4, 0xba, 0xc9, 0x2d,
+ 0xaf, 0xbb, 0x62, 0x2e, 0xe6, 0x89, 0xe4, 0x11,
+ },
+ },
+ {
+ .data_len = 513,
+ .digest = {
+ 0xe9, 0x14, 0xe7, 0x01, 0xd0, 0x81, 0x09, 0x51,
+ 0x78, 0x1c, 0x8e, 0x6c, 0x00, 0xd3, 0x28, 0xa0,
+ 0x2a, 0x7b, 0xd6, 0x25, 0xca, 0xd0, 0xf9, 0xb8,
+ 0xd8, 0xcf, 0xd0, 0xb7, 0x48, 0x25, 0xb7, 0x6a,
+ 0x53, 0x8e, 0xf8, 0x52, 0x9c, 0x1f, 0x7d, 0xae,
+ 0x4c, 0x22, 0xd5, 0x9d, 0xf0, 0xaf, 0x98, 0x91,
+ 0x19, 0x1f, 0x99, 0xbd, 0xa6, 0xc2, 0x0f, 0x05,
+ 0xa5, 0x9f, 0x3e, 0x87, 0xed, 0xc3, 0xab, 0x92,
+ },
+ },
+ {
+ .data_len = 1000,
+ .digest = {
+ 0x2e, 0xf4, 0x72, 0xd2, 0xd9, 0x4a, 0xd5, 0xf9,
+ 0x20, 0x03, 0x4a, 0xad, 0xed, 0xa9, 0x1b, 0x64,
+ 0x73, 0x38, 0xc6, 0x30, 0xa8, 0x7f, 0xf9, 0x3b,
+ 0x8c, 0xbc, 0xa1, 0x2d, 0x22, 0x7b, 0x84, 0x37,
+ 0xf5, 0xba, 0xee, 0xf0, 0x80, 0x9d, 0xe3, 0x82,
+ 0xbd, 0x07, 0x68, 0x15, 0x01, 0x22, 0xf6, 0x88,
+ 0x07, 0x0b, 0xfd, 0xb7, 0xb1, 0xc0, 0x68, 0x4b,
+ 0x8d, 0x05, 0xec, 0xfb, 0xcd, 0xde, 0xa4, 0x2a,
+ },
+ },
+ {
+ .data_len = 3333,
+ .digest = {
+ 0x73, 0xe3, 0xe5, 0x87, 0x01, 0x0a, 0x29, 0x4d,
+ 0xad, 0x92, 0x67, 0x64, 0xc7, 0x71, 0x0b, 0x22,
+ 0x80, 0x8a, 0x6e, 0x8b, 0x20, 0x73, 0xb2, 0xd7,
+ 0x98, 0x20, 0x35, 0x42, 0x42, 0x5d, 0x85, 0x12,
+ 0xb0, 0x06, 0x69, 0x63, 0x5f, 0x5b, 0xe7, 0x63,
+ 0x6f, 0xe6, 0x18, 0xa6, 0xc1, 0xa6, 0xae, 0x27,
+ 0xa7, 0x6a, 0x73, 0x6b, 0x27, 0xd5, 0x47, 0xe1,
+ 0xa2, 0x7d, 0xe4, 0x0d, 0xbd, 0x23, 0x7b, 0x7a,
+ },
+ },
+ {
+ .data_len = 4096,
+ .digest = {
+ 0x11, 0x5b, 0x77, 0x36, 0x6b, 0x3b, 0xe4, 0x42,
+ 0xe4, 0x92, 0x23, 0xcb, 0x0c, 0x06, 0xff, 0xb7,
+ 0x0c, 0x71, 0x64, 0xd9, 0x8a, 0x57, 0x75, 0x7b,
+ 0xa2, 0xd2, 0x17, 0x19, 0xbb, 0xb5, 0x3c, 0xb3,
+ 0x5f, 0xae, 0x35, 0x75, 0x8e, 0xa8, 0x97, 0x43,
+ 0xce, 0xfe, 0x41, 0x84, 0xfe, 0xcb, 0x18, 0x70,
+ 0x68, 0x2e, 0x16, 0x19, 0xd5, 0x10, 0x0d, 0x2f,
+ 0x61, 0x87, 0x79, 0xee, 0x5f, 0x24, 0xdd, 0x76,
+ },
+ },
+ {
+ .data_len = 4128,
+ .digest = {
+ 0x9e, 0x96, 0xe1, 0x0a, 0xb2, 0xd5, 0xba, 0xcf,
+ 0x27, 0xba, 0x6f, 0x85, 0xe9, 0xbf, 0x96, 0xb9,
+ 0x5a, 0x00, 0x00, 0x06, 0xdc, 0xb7, 0xaf, 0x0a,
+ 0x8f, 0x1d, 0x31, 0xf6, 0xce, 0xc3, 0x50, 0x2e,
+ 0x61, 0x3a, 0x8b, 0x28, 0xaf, 0xb2, 0x50, 0x0d,
+ 0x00, 0x98, 0x02, 0x11, 0x6b, 0xfa, 0x51, 0xc1,
+ 0xde, 0xe1, 0x34, 0x9f, 0xda, 0x11, 0x63, 0xfa,
+ 0x0a, 0xa0, 0xa2, 0x67, 0x39, 0xeb, 0x9b, 0xf1,
+ },
+ },
+ {
+ .data_len = 4160,
+ .digest = {
+ 0x46, 0x4e, 0x81, 0xd1, 0x08, 0x2a, 0x46, 0x12,
+ 0x4e, 0xae, 0x1f, 0x5d, 0x57, 0xe5, 0x19, 0xbc,
+ 0x76, 0x38, 0xb6, 0xa7, 0xe3, 0x72, 0x6d, 0xaf,
+ 0x80, 0x3b, 0xd0, 0xbc, 0x06, 0xe8, 0xab, 0xab,
+ 0x86, 0x4b, 0x0b, 0x7a, 0x61, 0xa6, 0x13, 0xff,
+ 0x64, 0x47, 0x89, 0xb7, 0x63, 0x8a, 0xa5, 0x4c,
+ 0x9f, 0x52, 0x70, 0xeb, 0x21, 0xe5, 0x2d, 0xe9,
+ 0xe7, 0xab, 0x1c, 0x0e, 0x74, 0xf5, 0x72, 0xec,
+ },
+ },
+ {
+ .data_len = 4224,
+ .digest = {
+ 0xfa, 0x6e, 0xff, 0x3c, 0xc1, 0x98, 0x49, 0x42,
+ 0x34, 0x67, 0xd4, 0xd3, 0xfa, 0xae, 0x27, 0xe4,
+ 0x77, 0x11, 0x84, 0xd2, 0x57, 0x99, 0xf8, 0xfd,
+ 0x41, 0x50, 0x84, 0x80, 0x7f, 0xf7, 0xb2, 0xd3,
+ 0x88, 0x21, 0x9c, 0xe8, 0xb9, 0x05, 0xd3, 0x48,
+ 0x64, 0xc5, 0xb7, 0x29, 0xd9, 0x21, 0x17, 0xad,
+ 0x89, 0x9c, 0x79, 0x55, 0x51, 0x0b, 0x96, 0x3e,
+ 0x10, 0x40, 0xe1, 0xdd, 0x7b, 0x39, 0x40, 0x86,
+ },
+ },
+ {
+ .data_len = 16384,
+ .digest = {
+ 0x41, 0xb3, 0xd2, 0x93, 0xcd, 0x79, 0x84, 0xc2,
+ 0xf5, 0xea, 0xf3, 0xb3, 0x94, 0x23, 0xaa, 0x76,
+ 0x87, 0x5f, 0xe3, 0xd2, 0x03, 0xd8, 0x00, 0xbb,
+ 0xa1, 0x55, 0xe4, 0xcb, 0x16, 0x04, 0x5b, 0xdf,
+ 0xf8, 0xd2, 0x63, 0x51, 0x02, 0x22, 0xc6, 0x0f,
+ 0x98, 0x2b, 0x12, 0x52, 0x25, 0x64, 0x93, 0xd9,
+ 0xab, 0xe9, 0x4d, 0x16, 0x4b, 0xf6, 0x09, 0x83,
+ 0x5c, 0x63, 0x1c, 0x41, 0x19, 0xf6, 0x76, 0xe3,
+ },
+ },
+};
+
+static const u8 hash_testvec_consolidated[SHA512_DIGEST_SIZE] = {
+ 0x5b, 0x9d, 0xf9, 0xab, 0x8c, 0x8e, 0x52, 0xdb,
+ 0x02, 0xa0, 0x4c, 0x24, 0x2d, 0xc4, 0xa8, 0x4e,
+ 0x9c, 0x93, 0x2f, 0x72, 0xa8, 0x75, 0xfb, 0xb5,
+ 0xdb, 0xef, 0x52, 0xc6, 0xa3, 0xfe, 0xeb, 0x6b,
+ 0x92, 0x79, 0x18, 0x05, 0xf6, 0xd7, 0xaf, 0x7b,
+ 0x36, 0xfc, 0x83, 0x2c, 0x7e, 0x7b, 0x59, 0x8b,
+ 0xf9, 0x81, 0xaa, 0x98, 0x38, 0x11, 0x97, 0x56,
+ 0x34, 0xe5, 0x2a, 0x4b, 0xf2, 0x9e, 0xf3, 0xdb,
+};
+
+static const u8 hmac_testvec_consolidated[SHA512_DIGEST_SIZE] = {
+ 0x40, 0xe7, 0xbc, 0x03, 0xdf, 0x22, 0xd4, 0x76,
+ 0x66, 0x45, 0xf8, 0x1d, 0x25, 0xdf, 0xbe, 0xa2,
+ 0x93, 0x06, 0x8c, 0x1d, 0x14, 0x23, 0x9b, 0x5c,
+ 0xfa, 0xac, 0xdf, 0xbd, 0xa2, 0x24, 0xe5, 0xf7,
+ 0xdc, 0xf7, 0xae, 0x96, 0xc1, 0x34, 0xe5, 0x24,
+ 0x16, 0x24, 0xdc, 0xee, 0x4f, 0x62, 0x1c, 0x67,
+ 0x4e, 0x02, 0x31, 0x4b, 0x9b, 0x65, 0x25, 0xeb,
+ 0x32, 0x2e, 0x24, 0xfb, 0xcd, 0x2b, 0x59, 0xd8,
+};
diff --git a/lib/crypto/tests/sha512_kunit.c b/lib/crypto/tests/sha512_kunit.c
new file mode 100644
index 000000000000..8923e2d7d3d4
--- /dev/null
+++ b/lib/crypto/tests/sha512_kunit.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/sha2.h>
+#include "sha512-testvecs.h"
+
+#define HASH sha512
+#define HASH_CTX sha512_ctx
+#define HASH_SIZE SHA512_DIGEST_SIZE
+#define HASH_INIT sha512_init
+#define HASH_UPDATE sha512_update
+#define HASH_FINAL sha512_final
+#define HMAC_KEY hmac_sha512_key
+#define HMAC_CTX hmac_sha512_ctx
+#define HMAC_PREPAREKEY hmac_sha512_preparekey
+#define HMAC_INIT hmac_sha512_init
+#define HMAC_UPDATE hmac_sha512_update
+#define HMAC_FINAL hmac_sha512_final
+#define HMAC hmac_sha512
+#define HMAC_USINGRAWKEY hmac_sha512_usingrawkey
+#include "hash-test-template.h"
+
+static struct kunit_case hash_test_cases[] = {
+ HASH_KUNIT_CASES,
+ KUNIT_CASE(benchmark_hash),
+ {},
+};
+
+static struct kunit_suite hash_test_suite = {
+ .name = "sha512",
+ .test_cases = hash_test_cases,
+ .suite_init = hash_suite_init,
+ .suite_exit = hash_suite_exit,
+};
+kunit_test_suite(hash_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA-512 and HMAC-SHA512");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/utils.c b/lib/crypto/utils.c
index 53230ab1b195..dec381d5e906 100644
--- a/lib/crypto/utils.c
+++ b/lib/crypto/utils.c
@@ -5,9 +5,10 @@
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
*/
-#include <asm/unaligned.h>
-#include <crypto/algapi.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
#include <linux/module.h>
+#include <linux/unaligned.h>
/*
* XOR @len bytes from @src1 and @src2 together, writing the result to @dst
@@ -85,4 +86,5 @@ void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len)
}
EXPORT_SYMBOL_GPL(__crypto_xor);
+MODULE_DESCRIPTION("Crypto library utility functions");
MODULE_LICENSE("GPL");
diff --git a/lib/crypto/x86/.gitignore b/lib/crypto/x86/.gitignore
new file mode 100644
index 000000000000..580c839bb177
--- /dev/null
+++ b/lib/crypto/x86/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-x86_64-cryptogams.S
diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
new file mode 100644
index 000000000000..7b1d98ca7482
--- /dev/null
+++ b/lib/crypto/x86/blake2s-core.S
@@ -0,0 +1,291 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.iv, "aM", @progbits, 32
+.align 32
+.Liv:
+ .octa 0xA54FF53A3C6EF372BB67AE856A09E667
+ .octa 0x5BE0CD191F83D9AB9B05688C510E527F
+
+.section .rodata.cst16.ror16, "aM", @progbits, 16
+.align 16
+.Lror16:
+ .octa 0x0D0C0F0E09080B0A0504070601000302
+
+.section .rodata.cst16.ror8, "aM", @progbits, 16
+.align 16
+.Lror8:
+ .octa 0x0C0F0E0D080B0A090407060500030201
+
+.section .rodata.cst64.sigma, "aM", @progbits, 160
+.align 64
+.Lsigma:
+.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
+.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
+.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
+.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
+.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
+.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
+.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
+.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
+.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
+.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
+
+.section .rodata.cst64.sigma2, "aM", @progbits, 160
+.align 64
+.Lsigma2:
+.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
+.byte 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
+.byte 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
+.byte 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
+.byte 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
+.byte 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
+.byte 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
+.byte 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
+.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
+.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
+
+#define CTX %rdi
+#define DATA %rsi
+#define NBLOCKS %rdx
+#define INC %ecx
+
+.text
+//
+// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+// const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+// u32 h[8]; (inout)
+// u32 t[2]; (inout)
+// u32 f[2]; (in)
+//
+SYM_FUNC_START(blake2s_compress_ssse3)
+ movdqu (CTX),%xmm0 // Load h[0..3]
+ movdqu 16(CTX),%xmm1 // Load h[4..7]
+ movdqa .Lror16(%rip),%xmm12
+ movdqa .Lror8(%rip),%xmm13
+ movdqu 32(CTX),%xmm14 // Load t and f
+ movd INC,%xmm15 // Load inc
+ leaq .Lsigma+160(%rip),%r8
+ jmp .Lssse3_mainloop
+
+ .align 32
+.Lssse3_mainloop:
+ // Main loop: each iteration processes one 64-byte block.
+ movdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
+ movdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
+ paddq %xmm15,%xmm14 // t += inc (64-bit addition)
+ movdqa .Liv(%rip),%xmm2 // v[8..11] = iv[0..3]
+ movdqa %xmm14,%xmm3
+ pxor .Liv+16(%rip),%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
+ leaq .Lsigma(%rip),%rcx
+
+.Lssse3_roundloop:
+ // Round loop: each iteration does 1 round (of 10 rounds total).
+ movzbl (%rcx),%eax
+ movd (DATA,%rax,4),%xmm4
+ movzbl 1(%rcx),%eax
+ movd (DATA,%rax,4),%xmm5
+ movzbl 2(%rcx),%eax
+ movd (DATA,%rax,4),%xmm6
+ movzbl 3(%rcx),%eax
+ movd (DATA,%rax,4),%xmm7
+ punpckldq %xmm5,%xmm4
+ punpckldq %xmm7,%xmm6
+ punpcklqdq %xmm6,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $12,%xmm1
+ pslld $20,%xmm8
+ por %xmm8,%xmm1
+ movzbl 4(%rcx),%eax
+ movd (DATA,%rax,4),%xmm5
+ movzbl 5(%rcx),%eax
+ movd (DATA,%rax,4),%xmm6
+ movzbl 6(%rcx),%eax
+ movd (DATA,%rax,4),%xmm7
+ movzbl 7(%rcx),%eax
+ movd (DATA,%rax,4),%xmm4
+ punpckldq %xmm6,%xmm5
+ punpckldq %xmm4,%xmm7
+ punpcklqdq %xmm7,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $7,%xmm1
+ pslld $25,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movzbl 8(%rcx),%eax
+ movd (DATA,%rax,4),%xmm6
+ movzbl 9(%rcx),%eax
+ movd (DATA,%rax,4),%xmm7
+ movzbl 10(%rcx),%eax
+ movd (DATA,%rax,4),%xmm4
+ movzbl 11(%rcx),%eax
+ movd (DATA,%rax,4),%xmm5
+ punpckldq %xmm7,%xmm6
+ punpckldq %xmm5,%xmm4
+ punpcklqdq %xmm4,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $12,%xmm1
+ pslld $20,%xmm8
+ por %xmm8,%xmm1
+ movzbl 12(%rcx),%eax
+ movd (DATA,%rax,4),%xmm7
+ movzbl 13(%rcx),%eax
+ movd (DATA,%rax,4),%xmm4
+ movzbl 14(%rcx),%eax
+ movd (DATA,%rax,4),%xmm5
+ movzbl 15(%rcx),%eax
+ movd (DATA,%rax,4),%xmm6
+ punpckldq %xmm4,%xmm7
+ punpckldq %xmm6,%xmm5
+ punpcklqdq %xmm5,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $7,%xmm1
+ pslld $25,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ addq $16,%rcx
+ cmpq %r8,%rcx
+ jnz .Lssse3_roundloop
+
+ // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm1
+ pxor %xmm10,%xmm0
+ pxor %xmm11,%xmm1
+ addq $64,DATA
+ decq NBLOCKS
+ jnz .Lssse3_mainloop
+
+ movdqu %xmm0,(CTX) // Store new h[0..3]
+ movdqu %xmm1,16(CTX) // Store new h[4..7]
+ movq %xmm14,32(CTX) // Store new t (f is unchanged)
+ RET
+SYM_FUNC_END(blake2s_compress_ssse3)
+
+//
+// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+// const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+// u32 h[8]; (inout)
+// u32 t[2]; (inout)
+// u32 f[2]; (in)
+//
+SYM_FUNC_START(blake2s_compress_avx512)
+ vmovdqu (CTX),%xmm0 // Load h[0..3]
+ vmovdqu 16(CTX),%xmm1 // Load h[4..7]
+ vmovdqu 32(CTX),%xmm4 // Load t and f
+ vmovd INC,%xmm5 // Load inc
+ vmovdqa .Liv(%rip),%xmm14 // Load iv[0..3]
+ vmovdqa .Liv+16(%rip),%xmm15 // Load iv[4..7]
+ jmp .Lavx512_mainloop
+
+ .align 32
+.Lavx512_mainloop:
+ // Main loop: each iteration processes one 64-byte block.
+ vmovdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
+ vmovdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
+ vpaddq %xmm5,%xmm4,%xmm4 // t += inc (64-bit addition)
+ vmovdqa %xmm14,%xmm2 // v[8..11] = iv[0..3]
+ vpxor %xmm15,%xmm4,%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
+ vmovdqu (DATA),%ymm6 // Load first 8 data words
+ vmovdqu 32(DATA),%ymm7 // Load second 8 data words
+ addq $64,DATA
+ leaq .Lsigma2(%rip),%rax
+ movb $10,%cl // Set num rounds remaining
+
+.Lavx512_roundloop:
+ // Round loop: each iteration does 1 round (of 10 rounds total).
+ vpmovzxbd (%rax),%ymm8
+ vpmovzxbd 8(%rax),%ymm9
+ addq $16,%rax
+ vpermi2d %ymm7,%ymm6,%ymm8
+ vpermi2d %ymm7,%ymm6,%ymm9
+ vmovdqa %ymm8,%ymm6
+ vmovdqa %ymm9,%ymm7
+ vpaddd %xmm8,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $16,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $12,%xmm1,%xmm1
+ vextracti128 $1,%ymm8,%xmm8
+ vpaddd %xmm8,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $8,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $7,%xmm1,%xmm1
+ vpshufd $0x93,%xmm0,%xmm0
+ vpshufd $0x4e,%xmm3,%xmm3
+ vpshufd $0x39,%xmm2,%xmm2
+ vpaddd %xmm9,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $16,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $12,%xmm1,%xmm1
+ vextracti128 $1,%ymm9,%xmm9
+ vpaddd %xmm9,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $8,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $7,%xmm1,%xmm1
+ vpshufd $0x39,%xmm0,%xmm0
+ vpshufd $0x4e,%xmm3,%xmm3
+ vpshufd $0x93,%xmm2,%xmm2
+ decb %cl
+ jne .Lavx512_roundloop
+
+ // Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
+ vpternlogd $0x96,%xmm10,%xmm2,%xmm0
+ vpternlogd $0x96,%xmm11,%xmm3,%xmm1
+ decq NBLOCKS
+ jne .Lavx512_mainloop
+
+ vmovdqu %xmm0,(CTX) // Store new h[0..3]
+ vmovdqu %xmm1,16(CTX) // Store new h[4..7]
+ vmovq %xmm4,32(CTX) // Store new t (f is unchanged)
+ vzeroupper
+ RET
+SYM_FUNC_END(blake2s_compress_avx512)
diff --git a/lib/crypto/x86/blake2s.h b/lib/crypto/x86/blake2s.h
new file mode 100644
index 000000000000..f8eed6cb042e
--- /dev/null
+++ b/lib/crypto/x86/blake2s.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
+#include <asm/processor.h>
+#include <asm/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+asmlinkage void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc);
+asmlinkage void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
+
+static void blake2s_compress(struct blake2s_ctx *ctx,
+ const u8 *data, size_t nblocks, u32 inc)
+{
+ /* SIMD disables preemption, so relax after processing each page. */
+ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
+
+ if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
+ blake2s_compress_generic(ctx, data, nblocks, inc);
+ return;
+ }
+
+ do {
+ const size_t blocks = min_t(size_t, nblocks,
+ SZ_4K / BLAKE2S_BLOCK_SIZE);
+
+ kernel_fpu_begin();
+ if (static_branch_likely(&blake2s_use_avx512))
+ blake2s_compress_avx512(ctx, data, blocks, inc);
+ else
+ blake2s_compress_ssse3(ctx, data, blocks, inc);
+ kernel_fpu_end();
+
+ data += blocks * BLAKE2S_BLOCK_SIZE;
+ nblocks -= blocks;
+ } while (nblocks);
+}
+
+#define blake2s_mod_init_arch blake2s_mod_init_arch
+static void blake2s_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ static_branch_enable(&blake2s_use_ssse3);
+
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL))
+ static_branch_enable(&blake2s_use_avx512);
+}
diff --git a/lib/crypto/x86/chacha-avx2-x86_64.S b/lib/crypto/x86/chacha-avx2-x86_64.S
new file mode 100644
index 000000000000..f3d8fc018249
--- /dev/null
+++ b/lib/crypto/x86/chacha-avx2-x86_64.S
@@ -0,0 +1,1021 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.ROT8, "aM", @progbits, 32
+.align 32
+ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
+ .octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section .rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+ .octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section .rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
+CTRINC: .octa 0x00000003000000020000000100000000
+ .octa 0x00000007000000060000000500000004
+
+.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL: .octa 0x00000000000000000000000000000000
+ .octa 0x00000000000000000000000000000001
+
+.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL: .octa 0x00000000000000000000000000000002
+ .octa 0x00000000000000000000000000000003
+
+.text
+
+SYM_FUNC_START(chacha_2block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 2 data blocks output, o
+ # %rdx: up to 2 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts two ChaCha blocks by loading the state
+ # matrix twice across four AVX registers. It performs matrix operations
+ # on four words in each matrix in parallel, but requires shuffling to
+ # rearrange the words after each round.
+
+ vzeroupper
+
+ # x0..3[0-2] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+
+ vmovdqa %ymm0,%ymm8
+ vmovdqa %ymm1,%ymm9
+ vmovdqa %ymm2,%ymm10
+ vmovdqa %ymm3,%ymm11
+
+ vmovdqa ROT8(%rip),%ymm4
+ vmovdqa ROT16(%rip),%ymm5
+
+ mov %rcx,%rax
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm5,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm6
+ vpslld $12,%ymm6,%ymm6
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm6,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm4,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm7
+ vpslld $7,%ymm7,%ymm7
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm5,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm6
+ vpslld $12,%ymm6,%ymm6
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm6,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm4,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm7
+ vpslld $7,%ymm7,%ymm7
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ vpaddd %ymm8,%ymm0,%ymm7
+ cmp $0x10,%rax
+ jl .Lxorpart2
+ vpxor 0x00(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x00(%rsi)
+ vextracti128 $1,%ymm7,%xmm0
+ # o1 = i1 ^ (x1 + s1)
+ vpaddd %ymm9,%ymm1,%ymm7
+ cmp $0x20,%rax
+ jl .Lxorpart2
+ vpxor 0x10(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x10(%rsi)
+ vextracti128 $1,%ymm7,%xmm1
+ # o2 = i2 ^ (x2 + s2)
+ vpaddd %ymm10,%ymm2,%ymm7
+ cmp $0x30,%rax
+ jl .Lxorpart2
+ vpxor 0x20(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x20(%rsi)
+ vextracti128 $1,%ymm7,%xmm2
+ # o3 = i3 ^ (x3 + s3)
+ vpaddd %ymm11,%ymm3,%ymm7
+ cmp $0x40,%rax
+ jl .Lxorpart2
+ vpxor 0x30(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x30(%rsi)
+ vextracti128 $1,%ymm7,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm7
+ cmp $0x50,%rax
+ jl .Lxorpart2
+ vpxor 0x40(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm7
+ cmp $0x60,%rax
+ jl .Lxorpart2
+ vpxor 0x50(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm7
+ cmp $0x70,%rax
+ jl .Lxorpart2
+ vpxor 0x60(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm7
+ cmp $0x80,%rax
+ jl .Lxorpart2
+ vpxor 0x70(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x70(%rsi)
+
+.Ldone2:
+ vzeroupper
+ RET
+
+.Lxorpart2:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone2
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%xmm7,%xmm7
+ vmovdqa %xmm7,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone2
+
+SYM_FUNC_END(chacha_2block_xor_avx2)
+
+SYM_FUNC_START(chacha_4block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four ChaCha blocks by loading the state
+ # matrix four times across eight AVX registers. It performs matrix
+ # operations on four words in two matrices in parallel, sequentially
+ # to the operations on the four words of the other two matrices. The
+ # required word shuffling has a rather high latency, we can do the
+ # arithmetic on two matrix-pairs without much slowdown.
+
+ vzeroupper
+
+ # x0..3[0-4] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vmovdqa %ymm0,%ymm4
+ vmovdqa %ymm1,%ymm5
+ vmovdqa %ymm2,%ymm6
+ vmovdqa %ymm3,%ymm7
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+ vpaddd CTR4BL(%rip),%ymm7,%ymm7
+
+ vmovdqa %ymm0,%ymm11
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm3,%ymm14
+ vmovdqa %ymm7,%ymm15
+
+ vmovdqa ROT8(%rip),%ymm8
+ vmovdqa ROT16(%rip),%ymm9
+
+ mov %rcx,%rax
+
+.Ldoubleround4:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm9,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ vpshufd $0x39,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+ vpshufd $0x93,%ymm7,%ymm7
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm9,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ vpshufd $0x93,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+ vpshufd $0x39,%ymm7,%ymm7
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # o0 = i0 ^ (x0 + s0), first block
+ vpaddd %ymm11,%ymm0,%ymm10
+ cmp $0x10,%rax
+ jl .Lxorpart4
+ vpxor 0x00(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x00(%rsi)
+ vextracti128 $1,%ymm10,%xmm0
+ # o1 = i1 ^ (x1 + s1), first block
+ vpaddd %ymm12,%ymm1,%ymm10
+ cmp $0x20,%rax
+ jl .Lxorpart4
+ vpxor 0x10(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x10(%rsi)
+ vextracti128 $1,%ymm10,%xmm1
+ # o2 = i2 ^ (x2 + s2), first block
+ vpaddd %ymm13,%ymm2,%ymm10
+ cmp $0x30,%rax
+ jl .Lxorpart4
+ vpxor 0x20(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x20(%rsi)
+ vextracti128 $1,%ymm10,%xmm2
+ # o3 = i3 ^ (x3 + s3), first block
+ vpaddd %ymm14,%ymm3,%ymm10
+ cmp $0x40,%rax
+ jl .Lxorpart4
+ vpxor 0x30(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x30(%rsi)
+ vextracti128 $1,%ymm10,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm10
+ cmp $0x50,%rax
+ jl .Lxorpart4
+ vpxor 0x40(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm10
+ cmp $0x60,%rax
+ jl .Lxorpart4
+ vpxor 0x50(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm10
+ cmp $0x70,%rax
+ jl .Lxorpart4
+ vpxor 0x60(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm10
+ cmp $0x80,%rax
+ jl .Lxorpart4
+ vpxor 0x70(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x70(%rsi)
+
+ # o0 = i0 ^ (x0 + s0), third block
+ vpaddd %ymm11,%ymm4,%ymm10
+ cmp $0x90,%rax
+ jl .Lxorpart4
+ vpxor 0x80(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x80(%rsi)
+ vextracti128 $1,%ymm10,%xmm4
+ # o1 = i1 ^ (x1 + s1), third block
+ vpaddd %ymm12,%ymm5,%ymm10
+ cmp $0xa0,%rax
+ jl .Lxorpart4
+ vpxor 0x90(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x90(%rsi)
+ vextracti128 $1,%ymm10,%xmm5
+ # o2 = i2 ^ (x2 + s2), third block
+ vpaddd %ymm13,%ymm6,%ymm10
+ cmp $0xb0,%rax
+ jl .Lxorpart4
+ vpxor 0xa0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xa0(%rsi)
+ vextracti128 $1,%ymm10,%xmm6
+ # o3 = i3 ^ (x3 + s3), third block
+ vpaddd %ymm15,%ymm7,%ymm10
+ cmp $0xc0,%rax
+ jl .Lxorpart4
+ vpxor 0xb0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xb0(%rsi)
+ vextracti128 $1,%ymm10,%xmm7
+
+ # xor and write fourth block
+ vmovdqa %xmm4,%xmm10
+ cmp $0xd0,%rax
+ jl .Lxorpart4
+ vpxor 0xc0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xc0(%rsi)
+
+ vmovdqa %xmm5,%xmm10
+ cmp $0xe0,%rax
+ jl .Lxorpart4
+ vpxor 0xd0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xd0(%rsi)
+
+ vmovdqa %xmm6,%xmm10
+ cmp $0xf0,%rax
+ jl .Lxorpart4
+ vpxor 0xe0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xe0(%rsi)
+
+ vmovdqa %xmm7,%xmm10
+ cmp $0x100,%rax
+ jl .Lxorpart4
+ vpxor 0xf0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xf0(%rsi)
+
+.Ldone4:
+ vzeroupper
+ RET
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone4
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%xmm10,%xmm10
+ vmovdqa %xmm10,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone4
+
+SYM_FUNC_END(chacha_4block_xor_avx2)
+
+SYM_FUNC_START(chacha_8block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 8 data blocks output, o
+ # %rdx: up to 8 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts eight consecutive ChaCha blocks by loading
+ # the state matrix in AVX registers eight times. As we need some
+ # scratch registers, we save the first four registers on the stack. The
+ # algorithm performs each operation on the corresponding word of each
+ # state matrix, hence requires no word shuffling. For final XORing step
+ # we transpose the matrix by interleaving 32-, 64- and then 128-bit
+ # words, which allows us to do XOR in AVX registers. 8/16-bit word
+ # rotation is done with the slightly better performing byte shuffling,
+ # 7/12-bit word rotation uses traditional shift+OR.
+
+ vzeroupper
+ # 4 * 32 byte stack, 32-byte aligned
+ lea 8(%rsp),%r10
+ and $~31, %rsp
+ sub $0x80, %rsp
+ mov %rcx,%rax
+
+ # x0..15[0-7] = s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpbroadcastd 0x04(%rdi),%ymm1
+ vpbroadcastd 0x08(%rdi),%ymm2
+ vpbroadcastd 0x0c(%rdi),%ymm3
+ vpbroadcastd 0x10(%rdi),%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm15
+ # x0..3 on stack
+ vmovdqa %ymm0,0x00(%rsp)
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa %ymm3,0x60(%rsp)
+
+ vmovdqa CTRINC(%rip),%ymm1
+ vmovdqa ROT8(%rip),%ymm2
+ vmovdqa ROT16(%rip),%ymm3
+
+ # x12 += counter values 0-3
+ vpaddd %ymm1,%ymm12,%ymm12
+
+.Ldoubleround8:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ vpaddd 0x00(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm3,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ vpaddd 0x20(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm3,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ vpaddd 0x40(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm3,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vpaddd 0x60(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm3,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $12,%ymm4,%ymm0
+ vpsrld $20,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $12,%ymm5,%ymm0
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $12,%ymm6,%ymm0
+ vpsrld $20,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm11,%ymm7,%ymm7
+ vpslld $12,%ymm7,%ymm0
+ vpsrld $20,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ vpaddd 0x00(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm2,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ vpaddd 0x20(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm2,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ vpaddd 0x40(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm2,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vpaddd 0x60(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm2,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm0
+ vpsrld $25,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm0
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm0
+ vpsrld $25,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm11,%ymm7,%ymm7
+ vpslld $7,%ymm7,%ymm0
+ vpsrld $25,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ vpaddd 0x00(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm3,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+ vpaddd 0x20(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm3,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ vpaddd 0x40(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm3,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vpaddd 0x60(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm3,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxor %ymm10,%ymm5,%ymm5
+ vpslld $12,%ymm5,%ymm0
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxor %ymm11,%ymm6,%ymm6
+ vpslld $12,%ymm6,%ymm0
+ vpsrld $20,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpslld $12,%ymm7,%ymm0
+ vpsrld $20,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm9,%ymm4,%ymm4
+ vpslld $12,%ymm4,%ymm0
+ vpsrld $20,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ vpaddd 0x00(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm2,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ vpaddd 0x20(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm2,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ vpaddd 0x40(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm2,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vpaddd 0x60(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm2,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxor %ymm10,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm0
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxor %ymm11,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm0
+ vpsrld $25,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpslld $7,%ymm7,%ymm0
+ vpsrld $25,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm9,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm0
+ vpsrld $25,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround8
+
+ # x0..15[0-3] += s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpaddd 0x00(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpbroadcastd 0x04(%rdi),%ymm0
+ vpaddd 0x20(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpbroadcastd 0x08(%rdi),%ymm0
+ vpaddd 0x40(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpbroadcastd 0x0c(%rdi),%ymm0
+ vpaddd 0x60(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpbroadcastd 0x10(%rdi),%ymm0
+ vpaddd %ymm0,%ymm4,%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm0
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm0
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm7,%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm0
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm0
+ vpaddd %ymm0,%ymm9,%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm0
+ vpaddd %ymm0,%ymm10,%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm0
+ vpaddd %ymm0,%ymm12,%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm0
+ vpaddd %ymm0,%ymm13,%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm0
+ vpaddd %ymm0,%ymm14,%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm15,%ymm15
+
+ # x12 += counter values 0-3
+ vpaddd %ymm1,%ymm12,%ymm12
+
+ # interleave 32-bit words in state n, n+1
+ vmovdqa 0x00(%rsp),%ymm0
+ vmovdqa 0x20(%rsp),%ymm1
+ vpunpckldq %ymm1,%ymm0,%ymm2
+ vpunpckhdq %ymm1,%ymm0,%ymm1
+ vmovdqa %ymm2,0x00(%rsp)
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa 0x40(%rsp),%ymm0
+ vmovdqa 0x60(%rsp),%ymm1
+ vpunpckldq %ymm1,%ymm0,%ymm2
+ vpunpckhdq %ymm1,%ymm0,%ymm1
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa %ymm1,0x60(%rsp)
+ vmovdqa %ymm4,%ymm0
+ vpunpckldq %ymm5,%ymm0,%ymm4
+ vpunpckhdq %ymm5,%ymm0,%ymm5
+ vmovdqa %ymm6,%ymm0
+ vpunpckldq %ymm7,%ymm0,%ymm6
+ vpunpckhdq %ymm7,%ymm0,%ymm7
+ vmovdqa %ymm8,%ymm0
+ vpunpckldq %ymm9,%ymm0,%ymm8
+ vpunpckhdq %ymm9,%ymm0,%ymm9
+ vmovdqa %ymm10,%ymm0
+ vpunpckldq %ymm11,%ymm0,%ymm10
+ vpunpckhdq %ymm11,%ymm0,%ymm11
+ vmovdqa %ymm12,%ymm0
+ vpunpckldq %ymm13,%ymm0,%ymm12
+ vpunpckhdq %ymm13,%ymm0,%ymm13
+ vmovdqa %ymm14,%ymm0
+ vpunpckldq %ymm15,%ymm0,%ymm14
+ vpunpckhdq %ymm15,%ymm0,%ymm15
+
+ # interleave 64-bit words in state n, n+2
+ vmovdqa 0x00(%rsp),%ymm0
+ vmovdqa 0x40(%rsp),%ymm2
+ vpunpcklqdq %ymm2,%ymm0,%ymm1
+ vpunpckhqdq %ymm2,%ymm0,%ymm2
+ vmovdqa %ymm1,0x00(%rsp)
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa 0x20(%rsp),%ymm0
+ vmovdqa 0x60(%rsp),%ymm2
+ vpunpcklqdq %ymm2,%ymm0,%ymm1
+ vpunpckhqdq %ymm2,%ymm0,%ymm2
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa %ymm2,0x60(%rsp)
+ vmovdqa %ymm4,%ymm0
+ vpunpcklqdq %ymm6,%ymm0,%ymm4
+ vpunpckhqdq %ymm6,%ymm0,%ymm6
+ vmovdqa %ymm5,%ymm0
+ vpunpcklqdq %ymm7,%ymm0,%ymm5
+ vpunpckhqdq %ymm7,%ymm0,%ymm7
+ vmovdqa %ymm8,%ymm0
+ vpunpcklqdq %ymm10,%ymm0,%ymm8
+ vpunpckhqdq %ymm10,%ymm0,%ymm10
+ vmovdqa %ymm9,%ymm0
+ vpunpcklqdq %ymm11,%ymm0,%ymm9
+ vpunpckhqdq %ymm11,%ymm0,%ymm11
+ vmovdqa %ymm12,%ymm0
+ vpunpcklqdq %ymm14,%ymm0,%ymm12
+ vpunpckhqdq %ymm14,%ymm0,%ymm14
+ vmovdqa %ymm13,%ymm0
+ vpunpcklqdq %ymm15,%ymm0,%ymm13
+ vpunpckhqdq %ymm15,%ymm0,%ymm15
+
+ # interleave 128-bit words in state n, n+4
+ # xor/write first four blocks
+ vmovdqa 0x00(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
+ cmp $0x0020,%rax
+ jl .Lxorpart8
+ vpxor 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0000(%rsi)
+ vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
+
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ cmp $0x0040,%rax
+ jl .Lxorpart8
+ vpxor 0x0020(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0020(%rsi)
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+
+ vmovdqa 0x40(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
+ cmp $0x0060,%rax
+ jl .Lxorpart8
+ vpxor 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0040(%rsi)
+ vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
+
+ vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ cmp $0x0080,%rax
+ jl .Lxorpart8
+ vpxor 0x0060(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0060(%rsi)
+ vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
+
+ vmovdqa 0x20(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
+ cmp $0x00a0,%rax
+ jl .Lxorpart8
+ vpxor 0x0080(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0080(%rsi)
+ vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
+
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ cmp $0x00c0,%rax
+ jl .Lxorpart8
+ vpxor 0x00a0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00a0(%rsi)
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+
+ vmovdqa 0x60(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
+ cmp $0x00e0,%rax
+ jl .Lxorpart8
+ vpxor 0x00c0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00c0(%rsi)
+ vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
+
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ cmp $0x0100,%rax
+ jl .Lxorpart8
+ vpxor 0x00e0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00e0(%rsi)
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+
+ # xor remaining blocks, write to output
+ vmovdqa %ymm4,%ymm0
+ cmp $0x0120,%rax
+ jl .Lxorpart8
+ vpxor 0x0100(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0100(%rsi)
+
+ vmovdqa %ymm12,%ymm0
+ cmp $0x0140,%rax
+ jl .Lxorpart8
+ vpxor 0x0120(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0120(%rsi)
+
+ vmovdqa %ymm6,%ymm0
+ cmp $0x0160,%rax
+ jl .Lxorpart8
+ vpxor 0x0140(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0140(%rsi)
+
+ vmovdqa %ymm14,%ymm0
+ cmp $0x0180,%rax
+ jl .Lxorpart8
+ vpxor 0x0160(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0160(%rsi)
+
+ vmovdqa %ymm5,%ymm0
+ cmp $0x01a0,%rax
+ jl .Lxorpart8
+ vpxor 0x0180(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0180(%rsi)
+
+ vmovdqa %ymm13,%ymm0
+ cmp $0x01c0,%rax
+ jl .Lxorpart8
+ vpxor 0x01a0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01a0(%rsi)
+
+ vmovdqa %ymm7,%ymm0
+ cmp $0x01e0,%rax
+ jl .Lxorpart8
+ vpxor 0x01c0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01c0(%rsi)
+
+ vmovdqa %ymm15,%ymm0
+ cmp $0x0200,%rax
+ jl .Lxorpart8
+ vpxor 0x01e0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+ vzeroupper
+ lea -8(%r10),%rsp
+ RET
+
+.Lxorpart8:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x1f,%r9
+ jz .Ldone8
+ and $~0x1f,%rax
+
+ mov %rsi,%r11
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ jmp .Ldone8
+
+SYM_FUNC_END(chacha_8block_xor_avx2)
diff --git a/lib/crypto/x86/chacha-avx512vl-x86_64.S b/lib/crypto/x86/chacha-avx512vl-x86_64.S
new file mode 100644
index 000000000000..259383e1ad44
--- /dev/null
+++ b/lib/crypto/x86/chacha-avx512vl-x86_64.S
@@ -0,0 +1,836 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL: .octa 0x00000000000000000000000000000000
+ .octa 0x00000000000000000000000000000001
+
+.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL: .octa 0x00000000000000000000000000000002
+ .octa 0x00000000000000000000000000000003
+
+.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL: .octa 0x00000003000000020000000100000000
+ .octa 0x00000007000000060000000500000004
+
+.text
+
+SYM_FUNC_START(chacha_2block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 2 data blocks output, o
+ # %rdx: up to 2 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts two ChaCha blocks by loading the state
+ # matrix twice across four AVX registers. It performs matrix operations
+ # on four words in each matrix in parallel, but requires shuffling to
+ # rearrange the words after each round.
+
+ vzeroupper
+
+ # x0..3[0-2] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+
+ vmovdqa %ymm0,%ymm8
+ vmovdqa %ymm1,%ymm9
+ vmovdqa %ymm2,%ymm10
+ vmovdqa %ymm3,%ymm11
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ vpaddd %ymm8,%ymm0,%ymm7
+ cmp $0x10,%rcx
+ jl .Lxorpart2
+ vpxord 0x00(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x00(%rsi)
+ vextracti128 $1,%ymm7,%xmm0
+ # o1 = i1 ^ (x1 + s1)
+ vpaddd %ymm9,%ymm1,%ymm7
+ cmp $0x20,%rcx
+ jl .Lxorpart2
+ vpxord 0x10(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x10(%rsi)
+ vextracti128 $1,%ymm7,%xmm1
+ # o2 = i2 ^ (x2 + s2)
+ vpaddd %ymm10,%ymm2,%ymm7
+ cmp $0x30,%rcx
+ jl .Lxorpart2
+ vpxord 0x20(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x20(%rsi)
+ vextracti128 $1,%ymm7,%xmm2
+ # o3 = i3 ^ (x3 + s3)
+ vpaddd %ymm11,%ymm3,%ymm7
+ cmp $0x40,%rcx
+ jl .Lxorpart2
+ vpxord 0x30(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x30(%rsi)
+ vextracti128 $1,%ymm7,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm7
+ cmp $0x50,%rcx
+ jl .Lxorpart2
+ vpxord 0x40(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm7
+ cmp $0x60,%rcx
+ jl .Lxorpart2
+ vpxord 0x50(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm7
+ cmp $0x70,%rcx
+ jl .Lxorpart2
+ vpxord 0x60(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm7
+ cmp $0x80,%rcx
+ jl .Lxorpart2
+ vpxord 0x70(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x70(%rsi)
+
+.Ldone2:
+ vzeroupper
+ RET
+
+.Lxorpart2:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0xf,%rcx
+ jz .Ldone2
+ mov %rax,%r9
+ and $~0xf,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
+ vpxord %xmm7,%xmm1,%xmm1
+ vmovdqu8 %xmm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone2
+
+SYM_FUNC_END(chacha_2block_xor_avx512vl)
+
+SYM_FUNC_START(chacha_4block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four ChaCha blocks by loading the state
+ # matrix four times across eight AVX registers. It performs matrix
+ # operations on four words in two matrices in parallel, sequentially
+ # to the operations on the four words of the other two matrices. The
+ # required word shuffling has a rather high latency, we can do the
+ # arithmetic on two matrix-pairs without much slowdown.
+
+ vzeroupper
+
+ # x0..3[0-4] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vmovdqa %ymm0,%ymm4
+ vmovdqa %ymm1,%ymm5
+ vmovdqa %ymm2,%ymm6
+ vmovdqa %ymm3,%ymm7
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+ vpaddd CTR4BL(%rip),%ymm7,%ymm7
+
+ vmovdqa %ymm0,%ymm11
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm3,%ymm14
+ vmovdqa %ymm7,%ymm15
+
+.Ldoubleround4:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $16,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ vpshufd $0x39,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+ vpshufd $0x93,%ymm7,%ymm7
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $16,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ vpshufd $0x93,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+ vpshufd $0x39,%ymm7,%ymm7
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # o0 = i0 ^ (x0 + s0), first block
+ vpaddd %ymm11,%ymm0,%ymm10
+ cmp $0x10,%rcx
+ jl .Lxorpart4
+ vpxord 0x00(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x00(%rsi)
+ vextracti128 $1,%ymm10,%xmm0
+ # o1 = i1 ^ (x1 + s1), first block
+ vpaddd %ymm12,%ymm1,%ymm10
+ cmp $0x20,%rcx
+ jl .Lxorpart4
+ vpxord 0x10(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x10(%rsi)
+ vextracti128 $1,%ymm10,%xmm1
+ # o2 = i2 ^ (x2 + s2), first block
+ vpaddd %ymm13,%ymm2,%ymm10
+ cmp $0x30,%rcx
+ jl .Lxorpart4
+ vpxord 0x20(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x20(%rsi)
+ vextracti128 $1,%ymm10,%xmm2
+ # o3 = i3 ^ (x3 + s3), first block
+ vpaddd %ymm14,%ymm3,%ymm10
+ cmp $0x40,%rcx
+ jl .Lxorpart4
+ vpxord 0x30(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x30(%rsi)
+ vextracti128 $1,%ymm10,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm10
+ cmp $0x50,%rcx
+ jl .Lxorpart4
+ vpxord 0x40(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm10
+ cmp $0x60,%rcx
+ jl .Lxorpart4
+ vpxord 0x50(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm10
+ cmp $0x70,%rcx
+ jl .Lxorpart4
+ vpxord 0x60(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm10
+ cmp $0x80,%rcx
+ jl .Lxorpart4
+ vpxord 0x70(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x70(%rsi)
+
+ # o0 = i0 ^ (x0 + s0), third block
+ vpaddd %ymm11,%ymm4,%ymm10
+ cmp $0x90,%rcx
+ jl .Lxorpart4
+ vpxord 0x80(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x80(%rsi)
+ vextracti128 $1,%ymm10,%xmm4
+ # o1 = i1 ^ (x1 + s1), third block
+ vpaddd %ymm12,%ymm5,%ymm10
+ cmp $0xa0,%rcx
+ jl .Lxorpart4
+ vpxord 0x90(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x90(%rsi)
+ vextracti128 $1,%ymm10,%xmm5
+ # o2 = i2 ^ (x2 + s2), third block
+ vpaddd %ymm13,%ymm6,%ymm10
+ cmp $0xb0,%rcx
+ jl .Lxorpart4
+ vpxord 0xa0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xa0(%rsi)
+ vextracti128 $1,%ymm10,%xmm6
+ # o3 = i3 ^ (x3 + s3), third block
+ vpaddd %ymm15,%ymm7,%ymm10
+ cmp $0xc0,%rcx
+ jl .Lxorpart4
+ vpxord 0xb0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xb0(%rsi)
+ vextracti128 $1,%ymm10,%xmm7
+
+ # xor and write fourth block
+ vmovdqa %xmm4,%xmm10
+ cmp $0xd0,%rcx
+ jl .Lxorpart4
+ vpxord 0xc0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xc0(%rsi)
+
+ vmovdqa %xmm5,%xmm10
+ cmp $0xe0,%rcx
+ jl .Lxorpart4
+ vpxord 0xd0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xd0(%rsi)
+
+ vmovdqa %xmm6,%xmm10
+ cmp $0xf0,%rcx
+ jl .Lxorpart4
+ vpxord 0xe0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xe0(%rsi)
+
+ vmovdqa %xmm7,%xmm10
+ cmp $0x100,%rcx
+ jl .Lxorpart4
+ vpxord 0xf0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xf0(%rsi)
+
+.Ldone4:
+ vzeroupper
+ RET
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0xf,%rcx
+ jz .Ldone4
+ mov %rax,%r9
+ and $~0xf,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
+ vpxord %xmm10,%xmm1,%xmm1
+ vmovdqu8 %xmm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone4
+
+SYM_FUNC_END(chacha_4block_xor_avx512vl)
+
+SYM_FUNC_START(chacha_8block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 8 data blocks output, o
+ # %rdx: up to 8 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts eight consecutive ChaCha blocks by loading
+ # the state matrix in AVX registers eight times. Compared to AVX2, this
+ # mostly benefits from the new rotate instructions in VL and the
+ # additional registers.
+
+ vzeroupper
+
+ # x0..15[0-7] = s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpbroadcastd 0x04(%rdi),%ymm1
+ vpbroadcastd 0x08(%rdi),%ymm2
+ vpbroadcastd 0x0c(%rdi),%ymm3
+ vpbroadcastd 0x10(%rdi),%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm15
+
+ # x12 += counter values 0-3
+ vpaddd CTR8BL(%rip),%ymm12,%ymm12
+
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+ vmovdqa64 %ymm4,%ymm20
+ vmovdqa64 %ymm5,%ymm21
+ vmovdqa64 %ymm6,%ymm22
+ vmovdqa64 %ymm7,%ymm23
+ vmovdqa64 %ymm8,%ymm24
+ vmovdqa64 %ymm9,%ymm25
+ vmovdqa64 %ymm10,%ymm26
+ vmovdqa64 %ymm11,%ymm27
+ vmovdqa64 %ymm12,%ymm28
+ vmovdqa64 %ymm13,%ymm29
+ vmovdqa64 %ymm14,%ymm30
+ vmovdqa64 %ymm15,%ymm31
+
+.Ldoubleround8:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ vpaddd %ymm0,%ymm4,%ymm0
+ vpxord %ymm0,%ymm12,%ymm12
+ vprold $16,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ vpaddd %ymm1,%ymm5,%ymm1
+ vpxord %ymm1,%ymm13,%ymm13
+ vprold $16,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ vpaddd %ymm2,%ymm6,%ymm2
+ vpxord %ymm2,%ymm14,%ymm14
+ vprold $16,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vpaddd %ymm3,%ymm7,%ymm3
+ vpxord %ymm3,%ymm15,%ymm15
+ vprold $16,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxord %ymm8,%ymm4,%ymm4
+ vprold $12,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxord %ymm9,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxord %ymm10,%ymm6,%ymm6
+ vprold $12,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxord %ymm11,%ymm7,%ymm7
+ vprold $12,%ymm7,%ymm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ vpaddd %ymm0,%ymm4,%ymm0
+ vpxord %ymm0,%ymm12,%ymm12
+ vprold $8,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ vpaddd %ymm1,%ymm5,%ymm1
+ vpxord %ymm1,%ymm13,%ymm13
+ vprold $8,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ vpaddd %ymm2,%ymm6,%ymm2
+ vpxord %ymm2,%ymm14,%ymm14
+ vprold $8,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vpaddd %ymm3,%ymm7,%ymm3
+ vpxord %ymm3,%ymm15,%ymm15
+ vprold $8,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxord %ymm8,%ymm4,%ymm4
+ vprold $7,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxord %ymm9,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxord %ymm10,%ymm6,%ymm6
+ vprold $7,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxord %ymm11,%ymm7,%ymm7
+ vprold $7,%ymm7,%ymm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ vpaddd %ymm0,%ymm5,%ymm0
+ vpxord %ymm0,%ymm15,%ymm15
+ vprold $16,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ vpaddd %ymm1,%ymm6,%ymm1
+ vpxord %ymm1,%ymm12,%ymm12
+ vprold $16,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ vpaddd %ymm2,%ymm7,%ymm2
+ vpxord %ymm2,%ymm13,%ymm13
+ vprold $16,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vpaddd %ymm3,%ymm4,%ymm3
+ vpxord %ymm3,%ymm14,%ymm14
+ vprold $16,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxord %ymm10,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxord %ymm11,%ymm6,%ymm6
+ vprold $12,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxord %ymm8,%ymm7,%ymm7
+ vprold $12,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxord %ymm9,%ymm4,%ymm4
+ vprold $12,%ymm4,%ymm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ vpaddd %ymm0,%ymm5,%ymm0
+ vpxord %ymm0,%ymm15,%ymm15
+ vprold $8,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ vpaddd %ymm1,%ymm6,%ymm1
+ vpxord %ymm1,%ymm12,%ymm12
+ vprold $8,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ vpaddd %ymm2,%ymm7,%ymm2
+ vpxord %ymm2,%ymm13,%ymm13
+ vprold $8,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vpaddd %ymm3,%ymm4,%ymm3
+ vpxord %ymm3,%ymm14,%ymm14
+ vprold $8,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxord %ymm10,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxord %ymm11,%ymm6,%ymm6
+ vprold $7,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxord %ymm8,%ymm7,%ymm7
+ vprold $7,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxord %ymm9,%ymm4,%ymm4
+ vprold $7,%ymm4,%ymm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround8
+
+ # x0..15[0-3] += s[0..15]
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+ vpaddd %ymm20,%ymm4,%ymm4
+ vpaddd %ymm21,%ymm5,%ymm5
+ vpaddd %ymm22,%ymm6,%ymm6
+ vpaddd %ymm23,%ymm7,%ymm7
+ vpaddd %ymm24,%ymm8,%ymm8
+ vpaddd %ymm25,%ymm9,%ymm9
+ vpaddd %ymm26,%ymm10,%ymm10
+ vpaddd %ymm27,%ymm11,%ymm11
+ vpaddd %ymm28,%ymm12,%ymm12
+ vpaddd %ymm29,%ymm13,%ymm13
+ vpaddd %ymm30,%ymm14,%ymm14
+ vpaddd %ymm31,%ymm15,%ymm15
+
+ # interleave 32-bit words in state n, n+1
+ vpunpckldq %ymm1,%ymm0,%ymm16
+ vpunpckhdq %ymm1,%ymm0,%ymm17
+ vpunpckldq %ymm3,%ymm2,%ymm18
+ vpunpckhdq %ymm3,%ymm2,%ymm19
+ vpunpckldq %ymm5,%ymm4,%ymm20
+ vpunpckhdq %ymm5,%ymm4,%ymm21
+ vpunpckldq %ymm7,%ymm6,%ymm22
+ vpunpckhdq %ymm7,%ymm6,%ymm23
+ vpunpckldq %ymm9,%ymm8,%ymm24
+ vpunpckhdq %ymm9,%ymm8,%ymm25
+ vpunpckldq %ymm11,%ymm10,%ymm26
+ vpunpckhdq %ymm11,%ymm10,%ymm27
+ vpunpckldq %ymm13,%ymm12,%ymm28
+ vpunpckhdq %ymm13,%ymm12,%ymm29
+ vpunpckldq %ymm15,%ymm14,%ymm30
+ vpunpckhdq %ymm15,%ymm14,%ymm31
+
+ # interleave 64-bit words in state n, n+2
+ vpunpcklqdq %ymm18,%ymm16,%ymm0
+ vpunpcklqdq %ymm19,%ymm17,%ymm1
+ vpunpckhqdq %ymm18,%ymm16,%ymm2
+ vpunpckhqdq %ymm19,%ymm17,%ymm3
+ vpunpcklqdq %ymm22,%ymm20,%ymm4
+ vpunpcklqdq %ymm23,%ymm21,%ymm5
+ vpunpckhqdq %ymm22,%ymm20,%ymm6
+ vpunpckhqdq %ymm23,%ymm21,%ymm7
+ vpunpcklqdq %ymm26,%ymm24,%ymm8
+ vpunpcklqdq %ymm27,%ymm25,%ymm9
+ vpunpckhqdq %ymm26,%ymm24,%ymm10
+ vpunpckhqdq %ymm27,%ymm25,%ymm11
+ vpunpcklqdq %ymm30,%ymm28,%ymm12
+ vpunpcklqdq %ymm31,%ymm29,%ymm13
+ vpunpckhqdq %ymm30,%ymm28,%ymm14
+ vpunpckhqdq %ymm31,%ymm29,%ymm15
+
+ # interleave 128-bit words in state n, n+4
+ # xor/write first four blocks
+ vmovdqa64 %ymm0,%ymm16
+ vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
+ cmp $0x0020,%rcx
+ jl .Lxorpart8
+ vpxord 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0000(%rsi)
+ vmovdqa64 %ymm16,%ymm0
+ vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
+
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ cmp $0x0040,%rcx
+ jl .Lxorpart8
+ vpxord 0x0020(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0020(%rsi)
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+
+ vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
+ cmp $0x0060,%rcx
+ jl .Lxorpart8
+ vpxord 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0040(%rsi)
+ vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
+
+ vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ cmp $0x0080,%rcx
+ jl .Lxorpart8
+ vpxord 0x0060(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0060(%rsi)
+ vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
+
+ vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
+ cmp $0x00a0,%rcx
+ jl .Lxorpart8
+ vpxord 0x0080(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0080(%rsi)
+ vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
+
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ cmp $0x00c0,%rcx
+ jl .Lxorpart8
+ vpxord 0x00a0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00a0(%rsi)
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+
+ vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
+ cmp $0x00e0,%rcx
+ jl .Lxorpart8
+ vpxord 0x00c0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00c0(%rsi)
+ vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
+
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ cmp $0x0100,%rcx
+ jl .Lxorpart8
+ vpxord 0x00e0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00e0(%rsi)
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+
+ # xor remaining blocks, write to output
+ vmovdqa64 %ymm4,%ymm0
+ cmp $0x0120,%rcx
+ jl .Lxorpart8
+ vpxord 0x0100(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0100(%rsi)
+
+ vmovdqa64 %ymm12,%ymm0
+ cmp $0x0140,%rcx
+ jl .Lxorpart8
+ vpxord 0x0120(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0120(%rsi)
+
+ vmovdqa64 %ymm6,%ymm0
+ cmp $0x0160,%rcx
+ jl .Lxorpart8
+ vpxord 0x0140(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0140(%rsi)
+
+ vmovdqa64 %ymm14,%ymm0
+ cmp $0x0180,%rcx
+ jl .Lxorpart8
+ vpxord 0x0160(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0160(%rsi)
+
+ vmovdqa64 %ymm5,%ymm0
+ cmp $0x01a0,%rcx
+ jl .Lxorpart8
+ vpxord 0x0180(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0180(%rsi)
+
+ vmovdqa64 %ymm13,%ymm0
+ cmp $0x01c0,%rcx
+ jl .Lxorpart8
+ vpxord 0x01a0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01a0(%rsi)
+
+ vmovdqa64 %ymm7,%ymm0
+ cmp $0x01e0,%rcx
+ jl .Lxorpart8
+ vpxord 0x01c0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01c0(%rsi)
+
+ vmovdqa64 %ymm15,%ymm0
+ cmp $0x0200,%rcx
+ jl .Lxorpart8
+ vpxord 0x01e0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+ vzeroupper
+ RET
+
+.Lxorpart8:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0x1f,%rcx
+ jz .Ldone8
+ mov %rax,%r9
+ and $~0x1f,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
+ vpxord %ymm0,%ymm1,%ymm1
+ vmovdqu8 %ymm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone8
+
+SYM_FUNC_END(chacha_8block_xor_avx512vl)
diff --git a/lib/crypto/x86/chacha-ssse3-x86_64.S b/lib/crypto/x86/chacha-ssse3-x86_64.S
new file mode 100644
index 000000000000..7111949cd5b9
--- /dev/null
+++ b/lib/crypto/x86/chacha-ssse3-x86_64.S
@@ -0,0 +1,791 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.section .rodata.cst16.ROT8, "aM", @progbits, 16
+.align 16
+ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
+.section .rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+.section .rodata.cst16.CTRINC, "aM", @progbits, 16
+.align 16
+CTRINC: .octa 0x00000003000000020000000100000000
+
+.text
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
+ * function performs matrix operations on four words in parallel, but requires
+ * shuffling to rearrange the words after each round. 8/16-bit word rotation is
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
+ * rotation uses traditional shift+OR.
+ *
+ * The round count is given in %r8d.
+ *
+ * Clobbers: %r8d, %xmm4-%xmm7
+ */
+SYM_FUNC_START_LOCAL(chacha_permute)
+
+ movdqa ROT8(%rip),%xmm4
+ movdqa ROT16(%rip),%xmm5
+
+.Ldoubleround:
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm5,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm6
+ pslld $12,%xmm6
+ psrld $20,%xmm1
+ por %xmm6,%xmm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm4,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm7
+ pslld $7,%xmm7
+ psrld $25,%xmm1
+ por %xmm7,%xmm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ pshufd $0x39,%xmm1,%xmm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ pshufd $0x4e,%xmm2,%xmm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ pshufd $0x93,%xmm3,%xmm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm5,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm6
+ pslld $12,%xmm6
+ psrld $20,%xmm1
+ por %xmm6,%xmm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm4,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm7
+ pslld $7,%xmm7
+ psrld $25,%xmm1
+ por %xmm7,%xmm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ pshufd $0x93,%xmm1,%xmm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ pshufd $0x4e,%xmm2,%xmm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ pshufd $0x39,%xmm3,%xmm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ RET
+SYM_FUNC_END(chacha_permute)
+
+SYM_FUNC_START(chacha_block_xor_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 1 data block output, o
+ # %rdx: up to 1 data block input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+ FRAME_BEGIN
+
+ # x0..3 = s0..3
+ movdqu 0x00(%rdi),%xmm0
+ movdqu 0x10(%rdi),%xmm1
+ movdqu 0x20(%rdi),%xmm2
+ movdqu 0x30(%rdi),%xmm3
+ movdqa %xmm0,%xmm8
+ movdqa %xmm1,%xmm9
+ movdqa %xmm2,%xmm10
+ movdqa %xmm3,%xmm11
+
+ mov %rcx,%rax
+ call chacha_permute
+
+ # o0 = i0 ^ (x0 + s0)
+ paddd %xmm8,%xmm0
+ cmp $0x10,%rax
+ jl .Lxorpart
+ movdqu 0x00(%rdx),%xmm4
+ pxor %xmm4,%xmm0
+ movdqu %xmm0,0x00(%rsi)
+ # o1 = i1 ^ (x1 + s1)
+ paddd %xmm9,%xmm1
+ movdqa %xmm1,%xmm0
+ cmp $0x20,%rax
+ jl .Lxorpart
+ movdqu 0x10(%rdx),%xmm0
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x10(%rsi)
+ # o2 = i2 ^ (x2 + s2)
+ paddd %xmm10,%xmm2
+ movdqa %xmm2,%xmm0
+ cmp $0x30,%rax
+ jl .Lxorpart
+ movdqu 0x20(%rdx),%xmm0
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,0x20(%rsi)
+ # o3 = i3 ^ (x3 + s3)
+ paddd %xmm11,%xmm3
+ movdqa %xmm3,%xmm0
+ cmp $0x40,%rax
+ jl .Lxorpart
+ movdqu 0x30(%rdx),%xmm0
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,0x30(%rsi)
+
+.Ldone:
+ FRAME_END
+ RET
+
+.Lxorpart:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ pxor 0x00(%rsp),%xmm0
+ movdqa %xmm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone
+
+SYM_FUNC_END(chacha_block_xor_ssse3)
+
+SYM_FUNC_START(hchacha_block_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: output (8 32-bit words)
+ # %edx: nrounds
+ FRAME_BEGIN
+
+ movdqu 0x00(%rdi),%xmm0
+ movdqu 0x10(%rdi),%xmm1
+ movdqu 0x20(%rdi),%xmm2
+ movdqu 0x30(%rdi),%xmm3
+
+ mov %edx,%r8d
+ call chacha_permute
+
+ movdqu %xmm0,0x00(%rsi)
+ movdqu %xmm3,0x10(%rsi)
+
+ FRAME_END
+ RET
+SYM_FUNC_END(hchacha_block_ssse3)
+
+SYM_FUNC_START(chacha_4block_xor_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four consecutive ChaCha blocks by loading the
+ # the state matrix in SSE registers four times. As we need some scratch
+ # registers, we save the first four registers on the stack. The
+ # algorithm performs each operation on the corresponding word of each
+ # state matrix, hence requires no word shuffling. For final XORing step
+ # we transpose the matrix by interleaving 32- and then 64-bit words,
+ # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
+ # done with the slightly better performing SSSE3 byte shuffling,
+ # 7/12-bit word rotation uses traditional shift+OR.
+
+ lea 8(%rsp),%r10
+ sub $0x80,%rsp
+ and $~63,%rsp
+ mov %rcx,%rax
+
+ # x0..15[0-3] = s0..3[0..3]
+ movq 0x00(%rdi),%xmm1
+ pshufd $0x00,%xmm1,%xmm0
+ pshufd $0x55,%xmm1,%xmm1
+ movq 0x08(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ movq 0x10(%rdi),%xmm5
+ pshufd $0x00,%xmm5,%xmm4
+ pshufd $0x55,%xmm5,%xmm5
+ movq 0x18(%rdi),%xmm7
+ pshufd $0x00,%xmm7,%xmm6
+ pshufd $0x55,%xmm7,%xmm7
+ movq 0x20(%rdi),%xmm9
+ pshufd $0x00,%xmm9,%xmm8
+ pshufd $0x55,%xmm9,%xmm9
+ movq 0x28(%rdi),%xmm11
+ pshufd $0x00,%xmm11,%xmm10
+ pshufd $0x55,%xmm11,%xmm11
+ movq 0x30(%rdi),%xmm13
+ pshufd $0x00,%xmm13,%xmm12
+ pshufd $0x55,%xmm13,%xmm13
+ movq 0x38(%rdi),%xmm15
+ pshufd $0x00,%xmm15,%xmm14
+ pshufd $0x55,%xmm15,%xmm15
+ # x0..3 on stack
+ movdqa %xmm0,0x00(%rsp)
+ movdqa %xmm1,0x10(%rsp)
+ movdqa %xmm2,0x20(%rsp)
+ movdqa %xmm3,0x30(%rsp)
+
+ movdqa CTRINC(%rip),%xmm1
+ movdqa ROT8(%rip),%xmm2
+ movdqa ROT16(%rip),%xmm3
+
+ # x12 += counter values 0-3
+ paddd %xmm1,%xmm12
+
+.Ldoubleround4:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm3,%xmm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm3,%xmm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm3,%xmm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm3,%xmm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm4
+ por %xmm0,%xmm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm5
+ por %xmm0,%xmm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm6
+ por %xmm0,%xmm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm7
+ por %xmm0,%xmm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm2,%xmm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm2,%xmm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm2,%xmm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm2,%xmm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm4
+ por %xmm0,%xmm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm5
+ por %xmm0,%xmm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm6
+ por %xmm0,%xmm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm7
+ por %xmm0,%xmm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm3,%xmm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm3,%xmm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm3,%xmm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm3,%xmm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ paddd %xmm15,%xmm10
+ pxor %xmm10,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm5
+ por %xmm0,%xmm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ paddd %xmm12,%xmm11
+ pxor %xmm11,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm6
+ por %xmm0,%xmm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ paddd %xmm13,%xmm8
+ pxor %xmm8,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm7
+ por %xmm0,%xmm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ paddd %xmm14,%xmm9
+ pxor %xmm9,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm4
+ por %xmm0,%xmm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm2,%xmm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm2,%xmm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm2,%xmm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm2,%xmm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ paddd %xmm15,%xmm10
+ pxor %xmm10,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm5
+ por %xmm0,%xmm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ paddd %xmm12,%xmm11
+ pxor %xmm11,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm6
+ por %xmm0,%xmm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ paddd %xmm13,%xmm8
+ pxor %xmm8,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm7
+ por %xmm0,%xmm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ paddd %xmm14,%xmm9
+ pxor %xmm9,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm4
+ por %xmm0,%xmm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # x0[0-3] += s0[0]
+ # x1[0-3] += s0[1]
+ movq 0x00(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd 0x00(%rsp),%xmm2
+ movdqa %xmm2,0x00(%rsp)
+ paddd 0x10(%rsp),%xmm3
+ movdqa %xmm3,0x10(%rsp)
+ # x2[0-3] += s0[2]
+ # x3[0-3] += s0[3]
+ movq 0x08(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd 0x20(%rsp),%xmm2
+ movdqa %xmm2,0x20(%rsp)
+ paddd 0x30(%rsp),%xmm3
+ movdqa %xmm3,0x30(%rsp)
+
+ # x4[0-3] += s1[0]
+ # x5[0-3] += s1[1]
+ movq 0x10(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ # x6[0-3] += s1[2]
+ # x7[0-3] += s1[3]
+ movq 0x18(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+
+ # x8[0-3] += s2[0]
+ # x9[0-3] += s2[1]
+ movq 0x20(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm8
+ paddd %xmm3,%xmm9
+ # x10[0-3] += s2[2]
+ # x11[0-3] += s2[3]
+ movq 0x28(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm10
+ paddd %xmm3,%xmm11
+
+ # x12[0-3] += s3[0]
+ # x13[0-3] += s3[1]
+ movq 0x30(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm12
+ paddd %xmm3,%xmm13
+ # x14[0-3] += s3[2]
+ # x15[0-3] += s3[3]
+ movq 0x38(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm14
+ paddd %xmm3,%xmm15
+
+ # x12 += counter values 0-3
+ paddd %xmm1,%xmm12
+
+ # interleave 32-bit words in state n, n+1
+ movdqa 0x00(%rsp),%xmm0
+ movdqa 0x10(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpckldq %xmm1,%xmm2
+ punpckhdq %xmm1,%xmm0
+ movdqa %xmm2,0x00(%rsp)
+ movdqa %xmm0,0x10(%rsp)
+ movdqa 0x20(%rsp),%xmm0
+ movdqa 0x30(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpckldq %xmm1,%xmm2
+ punpckhdq %xmm1,%xmm0
+ movdqa %xmm2,0x20(%rsp)
+ movdqa %xmm0,0x30(%rsp)
+ movdqa %xmm4,%xmm0
+ punpckldq %xmm5,%xmm4
+ punpckhdq %xmm5,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm6,%xmm0
+ punpckldq %xmm7,%xmm6
+ punpckhdq %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ movdqa %xmm8,%xmm0
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm9,%xmm0
+ movdqa %xmm0,%xmm9
+ movdqa %xmm10,%xmm0
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm11,%xmm0
+ movdqa %xmm0,%xmm11
+ movdqa %xmm12,%xmm0
+ punpckldq %xmm13,%xmm12
+ punpckhdq %xmm13,%xmm0
+ movdqa %xmm0,%xmm13
+ movdqa %xmm14,%xmm0
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm15,%xmm0
+ movdqa %xmm0,%xmm15
+
+ # interleave 64-bit words in state n, n+2
+ movdqa 0x00(%rsp),%xmm0
+ movdqa 0x20(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpcklqdq %xmm1,%xmm2
+ punpckhqdq %xmm1,%xmm0
+ movdqa %xmm2,0x00(%rsp)
+ movdqa %xmm0,0x20(%rsp)
+ movdqa 0x10(%rsp),%xmm0
+ movdqa 0x30(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpcklqdq %xmm1,%xmm2
+ punpckhqdq %xmm1,%xmm0
+ movdqa %xmm2,0x10(%rsp)
+ movdqa %xmm0,0x30(%rsp)
+ movdqa %xmm4,%xmm0
+ punpcklqdq %xmm6,%xmm4
+ punpckhqdq %xmm6,%xmm0
+ movdqa %xmm0,%xmm6
+ movdqa %xmm5,%xmm0
+ punpcklqdq %xmm7,%xmm5
+ punpckhqdq %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ movdqa %xmm8,%xmm0
+ punpcklqdq %xmm10,%xmm8
+ punpckhqdq %xmm10,%xmm0
+ movdqa %xmm0,%xmm10
+ movdqa %xmm9,%xmm0
+ punpcklqdq %xmm11,%xmm9
+ punpckhqdq %xmm11,%xmm0
+ movdqa %xmm0,%xmm11
+ movdqa %xmm12,%xmm0
+ punpcklqdq %xmm14,%xmm12
+ punpckhqdq %xmm14,%xmm0
+ movdqa %xmm0,%xmm14
+ movdqa %xmm13,%xmm0
+ punpcklqdq %xmm15,%xmm13
+ punpckhqdq %xmm15,%xmm0
+ movdqa %xmm0,%xmm15
+
+ # xor with corresponding input, write to output
+ movdqa 0x00(%rsp),%xmm0
+ cmp $0x10,%rax
+ jl .Lxorpart4
+ movdqu 0x00(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x00(%rsi)
+
+ movdqu %xmm4,%xmm0
+ cmp $0x20,%rax
+ jl .Lxorpart4
+ movdqu 0x10(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x10(%rsi)
+
+ movdqu %xmm8,%xmm0
+ cmp $0x30,%rax
+ jl .Lxorpart4
+ movdqu 0x20(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x20(%rsi)
+
+ movdqu %xmm12,%xmm0
+ cmp $0x40,%rax
+ jl .Lxorpart4
+ movdqu 0x30(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x30(%rsi)
+
+ movdqa 0x20(%rsp),%xmm0
+ cmp $0x50,%rax
+ jl .Lxorpart4
+ movdqu 0x40(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x40(%rsi)
+
+ movdqu %xmm6,%xmm0
+ cmp $0x60,%rax
+ jl .Lxorpart4
+ movdqu 0x50(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x50(%rsi)
+
+ movdqu %xmm10,%xmm0
+ cmp $0x70,%rax
+ jl .Lxorpart4
+ movdqu 0x60(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x60(%rsi)
+
+ movdqu %xmm14,%xmm0
+ cmp $0x80,%rax
+ jl .Lxorpart4
+ movdqu 0x70(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x70(%rsi)
+
+ movdqa 0x10(%rsp),%xmm0
+ cmp $0x90,%rax
+ jl .Lxorpart4
+ movdqu 0x80(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x80(%rsi)
+
+ movdqu %xmm5,%xmm0
+ cmp $0xa0,%rax
+ jl .Lxorpart4
+ movdqu 0x90(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x90(%rsi)
+
+ movdqu %xmm9,%xmm0
+ cmp $0xb0,%rax
+ jl .Lxorpart4
+ movdqu 0xa0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xa0(%rsi)
+
+ movdqu %xmm13,%xmm0
+ cmp $0xc0,%rax
+ jl .Lxorpart4
+ movdqu 0xb0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xb0(%rsi)
+
+ movdqa 0x30(%rsp),%xmm0
+ cmp $0xd0,%rax
+ jl .Lxorpart4
+ movdqu 0xc0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xc0(%rsi)
+
+ movdqu %xmm7,%xmm0
+ cmp $0xe0,%rax
+ jl .Lxorpart4
+ movdqu 0xd0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xd0(%rsi)
+
+ movdqu %xmm11,%xmm0
+ cmp $0xf0,%rax
+ jl .Lxorpart4
+ movdqu 0xe0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xe0(%rsi)
+
+ movdqu %xmm15,%xmm0
+ cmp $0x100,%rax
+ jl .Lxorpart4
+ movdqu 0xf0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xf0(%rsi)
+
+.Ldone4:
+ lea -8(%r10),%rsp
+ RET
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone4
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ pxor 0x00(%rsp),%xmm0
+ movdqa %xmm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ jmp .Ldone4
+
+SYM_FUNC_END(chacha_4block_xor_ssse3)
diff --git a/lib/crypto/x86/chacha.h b/lib/crypto/x86/chacha.h
new file mode 100644
index 000000000000..10cf8f1c569d
--- /dev/null
+++ b/lib/crypto/x86/chacha.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha and HChaCha functions (x86_64 optimized)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <asm/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void hchacha_block_ssse3(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+
+asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state,
+ u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
+
+static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+{
+ len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
+ return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ if (static_branch_likely(&chacha_use_avx512vl)) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ chacha_8block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 8;
+ src += CHACHA_BLOCK_SIZE * 8;
+ dst += CHACHA_BLOCK_SIZE * 8;
+ state->x[12] += 8;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 4) {
+ chacha_8block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state->x[12] += chacha_advance(bytes, 8);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 2) {
+ chacha_4block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state->x[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes) {
+ chacha_2block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state->x[12] += chacha_advance(bytes, 2);
+ return;
+ }
+ }
+
+ if (static_branch_likely(&chacha_use_avx2)) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 8;
+ src += CHACHA_BLOCK_SIZE * 8;
+ dst += CHACHA_BLOCK_SIZE * 8;
+ state->x[12] += 8;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 4) {
+ chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+ state->x[12] += chacha_advance(bytes, 8);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 2) {
+ chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
+ state->x[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE) {
+ chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
+ state->x[12] += chacha_advance(bytes, 2);
+ return;
+ }
+ }
+
+ while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+ chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 4;
+ src += CHACHA_BLOCK_SIZE * 4;
+ dst += CHACHA_BLOCK_SIZE * 4;
+ state->x[12] += 4;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE) {
+ chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+ state->x[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes) {
+ chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
+ state->x[12]++;
+ }
+}
+
+static void hchacha_block_arch(const struct chacha_state *state,
+ u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+ if (!static_branch_likely(&chacha_use_simd)) {
+ hchacha_block_generic(state, out, nrounds);
+ } else {
+ kernel_fpu_begin();
+ hchacha_block_ssse3(state, out, nrounds);
+ kernel_fpu_end();
+ }
+}
+
+static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
+ const u8 *src, unsigned int bytes, int nrounds)
+{
+ if (!static_branch_likely(&chacha_use_simd) ||
+ bytes <= CHACHA_BLOCK_SIZE)
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_fpu_begin();
+ chacha_dosimd(state, dst, src, todo, nrounds);
+ kernel_fpu_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+
+#define chacha_mod_init_arch chacha_mod_init_arch
+static void chacha_mod_init_arch(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_SSSE3))
+ return;
+
+ static_branch_enable(&chacha_use_simd);
+
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+ static_branch_enable(&chacha_use_avx2);
+
+ if (boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
+ static_branch_enable(&chacha_use_avx512vl);
+ }
+}
diff --git a/lib/crypto/x86/curve25519.h b/lib/crypto/x86/curve25519.h
new file mode 100644
index 000000000000..5c0b8408852d
--- /dev/null
+++ b/lib/crypto/x86/curve25519.h
@@ -0,0 +1,1613 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
+ */
+
+#include <linux/types.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+static __always_inline u64 eq_mask(u64 a, u64 b)
+{
+ u64 x = a ^ b;
+ u64 minus_x = ~x + (u64)1U;
+ u64 x_or_minus_x = x | minus_x;
+ u64 xnx = x_or_minus_x >> (u32)63U;
+ return xnx - (u64)1U;
+}
+
+static __always_inline u64 gte_mask(u64 a, u64 b)
+{
+ u64 x = a;
+ u64 y = b;
+ u64 x_xor_y = x ^ y;
+ u64 x_sub_y = x - y;
+ u64 x_sub_y_xor_y = x_sub_y ^ y;
+ u64 q = x_xor_y | x_sub_y_xor_y;
+ u64 x_xor_q = x ^ q;
+ u64 x_xor_q_ = x_xor_q >> (u32)63U;
+ return x_xor_q_ - (u64)1U;
+}
+
+/* Computes the addition of four-element f1 with value in f2
+ * and returns the carry (if any) */
+static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
+{
+ u64 carry_r;
+
+ asm volatile(
+ /* Clear registers to propagate the carry bit */
+ " xor %%r8d, %%r8d;"
+ " xor %%r9d, %%r9d;"
+ " xor %%r10d, %%r10d;"
+ " xor %%r11d, %%r11d;"
+ " xor %k1, %k1;"
+
+ /* Begin addition chain */
+ " addq 0(%3), %0;"
+ " movq %0, 0(%2);"
+ " adcxq 8(%3), %%r8;"
+ " movq %%r8, 8(%2);"
+ " adcxq 16(%3), %%r9;"
+ " movq %%r9, 16(%2);"
+ " adcxq 24(%3), %%r10;"
+ " movq %%r10, 24(%2);"
+
+ /* Return the carry bit in a register */
+ " adcx %%r11, %1;"
+ : "+&r"(f2), "=&r"(carry_r)
+ : "r"(out), "r"(f1)
+ : "%r8", "%r9", "%r10", "%r11", "memory", "cc");
+
+ return carry_r;
+}
+
+/* Computes the field addition of two field elements */
+static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
+{
+ asm volatile(
+ /* Compute the raw addition of f1 + f2 */
+ " movq 0(%0), %%r8;"
+ " addq 0(%2), %%r8;"
+ " movq 8(%0), %%r9;"
+ " adcxq 8(%2), %%r9;"
+ " movq 16(%0), %%r10;"
+ " adcxq 16(%2), %%r10;"
+ " movq 24(%0), %%r11;"
+ " adcxq 24(%2), %%r11;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute carry*38 */
+ " mov $0, %%rax;"
+ " mov $38, %0;"
+ " cmovc %0, %%rax;"
+
+ /* Step 2: Add carry*38 to the original sum */
+ " xor %%ecx, %%ecx;"
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %0, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+ : "+&r"(f2)
+ : "r"(out), "r"(f1)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
+}
+
+/* Computes the field subtraction of two field elements */
+static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
+{
+ asm volatile(
+ /* Compute the raw subtraction of f1-f2 */
+ " movq 0(%1), %%r8;"
+ " subq 0(%2), %%r8;"
+ " movq 8(%1), %%r9;"
+ " sbbq 8(%2), %%r9;"
+ " movq 16(%1), %%r10;"
+ " sbbq 16(%2), %%r10;"
+ " movq 24(%1), %%r11;"
+ " sbbq 24(%2), %%r11;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute carry*38 */
+ " mov $0, %%rax;"
+ " mov $38, %%rcx;"
+ " cmovc %%rcx, %%rax;"
+
+ /* Step 2: Subtract carry*38 from the original difference */
+ " sub %%rax, %%r8;"
+ " sbb $0, %%r9;"
+ " sbb $0, %%r10;"
+ " sbb $0, %%r11;"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rcx, %%rax;"
+ " sub %%rax, %%r8;"
+
+ /* Store the result */
+ " movq %%r8, 0(%0);"
+ " movq %%r9, 8(%0);"
+ " movq %%r10, 16(%0);"
+ " movq %%r11, 24(%0);"
+ :
+ : "r"(out), "r"(f1), "r"(f2)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
+}
+
+/* Computes a field multiplication: out <- f1 * f2
+ * Uses the 8-element buffer tmp for intermediate results */
+static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
+{
+ asm volatile(
+
+ /* Compute the raw multiplication: tmp <- src1 * src2 */
+
+ /* Compute src1[0] * src2 */
+ " movq 0(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 0(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
+ /* Compute src1[1] * src2 */
+ " movq 8(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 8(%2), %%r8;"
+ " movq %%r8, 8(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 16(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[2] * src2 */
+ " movq 16(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 16(%2), %%r8;"
+ " movq %%r8, 16(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 24(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[3] * src2 */
+ " movq 24(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 24(%2), %%r8;"
+ " movq %%r8, 24(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 32(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 40(%2);"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%2);"
+
+ /* Line up pointers */
+ " mov %2, %0;"
+ " mov %3, %2;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %1, %%r9;"
+ " movq %%r9, 8(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 16(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 24(%2);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%2);"
+ : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "%r14", "memory", "cc");
+}
+
+/* Computes two field multiplications:
+ * out[0] <- f1[0] * f2[0]
+ * out[1] <- f1[1] * f2[1]
+ * Uses the 16-element buffer tmp for intermediate results: */
+static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
+{
+ asm volatile(
+
+ /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
+
+ /* Compute src1[0] * src2 */
+ " movq 0(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 0(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
+ /* Compute src1[1] * src2 */
+ " movq 8(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 8(%2), %%r8;"
+ " movq %%r8, 8(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 16(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[2] * src2 */
+ " movq 16(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 16(%2), %%r8;"
+ " movq %%r8, 16(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 24(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[3] * src2 */
+ " movq 24(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 24(%2), %%r8;"
+ " movq %%r8, 24(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 32(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 40(%2);"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%2);"
+
+ /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
+
+ /* Compute src1[0] * src2 */
+ " movq 32(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 64(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 72(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
+ /* Compute src1[1] * src2 */
+ " movq 40(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 72(%2), %%r8;"
+ " movq %%r8, 72(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 80(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[2] * src2 */
+ " movq 48(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 80(%2), %%r8;"
+ " movq %%r8, 80(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 88(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
+ /* Compute src1[3] * src2 */
+ " movq 56(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 88(%2), %%r8;"
+ " movq %%r8, 88(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 96(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 104(%2);"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 112(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 120(%2);"
+
+ /* Line up pointers */
+ " mov %2, %0;"
+ " mov %3, %2;"
+
+ /* Wrap the results back into the field */
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %1, %%r9;"
+ " movq %%r9, 8(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 16(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 24(%2);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%2);"
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 96(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 64(%0), %%r8;"
+ " mulxq 104(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 72(%0), %%r9;"
+ " mulxq 112(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 80(%0), %%r10;"
+ " mulxq 120(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 88(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %1, %%r9;"
+ " movq %%r9, 40(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 48(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 56(%2);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 32(%2);"
+ : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "%r14", "memory", "cc");
+}
+
+/* Computes the field multiplication of four-element f1 with value in f2
+ * Requires f2 to be smaller than 2^17 */
+static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
+{
+ register u64 f2_r asm("rdx") = f2;
+
+ asm volatile(
+ /* Compute the raw multiplication of f1*f2 */
+ " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
+ " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
+ " add %%rcx, %%r9;"
+ " mov $0, %%rcx;"
+ " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
+ " adcx %%rbx, %%r10;"
+ " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
+ " adcx %%r13, %%r11;"
+ " adcx %%rcx, %%rax;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute carry*38 */
+ " mov $38, %%rdx;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+ : "+&r"(f2_r)
+ : "r"(out), "r"(f1)
+ : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "memory", "cc");
+}
+
+/* Computes p1 <- bit ? p2 : p1 in constant time */
+static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
+{
+ asm volatile(
+ /* Transfer bit into CF flag */
+ " add $18446744073709551615, %0;"
+
+ /* cswap p1[0], p2[0] */
+ " movq 0(%1), %%r8;"
+ " movq 0(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 0(%1);"
+ " movq %%r9, 0(%2);"
+
+ /* cswap p1[1], p2[1] */
+ " movq 8(%1), %%r8;"
+ " movq 8(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 8(%1);"
+ " movq %%r9, 8(%2);"
+
+ /* cswap p1[2], p2[2] */
+ " movq 16(%1), %%r8;"
+ " movq 16(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 16(%1);"
+ " movq %%r9, 16(%2);"
+
+ /* cswap p1[3], p2[3] */
+ " movq 24(%1), %%r8;"
+ " movq 24(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 24(%1);"
+ " movq %%r9, 24(%2);"
+
+ /* cswap p1[4], p2[4] */
+ " movq 32(%1), %%r8;"
+ " movq 32(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 32(%1);"
+ " movq %%r9, 32(%2);"
+
+ /* cswap p1[5], p2[5] */
+ " movq 40(%1), %%r8;"
+ " movq 40(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 40(%1);"
+ " movq %%r9, 40(%2);"
+
+ /* cswap p1[6], p2[6] */
+ " movq 48(%1), %%r8;"
+ " movq 48(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 48(%1);"
+ " movq %%r9, 48(%2);"
+
+ /* cswap p1[7], p2[7] */
+ " movq 56(%1), %%r8;"
+ " movq 56(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 56(%1);"
+ " movq %%r9, 56(%2);"
+ : "+&r"(bit)
+ : "r"(p1), "r"(p2)
+ : "%r8", "%r9", "%r10", "memory", "cc");
+}
+
+/* Computes the square of a field element: out <- f * f
+ * Uses the 8-element buffer tmp for intermediate results */
+static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
+{
+ asm volatile(
+ /* Compute the raw multiplication: tmp <- f * f */
+
+ /* Step 1: Compute all partial products */
+ " movq 0(%0), %%rdx;" /* f[0] */
+ " mulxq 8(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 16(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%0), %%rdx;" /* f[3] */
+ " mulxq 8(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 8(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
+ " xor %%r15d, %%r15d;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%rbx;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%rbx, %%rbx;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+
+ /* Step 3: Compute intermediate squares */
+ " movq 0(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%1);"
+ " movq 8(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%1);"
+ " movq 16(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 40(%1);"
+ " movq 24(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%1);"
+
+ /* Line up pointers */
+ " mov %1, %0;"
+ " mov %2, %1;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %%ecx, %%ecx;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+ : "+&r"(f), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15", "memory", "cc");
+}
+
+/* Computes two field squarings:
+ * out[0] <- f[0] * f[0]
+ * out[1] <- f[1] * f[1]
+ * Uses the 16-element buffer tmp for intermediate results */
+static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
+{
+ asm volatile(
+ /* Step 1: Compute all partial products */
+ " movq 0(%0), %%rdx;" /* f[0] */
+ " mulxq 8(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 16(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%0), %%rdx;" /* f[3] */
+ " mulxq 8(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 8(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
+ " xor %%r15d, %%r15d;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%rbx;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%rbx, %%rbx;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+
+ /* Step 3: Compute intermediate squares */
+ " movq 0(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%1);"
+ " movq 8(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%1);"
+ " movq 16(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 40(%1);"
+ " movq 24(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%1);"
+
+ /* Step 1: Compute all partial products */
+ " movq 32(%0), %%rdx;" /* f[0] */
+ " mulxq 40(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 48(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 56(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 56(%0), %%rdx;" /* f[3] */
+ " mulxq 40(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 48(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 40(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 48(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
+ " xor %%r15d, %%r15d;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%rbx;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%rbx, %%rbx;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+
+ /* Step 3: Compute intermediate squares */
+ " movq 32(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 64(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 72(%1);"
+ " movq 40(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 80(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 88(%1);"
+ " movq 48(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 96(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 104(%1);"
+ " movq 56(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 112(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 120(%1);"
+
+ /* Line up pointers */
+ " mov %1, %0;"
+ " mov %2, %1;"
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %%ecx, %%ecx;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 96(%0), %%r8, %%r13;"
+ " xor %%ecx, %%ecx;"
+ " adoxq 64(%0), %%r8;"
+ " mulxq 104(%0), %%r9, %%rbx;"
+ " adcx %%r13, %%r9;"
+ " adoxq 72(%0), %%r9;"
+ " mulxq 112(%0), %%r10, %%r13;"
+ " adcx %%rbx, %%r10;"
+ " adoxq 80(%0), %%r10;"
+ " mulxq 120(%0), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 88(%0), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 40(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 48(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 56(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 32(%1);"
+ : "+&r"(f), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15", "memory", "cc");
+}
+
+static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
+{
+ u64 *nq = p01_tmp1;
+ u64 *nq_p1 = p01_tmp1 + (u32)8U;
+ u64 *tmp1 = p01_tmp1 + (u32)16U;
+ u64 *x1 = q;
+ u64 *x2 = nq;
+ u64 *z2 = nq + (u32)4U;
+ u64 *z3 = nq_p1 + (u32)4U;
+ u64 *a = tmp1;
+ u64 *b = tmp1 + (u32)4U;
+ u64 *ab = tmp1;
+ u64 *dc = tmp1 + (u32)8U;
+ u64 *x3;
+ u64 *z31;
+ u64 *d0;
+ u64 *c0;
+ u64 *a1;
+ u64 *b1;
+ u64 *d;
+ u64 *c;
+ u64 *ab1;
+ u64 *dc1;
+ fadd(a, x2, z2);
+ fsub(b, x2, z2);
+ x3 = nq_p1;
+ z31 = nq_p1 + (u32)4U;
+ d0 = dc;
+ c0 = dc + (u32)4U;
+ fadd(c0, x3, z31);
+ fsub(d0, x3, z31);
+ fmul2(dc, dc, ab, tmp2);
+ fadd(x3, d0, c0);
+ fsub(z31, d0, c0);
+ a1 = tmp1;
+ b1 = tmp1 + (u32)4U;
+ d = tmp1 + (u32)8U;
+ c = tmp1 + (u32)12U;
+ ab1 = tmp1;
+ dc1 = tmp1 + (u32)8U;
+ fsqr2(dc1, ab1, tmp2);
+ fsqr2(nq_p1, nq_p1, tmp2);
+ a1[0U] = c[0U];
+ a1[1U] = c[1U];
+ a1[2U] = c[2U];
+ a1[3U] = c[3U];
+ fsub(c, d, c);
+ fmul_scalar(b1, c, (u64)121665U);
+ fadd(b1, b1, d);
+ fmul2(nq, dc1, ab1, tmp2);
+ fmul(z3, z3, x1, tmp2);
+}
+
+static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
+{
+ u64 *x2 = nq;
+ u64 *z2 = nq + (u32)4U;
+ u64 *a = tmp1;
+ u64 *b = tmp1 + (u32)4U;
+ u64 *d = tmp1 + (u32)8U;
+ u64 *c = tmp1 + (u32)12U;
+ u64 *ab = tmp1;
+ u64 *dc = tmp1 + (u32)8U;
+ fadd(a, x2, z2);
+ fsub(b, x2, z2);
+ fsqr2(dc, ab, tmp2);
+ a[0U] = c[0U];
+ a[1U] = c[1U];
+ a[2U] = c[2U];
+ a[3U] = c[3U];
+ fsub(c, d, c);
+ fmul_scalar(b, c, (u64)121665U);
+ fadd(b, b, d);
+ fmul2(nq, dc, ab, tmp2);
+}
+
+static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
+{
+ u64 tmp2[16U] = { 0U };
+ u64 p01_tmp1_swap[33U] = { 0U };
+ u64 *p0 = p01_tmp1_swap;
+ u64 *p01 = p01_tmp1_swap;
+ u64 *p03 = p01;
+ u64 *p11 = p01 + (u32)8U;
+ u64 *x0;
+ u64 *z0;
+ u64 *p01_tmp1;
+ u64 *p01_tmp11;
+ u64 *nq10;
+ u64 *nq_p11;
+ u64 *swap1;
+ u64 sw0;
+ u64 *nq1;
+ u64 *tmp1;
+ memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
+ x0 = p03;
+ z0 = p03 + (u32)4U;
+ x0[0U] = (u64)1U;
+ x0[1U] = (u64)0U;
+ x0[2U] = (u64)0U;
+ x0[3U] = (u64)0U;
+ z0[0U] = (u64)0U;
+ z0[1U] = (u64)0U;
+ z0[2U] = (u64)0U;
+ z0[3U] = (u64)0U;
+ p01_tmp1 = p01_tmp1_swap;
+ p01_tmp11 = p01_tmp1_swap;
+ nq10 = p01_tmp1_swap;
+ nq_p11 = p01_tmp1_swap + (u32)8U;
+ swap1 = p01_tmp1_swap + (u32)32U;
+ cswap2((u64)1U, nq10, nq_p11);
+ point_add_and_double(init1, p01_tmp11, tmp2);
+ swap1[0U] = (u64)1U;
+ {
+ u32 i;
+ for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
+ u64 *p01_tmp12 = p01_tmp1_swap;
+ u64 *swap2 = p01_tmp1_swap + (u32)32U;
+ u64 *nq2 = p01_tmp12;
+ u64 *nq_p12 = p01_tmp12 + (u32)8U;
+ u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
+ u64 sw = swap2[0U] ^ bit;
+ cswap2(sw, nq2, nq_p12);
+ point_add_and_double(init1, p01_tmp12, tmp2);
+ swap2[0U] = bit;
+ }
+ }
+ sw0 = swap1[0U];
+ cswap2(sw0, nq10, nq_p11);
+ nq1 = p01_tmp1;
+ tmp1 = p01_tmp1 + (u32)16U;
+ point_double(nq1, tmp1, tmp2);
+ point_double(nq1, tmp1, tmp2);
+ point_double(nq1, tmp1, tmp2);
+ memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
+
+ memzero_explicit(tmp2, sizeof(tmp2));
+ memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
+}
+
+static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
+{
+ u32 i;
+ fsqr(o, inp, tmp);
+ for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
+ fsqr(o, o, tmp);
+}
+
+static void finv(u64 *o, const u64 *i, u64 *tmp)
+{
+ u64 t1[16U] = { 0U };
+ u64 *a0 = t1;
+ u64 *b = t1 + (u32)4U;
+ u64 *c = t1 + (u32)8U;
+ u64 *t00 = t1 + (u32)12U;
+ u64 *tmp1 = tmp;
+ u64 *a;
+ u64 *t0;
+ fsquare_times(a0, i, tmp1, (u32)1U);
+ fsquare_times(t00, a0, tmp1, (u32)2U);
+ fmul(b, t00, i, tmp);
+ fmul(a0, b, a0, tmp);
+ fsquare_times(t00, a0, tmp1, (u32)1U);
+ fmul(b, t00, b, tmp);
+ fsquare_times(t00, b, tmp1, (u32)5U);
+ fmul(b, t00, b, tmp);
+ fsquare_times(t00, b, tmp1, (u32)10U);
+ fmul(c, t00, b, tmp);
+ fsquare_times(t00, c, tmp1, (u32)20U);
+ fmul(t00, t00, c, tmp);
+ fsquare_times(t00, t00, tmp1, (u32)10U);
+ fmul(b, t00, b, tmp);
+ fsquare_times(t00, b, tmp1, (u32)50U);
+ fmul(c, t00, b, tmp);
+ fsquare_times(t00, c, tmp1, (u32)100U);
+ fmul(t00, t00, c, tmp);
+ fsquare_times(t00, t00, tmp1, (u32)50U);
+ fmul(t00, t00, b, tmp);
+ fsquare_times(t00, t00, tmp1, (u32)5U);
+ a = t1;
+ t0 = t1 + (u32)12U;
+ fmul(o, t0, a, tmp);
+}
+
+static void store_felem(u64 *b, u64 *f)
+{
+ u64 f30 = f[3U];
+ u64 top_bit0 = f30 >> (u32)63U;
+ u64 f31;
+ u64 top_bit;
+ u64 f0;
+ u64 f1;
+ u64 f2;
+ u64 f3;
+ u64 m0;
+ u64 m1;
+ u64 m2;
+ u64 m3;
+ u64 mask;
+ u64 f0_;
+ u64 f1_;
+ u64 f2_;
+ u64 f3_;
+ u64 o0;
+ u64 o1;
+ u64 o2;
+ u64 o3;
+ f[3U] = f30 & (u64)0x7fffffffffffffffU;
+ add_scalar(f, f, (u64)19U * top_bit0);
+ f31 = f[3U];
+ top_bit = f31 >> (u32)63U;
+ f[3U] = f31 & (u64)0x7fffffffffffffffU;
+ add_scalar(f, f, (u64)19U * top_bit);
+ f0 = f[0U];
+ f1 = f[1U];
+ f2 = f[2U];
+ f3 = f[3U];
+ m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
+ m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
+ m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
+ m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
+ mask = ((m0 & m1) & m2) & m3;
+ f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
+ f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
+ f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
+ f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
+ o0 = f0_;
+ o1 = f1_;
+ o2 = f2_;
+ o3 = f3_;
+ b[0U] = o0;
+ b[1U] = o1;
+ b[2U] = o2;
+ b[3U] = o3;
+}
+
+static void encode_point(u8 *o, const u64 *i)
+{
+ const u64 *x = i;
+ const u64 *z = i + (u32)4U;
+ u64 tmp[4U] = { 0U };
+ u64 tmp_w[16U] = { 0U };
+ finv(tmp, z, tmp_w);
+ fmul(tmp, tmp, x, tmp_w);
+ store_felem((u64 *)o, tmp);
+}
+
+static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
+{
+ u64 init1[8U] = { 0U };
+ u64 tmp[4U] = { 0U };
+ u64 tmp3;
+ u64 *x;
+ u64 *z;
+ {
+ u32 i;
+ for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
+ u64 *os = tmp;
+ const u8 *bj = pub + i * (u32)8U;
+ u64 u = *(u64 *)bj;
+ u64 r = u;
+ u64 x0 = r;
+ os[i] = x0;
+ }
+ }
+ tmp3 = tmp[3U];
+ tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
+ x = init1;
+ z = init1 + (u32)4U;
+ z[0U] = (u64)1U;
+ z[1U] = (u64)0U;
+ z[2U] = (u64)0U;
+ z[3U] = (u64)0U;
+ x[0U] = tmp[0U];
+ x[1U] = tmp[1U];
+ x[2U] = tmp[2U];
+ x[3U] = tmp[3U];
+ montgomery_ladder(init1, priv, init1);
+ encode_point(out, init1);
+}
+
+/* The below constants were generated using this sage script:
+ *
+ * #!/usr/bin/env sage
+ * import sys
+ * from sage.all import *
+ * def limbs(n):
+ * n = int(n)
+ * l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
+ * return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
+ * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
+ * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
+ * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
+ * print("static const u64 table_ladder[] = {")
+ * p = ec.lift_x(9)
+ * for i in range(252):
+ * l = (p[0] + p[2]) / (p[0] - p[2])
+ * print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
+ * p = p * 2
+ * print("};")
+ *
+ */
+
+static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
+
+static const u64 table_ladder[] = {
+ 0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
+ 0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
+ 0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
+ 0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
+ 0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
+ 0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
+ 0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
+ 0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
+ 0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
+ 0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
+ 0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
+ 0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
+ 0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
+ 0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
+ 0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
+ 0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
+ 0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
+ 0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
+ 0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
+ 0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
+ 0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
+ 0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
+ 0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
+ 0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
+ 0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
+ 0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
+ 0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
+ 0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
+ 0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
+ 0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
+ 0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
+ 0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
+ 0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
+ 0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
+ 0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
+ 0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
+ 0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
+ 0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
+ 0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
+ 0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
+ 0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
+ 0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
+ 0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
+ 0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
+ 0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
+ 0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
+ 0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
+ 0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
+ 0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
+ 0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
+ 0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
+ 0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
+ 0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
+ 0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
+ 0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
+ 0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
+ 0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
+ 0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
+ 0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
+ 0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
+ 0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
+ 0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
+ 0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
+ 0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
+ 0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
+ 0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
+ 0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
+ 0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
+ 0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
+ 0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
+ 0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
+ 0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
+ 0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
+ 0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
+ 0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
+ 0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
+ 0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
+ 0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
+ 0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
+ 0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
+ 0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
+ 0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
+ 0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
+ 0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
+ 0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
+ 0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
+ 0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
+ 0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
+ 0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
+ 0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
+ 0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
+ 0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
+ 0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
+ 0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
+ 0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
+ 0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
+ 0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
+ 0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
+ 0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
+ 0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
+ 0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
+ 0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
+ 0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
+ 0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
+ 0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
+ 0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
+ 0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
+ 0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
+ 0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
+ 0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
+ 0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
+ 0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
+ 0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
+ 0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
+ 0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
+ 0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
+ 0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
+ 0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
+ 0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
+ 0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
+ 0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
+ 0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
+ 0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
+ 0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
+ 0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
+ 0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
+ 0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
+ 0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
+ 0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
+ 0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
+ 0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
+ 0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
+ 0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
+ 0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
+ 0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
+ 0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
+ 0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
+ 0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
+ 0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
+ 0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
+ 0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
+ 0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
+ 0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
+ 0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
+ 0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
+ 0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
+ 0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
+ 0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
+ 0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
+ 0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
+ 0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
+ 0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
+ 0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
+ 0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
+ 0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
+ 0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
+ 0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
+ 0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
+ 0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
+ 0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
+ 0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
+ 0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
+ 0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
+ 0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
+ 0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
+ 0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
+ 0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
+ 0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
+ 0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
+ 0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
+ 0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
+ 0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
+ 0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
+ 0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
+ 0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
+ 0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
+ 0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
+ 0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
+ 0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
+ 0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
+ 0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
+ 0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
+ 0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
+ 0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
+ 0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
+ 0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
+ 0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
+ 0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
+ 0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
+ 0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
+ 0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
+ 0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
+ 0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
+ 0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
+ 0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
+ 0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
+ 0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
+ 0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
+ 0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
+ 0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
+ 0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
+ 0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
+ 0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
+ 0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
+ 0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
+ 0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
+ 0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
+ 0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
+ 0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
+ 0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
+ 0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
+ 0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
+ 0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
+ 0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
+ 0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
+ 0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
+ 0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
+ 0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
+ 0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
+ 0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
+ 0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
+ 0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
+ 0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
+ 0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
+ 0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
+ 0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
+ 0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
+ 0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
+ 0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
+ 0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
+ 0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
+ 0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
+ 0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
+ 0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
+ 0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
+ 0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
+ 0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
+ 0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
+ 0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
+ 0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
+ 0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
+ 0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
+ 0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
+ 0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
+ 0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
+ 0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
+ 0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
+ 0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
+ 0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
+ 0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
+ 0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
+ 0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
+};
+
+static void curve25519_ever64_base(u8 *out, const u8 *priv)
+{
+ u64 swap = 1;
+ int i, j, k;
+ u64 tmp[16 + 32 + 4];
+ u64 *x1 = &tmp[0];
+ u64 *z1 = &tmp[4];
+ u64 *x2 = &tmp[8];
+ u64 *z2 = &tmp[12];
+ u64 *xz1 = &tmp[0];
+ u64 *xz2 = &tmp[8];
+ u64 *a = &tmp[0 + 16];
+ u64 *b = &tmp[4 + 16];
+ u64 *c = &tmp[8 + 16];
+ u64 *ab = &tmp[0 + 16];
+ u64 *abcd = &tmp[0 + 16];
+ u64 *ef = &tmp[16 + 16];
+ u64 *efgh = &tmp[16 + 16];
+ u64 *key = &tmp[0 + 16 + 32];
+
+ memcpy(key, priv, 32);
+ ((u8 *)key)[0] &= 248;
+ ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
+
+ x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
+ z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
+ z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
+ memcpy(x2, p_minus_s, sizeof(p_minus_s));
+
+ j = 3;
+ for (i = 0; i < 4; ++i) {
+ while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
+ u64 bit = (key[i] >> j) & 1;
+ k = (64 * i + j - 3);
+ swap = swap ^ bit;
+ cswap2(swap, xz1, xz2);
+ swap = bit;
+ fsub(b, x1, z1);
+ fadd(a, x1, z1);
+ fmul(c, &table_ladder[4 * k], b, ef);
+ fsub(b, a, c);
+ fadd(a, a, c);
+ fsqr2(ab, ab, efgh);
+ fmul2(xz1, xz2, ab, efgh);
+ ++j;
+ }
+ j = 0;
+ }
+
+ point_double(xz1, abcd, efgh);
+ point_double(xz1, abcd, efgh);
+ point_double(xz1, abcd, efgh);
+ encode_point(out, xz1);
+
+ memzero_explicit(tmp, sizeof(tmp));
+}
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
+
+static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+ if (static_branch_likely(&curve25519_use_bmi2_adx))
+ curve25519_ever64(mypublic, secret, basepoint);
+ else
+ curve25519_generic(mypublic, secret, basepoint);
+}
+
+static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ if (static_branch_likely(&curve25519_use_bmi2_adx))
+ curve25519_ever64_base(pub, secret);
+ else
+ curve25519_generic(pub, secret, curve25519_base_point);
+}
+
+#define curve25519_mod_init_arch curve25519_mod_init_arch
+static void curve25519_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
+ static_branch_enable(&curve25519_use_bmi2_adx);
+}
diff --git a/lib/crypto/x86/poly1305-x86_64-cryptogams.pl b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
new file mode 100644
index 000000000000..409ec6955733
--- /dev/null
+++ b/lib/crypto/x86/poly1305-x86_64-cryptogams.pl
@@ -0,0 +1,4240 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+#
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
+# has relicensed it under the licenses specified in the SPDX header above.
+# The original headers, including the original license headers, are
+# included below for completeness.
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for x86_64.
+#
+# March 2015
+#
+# Initial release.
+#
+# December 2016
+#
+# Add AVX512F+VL+BW code path.
+#
+# November 2017
+#
+# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
+# executed even on Knights Landing. Trigger for modification was
+# observation that AVX512 code paths can negatively affect overall
+# Skylake-X system performance. Since we are likely to suppress
+# AVX512F capability flag [at least on Skylake-X], conversion serves
+# as kind of "investment protection". Note that next *lake processor,
+# Cannonlake, has AVX512IFMA code path to execute...
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone,
+# measured with rdtsc at fixed clock frequency.
+#
+# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
+# P4 4.46/+120% -
+# Core 2 2.41/+90% -
+# Westmere 1.88/+120% -
+# Sandy Bridge 1.39/+140% 1.10
+# Haswell 1.14/+175% 1.11 0.65
+# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
+# Silvermont 2.83/+95% -
+# Knights L 3.60/? 1.65 1.10 0.41(***)
+# Goldmont 1.70/+180% -
+# VIA Nano 1.82/+150% -
+# Sledgehammer 1.38/+160% -
+# Bulldozer 2.30/+130% 0.97
+# Ryzen 1.15/+200% 1.08 1.18
+#
+# (*) improvement coefficients relative to clang are more modest and
+# are ~50% on most processors, in both cases we are comparing to
+# __int128 code;
+# (**) SSE2 implementation was attempted, but among non-AVX processors
+# it was faster than integer-only code only on older Intel P4 and
+# Core processors, 50-30%, less newer processor is, but slower on
+# contemporary ones, for example almost 2x slower on Atom, and as
+# former are naturally disappearing, SSE2 is deemed unnecessary;
+# (***) strangely enough performance seems to vary from core to core,
+# listed result is best case;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$kernel=0; $kernel=1 if (!$flavour && !$output);
+
+if (!$kernel) {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+ die "can't locate x86_64-xlate.pl";
+
+ open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+ *STDOUT=*OUT;
+
+ if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
+ }
+
+ if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+ $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
+ $avx += 1 if ($1==2.11 && $2>=8);
+ }
+
+ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+ }
+
+ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+ }
+} else {
+ $avx = 4; # The kernel uses ifdefs for this.
+}
+
+sub declare_function() {
+ my ($name, $align, $nargs) = @_;
+ if($kernel) {
+ $code .= "SYM_FUNC_START($name)\n";
+ $code .= ".L$name:\n";
+ } else {
+ $code .= ".globl $name\n";
+ $code .= ".type $name,\@function,$nargs\n";
+ $code .= ".align $align\n";
+ $code .= "$name:\n";
+ }
+}
+
+sub end_function() {
+ my ($name) = @_;
+ if($kernel) {
+ $code .= "SYM_FUNC_END($name)\n";
+ } else {
+ $code .= ".size $name,.-$name\n";
+ }
+}
+
+$code.=<<___ if $kernel;
+#include <linux/linkage.h>
+___
+
+if ($avx) {
+$code.=<<___ if $kernel;
+.section .rodata
+___
+$code.=<<___;
+.align 64
+.Lconst:
+.Lmask24:
+.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
+.Lmask26:
+.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lpermd_avx2:
+.long 2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.L2_44_inp_permd:
+.long 0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad 0,12,24,64
+.L2_44_mask:
+.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad 44,44,42,64
+.L2_44_shift_lft:
+.quad 8,8,10,64
+
+.align 64
+.Lx_mask44:
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+___
+}
+$code.=<<___ if (!$kernel);
+.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
+
+my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
+my ($mac,$nonce)=($inp,$len); # *_emit arguments
+my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
+my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
+
+sub poly1305_iteration {
+# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
+# output: $h0-$h2 *= $r0-$r1
+$code.=<<___;
+ mulq $h0 # h0*r1
+ mov %rax,$d2
+ mov $r0,%rax
+ mov %rdx,$d3
+
+ mulq $h0 # h0*r0
+ mov %rax,$h0 # future $h0
+ mov $r0,%rax
+ mov %rdx,$d1
+
+ mulq $h1 # h1*r0
+ add %rax,$d2
+ mov $s1,%rax
+ adc %rdx,$d3
+
+ mulq $h1 # h1*s1
+ mov $h2,$h1 # borrow $h1
+ add %rax,$h0
+ adc %rdx,$d1
+
+ imulq $s1,$h1 # h2*s1
+ add $h1,$d2
+ mov $d1,$h1
+ adc \$0,$d3
+
+ imulq $r0,$h2 # h2*r0
+ add $d2,$h1
+ mov \$-4,%rax # mask value
+ adc $h2,$d3
+
+ and $d3,%rax # last reduction step
+ mov $d3,$h2
+ shr \$2,$d3
+ and \$3,$h2
+ add $d3,%rax
+ add %rax,$h0
+ adc \$0,$h1
+ adc \$0,$h2
+___
+}
+
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int64 h[3]; # current hash value base 2^64
+# unsigned __int64 r[2]; # key value base 2^64
+
+$code.=<<___;
+.text
+___
+$code.=<<___ if (!$kernel);
+.extern OPENSSL_ia32cap_P
+
+.globl poly1305_init_x86_64
+.hidden poly1305_init_x86_64
+.globl poly1305_blocks_x86_64
+.hidden poly1305_blocks_x86_64
+.globl poly1305_emit_x86_64
+.hidden poly1305_emit_x86_64
+___
+&declare_function("poly1305_init_x86_64", 32, 3);
+$code.=<<___;
+ xor %eax,%eax
+ mov %rax,0($ctx) # initialize hash value
+ mov %rax,8($ctx)
+ mov %rax,16($ctx)
+
+ test $inp,$inp
+ je .Lno_key
+___
+$code.=<<___ if (!$kernel);
+ lea poly1305_blocks_x86_64(%rip),%r10
+ lea poly1305_emit_x86_64(%rip),%r11
+___
+$code.=<<___ if (!$kernel && $avx);
+ mov OPENSSL_ia32cap_P+4(%rip),%r9
+ lea poly1305_blocks_avx(%rip),%rax
+ lea poly1305_emit_avx(%rip),%rcx
+ bt \$`60-32`,%r9 # AVX?
+ cmovc %rax,%r10
+ cmovc %rcx,%r11
+___
+$code.=<<___ if (!$kernel && $avx>1);
+ lea poly1305_blocks_avx2(%rip),%rax
+ bt \$`5+32`,%r9 # AVX2?
+ cmovc %rax,%r10
+___
+$code.=<<___ if (!$kernel && $avx>3);
+ mov \$`(1<<31|1<<21|1<<16)`,%rax
+ shr \$32,%r9
+ and %rax,%r9
+ cmp %rax,%r9
+ je .Linit_base2_44
+___
+$code.=<<___;
+ mov \$0x0ffffffc0fffffff,%rax
+ mov \$0x0ffffffc0ffffffc,%rcx
+ and 0($inp),%rax
+ and 8($inp),%rcx
+ mov %rax,24($ctx)
+ mov %rcx,32($ctx)
+___
+$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
+ mov %r10,0(%rdx)
+ mov %r11,8(%rdx)
+___
+$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
+ mov %r10d,0(%rdx)
+ mov %r11d,4(%rdx)
+___
+$code.=<<___;
+ mov \$1,%eax
+.Lno_key:
+ RET
+___
+&end_function("poly1305_init_x86_64");
+
+&declare_function("poly1305_blocks_x86_64", 32, 4);
+$code.=<<___;
+.cfi_startproc
+.Lblocks:
+ shr \$4,$len
+ jz .Lno_data # too short
+
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+ push $ctx
+.cfi_push $ctx
+.Lblocks_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ mov 0($ctx),$h0 # load hash value
+ mov 8($ctx),$h1
+ mov 16($ctx),$h2
+
+ mov $s1,$r1
+ shr \$2,$s1
+ mov $r1,%rax
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+ jmp .Loop
+
+.align 32
+.Loop:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+___
+
+ &poly1305_iteration();
+
+$code.=<<___;
+ mov $r1,%rax
+ dec %r15 # len-=16
+ jnz .Loop
+
+ mov 0(%rsp),$ctx
+.cfi_restore $ctx
+
+ mov $h0,0($ctx) # store hash value
+ mov $h1,8($ctx)
+ mov $h2,16($ctx)
+
+ mov 8(%rsp),%r15
+.cfi_restore %r15
+ mov 16(%rsp),%r14
+.cfi_restore %r14
+ mov 24(%rsp),%r13
+.cfi_restore %r13
+ mov 32(%rsp),%r12
+.cfi_restore %r12
+ mov 40(%rsp),%rbx
+.cfi_restore %rbx
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lno_data:
+.Lblocks_epilogue:
+ RET
+.cfi_endproc
+___
+&end_function("poly1305_blocks_x86_64");
+
+&declare_function("poly1305_emit_x86_64", 32, 3);
+$code.=<<___;
+.Lemit:
+ mov 0($ctx),%r8 # load hash value
+ mov 8($ctx),%r9
+ mov 16($ctx),%r10
+
+ mov %r8,%rax
+ add \$5,%r8 # compare to modulus
+ mov %r9,%rcx
+ adc \$0,%r9
+ adc \$0,%r10
+ shr \$2,%r10 # did 130-bit value overflow?
+ cmovnz %r8,%rax
+ cmovnz %r9,%rcx
+
+ add 0($nonce),%rax # accumulate nonce
+ adc 8($nonce),%rcx
+ mov %rax,0($mac) # write result
+ mov %rcx,8($mac)
+
+ RET
+___
+&end_function("poly1305_emit_x86_64");
+if ($avx) {
+
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int32 h[5]; # current hash value base 2^26
+# unsigned __int32 is_base2_26;
+# unsigned __int64 r[2]; # key value base 2^64
+# unsigned __int64 pad;
+# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
+#
+# where r^n are base 2^26 digits of degrees of multiplier key. There are
+# 5 digits, but last four are interleaved with multiples of 5, totalling
+# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
+
+my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
+ map("%xmm$_",(0..15));
+
+$code.=<<___;
+.type __poly1305_block,\@abi-omnipotent
+.align 32
+__poly1305_block:
+ push $ctx
+___
+ &poly1305_iteration();
+$code.=<<___;
+ pop $ctx
+ RET
+.size __poly1305_block,.-__poly1305_block
+
+.type __poly1305_init_avx,\@abi-omnipotent
+.align 32
+__poly1305_init_avx:
+ push %rbp
+ mov %rsp,%rbp
+ mov $r0,$h0
+ mov $r1,$h1
+ xor $h2,$h2
+
+ lea 48+64($ctx),$ctx # size optimization
+
+ mov $r1,%rax
+ call __poly1305_block # r^2
+
+ mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
+ mov \$0x3ffffff,%edx
+ mov $h0,$d1
+ and $h0#d,%eax
+ mov $r0,$d2
+ and $r0#d,%edx
+ mov %eax,`16*0+0-64`($ctx)
+ shr \$26,$d1
+ mov %edx,`16*0+4-64`($ctx)
+ shr \$26,$d2
+
+ mov \$0x3ffffff,%eax
+ mov \$0x3ffffff,%edx
+ and $d1#d,%eax
+ and $d2#d,%edx
+ mov %eax,`16*1+0-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov %edx,`16*1+4-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ mov %eax,`16*2+0-64`($ctx)
+ shr \$26,$d1
+ mov %edx,`16*2+4-64`($ctx)
+ shr \$26,$d2
+
+ mov $h1,%rax
+ mov $r1,%rdx
+ shl \$12,%rax
+ shl \$12,%rdx
+ or $d1,%rax
+ or $d2,%rdx
+ and \$0x3ffffff,%eax
+ and \$0x3ffffff,%edx
+ mov %eax,`16*3+0-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov %edx,`16*3+4-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ mov %eax,`16*4+0-64`($ctx)
+ mov $h1,$d1
+ mov %edx,`16*4+4-64`($ctx)
+ mov $r1,$d2
+
+ mov \$0x3ffffff,%eax
+ mov \$0x3ffffff,%edx
+ shr \$14,$d1
+ shr \$14,$d2
+ and $d1#d,%eax
+ and $d2#d,%edx
+ mov %eax,`16*5+0-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov %edx,`16*5+4-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ mov %eax,`16*6+0-64`($ctx)
+ shr \$26,$d1
+ mov %edx,`16*6+4-64`($ctx)
+ shr \$26,$d2
+
+ mov $h2,%rax
+ shl \$24,%rax
+ or %rax,$d1
+ mov $d1#d,`16*7+0-64`($ctx)
+ lea ($d1,$d1,4),$d1 # *5
+ mov $d2#d,`16*7+4-64`($ctx)
+ lea ($d2,$d2,4),$d2 # *5
+ mov $d1#d,`16*8+0-64`($ctx)
+ mov $d2#d,`16*8+4-64`($ctx)
+
+ mov $r1,%rax
+ call __poly1305_block # r^3
+
+ mov \$0x3ffffff,%eax # save r^3 base 2^26
+ mov $h0,$d1
+ and $h0#d,%eax
+ shr \$26,$d1
+ mov %eax,`16*0+12-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ and $d1#d,%edx
+ mov %edx,`16*1+12-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*2+12-64`($ctx)
+
+ mov $h1,%rax
+ shl \$12,%rax
+ or $d1,%rax
+ and \$0x3ffffff,%eax
+ mov %eax,`16*3+12-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov $h1,$d1
+ mov %eax,`16*4+12-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ shr \$14,$d1
+ and $d1#d,%edx
+ mov %edx,`16*5+12-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*6+12-64`($ctx)
+
+ mov $h2,%rax
+ shl \$24,%rax
+ or %rax,$d1
+ mov $d1#d,`16*7+12-64`($ctx)
+ lea ($d1,$d1,4),$d1 # *5
+ mov $d1#d,`16*8+12-64`($ctx)
+
+ mov $r1,%rax
+ call __poly1305_block # r^4
+
+ mov \$0x3ffffff,%eax # save r^4 base 2^26
+ mov $h0,$d1
+ and $h0#d,%eax
+ shr \$26,$d1
+ mov %eax,`16*0+8-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ and $d1#d,%edx
+ mov %edx,`16*1+8-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*2+8-64`($ctx)
+
+ mov $h1,%rax
+ shl \$12,%rax
+ or $d1,%rax
+ and \$0x3ffffff,%eax
+ mov %eax,`16*3+8-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov $h1,$d1
+ mov %eax,`16*4+8-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ shr \$14,$d1
+ and $d1#d,%edx
+ mov %edx,`16*5+8-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*6+8-64`($ctx)
+
+ mov $h2,%rax
+ shl \$24,%rax
+ or %rax,$d1
+ mov $d1#d,`16*7+8-64`($ctx)
+ lea ($d1,$d1,4),$d1 # *5
+ mov $d1#d,`16*8+8-64`($ctx)
+
+ lea -48-64($ctx),$ctx # size [de-]optimization
+ pop %rbp
+ RET
+.size __poly1305_init_avx,.-__poly1305_init_avx
+___
+
+&declare_function("poly1305_blocks_avx", 32, 4);
+$code.=<<___;
+.cfi_startproc
+ mov 20($ctx),%r8d # is_base2_26
+ cmp \$128,$len
+ jae .Lblocks_avx
+ test %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx:
+ and \$-16,$len
+ jz .Lno_data_avx
+
+ vzeroupper
+
+ test %r8d,%r8d
+ jz .Lbase2_64_avx
+
+ test \$31,$len
+ jz .Leven_avx
+
+ push %rbp
+.cfi_push %rbp
+ mov %rsp,%rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lblocks_avx_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 0($ctx),$d1 # load hash value
+ mov 8($ctx),$d2
+ mov 16($ctx),$h2#d
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ ################################# base 2^26 -> base 2^64
+ mov $d1#d,$h0#d
+ and \$`-1*(1<<31)`,$d1
+ mov $d2,$r1 # borrow $r1
+ mov $d2#d,$h1#d
+ and \$`-1*(1<<31)`,$d2
+
+ shr \$6,$d1
+ shl \$52,$r1
+ add $d1,$h0
+ shr \$12,$h1
+ shr \$18,$d2
+ add $r1,$h0
+ adc $d2,$h1
+
+ mov $h2,$d1
+ shl \$40,$d1
+ shr \$24,$h2
+ add $d1,$h1
+ adc \$0,$h2 # can be partially reduced...
+
+ mov \$-4,$d2 # ... so reduce
+ mov $h2,$d1
+ and $h2,$d2
+ shr \$2,$d1
+ and \$3,$h2
+ add $d2,$d1 # =*5
+ add $d1,$h0
+ adc \$0,$h1
+ adc \$0,$h2
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+
+ call __poly1305_block
+
+ test $padbit,$padbit # if $padbit is zero,
+ jz .Lstore_base2_64_avx # store hash in base 2^64 format
+
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$r0
+ mov $h1,$r1
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$r0
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $r0,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$r1
+ and \$0x3ffffff,$h1 # h[3]
+ or $r1,$h2 # h[4]
+
+ sub \$16,%r15
+ jz .Lstore_base2_26_avx
+
+ vmovd %rax#d,$H0
+ vmovd %rdx#d,$H1
+ vmovd $h0#d,$H2
+ vmovd $h1#d,$H3
+ vmovd $h2#d,$H4
+ jmp .Lproceed_avx
+
+.align 32
+.Lstore_base2_64_avx:
+ mov $h0,0($ctx)
+ mov $h1,8($ctx)
+ mov $h2,16($ctx) # note that is_base2_26 is zeroed
+ jmp .Ldone_avx
+
+.align 16
+.Lstore_base2_26_avx:
+ mov %rax#d,0($ctx) # store hash value base 2^26
+ mov %rdx#d,4($ctx)
+ mov $h0#d,8($ctx)
+ mov $h1#d,12($ctx)
+ mov $h2#d,16($ctx)
+.align 16
+.Ldone_avx:
+ pop %r15
+.cfi_restore %r15
+ pop %r14
+.cfi_restore %r14
+ pop %r13
+.cfi_restore %r13
+ pop %r12
+.cfi_restore %r12
+ pop %rbx
+.cfi_restore %rbx
+ pop %rbp
+.cfi_restore %rbp
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+ RET
+.cfi_endproc
+
+.align 32
+.Lbase2_64_avx:
+.cfi_startproc
+ push %rbp
+.cfi_push %rbp
+ mov %rsp,%rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lbase2_64_avx_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ mov 0($ctx),$h0 # load hash value
+ mov 8($ctx),$h1
+ mov 16($ctx),$h2#d
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ test \$31,$len
+ jz .Linit_avx
+
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ sub \$16,%r15
+
+ call __poly1305_block
+
+.Linit_avx:
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$d1
+ mov $h1,$d2
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$d1
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $d1,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$d2
+ and \$0x3ffffff,$h1 # h[3]
+ or $d2,$h2 # h[4]
+
+ vmovd %rax#d,$H0
+ vmovd %rdx#d,$H1
+ vmovd $h0#d,$H2
+ vmovd $h1#d,$H3
+ vmovd $h2#d,$H4
+ movl \$1,20($ctx) # set is_base2_26
+
+ call __poly1305_init_avx
+
+.Lproceed_avx:
+ mov %r15,$len
+ pop %r15
+.cfi_restore %r15
+ pop %r14
+.cfi_restore %r14
+ pop %r13
+.cfi_restore %r13
+ pop %r12
+.cfi_restore %r12
+ pop %rbx
+.cfi_restore %rbx
+ pop %rbp
+.cfi_restore %rbp
+.Lbase2_64_avx_epilogue:
+ jmp .Ldo_avx
+.cfi_endproc
+
+.align 32
+.Leven_avx:
+.cfi_startproc
+ vmovd 4*0($ctx),$H0 # load hash value
+ vmovd 4*1($ctx),$H1
+ vmovd 4*2($ctx),$H2
+ vmovd 4*3($ctx),$H3
+ vmovd 4*4($ctx),$H4
+
+.Ldo_avx:
+___
+$code.=<<___ if (!$win64);
+ lea 8(%rsp),%r10
+.cfi_def_cfa_register %r10
+ and \$-32,%rsp
+ sub \$-8,%rsp
+ lea -0x58(%rsp),%r11
+ sub \$0x178,%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xf8(%rsp),%r11
+ sub \$0x218,%rsp
+ vmovdqa %xmm6,0x50(%r11)
+ vmovdqa %xmm7,0x60(%r11)
+ vmovdqa %xmm8,0x70(%r11)
+ vmovdqa %xmm9,0x80(%r11)
+ vmovdqa %xmm10,0x90(%r11)
+ vmovdqa %xmm11,0xa0(%r11)
+ vmovdqa %xmm12,0xb0(%r11)
+ vmovdqa %xmm13,0xc0(%r11)
+ vmovdqa %xmm14,0xd0(%r11)
+ vmovdqa %xmm15,0xe0(%r11)
+.Ldo_avx_body:
+___
+$code.=<<___;
+ sub \$64,$len
+ lea -32($inp),%rax
+ cmovc %rax,$inp
+
+ vmovdqu `16*3`($ctx),$D4 # preload r0^2
+ lea `16*3+64`($ctx),$ctx # size optimization
+ lea .Lconst(%rip),%rcx
+
+ ################################################################
+ # load input
+ vmovdqu 16*2($inp),$T0
+ vmovdqu 16*3($inp),$T1
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpsrldq \$6,$T1,$T3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpunpcklqdq $T3,$T2,$T3 # 2:3
+
+ vpsrlq \$40,$T4,$T4 # 4
+ vpsrlq \$26,$T0,$T1
+ vpand $MASK,$T0,$T0 # 0
+ vpsrlq \$4,$T3,$T2
+ vpand $MASK,$T1,$T1 # 1
+ vpsrlq \$30,$T3,$T3
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ jbe .Lskip_loop_avx
+
+ # expand and copy pre-calculated table to stack
+ vmovdqu `16*1-64`($ctx),$D1
+ vmovdqu `16*2-64`($ctx),$D2
+ vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
+ vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
+ vmovdqa $D3,-0x90(%r11)
+ vmovdqa $D0,0x00(%rsp)
+ vpshufd \$0xEE,$D1,$D4
+ vmovdqu `16*3-64`($ctx),$D0
+ vpshufd \$0x44,$D1,$D1
+ vmovdqa $D4,-0x80(%r11)
+ vmovdqa $D1,0x10(%rsp)
+ vpshufd \$0xEE,$D2,$D3
+ vmovdqu `16*4-64`($ctx),$D1
+ vpshufd \$0x44,$D2,$D2
+ vmovdqa $D3,-0x70(%r11)
+ vmovdqa $D2,0x20(%rsp)
+ vpshufd \$0xEE,$D0,$D4
+ vmovdqu `16*5-64`($ctx),$D2
+ vpshufd \$0x44,$D0,$D0
+ vmovdqa $D4,-0x60(%r11)
+ vmovdqa $D0,0x30(%rsp)
+ vpshufd \$0xEE,$D1,$D3
+ vmovdqu `16*6-64`($ctx),$D0
+ vpshufd \$0x44,$D1,$D1
+ vmovdqa $D3,-0x50(%r11)
+ vmovdqa $D1,0x40(%rsp)
+ vpshufd \$0xEE,$D2,$D4
+ vmovdqu `16*7-64`($ctx),$D1
+ vpshufd \$0x44,$D2,$D2
+ vmovdqa $D4,-0x40(%r11)
+ vmovdqa $D2,0x50(%rsp)
+ vpshufd \$0xEE,$D0,$D3
+ vmovdqu `16*8-64`($ctx),$D2
+ vpshufd \$0x44,$D0,$D0
+ vmovdqa $D3,-0x30(%r11)
+ vmovdqa $D0,0x60(%rsp)
+ vpshufd \$0xEE,$D1,$D4
+ vpshufd \$0x44,$D1,$D1
+ vmovdqa $D4,-0x20(%r11)
+ vmovdqa $D1,0x70(%rsp)
+ vpshufd \$0xEE,$D2,$D3
+ vmovdqa 0x00(%rsp),$D4 # preload r0^2
+ vpshufd \$0x44,$D2,$D2
+ vmovdqa $D3,-0x10(%r11)
+ vmovdqa $D2,0x80(%rsp)
+
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+ ################################################################
+ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ # \___________________/
+ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ # \___________________/ \____________________/
+ #
+ # Note that we start with inp[2:3]*r^2. This is because it
+ # doesn't depend on reduction in previous iteration.
+ ################################################################
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ #
+ # though note that $Tx and $Hx are "reversed" in this section,
+ # and $D4 is preloaded with r0^2...
+
+ vpmuludq $T0,$D4,$D0 # d0 = h0*r0
+ vpmuludq $T1,$D4,$D1 # d1 = h1*r0
+ vmovdqa $H2,0x20(%r11) # offload hash
+ vpmuludq $T2,$D4,$D2 # d3 = h2*r0
+ vmovdqa 0x10(%rsp),$H2 # r1^2
+ vpmuludq $T3,$D4,$D3 # d3 = h3*r0
+ vpmuludq $T4,$D4,$D4 # d4 = h4*r0
+
+ vmovdqa $H0,0x00(%r11) #
+ vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
+ vmovdqa $H1,0x10(%r11) #
+ vpmuludq $T3,$H2,$H1 # h3*r1
+ vpaddq $H0,$D0,$D0 # d0 += h4*s1
+ vpaddq $H1,$D4,$D4 # d4 += h3*r1
+ vmovdqa $H3,0x30(%r11) #
+ vpmuludq $T2,$H2,$H0 # h2*r1
+ vpmuludq $T1,$H2,$H1 # h1*r1
+ vpaddq $H0,$D3,$D3 # d3 += h2*r1
+ vmovdqa 0x30(%rsp),$H3 # r2^2
+ vpaddq $H1,$D2,$D2 # d2 += h1*r1
+ vmovdqa $H4,0x40(%r11) #
+ vpmuludq $T0,$H2,$H2 # h0*r1
+ vpmuludq $T2,$H3,$H0 # h2*r2
+ vpaddq $H2,$D1,$D1 # d1 += h0*r1
+
+ vmovdqa 0x40(%rsp),$H4 # s2^2
+ vpaddq $H0,$D4,$D4 # d4 += h2*r2
+ vpmuludq $T1,$H3,$H1 # h1*r2
+ vpmuludq $T0,$H3,$H3 # h0*r2
+ vpaddq $H1,$D3,$D3 # d3 += h1*r2
+ vmovdqa 0x50(%rsp),$H2 # r3^2
+ vpaddq $H3,$D2,$D2 # d2 += h0*r2
+ vpmuludq $T4,$H4,$H0 # h4*s2
+ vpmuludq $T3,$H4,$H4 # h3*s2
+ vpaddq $H0,$D1,$D1 # d1 += h4*s2
+ vmovdqa 0x60(%rsp),$H3 # s3^2
+ vpaddq $H4,$D0,$D0 # d0 += h3*s2
+
+ vmovdqa 0x80(%rsp),$H4 # s4^2
+ vpmuludq $T1,$H2,$H1 # h1*r3
+ vpmuludq $T0,$H2,$H2 # h0*r3
+ vpaddq $H1,$D4,$D4 # d4 += h1*r3
+ vpaddq $H2,$D3,$D3 # d3 += h0*r3
+ vpmuludq $T4,$H3,$H0 # h4*s3
+ vpmuludq $T3,$H3,$H1 # h3*s3
+ vpaddq $H0,$D2,$D2 # d2 += h4*s3
+ vmovdqu 16*0($inp),$H0 # load input
+ vpaddq $H1,$D1,$D1 # d1 += h3*s3
+ vpmuludq $T2,$H3,$H3 # h2*s3
+ vpmuludq $T2,$H4,$T2 # h2*s4
+ vpaddq $H3,$D0,$D0 # d0 += h2*s3
+
+ vmovdqu 16*1($inp),$H1 #
+ vpaddq $T2,$D1,$D1 # d1 += h2*s4
+ vpmuludq $T3,$H4,$T3 # h3*s4
+ vpmuludq $T4,$H4,$T4 # h4*s4
+ vpsrldq \$6,$H0,$H2 # splat input
+ vpaddq $T3,$D2,$D2 # d2 += h3*s4
+ vpaddq $T4,$D3,$D3 # d3 += h4*s4
+ vpsrldq \$6,$H1,$H3 #
+ vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
+ vpmuludq $T1,$H4,$T0 # h1*s4
+ vpunpckhqdq $H1,$H0,$H4 # 4
+ vpaddq $T4,$D4,$D4 # d4 += h0*r4
+ vmovdqa -0x90(%r11),$T4 # r0^4
+ vpaddq $T0,$D0,$D0 # d0 += h1*s4
+
+ vpunpcklqdq $H1,$H0,$H0 # 0:1
+ vpunpcklqdq $H3,$H2,$H3 # 2:3
+
+ #vpsrlq \$40,$H4,$H4 # 4
+ vpsrldq \$`40/8`,$H4,$H4 # 4
+ vpsrlq \$26,$H0,$H1
+ vpand $MASK,$H0,$H0 # 0
+ vpsrlq \$4,$H3,$H2
+ vpand $MASK,$H1,$H1 # 1
+ vpand 0(%rcx),$H4,$H4 # .Lmask24
+ vpsrlq \$30,$H3,$H3
+ vpand $MASK,$H2,$H2 # 2
+ vpand $MASK,$H3,$H3 # 3
+ vpor 32(%rcx),$H4,$H4 # padbit, yes, always
+
+ vpaddq 0x00(%r11),$H0,$H0 # add hash value
+ vpaddq 0x10(%r11),$H1,$H1
+ vpaddq 0x20(%r11),$H2,$H2
+ vpaddq 0x30(%r11),$H3,$H3
+ vpaddq 0x40(%r11),$H4,$H4
+
+ lea 16*2($inp),%rax
+ lea 16*4($inp),$inp
+ sub \$64,$len
+ cmovc %rax,$inp
+
+ ################################################################
+ # Now we accumulate (inp[0:1]+hash)*r^4
+ ################################################################
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ vpmuludq $H0,$T4,$T0 # h0*r0
+ vpmuludq $H1,$T4,$T1 # h1*r0
+ vpaddq $T0,$D0,$D0
+ vpaddq $T1,$D1,$D1
+ vmovdqa -0x80(%r11),$T2 # r1^4
+ vpmuludq $H2,$T4,$T0 # h2*r0
+ vpmuludq $H3,$T4,$T1 # h3*r0
+ vpaddq $T0,$D2,$D2
+ vpaddq $T1,$D3,$D3
+ vpmuludq $H4,$T4,$T4 # h4*r0
+ vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
+ vpaddq $T4,$D4,$D4
+
+ vpaddq $T0,$D0,$D0 # d0 += h4*s1
+ vpmuludq $H2,$T2,$T1 # h2*r1
+ vpmuludq $H3,$T2,$T0 # h3*r1
+ vpaddq $T1,$D3,$D3 # d3 += h2*r1
+ vmovdqa -0x60(%r11),$T3 # r2^4
+ vpaddq $T0,$D4,$D4 # d4 += h3*r1
+ vpmuludq $H1,$T2,$T1 # h1*r1
+ vpmuludq $H0,$T2,$T2 # h0*r1
+ vpaddq $T1,$D2,$D2 # d2 += h1*r1
+ vpaddq $T2,$D1,$D1 # d1 += h0*r1
+
+ vmovdqa -0x50(%r11),$T4 # s2^4
+ vpmuludq $H2,$T3,$T0 # h2*r2
+ vpmuludq $H1,$T3,$T1 # h1*r2
+ vpaddq $T0,$D4,$D4 # d4 += h2*r2
+ vpaddq $T1,$D3,$D3 # d3 += h1*r2
+ vmovdqa -0x40(%r11),$T2 # r3^4
+ vpmuludq $H0,$T3,$T3 # h0*r2
+ vpmuludq $H4,$T4,$T0 # h4*s2
+ vpaddq $T3,$D2,$D2 # d2 += h0*r2
+ vpaddq $T0,$D1,$D1 # d1 += h4*s2
+ vmovdqa -0x30(%r11),$T3 # s3^4
+ vpmuludq $H3,$T4,$T4 # h3*s2
+ vpmuludq $H1,$T2,$T1 # h1*r3
+ vpaddq $T4,$D0,$D0 # d0 += h3*s2
+
+ vmovdqa -0x10(%r11),$T4 # s4^4
+ vpaddq $T1,$D4,$D4 # d4 += h1*r3
+ vpmuludq $H0,$T2,$T2 # h0*r3
+ vpmuludq $H4,$T3,$T0 # h4*s3
+ vpaddq $T2,$D3,$D3 # d3 += h0*r3
+ vpaddq $T0,$D2,$D2 # d2 += h4*s3
+ vmovdqu 16*2($inp),$T0 # load input
+ vpmuludq $H3,$T3,$T2 # h3*s3
+ vpmuludq $H2,$T3,$T3 # h2*s3
+ vpaddq $T2,$D1,$D1 # d1 += h3*s3
+ vmovdqu 16*3($inp),$T1 #
+ vpaddq $T3,$D0,$D0 # d0 += h2*s3
+
+ vpmuludq $H2,$T4,$H2 # h2*s4
+ vpmuludq $H3,$T4,$H3 # h3*s4
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpaddq $H2,$D1,$D1 # d1 += h2*s4
+ vpmuludq $H4,$T4,$H4 # h4*s4
+ vpsrldq \$6,$T1,$T3 #
+ vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
+ vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
+ vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
+ vpmuludq $H1,$T4,$H0
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
+ vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpunpcklqdq $T3,$T2,$T3 # 2:3
+
+ #vpsrlq \$40,$T4,$T4 # 4
+ vpsrldq \$`40/8`,$T4,$T4 # 4
+ vpsrlq \$26,$T0,$T1
+ vmovdqa 0x00(%rsp),$D4 # preload r0^2
+ vpand $MASK,$T0,$T0 # 0
+ vpsrlq \$4,$T3,$T2
+ vpand $MASK,$T1,$T1 # 1
+ vpand 0(%rcx),$T4,$T4 # .Lmask24
+ vpsrlq \$30,$T3,$T3
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ ################################################################
+ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ # and P. Schwabe
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$D1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D0
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D0,$H0,$H0
+ vpsllq \$2,$D0,$D0
+ vpaddq $D0,$H0,$H0 # h4 -> h0
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ ja .Loop_avx
+
+.Lskip_loop_avx:
+ ################################################################
+ # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
+ add \$32,$len
+ jnz .Long_tail_avx
+
+ vpaddq $H2,$T2,$T2
+ vpaddq $H0,$T0,$T0
+ vpaddq $H1,$T1,$T1
+ vpaddq $H3,$T3,$T3
+ vpaddq $H4,$T4,$T4
+
+.Long_tail_avx:
+ vmovdqa $H2,0x20(%r11)
+ vmovdqa $H0,0x00(%r11)
+ vmovdqa $H1,0x10(%r11)
+ vmovdqa $H3,0x30(%r11)
+ vmovdqa $H4,0x40(%r11)
+
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ vpmuludq $T2,$D4,$D2 # d2 = h2*r0
+ vpmuludq $T0,$D4,$D0 # d0 = h0*r0
+ vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
+ vpmuludq $T1,$D4,$D1 # d1 = h1*r0
+ vpmuludq $T3,$D4,$D3 # d3 = h3*r0
+ vpmuludq $T4,$D4,$D4 # d4 = h4*r0
+
+ vpmuludq $T3,$H2,$H0 # h3*r1
+ vpaddq $H0,$D4,$D4 # d4 += h3*r1
+ vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
+ vpmuludq $T2,$H2,$H1 # h2*r1
+ vpaddq $H1,$D3,$D3 # d3 += h2*r1
+ vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
+ vpmuludq $T1,$H2,$H0 # h1*r1
+ vpaddq $H0,$D2,$D2 # d2 += h1*r1
+ vpmuludq $T0,$H2,$H2 # h0*r1
+ vpaddq $H2,$D1,$D1 # d1 += h0*r1
+ vpmuludq $T4,$H3,$H3 # h4*s1
+ vpaddq $H3,$D0,$D0 # d0 += h4*s1
+
+ vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
+ vpmuludq $T2,$H4,$H1 # h2*r2
+ vpaddq $H1,$D4,$D4 # d4 += h2*r2
+ vpmuludq $T1,$H4,$H0 # h1*r2
+ vpaddq $H0,$D3,$D3 # d3 += h1*r2
+ vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
+ vpmuludq $T0,$H4,$H4 # h0*r2
+ vpaddq $H4,$D2,$D2 # d2 += h0*r2
+ vpmuludq $T4,$H2,$H1 # h4*s2
+ vpaddq $H1,$D1,$D1 # d1 += h4*s2
+ vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
+ vpmuludq $T3,$H2,$H2 # h3*s2
+ vpaddq $H2,$D0,$D0 # d0 += h3*s2
+
+ vpmuludq $T1,$H3,$H0 # h1*r3
+ vpaddq $H0,$D4,$D4 # d4 += h1*r3
+ vpmuludq $T0,$H3,$H3 # h0*r3
+ vpaddq $H3,$D3,$D3 # d3 += h0*r3
+ vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
+ vpmuludq $T4,$H4,$H1 # h4*s3
+ vpaddq $H1,$D2,$D2 # d2 += h4*s3
+ vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
+ vpmuludq $T3,$H4,$H0 # h3*s3
+ vpaddq $H0,$D1,$D1 # d1 += h3*s3
+ vpmuludq $T2,$H4,$H4 # h2*s3
+ vpaddq $H4,$D0,$D0 # d0 += h2*s3
+
+ vpmuludq $T0,$H2,$H2 # h0*r4
+ vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
+ vpmuludq $T4,$H3,$H1 # h4*s4
+ vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
+ vpmuludq $T3,$H3,$H0 # h3*s4
+ vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
+ vpmuludq $T2,$H3,$H1 # h2*s4
+ vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
+ vpmuludq $T1,$H3,$H3 # h1*s4
+ vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
+
+ jz .Lshort_tail_avx
+
+ vmovdqu 16*0($inp),$H0 # load input
+ vmovdqu 16*1($inp),$H1
+
+ vpsrldq \$6,$H0,$H2 # splat input
+ vpsrldq \$6,$H1,$H3
+ vpunpckhqdq $H1,$H0,$H4 # 4
+ vpunpcklqdq $H1,$H0,$H0 # 0:1
+ vpunpcklqdq $H3,$H2,$H3 # 2:3
+
+ vpsrlq \$40,$H4,$H4 # 4
+ vpsrlq \$26,$H0,$H1
+ vpand $MASK,$H0,$H0 # 0
+ vpsrlq \$4,$H3,$H2
+ vpand $MASK,$H1,$H1 # 1
+ vpsrlq \$30,$H3,$H3
+ vpand $MASK,$H2,$H2 # 2
+ vpand $MASK,$H3,$H3 # 3
+ vpor 32(%rcx),$H4,$H4 # padbit, yes, always
+
+ vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
+ vpaddq 0x00(%r11),$H0,$H0
+ vpaddq 0x10(%r11),$H1,$H1
+ vpaddq 0x20(%r11),$H2,$H2
+ vpaddq 0x30(%r11),$H3,$H3
+ vpaddq 0x40(%r11),$H4,$H4
+
+ ################################################################
+ # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
+
+ vpmuludq $H0,$T4,$T0 # h0*r0
+ vpaddq $T0,$D0,$D0 # d0 += h0*r0
+ vpmuludq $H1,$T4,$T1 # h1*r0
+ vpaddq $T1,$D1,$D1 # d1 += h1*r0
+ vpmuludq $H2,$T4,$T0 # h2*r0
+ vpaddq $T0,$D2,$D2 # d2 += h2*r0
+ vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
+ vpmuludq $H3,$T4,$T1 # h3*r0
+ vpaddq $T1,$D3,$D3 # d3 += h3*r0
+ vpmuludq $H4,$T4,$T4 # h4*r0
+ vpaddq $T4,$D4,$D4 # d4 += h4*r0
+
+ vpmuludq $H3,$T2,$T0 # h3*r1
+ vpaddq $T0,$D4,$D4 # d4 += h3*r1
+ vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
+ vpmuludq $H2,$T2,$T1 # h2*r1
+ vpaddq $T1,$D3,$D3 # d3 += h2*r1
+ vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
+ vpmuludq $H1,$T2,$T0 # h1*r1
+ vpaddq $T0,$D2,$D2 # d2 += h1*r1
+ vpmuludq $H0,$T2,$T2 # h0*r1
+ vpaddq $T2,$D1,$D1 # d1 += h0*r1
+ vpmuludq $H4,$T3,$T3 # h4*s1
+ vpaddq $T3,$D0,$D0 # d0 += h4*s1
+
+ vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
+ vpmuludq $H2,$T4,$T1 # h2*r2
+ vpaddq $T1,$D4,$D4 # d4 += h2*r2
+ vpmuludq $H1,$T4,$T0 # h1*r2
+ vpaddq $T0,$D3,$D3 # d3 += h1*r2
+ vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
+ vpmuludq $H0,$T4,$T4 # h0*r2
+ vpaddq $T4,$D2,$D2 # d2 += h0*r2
+ vpmuludq $H4,$T2,$T1 # h4*s2
+ vpaddq $T1,$D1,$D1 # d1 += h4*s2
+ vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
+ vpmuludq $H3,$T2,$T2 # h3*s2
+ vpaddq $T2,$D0,$D0 # d0 += h3*s2
+
+ vpmuludq $H1,$T3,$T0 # h1*r3
+ vpaddq $T0,$D4,$D4 # d4 += h1*r3
+ vpmuludq $H0,$T3,$T3 # h0*r3
+ vpaddq $T3,$D3,$D3 # d3 += h0*r3
+ vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
+ vpmuludq $H4,$T4,$T1 # h4*s3
+ vpaddq $T1,$D2,$D2 # d2 += h4*s3
+ vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
+ vpmuludq $H3,$T4,$T0 # h3*s3
+ vpaddq $T0,$D1,$D1 # d1 += h3*s3
+ vpmuludq $H2,$T4,$T4 # h2*s3
+ vpaddq $T4,$D0,$D0 # d0 += h2*s3
+
+ vpmuludq $H0,$T2,$T2 # h0*r4
+ vpaddq $T2,$D4,$D4 # d4 += h0*r4
+ vpmuludq $H4,$T3,$T1 # h4*s4
+ vpaddq $T1,$D3,$D3 # d3 += h4*s4
+ vpmuludq $H3,$T3,$T0 # h3*s4
+ vpaddq $T0,$D2,$D2 # d2 += h3*s4
+ vpmuludq $H2,$T3,$T1 # h2*s4
+ vpaddq $T1,$D1,$D1 # d1 += h2*s4
+ vpmuludq $H1,$T3,$T3 # h1*s4
+ vpaddq $T3,$D0,$D0 # d0 += h1*s4
+
+.Lshort_tail_avx:
+ ################################################################
+ # horizontal addition
+
+ vpsrldq \$8,$D4,$T4
+ vpsrldq \$8,$D3,$T3
+ vpsrldq \$8,$D1,$T1
+ vpsrldq \$8,$D0,$T0
+ vpsrldq \$8,$D2,$T2
+ vpaddq $T3,$D3,$D3
+ vpaddq $T4,$D4,$D4
+ vpaddq $T0,$D0,$D0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$D2,$D2
+
+ ################################################################
+ # lazy reduction
+
+ vpsrlq \$26,$D3,$H3
+ vpand $MASK,$D3,$D3
+ vpaddq $H3,$D4,$D4 # h3 -> h4
+
+ vpsrlq \$26,$D0,$H0
+ vpand $MASK,$D0,$D0
+ vpaddq $H0,$D1,$D1 # h0 -> h1
+
+ vpsrlq \$26,$D4,$H4
+ vpand $MASK,$D4,$D4
+
+ vpsrlq \$26,$D1,$H1
+ vpand $MASK,$D1,$D1
+ vpaddq $H1,$D2,$D2 # h1 -> h2
+
+ vpaddq $H4,$D0,$D0
+ vpsllq \$2,$H4,$H4
+ vpaddq $H4,$D0,$D0 # h4 -> h0
+
+ vpsrlq \$26,$D2,$H2
+ vpand $MASK,$D2,$D2
+ vpaddq $H2,$D3,$D3 # h2 -> h3
+
+ vpsrlq \$26,$D0,$H0
+ vpand $MASK,$D0,$D0
+ vpaddq $H0,$D1,$D1 # h0 -> h1
+
+ vpsrlq \$26,$D3,$H3
+ vpand $MASK,$D3,$D3
+ vpaddq $H3,$D4,$D4 # h3 -> h4
+
+ vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
+ vmovd $D1,`4*1-48-64`($ctx)
+ vmovd $D2,`4*2-48-64`($ctx)
+ vmovd $D3,`4*3-48-64`($ctx)
+ vmovd $D4,`4*4-48-64`($ctx)
+___
+$code.=<<___ if ($win64);
+ vmovdqa 0x50(%r11),%xmm6
+ vmovdqa 0x60(%r11),%xmm7
+ vmovdqa 0x70(%r11),%xmm8
+ vmovdqa 0x80(%r11),%xmm9
+ vmovdqa 0x90(%r11),%xmm10
+ vmovdqa 0xa0(%r11),%xmm11
+ vmovdqa 0xb0(%r11),%xmm12
+ vmovdqa 0xc0(%r11),%xmm13
+ vmovdqa 0xd0(%r11),%xmm14
+ vmovdqa 0xe0(%r11),%xmm15
+ lea 0xf8(%r11),%rsp
+.Ldo_avx_epilogue:
+___
+$code.=<<___ if (!$win64);
+ lea -8(%r10),%rsp
+.cfi_def_cfa_register %rsp
+___
+$code.=<<___;
+ vzeroupper
+ RET
+.cfi_endproc
+___
+&end_function("poly1305_blocks_avx");
+
+&declare_function("poly1305_emit_avx", 32, 3);
+$code.=<<___;
+ cmpl \$0,20($ctx) # is_base2_26?
+ je .Lemit
+
+ mov 0($ctx),%eax # load hash value base 2^26
+ mov 4($ctx),%ecx
+ mov 8($ctx),%r8d
+ mov 12($ctx),%r11d
+ mov 16($ctx),%r10d
+
+ shl \$26,%rcx # base 2^26 -> base 2^64
+ mov %r8,%r9
+ shl \$52,%r8
+ add %rcx,%rax
+ shr \$12,%r9
+ add %rax,%r8 # h0
+ adc \$0,%r9
+
+ shl \$14,%r11
+ mov %r10,%rax
+ shr \$24,%r10
+ add %r11,%r9
+ shl \$40,%rax
+ add %rax,%r9 # h1
+ adc \$0,%r10 # h2
+
+ mov %r10,%rax # could be partially reduced, so reduce
+ mov %r10,%rcx
+ and \$3,%r10
+ shr \$2,%rax
+ and \$-4,%rcx
+ add %rcx,%rax
+ add %rax,%r8
+ adc \$0,%r9
+ adc \$0,%r10
+
+ mov %r8,%rax
+ add \$5,%r8 # compare to modulus
+ mov %r9,%rcx
+ adc \$0,%r9
+ adc \$0,%r10
+ shr \$2,%r10 # did 130-bit value overflow?
+ cmovnz %r8,%rax
+ cmovnz %r9,%rcx
+
+ add 0($nonce),%rax # accumulate nonce
+ adc 8($nonce),%rcx
+ mov %rax,0($mac) # write result
+ mov %rcx,8($mac)
+
+ RET
+___
+&end_function("poly1305_emit_avx");
+
+if ($avx>1) {
+
+my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
+ map("%ymm$_",(0..15));
+my $S4=$MASK;
+
+sub poly1305_blocks_avxN {
+ my ($avx512) = @_;
+ my $suffix = $avx512 ? "_avx512" : "";
+$code.=<<___;
+.cfi_startproc
+ mov 20($ctx),%r8d # is_base2_26
+ cmp \$128,$len
+ jae .Lblocks_avx2$suffix
+ test %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx2$suffix:
+ and \$-16,$len
+ jz .Lno_data_avx2$suffix
+
+ vzeroupper
+
+ test %r8d,%r8d
+ jz .Lbase2_64_avx2$suffix
+
+ test \$63,$len
+ jz .Leven_avx2$suffix
+
+ push %rbp
+.cfi_push %rbp
+ mov %rsp,%rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lblocks_avx2_body$suffix:
+
+ mov $len,%r15 # reassign $len
+
+ mov 0($ctx),$d1 # load hash value
+ mov 8($ctx),$d2
+ mov 16($ctx),$h2#d
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ ################################# base 2^26 -> base 2^64
+ mov $d1#d,$h0#d
+ and \$`-1*(1<<31)`,$d1
+ mov $d2,$r1 # borrow $r1
+ mov $d2#d,$h1#d
+ and \$`-1*(1<<31)`,$d2
+
+ shr \$6,$d1
+ shl \$52,$r1
+ add $d1,$h0
+ shr \$12,$h1
+ shr \$18,$d2
+ add $r1,$h0
+ adc $d2,$h1
+
+ mov $h2,$d1
+ shl \$40,$d1
+ shr \$24,$h2
+ add $d1,$h1
+ adc \$0,$h2 # can be partially reduced...
+
+ mov \$-4,$d2 # ... so reduce
+ mov $h2,$d1
+ and $h2,$d2
+ shr \$2,$d1
+ and \$3,$h2
+ add $d2,$d1 # =*5
+ add $d1,$h0
+ adc \$0,$h1
+ adc \$0,$h2
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+.Lbase2_26_pre_avx2$suffix:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ sub \$16,%r15
+
+ call __poly1305_block
+ mov $r1,%rax
+
+ test \$63,%r15
+ jnz .Lbase2_26_pre_avx2$suffix
+
+ test $padbit,$padbit # if $padbit is zero,
+ jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
+
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$r0
+ mov $h1,$r1
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$r0
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $r0,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$r1
+ and \$0x3ffffff,$h1 # h[3]
+ or $r1,$h2 # h[4]
+
+ test %r15,%r15
+ jz .Lstore_base2_26_avx2$suffix
+
+ vmovd %rax#d,%x#$H0
+ vmovd %rdx#d,%x#$H1
+ vmovd $h0#d,%x#$H2
+ vmovd $h1#d,%x#$H3
+ vmovd $h2#d,%x#$H4
+ jmp .Lproceed_avx2$suffix
+
+.align 32
+.Lstore_base2_64_avx2$suffix:
+ mov $h0,0($ctx)
+ mov $h1,8($ctx)
+ mov $h2,16($ctx) # note that is_base2_26 is zeroed
+ jmp .Ldone_avx2$suffix
+
+.align 16
+.Lstore_base2_26_avx2$suffix:
+ mov %rax#d,0($ctx) # store hash value base 2^26
+ mov %rdx#d,4($ctx)
+ mov $h0#d,8($ctx)
+ mov $h1#d,12($ctx)
+ mov $h2#d,16($ctx)
+.align 16
+.Ldone_avx2$suffix:
+ pop %r15
+.cfi_restore %r15
+ pop %r14
+.cfi_restore %r14
+ pop %r13
+.cfi_restore %r13
+ pop %r12
+.cfi_restore %r12
+ pop %rbx
+.cfi_restore %rbx
+ pop %rbp
+.cfi_restore %rbp
+.Lno_data_avx2$suffix:
+.Lblocks_avx2_epilogue$suffix:
+ RET
+.cfi_endproc
+
+.align 32
+.Lbase2_64_avx2$suffix:
+.cfi_startproc
+ push %rbp
+.cfi_push %rbp
+ mov %rsp,%rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lbase2_64_avx2_body$suffix:
+
+ mov $len,%r15 # reassign $len
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ mov 0($ctx),$h0 # load hash value
+ mov 8($ctx),$h1
+ mov 16($ctx),$h2#d
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ test \$63,$len
+ jz .Linit_avx2$suffix
+
+.Lbase2_64_pre_avx2$suffix:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ sub \$16,%r15
+
+ call __poly1305_block
+ mov $r1,%rax
+
+ test \$63,%r15
+ jnz .Lbase2_64_pre_avx2$suffix
+
+.Linit_avx2$suffix:
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$d1
+ mov $h1,$d2
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$d1
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $d1,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$d2
+ and \$0x3ffffff,$h1 # h[3]
+ or $d2,$h2 # h[4]
+
+ vmovd %rax#d,%x#$H0
+ vmovd %rdx#d,%x#$H1
+ vmovd $h0#d,%x#$H2
+ vmovd $h1#d,%x#$H3
+ vmovd $h2#d,%x#$H4
+ movl \$1,20($ctx) # set is_base2_26
+
+ call __poly1305_init_avx
+
+.Lproceed_avx2$suffix:
+ mov %r15,$len # restore $len
+___
+$code.=<<___ if (!$kernel);
+ mov OPENSSL_ia32cap_P+8(%rip),%r9d
+ mov \$`(1<<31|1<<30|1<<16)`,%r11d
+___
+$code.=<<___;
+ pop %r15
+.cfi_restore %r15
+ pop %r14
+.cfi_restore %r14
+ pop %r13
+.cfi_restore %r13
+ pop %r12
+.cfi_restore %r12
+ pop %rbx
+.cfi_restore %rbx
+ pop %rbp
+.cfi_restore %rbp
+.Lbase2_64_avx2_epilogue$suffix:
+ jmp .Ldo_avx2$suffix
+.cfi_endproc
+
+.align 32
+.Leven_avx2$suffix:
+.cfi_startproc
+___
+$code.=<<___ if (!$kernel);
+ mov OPENSSL_ia32cap_P+8(%rip),%r9d
+___
+$code.=<<___;
+ vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
+ vmovd 4*1($ctx),%x#$H1
+ vmovd 4*2($ctx),%x#$H2
+ vmovd 4*3($ctx),%x#$H3
+ vmovd 4*4($ctx),%x#$H4
+
+.Ldo_avx2$suffix:
+___
+$code.=<<___ if (!$kernel && $avx>2);
+ cmp \$512,$len
+ jb .Lskip_avx512
+ and %r11d,%r9d
+ test \$`1<<16`,%r9d # check for AVX512F
+ jnz .Lblocks_avx512
+.Lskip_avx512$suffix:
+___
+$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
+ cmp \$512,$len
+ jae .Lblocks_avx512
+___
+$code.=<<___ if (!$win64);
+ lea 8(%rsp),%r10
+.cfi_def_cfa_register %r10
+ sub \$0x128,%rsp
+___
+$code.=<<___ if ($win64);
+ lea 8(%rsp),%r10
+ sub \$0x1c8,%rsp
+ vmovdqa %xmm6,-0xb0(%r10)
+ vmovdqa %xmm7,-0xa0(%r10)
+ vmovdqa %xmm8,-0x90(%r10)
+ vmovdqa %xmm9,-0x80(%r10)
+ vmovdqa %xmm10,-0x70(%r10)
+ vmovdqa %xmm11,-0x60(%r10)
+ vmovdqa %xmm12,-0x50(%r10)
+ vmovdqa %xmm13,-0x40(%r10)
+ vmovdqa %xmm14,-0x30(%r10)
+ vmovdqa %xmm15,-0x20(%r10)
+.Ldo_avx2_body$suffix:
+___
+$code.=<<___;
+ lea .Lconst(%rip),%rcx
+ lea 48+64($ctx),$ctx # size optimization
+ vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
+
+ # expand and copy pre-calculated table to stack
+ vmovdqu `16*0-64`($ctx),%x#$T2
+ and \$-512,%rsp
+ vmovdqu `16*1-64`($ctx),%x#$T3
+ vmovdqu `16*2-64`($ctx),%x#$T4
+ vmovdqu `16*3-64`($ctx),%x#$D0
+ vmovdqu `16*4-64`($ctx),%x#$D1
+ vmovdqu `16*5-64`($ctx),%x#$D2
+ lea 0x90(%rsp),%rax # size optimization
+ vmovdqu `16*6-64`($ctx),%x#$D3
+ vpermd $T2,$T0,$T2 # 00003412 -> 14243444
+ vmovdqu `16*7-64`($ctx),%x#$D4
+ vpermd $T3,$T0,$T3
+ vmovdqu `16*8-64`($ctx),%x#$MASK
+ vpermd $T4,$T0,$T4
+ vmovdqa $T2,0x00(%rsp)
+ vpermd $D0,$T0,$D0
+ vmovdqa $T3,0x20-0x90(%rax)
+ vpermd $D1,$T0,$D1
+ vmovdqa $T4,0x40-0x90(%rax)
+ vpermd $D2,$T0,$D2
+ vmovdqa $D0,0x60-0x90(%rax)
+ vpermd $D3,$T0,$D3
+ vmovdqa $D1,0x80-0x90(%rax)
+ vpermd $D4,$T0,$D4
+ vmovdqa $D2,0xa0-0x90(%rax)
+ vpermd $MASK,$T0,$MASK
+ vmovdqa $D3,0xc0-0x90(%rax)
+ vmovdqa $D4,0xe0-0x90(%rax)
+ vmovdqa $MASK,0x100-0x90(%rax)
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+
+ ################################################################
+ # load input
+ vmovdqu 16*0($inp),%x#$T0
+ vmovdqu 16*1($inp),%x#$T1
+ vinserti128 \$1,16*2($inp),$T0,$T0
+ vinserti128 \$1,16*3($inp),$T1,$T1
+ lea 16*4($inp),$inp
+
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpsrldq \$6,$T1,$T3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpunpcklqdq $T3,$T2,$T2 # 2:3
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+
+ vpsrlq \$30,$T2,$T3
+ vpsrlq \$4,$T2,$T2
+ vpsrlq \$26,$T0,$T1
+ vpsrlq \$40,$T4,$T4 # 4
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T0,$T0 # 0
+ vpand $MASK,$T1,$T1 # 1
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ vpaddq $H2,$T2,$H2 # accumulate input
+ sub \$64,$len
+ jz .Ltail_avx2$suffix
+ jmp .Loop_avx2$suffix
+
+.align 32
+.Loop_avx2$suffix:
+ ################################################################
+ # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
+ # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
+ # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
+ # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
+ # \________/\__________/
+ ################################################################
+ #vpaddq $H2,$T2,$H2 # accumulate input
+ vpaddq $H0,$T0,$H0
+ vmovdqa `32*0`(%rsp),$T0 # r0^4
+ vpaddq $H1,$T1,$H1
+ vmovdqa `32*1`(%rsp),$T1 # r1^4
+ vpaddq $H3,$T3,$H3
+ vmovdqa `32*3`(%rsp),$T2 # r2^4
+ vpaddq $H4,$T4,$H4
+ vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
+ vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
+
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ #
+ # however, as h2 is "chronologically" first one available pull
+ # corresponding operations up, so it's
+ #
+ # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
+ # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
+ # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
+
+ vpmuludq $H2,$T0,$D2 # d2 = h2*r0
+ vpmuludq $H2,$T1,$D3 # d3 = h2*r1
+ vpmuludq $H2,$T2,$D4 # d4 = h2*r2
+ vpmuludq $H2,$T3,$D0 # d0 = h2*s3
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+
+ vpmuludq $H0,$T1,$T4 # h0*r1
+ vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
+ vpaddq $T4,$D1,$D1 # d1 += h0*r1
+ vpaddq $H2,$D2,$D2 # d2 += h1*r1
+ vpmuludq $H3,$T1,$T4 # h3*r1
+ vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
+ vpaddq $T4,$D4,$D4 # d4 += h3*r1
+ vpaddq $H2,$D0,$D0 # d0 += h4*s1
+ vmovdqa `32*4-0x90`(%rax),$T1 # s2
+
+ vpmuludq $H0,$T0,$T4 # h0*r0
+ vpmuludq $H1,$T0,$H2 # h1*r0
+ vpaddq $T4,$D0,$D0 # d0 += h0*r0
+ vpaddq $H2,$D1,$D1 # d1 += h1*r0
+ vpmuludq $H3,$T0,$T4 # h3*r0
+ vpmuludq $H4,$T0,$H2 # h4*r0
+ vmovdqu 16*0($inp),%x#$T0 # load input
+ vpaddq $T4,$D3,$D3 # d3 += h3*r0
+ vpaddq $H2,$D4,$D4 # d4 += h4*r0
+ vinserti128 \$1,16*2($inp),$T0,$T0
+
+ vpmuludq $H3,$T1,$T4 # h3*s2
+ vpmuludq $H4,$T1,$H2 # h4*s2
+ vmovdqu 16*1($inp),%x#$T1
+ vpaddq $T4,$D0,$D0 # d0 += h3*s2
+ vpaddq $H2,$D1,$D1 # d1 += h4*s2
+ vmovdqa `32*5-0x90`(%rax),$H2 # r3
+ vpmuludq $H1,$T2,$T4 # h1*r2
+ vpmuludq $H0,$T2,$T2 # h0*r2
+ vpaddq $T4,$D3,$D3 # d3 += h1*r2
+ vpaddq $T2,$D2,$D2 # d2 += h0*r2
+ vinserti128 \$1,16*3($inp),$T1,$T1
+ lea 16*4($inp),$inp
+
+ vpmuludq $H1,$H2,$T4 # h1*r3
+ vpmuludq $H0,$H2,$H2 # h0*r3
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpaddq $T4,$D4,$D4 # d4 += h1*r3
+ vpaddq $H2,$D3,$D3 # d3 += h0*r3
+ vpmuludq $H3,$T3,$T4 # h3*s3
+ vpmuludq $H4,$T3,$H2 # h4*s3
+ vpsrldq \$6,$T1,$T3
+ vpaddq $T4,$D1,$D1 # d1 += h3*s3
+ vpaddq $H2,$D2,$D2 # d2 += h4*s3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+
+ vpmuludq $H3,$S4,$H3 # h3*s4
+ vpmuludq $H4,$S4,$H4 # h4*s4
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
+ vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
+ vpunpcklqdq $T3,$T2,$T3 # 2:3
+ vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
+ vpmuludq $H1,$S4,$H0 # h1*s4
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+ vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
+ vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+
+ ################################################################
+ # lazy reduction (interleaved with tail of input splat)
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$D1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D4
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$4,$T3,$T2
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpand $MASK,$T2,$T2 # 2
+ vpsrlq \$26,$T0,$T1
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpaddq $T2,$H2,$H2 # modulo-scheduled
+ vpsrlq \$30,$T3,$T3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$40,$T4,$T4 # 4
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpand $MASK,$T0,$T0 # 0
+ vpand $MASK,$T1,$T1 # 1
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ sub \$64,$len
+ jnz .Loop_avx2$suffix
+
+ .byte 0x66,0x90
+.Ltail_avx2$suffix:
+ ################################################################
+ # while above multiplications were by r^4 in all lanes, in last
+ # iteration we multiply least significant lane by r^4 and most
+ # significant one by r, so copy of above except that references
+ # to the precomputed table are displaced by 4...
+
+ #vpaddq $H2,$T2,$H2 # accumulate input
+ vpaddq $H0,$T0,$H0
+ vmovdqu `32*0+4`(%rsp),$T0 # r0^4
+ vpaddq $H1,$T1,$H1
+ vmovdqu `32*1+4`(%rsp),$T1 # r1^4
+ vpaddq $H3,$T3,$H3
+ vmovdqu `32*3+4`(%rsp),$T2 # r2^4
+ vpaddq $H4,$T4,$H4
+ vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
+ vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
+
+ vpmuludq $H2,$T0,$D2 # d2 = h2*r0
+ vpmuludq $H2,$T1,$D3 # d3 = h2*r1
+ vpmuludq $H2,$T2,$D4 # d4 = h2*r2
+ vpmuludq $H2,$T3,$D0 # d0 = h2*s3
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+
+ vpmuludq $H0,$T1,$T4 # h0*r1
+ vpmuludq $H1,$T1,$H2 # h1*r1
+ vpaddq $T4,$D1,$D1 # d1 += h0*r1
+ vpaddq $H2,$D2,$D2 # d2 += h1*r1
+ vpmuludq $H3,$T1,$T4 # h3*r1
+ vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
+ vpaddq $T4,$D4,$D4 # d4 += h3*r1
+ vpaddq $H2,$D0,$D0 # d0 += h4*s1
+
+ vpmuludq $H0,$T0,$T4 # h0*r0
+ vpmuludq $H1,$T0,$H2 # h1*r0
+ vpaddq $T4,$D0,$D0 # d0 += h0*r0
+ vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
+ vpaddq $H2,$D1,$D1 # d1 += h1*r0
+ vpmuludq $H3,$T0,$T4 # h3*r0
+ vpmuludq $H4,$T0,$H2 # h4*r0
+ vpaddq $T4,$D3,$D3 # d3 += h3*r0
+ vpaddq $H2,$D4,$D4 # d4 += h4*r0
+
+ vpmuludq $H3,$T1,$T4 # h3*s2
+ vpmuludq $H4,$T1,$H2 # h4*s2
+ vpaddq $T4,$D0,$D0 # d0 += h3*s2
+ vpaddq $H2,$D1,$D1 # d1 += h4*s2
+ vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
+ vpmuludq $H1,$T2,$T4 # h1*r2
+ vpmuludq $H0,$T2,$T2 # h0*r2
+ vpaddq $T4,$D3,$D3 # d3 += h1*r2
+ vpaddq $T2,$D2,$D2 # d2 += h0*r2
+
+ vpmuludq $H1,$H2,$T4 # h1*r3
+ vpmuludq $H0,$H2,$H2 # h0*r3
+ vpaddq $T4,$D4,$D4 # d4 += h1*r3
+ vpaddq $H2,$D3,$D3 # d3 += h0*r3
+ vpmuludq $H3,$T3,$T4 # h3*s3
+ vpmuludq $H4,$T3,$H2 # h4*s3
+ vpaddq $T4,$D1,$D1 # d1 += h3*s3
+ vpaddq $H2,$D2,$D2 # d2 += h4*s3
+
+ vpmuludq $H3,$S4,$H3 # h3*s4
+ vpmuludq $H4,$S4,$H4 # h4*s4
+ vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
+ vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
+ vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
+ vpmuludq $H1,$S4,$H0 # h1*s4
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+ vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
+ vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+
+ ################################################################
+ # horizontal addition
+
+ vpsrldq \$8,$D1,$T1
+ vpsrldq \$8,$H2,$T2
+ vpsrldq \$8,$H3,$T3
+ vpsrldq \$8,$H4,$T4
+ vpsrldq \$8,$H0,$T0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$H2,$H2
+ vpaddq $T3,$H3,$H3
+ vpaddq $T4,$H4,$H4
+ vpaddq $T0,$H0,$H0
+
+ vpermq \$0x2,$H3,$T3
+ vpermq \$0x2,$H4,$T4
+ vpermq \$0x2,$H0,$T0
+ vpermq \$0x2,$D1,$T1
+ vpermq \$0x2,$H2,$T2
+ vpaddq $T3,$H3,$H3
+ vpaddq $T4,$H4,$H4
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$H2,$H2
+
+ ################################################################
+ # lazy reduction
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$D1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D4
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
+ vmovd %x#$H1,`4*1-48-64`($ctx)
+ vmovd %x#$H2,`4*2-48-64`($ctx)
+ vmovd %x#$H3,`4*3-48-64`($ctx)
+ vmovd %x#$H4,`4*4-48-64`($ctx)
+___
+$code.=<<___ if ($win64);
+ vmovdqa -0xb0(%r10),%xmm6
+ vmovdqa -0xa0(%r10),%xmm7
+ vmovdqa -0x90(%r10),%xmm8
+ vmovdqa -0x80(%r10),%xmm9
+ vmovdqa -0x70(%r10),%xmm10
+ vmovdqa -0x60(%r10),%xmm11
+ vmovdqa -0x50(%r10),%xmm12
+ vmovdqa -0x40(%r10),%xmm13
+ vmovdqa -0x30(%r10),%xmm14
+ vmovdqa -0x20(%r10),%xmm15
+ lea -8(%r10),%rsp
+.Ldo_avx2_epilogue$suffix:
+___
+$code.=<<___ if (!$win64);
+ lea -8(%r10),%rsp
+.cfi_def_cfa_register %rsp
+___
+$code.=<<___;
+ vzeroupper
+ RET
+.cfi_endproc
+___
+if($avx > 2 && $avx512) {
+my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
+my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
+my $PADBIT="%zmm30";
+
+map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
+map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
+map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
+map(s/%y/%z/,($MASK));
+
+$code.=<<___;
+.cfi_startproc
+.Lblocks_avx512:
+ mov \$15,%eax
+ kmovw %eax,%k2
+___
+$code.=<<___ if (!$win64);
+ lea 8(%rsp),%r10
+.cfi_def_cfa_register %r10
+ sub \$0x128,%rsp
+___
+$code.=<<___ if ($win64);
+ lea 8(%rsp),%r10
+ sub \$0x1c8,%rsp
+ vmovdqa %xmm6,-0xb0(%r10)
+ vmovdqa %xmm7,-0xa0(%r10)
+ vmovdqa %xmm8,-0x90(%r10)
+ vmovdqa %xmm9,-0x80(%r10)
+ vmovdqa %xmm10,-0x70(%r10)
+ vmovdqa %xmm11,-0x60(%r10)
+ vmovdqa %xmm12,-0x50(%r10)
+ vmovdqa %xmm13,-0x40(%r10)
+ vmovdqa %xmm14,-0x30(%r10)
+ vmovdqa %xmm15,-0x20(%r10)
+.Ldo_avx512_body:
+___
+$code.=<<___;
+ lea .Lconst(%rip),%rcx
+ lea 48+64($ctx),$ctx # size optimization
+ vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
+
+ # expand pre-calculated table
+ vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
+ and \$-512,%rsp
+ vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
+ mov \$0x20,%rax
+ vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
+ vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
+ vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
+ vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
+ vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
+ vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
+ vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
+ vpermd $D0,$T2,$R0 # 00003412 -> 14243444
+ vpbroadcastq 64(%rcx),$MASK # .Lmask26
+ vpermd $D1,$T2,$R1
+ vpermd $T0,$T2,$S1
+ vpermd $D2,$T2,$R2
+ vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
+ vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
+ vpermd $T1,$T2,$S2
+ vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
+ vpsrlq \$32,$R1,$T1
+ vpermd $D3,$T2,$R3
+ vmovdqa64 $S1,0x40(%rsp){%k2}
+ vpermd $T3,$T2,$S3
+ vpermd $D4,$T2,$R4
+ vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
+ vpermd $T4,$T2,$S4
+ vmovdqa64 $S2,0x80(%rsp){%k2}
+ vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
+ vmovdqa64 $S3,0xc0(%rsp){%k2}
+ vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
+ vmovdqa64 $S4,0x100(%rsp){%k2}
+
+ ################################################################
+ # calculate 5th through 8th powers of the key
+ #
+ # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
+ # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
+ # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
+ # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
+ # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
+
+ vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
+ vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
+ vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
+ vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
+ vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
+ vpsrlq \$32,$R2,$T2
+
+ vpmuludq $T1,$S4,$M0
+ vpmuludq $T1,$R0,$M1
+ vpmuludq $T1,$R1,$M2
+ vpmuludq $T1,$R2,$M3
+ vpmuludq $T1,$R3,$M4
+ vpsrlq \$32,$R3,$T3
+ vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
+ vpaddq $M1,$D1,$D1 # d1 += r1'*r0
+ vpaddq $M2,$D2,$D2 # d2 += r1'*r1
+ vpaddq $M3,$D3,$D3 # d3 += r1'*r2
+ vpaddq $M4,$D4,$D4 # d4 += r1'*r3
+
+ vpmuludq $T2,$S3,$M0
+ vpmuludq $T2,$S4,$M1
+ vpmuludq $T2,$R1,$M3
+ vpmuludq $T2,$R2,$M4
+ vpmuludq $T2,$R0,$M2
+ vpsrlq \$32,$R4,$T4
+ vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
+ vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
+ vpaddq $M3,$D3,$D3 # d3 += r2'*r1
+ vpaddq $M4,$D4,$D4 # d4 += r2'*r2
+ vpaddq $M2,$D2,$D2 # d2 += r2'*r0
+
+ vpmuludq $T3,$S2,$M0
+ vpmuludq $T3,$R0,$M3
+ vpmuludq $T3,$R1,$M4
+ vpmuludq $T3,$S3,$M1
+ vpmuludq $T3,$S4,$M2
+ vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
+ vpaddq $M3,$D3,$D3 # d3 += r3'*r0
+ vpaddq $M4,$D4,$D4 # d4 += r3'*r1
+ vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
+ vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
+
+ vpmuludq $T4,$S4,$M3
+ vpmuludq $T4,$R0,$M4
+ vpmuludq $T4,$S1,$M0
+ vpmuludq $T4,$S2,$M1
+ vpmuludq $T4,$S3,$M2
+ vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
+ vpaddq $M4,$D4,$D4 # d4 += r2'*r0
+ vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
+ vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
+ vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
+
+ ################################################################
+ # load input
+ vmovdqu64 16*0($inp),%z#$T3
+ vmovdqu64 16*4($inp),%z#$T4
+ lea 16*8($inp),$inp
+
+ ################################################################
+ # lazy reduction
+
+ vpsrlq \$26,$D3,$M3
+ vpandq $MASK,$D3,$D3
+ vpaddq $M3,$D4,$D4 # d3 -> d4
+
+ vpsrlq \$26,$D0,$M0
+ vpandq $MASK,$D0,$D0
+ vpaddq $M0,$D1,$D1 # d0 -> d1
+
+ vpsrlq \$26,$D4,$M4
+ vpandq $MASK,$D4,$D4
+
+ vpsrlq \$26,$D1,$M1
+ vpandq $MASK,$D1,$D1
+ vpaddq $M1,$D2,$D2 # d1 -> d2
+
+ vpaddq $M4,$D0,$D0
+ vpsllq \$2,$M4,$M4
+ vpaddq $M4,$D0,$D0 # d4 -> d0
+
+ vpsrlq \$26,$D2,$M2
+ vpandq $MASK,$D2,$D2
+ vpaddq $M2,$D3,$D3 # d2 -> d3
+
+ vpsrlq \$26,$D0,$M0
+ vpandq $MASK,$D0,$D0
+ vpaddq $M0,$D1,$D1 # d0 -> d1
+
+ vpsrlq \$26,$D3,$M3
+ vpandq $MASK,$D3,$D3
+ vpaddq $M3,$D4,$D4 # d3 -> d4
+
+ ################################################################
+ # at this point we have 14243444 in $R0-$S4 and 05060708 in
+ # $D0-$D4, ...
+
+ vpunpcklqdq $T4,$T3,$T0 # transpose input
+ vpunpckhqdq $T4,$T3,$T4
+
+ # ... since input 64-bit lanes are ordered as 73625140, we could
+ # "vperm" it to 76543210 (here and in each loop iteration), *or*
+ # we could just flow along, hence the goal for $R0-$S4 is
+ # 1858286838784888 ...
+
+ vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
+ mov \$0x7777,%eax
+ kmovw %eax,%k1
+
+ vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
+ vpermd $R1,$M0,$R1
+ vpermd $R2,$M0,$R2
+ vpermd $R3,$M0,$R3
+ vpermd $R4,$M0,$R4
+
+ vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
+ vpermd $D1,$M0,${R1}{%k1}
+ vpermd $D2,$M0,${R2}{%k1}
+ vpermd $D3,$M0,${R3}{%k1}
+ vpermd $D4,$M0,${R4}{%k1}
+
+ vpslld \$2,$R1,$S1 # *5
+ vpslld \$2,$R2,$S2
+ vpslld \$2,$R3,$S3
+ vpslld \$2,$R4,$S4
+ vpaddd $R1,$S1,$S1
+ vpaddd $R2,$S2,$S2
+ vpaddd $R3,$S3,$S3
+ vpaddd $R4,$S4,$S4
+
+ vpbroadcastq 32(%rcx),$PADBIT # .L129
+
+ vpsrlq \$52,$T0,$T2 # splat input
+ vpsllq \$12,$T4,$T3
+ vporq $T3,$T2,$T2
+ vpsrlq \$26,$T0,$T1
+ vpsrlq \$14,$T4,$T3
+ vpsrlq \$40,$T4,$T4 # 4
+ vpandq $MASK,$T2,$T2 # 2
+ vpandq $MASK,$T0,$T0 # 0
+ #vpandq $MASK,$T1,$T1 # 1
+ #vpandq $MASK,$T3,$T3 # 3
+ #vporq $PADBIT,$T4,$T4 # padbit, yes, always
+
+ vpaddq $H2,$T2,$H2 # accumulate input
+ sub \$192,$len
+ jbe .Ltail_avx512
+ jmp .Loop_avx512
+
+.align 32
+.Loop_avx512:
+ ################################################################
+ # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
+ # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
+ # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
+ # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
+ # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
+ # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
+ # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
+ # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
+ # \________/\___________/
+ ################################################################
+ #vpaddq $H2,$T2,$H2 # accumulate input
+
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ #
+ # however, as h2 is "chronologically" first one available pull
+ # corresponding operations up, so it's
+ #
+ # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
+ # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
+ # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
+ # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
+ # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
+
+ vpmuludq $H2,$R1,$D3 # d3 = h2*r1
+ vpaddq $H0,$T0,$H0
+ vpmuludq $H2,$R2,$D4 # d4 = h2*r2
+ vpandq $MASK,$T1,$T1 # 1
+ vpmuludq $H2,$S3,$D0 # d0 = h2*s3
+ vpandq $MASK,$T3,$T3 # 3
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+ vporq $PADBIT,$T4,$T4 # padbit, yes, always
+ vpmuludq $H2,$R0,$D2 # d2 = h2*r0
+ vpaddq $H1,$T1,$H1 # accumulate input
+ vpaddq $H3,$T3,$H3
+ vpaddq $H4,$T4,$H4
+
+ vmovdqu64 16*0($inp),$T3 # load input
+ vmovdqu64 16*4($inp),$T4
+ lea 16*8($inp),$inp
+ vpmuludq $H0,$R3,$M3
+ vpmuludq $H0,$R4,$M4
+ vpmuludq $H0,$R0,$M0
+ vpmuludq $H0,$R1,$M1
+ vpaddq $M3,$D3,$D3 # d3 += h0*r3
+ vpaddq $M4,$D4,$D4 # d4 += h0*r4
+ vpaddq $M0,$D0,$D0 # d0 += h0*r0
+ vpaddq $M1,$D1,$D1 # d1 += h0*r1
+
+ vpmuludq $H1,$R2,$M3
+ vpmuludq $H1,$R3,$M4
+ vpmuludq $H1,$S4,$M0
+ vpmuludq $H0,$R2,$M2
+ vpaddq $M3,$D3,$D3 # d3 += h1*r2
+ vpaddq $M4,$D4,$D4 # d4 += h1*r3
+ vpaddq $M0,$D0,$D0 # d0 += h1*s4
+ vpaddq $M2,$D2,$D2 # d2 += h0*r2
+
+ vpunpcklqdq $T4,$T3,$T0 # transpose input
+ vpunpckhqdq $T4,$T3,$T4
+
+ vpmuludq $H3,$R0,$M3
+ vpmuludq $H3,$R1,$M4
+ vpmuludq $H1,$R0,$M1
+ vpmuludq $H1,$R1,$M2
+ vpaddq $M3,$D3,$D3 # d3 += h3*r0
+ vpaddq $M4,$D4,$D4 # d4 += h3*r1
+ vpaddq $M1,$D1,$D1 # d1 += h1*r0
+ vpaddq $M2,$D2,$D2 # d2 += h1*r1
+
+ vpmuludq $H4,$S4,$M3
+ vpmuludq $H4,$R0,$M4
+ vpmuludq $H3,$S2,$M0
+ vpmuludq $H3,$S3,$M1
+ vpaddq $M3,$D3,$D3 # d3 += h4*s4
+ vpmuludq $H3,$S4,$M2
+ vpaddq $M4,$D4,$D4 # d4 += h4*r0
+ vpaddq $M0,$D0,$D0 # d0 += h3*s2
+ vpaddq $M1,$D1,$D1 # d1 += h3*s3
+ vpaddq $M2,$D2,$D2 # d2 += h3*s4
+
+ vpmuludq $H4,$S1,$M0
+ vpmuludq $H4,$S2,$M1
+ vpmuludq $H4,$S3,$M2
+ vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
+ vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
+ vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
+
+ ################################################################
+ # lazy reduction (interleaved with input splat)
+
+ vpsrlq \$52,$T0,$T2 # splat input
+ vpsllq \$12,$T4,$T3
+
+ vpsrlq \$26,$D3,$H3
+ vpandq $MASK,$D3,$D3
+ vpaddq $H3,$D4,$H4 # h3 -> h4
+
+ vporq $T3,$T2,$T2
+
+ vpsrlq \$26,$H0,$D0
+ vpandq $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpandq $MASK,$T2,$T2 # 2
+
+ vpsrlq \$26,$H4,$D4
+ vpandq $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpandq $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpaddq $T2,$H2,$H2 # modulo-scheduled
+ vpsrlq \$26,$T0,$T1
+
+ vpsrlq \$26,$H2,$D2
+ vpandq $MASK,$H2,$H2
+ vpaddq $D2,$D3,$H3 # h2 -> h3
+
+ vpsrlq \$14,$T4,$T3
+
+ vpsrlq \$26,$H0,$D0
+ vpandq $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$40,$T4,$T4 # 4
+
+ vpsrlq \$26,$H3,$D3
+ vpandq $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpandq $MASK,$T0,$T0 # 0
+ #vpandq $MASK,$T1,$T1 # 1
+ #vpandq $MASK,$T3,$T3 # 3
+ #vporq $PADBIT,$T4,$T4 # padbit, yes, always
+
+ sub \$128,$len
+ ja .Loop_avx512
+
+.Ltail_avx512:
+ ################################################################
+ # while above multiplications were by r^8 in all lanes, in last
+ # iteration we multiply least significant lane by r^8 and most
+ # significant one by r, that's why table gets shifted...
+
+ vpsrlq \$32,$R0,$R0 # 0105020603070408
+ vpsrlq \$32,$R1,$R1
+ vpsrlq \$32,$R2,$R2
+ vpsrlq \$32,$S3,$S3
+ vpsrlq \$32,$S4,$S4
+ vpsrlq \$32,$R3,$R3
+ vpsrlq \$32,$R4,$R4
+ vpsrlq \$32,$S1,$S1
+ vpsrlq \$32,$S2,$S2
+
+ ################################################################
+ # load either next or last 64 byte of input
+ lea ($inp,$len),$inp
+
+ #vpaddq $H2,$T2,$H2 # accumulate input
+ vpaddq $H0,$T0,$H0
+
+ vpmuludq $H2,$R1,$D3 # d3 = h2*r1
+ vpmuludq $H2,$R2,$D4 # d4 = h2*r2
+ vpmuludq $H2,$S3,$D0 # d0 = h2*s3
+ vpandq $MASK,$T1,$T1 # 1
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+ vpandq $MASK,$T3,$T3 # 3
+ vpmuludq $H2,$R0,$D2 # d2 = h2*r0
+ vporq $PADBIT,$T4,$T4 # padbit, yes, always
+ vpaddq $H1,$T1,$H1 # accumulate input
+ vpaddq $H3,$T3,$H3
+ vpaddq $H4,$T4,$H4
+
+ vmovdqu 16*0($inp),%x#$T0
+ vpmuludq $H0,$R3,$M3
+ vpmuludq $H0,$R4,$M4
+ vpmuludq $H0,$R0,$M0
+ vpmuludq $H0,$R1,$M1
+ vpaddq $M3,$D3,$D3 # d3 += h0*r3
+ vpaddq $M4,$D4,$D4 # d4 += h0*r4
+ vpaddq $M0,$D0,$D0 # d0 += h0*r0
+ vpaddq $M1,$D1,$D1 # d1 += h0*r1
+
+ vmovdqu 16*1($inp),%x#$T1
+ vpmuludq $H1,$R2,$M3
+ vpmuludq $H1,$R3,$M4
+ vpmuludq $H1,$S4,$M0
+ vpmuludq $H0,$R2,$M2
+ vpaddq $M3,$D3,$D3 # d3 += h1*r2
+ vpaddq $M4,$D4,$D4 # d4 += h1*r3
+ vpaddq $M0,$D0,$D0 # d0 += h1*s4
+ vpaddq $M2,$D2,$D2 # d2 += h0*r2
+
+ vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
+ vpmuludq $H3,$R0,$M3
+ vpmuludq $H3,$R1,$M4
+ vpmuludq $H1,$R0,$M1
+ vpmuludq $H1,$R1,$M2
+ vpaddq $M3,$D3,$D3 # d3 += h3*r0
+ vpaddq $M4,$D4,$D4 # d4 += h3*r1
+ vpaddq $M1,$D1,$D1 # d1 += h1*r0
+ vpaddq $M2,$D2,$D2 # d2 += h1*r1
+
+ vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
+ vpmuludq $H4,$S4,$M3
+ vpmuludq $H4,$R0,$M4
+ vpmuludq $H3,$S2,$M0
+ vpmuludq $H3,$S3,$M1
+ vpmuludq $H3,$S4,$M2
+ vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
+ vpaddq $M4,$D4,$D4 # d4 += h4*r0
+ vpaddq $M0,$D0,$D0 # d0 += h3*s2
+ vpaddq $M1,$D1,$D1 # d1 += h3*s3
+ vpaddq $M2,$D2,$D2 # d2 += h3*s4
+
+ vpmuludq $H4,$S1,$M0
+ vpmuludq $H4,$S2,$M1
+ vpmuludq $H4,$S3,$M2
+ vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
+ vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
+ vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
+
+ ################################################################
+ # horizontal addition
+
+ mov \$1,%eax
+ vpermq \$0xb1,$H3,$D3
+ vpermq \$0xb1,$D4,$H4
+ vpermq \$0xb1,$H0,$D0
+ vpermq \$0xb1,$H1,$D1
+ vpermq \$0xb1,$H2,$D2
+ vpaddq $D3,$H3,$H3
+ vpaddq $D4,$H4,$H4
+ vpaddq $D0,$H0,$H0
+ vpaddq $D1,$H1,$H1
+ vpaddq $D2,$H2,$H2
+
+ kmovw %eax,%k3
+ vpermq \$0x2,$H3,$D3
+ vpermq \$0x2,$H4,$D4
+ vpermq \$0x2,$H0,$D0
+ vpermq \$0x2,$H1,$D1
+ vpermq \$0x2,$H2,$D2
+ vpaddq $D3,$H3,$H3
+ vpaddq $D4,$H4,$H4
+ vpaddq $D0,$H0,$H0
+ vpaddq $D1,$H1,$H1
+ vpaddq $D2,$H2,$H2
+
+ vextracti64x4 \$0x1,$H3,%y#$D3
+ vextracti64x4 \$0x1,$H4,%y#$D4
+ vextracti64x4 \$0x1,$H0,%y#$D0
+ vextracti64x4 \$0x1,$H1,%y#$D1
+ vextracti64x4 \$0x1,$H2,%y#$D2
+ vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
+ vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
+ vpaddq $D0,$H0,${H0}{%k3}{z}
+ vpaddq $D1,$H1,${H1}{%k3}{z}
+ vpaddq $D2,$H2,${H2}{%k3}{z}
+___
+map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
+map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
+$code.=<<___;
+ ################################################################
+ # lazy reduction (interleaved with input splat)
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpsrldq \$6,$T1,$T3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpunpcklqdq $T3,$T2,$T2 # 2:3
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D4
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpsrlq \$30,$T2,$T3
+ vpsrlq \$4,$T2,$T2
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpsrlq \$26,$T0,$T1
+ vpsrlq \$40,$T4,$T4 # 4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T0,$T0 # 0
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
+ vpand $MASK,$T1,$T1 # 1
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
+ add \$64,$len
+ jnz .Ltail_avx2$suffix
+
+ vpsubq $T2,$H2,$H2 # undo input accumulation
+ vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
+ vmovd %x#$H1,`4*1-48-64`($ctx)
+ vmovd %x#$H2,`4*2-48-64`($ctx)
+ vmovd %x#$H3,`4*3-48-64`($ctx)
+ vmovd %x#$H4,`4*4-48-64`($ctx)
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movdqa -0xb0(%r10),%xmm6
+ movdqa -0xa0(%r10),%xmm7
+ movdqa -0x90(%r10),%xmm8
+ movdqa -0x80(%r10),%xmm9
+ movdqa -0x70(%r10),%xmm10
+ movdqa -0x60(%r10),%xmm11
+ movdqa -0x50(%r10),%xmm12
+ movdqa -0x40(%r10),%xmm13
+ movdqa -0x30(%r10),%xmm14
+ movdqa -0x20(%r10),%xmm15
+ lea -8(%r10),%rsp
+.Ldo_avx512_epilogue:
+___
+$code.=<<___ if (!$win64);
+ lea -8(%r10),%rsp
+.cfi_def_cfa_register %rsp
+___
+$code.=<<___;
+ RET
+.cfi_endproc
+___
+
+}
+
+}
+
+&declare_function("poly1305_blocks_avx2", 32, 4);
+poly1305_blocks_avxN(0);
+&end_function("poly1305_blocks_avx2");
+
+#######################################################################
+if ($avx>2) {
+# On entry we have input length divisible by 64. But since inner loop
+# processes 128 bytes per iteration, cases when length is not divisible
+# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
+# reason stack layout is kept identical to poly1305_blocks_avx2. If not
+# for this tail, we wouldn't have to even allocate stack frame...
+
+&declare_function("poly1305_blocks_avx512", 32, 4);
+poly1305_blocks_avxN(1);
+&end_function("poly1305_blocks_avx512");
+
+if (!$kernel && $avx>3) {
+########################################################################
+# VPMADD52 version using 2^44 radix.
+#
+# One can argue that base 2^52 would be more natural. Well, even though
+# some operations would be more natural, one has to recognize couple of
+# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
+# at amount of multiply-n-accumulate operations. Secondly, it makes it
+# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
+# reference implementations], which means that more such operations
+# would have to be performed in inner loop, which in turn makes critical
+# path longer. In other words, even though base 2^44 reduction might
+# look less elegant, overall critical path is actually shorter...
+
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int64 h[3]; # current hash value base 2^44
+# unsigned __int64 s[2]; # key value*20 base 2^44
+# unsigned __int64 r[3]; # key value base 2^44
+# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
+# # r^n positions reflect
+# # placement in register, not
+# # memory, R[3] is R[1]*20
+
+$code.=<<___;
+.type poly1305_init_base2_44,\@function,3
+.align 32
+poly1305_init_base2_44:
+ xor %eax,%eax
+ mov %rax,0($ctx) # initialize hash value
+ mov %rax,8($ctx)
+ mov %rax,16($ctx)
+
+.Linit_base2_44:
+ lea poly1305_blocks_vpmadd52(%rip),%r10
+ lea poly1305_emit_base2_44(%rip),%r11
+
+ mov \$0x0ffffffc0fffffff,%rax
+ mov \$0x0ffffffc0ffffffc,%rcx
+ and 0($inp),%rax
+ mov \$0x00000fffffffffff,%r8
+ and 8($inp),%rcx
+ mov \$0x00000fffffffffff,%r9
+ and %rax,%r8
+ shrd \$44,%rcx,%rax
+ mov %r8,40($ctx) # r0
+ and %r9,%rax
+ shr \$24,%rcx
+ mov %rax,48($ctx) # r1
+ lea (%rax,%rax,4),%rax # *5
+ mov %rcx,56($ctx) # r2
+ shl \$2,%rax # magic <<2
+ lea (%rcx,%rcx,4),%rcx # *5
+ shl \$2,%rcx # magic <<2
+ mov %rax,24($ctx) # s1
+ mov %rcx,32($ctx) # s2
+ movq \$-1,64($ctx) # write impossible value
+___
+$code.=<<___ if ($flavour !~ /elf32/);
+ mov %r10,0(%rdx)
+ mov %r11,8(%rdx)
+___
+$code.=<<___ if ($flavour =~ /elf32/);
+ mov %r10d,0(%rdx)
+ mov %r11d,4(%rdx)
+___
+$code.=<<___;
+ mov \$1,%eax
+ RET
+.size poly1305_init_base2_44,.-poly1305_init_base2_44
+___
+{
+my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
+my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
+my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
+
+$code.=<<___;
+.type poly1305_blocks_vpmadd52,\@function,4
+.align 32
+poly1305_blocks_vpmadd52:
+ shr \$4,$len
+ jz .Lno_data_vpmadd52 # too short
+
+ shl \$40,$padbit
+ mov 64($ctx),%r8 # peek on power of the key
+
+ # if powers of the key are not calculated yet, process up to 3
+ # blocks with this single-block subroutine, otherwise ensure that
+ # length is divisible by 2 blocks and pass the rest down to next
+ # subroutine...
+
+ mov \$3,%rax
+ mov \$1,%r10
+ cmp \$4,$len # is input long
+ cmovae %r10,%rax
+ test %r8,%r8 # is power value impossible?
+ cmovns %r10,%rax
+
+ and $len,%rax # is input of favourable length?
+ jz .Lblocks_vpmadd52_4x
+
+ sub %rax,$len
+ mov \$7,%r10d
+ mov \$1,%r11d
+ kmovw %r10d,%k7
+ lea .L2_44_inp_permd(%rip),%r10
+ kmovw %r11d,%k1
+
+ vmovq $padbit,%x#$PAD
+ vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
+ vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
+ vpermq \$0xcf,$PAD,$PAD
+ vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
+
+ vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
+ vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
+ vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
+ vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
+
+ vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
+ vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
+
+ jmp .Loop_vpmadd52
+
+.align 32
+.Loop_vpmadd52:
+ vmovdqu32 0($inp),%x#$T0 # load input as ----3210
+ lea 16($inp),$inp
+
+ vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
+ vpsrlvq $inp_shift,$T0,$T0
+ vpandq $reduc_mask,$T0,$T0
+ vporq $PAD,$T0,$T0
+
+ vpaddq $T0,$Dlo,$Dlo # accumulate input
+
+ vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
+ vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
+ vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
+
+ vpxord $Dlo,$Dlo,$Dlo
+ vpxord $Dhi,$Dhi,$Dhi
+
+ vpmadd52luq $r2r1r0,$H0,$Dlo
+ vpmadd52huq $r2r1r0,$H0,$Dhi
+
+ vpmadd52luq $r1r0s2,$H1,$Dlo
+ vpmadd52huq $r1r0s2,$H1,$Dhi
+
+ vpmadd52luq $r0s2s1,$H2,$Dlo
+ vpmadd52huq $r0s2s1,$H2,$Dhi
+
+ vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
+ vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
+ vpandq $reduc_mask,$Dlo,$Dlo
+
+ vpaddq $T0,$Dhi,$Dhi
+
+ vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
+
+ vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
+
+ vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
+ vpandq $reduc_mask,$Dlo,$Dlo
+
+ vpermq \$0b10010011,$T0,$T0
+
+ vpaddq $T0,$Dlo,$Dlo
+
+ vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
+
+ vpaddq $T0,$Dlo,$Dlo
+ vpsllq \$2,$T0,$T0
+
+ vpaddq $T0,$Dlo,$Dlo
+
+ dec %rax # len-=16
+ jnz .Loop_vpmadd52
+
+ vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
+
+ test $len,$len
+ jnz .Lblocks_vpmadd52_4x
+
+.Lno_data_vpmadd52:
+ RET
+.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
+___
+}
+{
+########################################################################
+# As implied by its name 4x subroutine processes 4 blocks in parallel
+# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
+# and is handled in 256-bit %ymm registers.
+
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+
+$code.=<<___;
+.type poly1305_blocks_vpmadd52_4x,\@function,4
+.align 32
+poly1305_blocks_vpmadd52_4x:
+ shr \$4,$len
+ jz .Lno_data_vpmadd52_4x # too short
+
+ shl \$40,$padbit
+ mov 64($ctx),%r8 # peek on power of the key
+
+.Lblocks_vpmadd52_4x:
+ vpbroadcastq $padbit,$PAD
+
+ vmovdqa64 .Lx_mask44(%rip),$mask44
+ mov \$5,%eax
+ vmovdqa64 .Lx_mask42(%rip),$mask42
+ kmovw %eax,%k1 # used in 2x path
+
+ test %r8,%r8 # is power value impossible?
+ js .Linit_vpmadd52 # if it is, then init R[4]
+
+ vmovq 0($ctx),%x#$H0 # load current hash value
+ vmovq 8($ctx),%x#$H1
+ vmovq 16($ctx),%x#$H2
+
+ test \$3,$len # is length 4*n+2?
+ jnz .Lblocks_vpmadd52_2x_do
+
+.Lblocks_vpmadd52_4x_do:
+ vpbroadcastq 64($ctx),$R0 # load 4th power of the key
+ vpbroadcastq 96($ctx),$R1
+ vpbroadcastq 128($ctx),$R2
+ vpbroadcastq 160($ctx),$S1
+
+.Lblocks_vpmadd52_4x_key_loaded:
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S2,$S2
+
+ test \$7,$len # is len 8*n?
+ jz .Lblocks_vpmadd52_8x
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*2($inp),$T3
+ lea 16*4($inp),$inp
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+
+ # at this point 64-bit lanes are ordered as 3-1-2-0
+
+ vpsrlq \$24,$T3,$T2 # splat the data
+ vporq $PAD,$T2,$T2
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ sub \$4,$len
+ jz .Ltail_vpmadd52_4x
+ jmp .Loop_vpmadd52_4x
+ ud2
+
+.align 32
+.Linit_vpmadd52:
+ vmovq 24($ctx),%x#$S1 # load key
+ vmovq 56($ctx),%x#$H2
+ vmovq 32($ctx),%x#$S2
+ vmovq 40($ctx),%x#$R0
+ vmovq 48($ctx),%x#$R1
+
+ vmovdqa $R0,$H0
+ vmovdqa $R1,$H1
+ vmovdqa $H2,$R2
+
+ mov \$2,%eax
+
+.Lmul_init_vpmadd52:
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ dec %eax
+ jz .Ldone_init_vpmadd52
+
+ vpunpcklqdq $R1,$H1,$R1 # 1,2
+ vpbroadcastq %x#$H1,%x#$H1 # 2,2
+ vpunpcklqdq $R2,$H2,$R2
+ vpbroadcastq %x#$H2,%x#$H2
+ vpunpcklqdq $R0,$H0,$R0
+ vpbroadcastq %x#$H0,%x#$H0
+
+ vpsllq \$2,$R1,$S1 # S1 = R1*5*4
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R1,$S1,$S1
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S1,$S1
+ vpsllq \$2,$S2,$S2
+
+ jmp .Lmul_init_vpmadd52
+ ud2
+
+.align 32
+.Ldone_init_vpmadd52:
+ vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
+ vinserti128 \$1,%x#$R2,$H2,$R2
+ vinserti128 \$1,%x#$R0,$H0,$R0
+
+ vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
+ vpermq \$0b11011000,$R2,$R2
+ vpermq \$0b11011000,$R0,$R0
+
+ vpsllq \$2,$R1,$S1 # S1 = R1*5*4
+ vpaddq $R1,$S1,$S1
+ vpsllq \$2,$S1,$S1
+
+ vmovq 0($ctx),%x#$H0 # load current hash value
+ vmovq 8($ctx),%x#$H1
+ vmovq 16($ctx),%x#$H2
+
+ test \$3,$len # is length 4*n+2?
+ jnz .Ldone_init_vpmadd52_2x
+
+ vmovdqu64 $R0,64($ctx) # save key powers
+ vpbroadcastq %x#$R0,$R0 # broadcast 4th power
+ vmovdqu64 $R1,96($ctx)
+ vpbroadcastq %x#$R1,$R1
+ vmovdqu64 $R2,128($ctx)
+ vpbroadcastq %x#$R2,$R2
+ vmovdqu64 $S1,160($ctx)
+ vpbroadcastq %x#$S1,$S1
+
+ jmp .Lblocks_vpmadd52_4x_key_loaded
+ ud2
+
+.align 32
+.Ldone_init_vpmadd52_2x:
+ vmovdqu64 $R0,64($ctx) # save key powers
+ vpsrldq \$8,$R0,$R0 # 0-1-0-2
+ vmovdqu64 $R1,96($ctx)
+ vpsrldq \$8,$R1,$R1
+ vmovdqu64 $R2,128($ctx)
+ vpsrldq \$8,$R2,$R2
+ vmovdqu64 $S1,160($ctx)
+ vpsrldq \$8,$S1,$S1
+ jmp .Lblocks_vpmadd52_2x_key_loaded
+ ud2
+
+.align 32
+.Lblocks_vpmadd52_2x_do:
+ vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
+ vmovdqu64 160+8($ctx),${S1}{%k1}{z}
+ vmovdqu64 64+8($ctx),${R0}{%k1}{z}
+ vmovdqu64 96+8($ctx),${R1}{%k1}{z}
+
+.Lblocks_vpmadd52_2x_key_loaded:
+ vmovdqu64 16*0($inp),$T2 # load data
+ vpxorq $T3,$T3,$T3
+ lea 16*2($inp),$inp
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+
+ # at this point 64-bit lanes are ordered as x-1-x-0
+
+ vpsrlq \$24,$T3,$T2 # splat the data
+ vporq $PAD,$T2,$T2
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ jmp .Ltail_vpmadd52_2x
+ ud2
+
+.align 32
+.Loop_vpmadd52_4x:
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*2($inp),$T3
+ lea 16*4($inp),$inp
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction (interleaved with data splat)
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpsrlq \$24,$T3,$T2
+ vporq $PAD,$T2,$T2
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ sub \$4,$len # len-=64
+ jnz .Loop_vpmadd52_4x
+
+.Ltail_vpmadd52_4x:
+ vmovdqu64 128($ctx),$R2 # load all key powers
+ vmovdqu64 160($ctx),$S1
+ vmovdqu64 64($ctx),$R0
+ vmovdqu64 96($ctx),$R1
+
+.Ltail_vpmadd52_2x:
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S2,$S2
+
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # horizontal addition
+
+ mov \$1,%eax
+ kmovw %eax,%k1
+ vpsrldq \$8,$D0lo,$T0
+ vpsrldq \$8,$D0hi,$H0
+ vpsrldq \$8,$D1lo,$T1
+ vpsrldq \$8,$D1hi,$H1
+ vpaddq $T0,$D0lo,$D0lo
+ vpaddq $H0,$D0hi,$D0hi
+ vpsrldq \$8,$D2lo,$T2
+ vpsrldq \$8,$D2hi,$H2
+ vpaddq $T1,$D1lo,$D1lo
+ vpaddq $H1,$D1hi,$D1hi
+ vpermq \$0x2,$D0lo,$T0
+ vpermq \$0x2,$D0hi,$H0
+ vpaddq $T2,$D2lo,$D2lo
+ vpaddq $H2,$D2hi,$D2hi
+
+ vpermq \$0x2,$D1lo,$T1
+ vpermq \$0x2,$D1hi,$H1
+ vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
+ vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
+ vpermq \$0x2,$D2lo,$T2
+ vpermq \$0x2,$D2hi,$H2
+ vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
+ vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
+ vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
+ vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+ # at this point $len is
+ # either 4*n+2 or 0...
+ sub \$2,$len # len-=32
+ ja .Lblocks_vpmadd52_4x_do
+
+ vmovq %x#$H0,0($ctx)
+ vmovq %x#$H1,8($ctx)
+ vmovq %x#$H2,16($ctx)
+ vzeroall
+
+.Lno_data_vpmadd52_4x:
+ RET
+.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
+___
+}
+{
+########################################################################
+# As implied by its name 8x subroutine processes 8 blocks in parallel...
+# This is intermediate version, as it's used only in cases when input
+# length is either 8*n, 8*n+1 or 8*n+2...
+
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
+
+$code.=<<___;
+.type poly1305_blocks_vpmadd52_8x,\@function,4
+.align 32
+poly1305_blocks_vpmadd52_8x:
+ shr \$4,$len
+ jz .Lno_data_vpmadd52_8x # too short
+
+ shl \$40,$padbit
+ mov 64($ctx),%r8 # peek on power of the key
+
+ vmovdqa64 .Lx_mask44(%rip),$mask44
+ vmovdqa64 .Lx_mask42(%rip),$mask42
+
+ test %r8,%r8 # is power value impossible?
+ js .Linit_vpmadd52 # if it is, then init R[4]
+
+ vmovq 0($ctx),%x#$H0 # load current hash value
+ vmovq 8($ctx),%x#$H1
+ vmovq 16($ctx),%x#$H2
+
+.Lblocks_vpmadd52_8x:
+ ################################################################
+ # fist we calculate more key powers
+
+ vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
+ vmovdqu64 160($ctx),$S1
+ vmovdqu64 64($ctx),$R0
+ vmovdqu64 96($ctx),$R1
+
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S2,$S2
+
+ vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
+ vpbroadcastq %x#$R0,$RR0
+ vpbroadcastq %x#$R1,$RR1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $RR2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $RR2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $RR2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $RR2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $RR2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $RR2,$R0,$D2hi
+
+ vpmadd52luq $RR0,$R0,$D0lo
+ vpmadd52huq $RR0,$R0,$D0hi
+ vpmadd52luq $RR0,$R1,$D1lo
+ vpmadd52huq $RR0,$R1,$D1hi
+ vpmadd52luq $RR0,$R2,$D2lo
+ vpmadd52huq $RR0,$R2,$D2hi
+
+ vpmadd52luq $RR1,$S2,$D0lo
+ vpmadd52huq $RR1,$S2,$D0hi
+ vpmadd52luq $RR1,$R0,$D1lo
+ vpmadd52huq $RR1,$R0,$D1hi
+ vpmadd52luq $RR1,$R1,$D2lo
+ vpmadd52huq $RR1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$RR0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$RR1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$RR2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$RR0,$RR0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$RR0,$RR0
+
+ vpsrlq \$44,$RR0,$tmp # additional step
+ vpandq $mask44,$RR0,$RR0
+
+ vpaddq $tmp,$RR1,$RR1
+
+ ################################################################
+ # At this point Rx holds 1324 powers, RRx - 5768, and the goal
+ # is 15263748, which reflects how data is loaded...
+
+ vpunpcklqdq $R2,$RR2,$T2 # 3748
+ vpunpckhqdq $R2,$RR2,$R2 # 1526
+ vpunpcklqdq $R0,$RR0,$T0
+ vpunpckhqdq $R0,$RR0,$R0
+ vpunpcklqdq $R1,$RR1,$T1
+ vpunpckhqdq $R1,$RR1,$R1
+___
+######## switch to %zmm
+map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
+map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
+map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
+map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
+
+$code.=<<___;
+ vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
+ vshufi64x2 \$0x44,$R0,$T0,$RR0
+ vshufi64x2 \$0x44,$R1,$T1,$RR1
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*4($inp),$T3
+ lea 16*8($inp),$inp
+
+ vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
+ vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
+ vpaddq $RR2,$SS2,$SS2
+ vpaddq $RR1,$SS1,$SS1
+ vpsllq \$2,$SS2,$SS2
+ vpsllq \$2,$SS1,$SS1
+
+ vpbroadcastq $padbit,$PAD
+ vpbroadcastq %x#$mask44,$mask44
+ vpbroadcastq %x#$mask42,$mask42
+
+ vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
+ vpbroadcastq %x#$SS2,$S2
+ vpbroadcastq %x#$RR0,$R0
+ vpbroadcastq %x#$RR1,$R1
+ vpbroadcastq %x#$RR2,$R2
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+
+ # at this point 64-bit lanes are ordered as 73625140
+
+ vpsrlq \$24,$T3,$T2 # splat the data
+ vporq $PAD,$T2,$T2
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ sub \$8,$len
+ jz .Ltail_vpmadd52_8x
+ jmp .Loop_vpmadd52_8x
+
+.align 32
+.Loop_vpmadd52_8x:
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*4($inp),$T3
+ lea 16*8($inp),$inp
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction (interleaved with data splat)
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpsrlq \$24,$T3,$T2
+ vporq $PAD,$T2,$T2
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ sub \$8,$len # len-=128
+ jnz .Loop_vpmadd52_8x
+
+.Ltail_vpmadd52_8x:
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$SS1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$SS1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$SS2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$SS2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$RR0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$RR0,$D2hi
+
+ vpmadd52luq $H0,$RR0,$D0lo
+ vpmadd52huq $H0,$RR0,$D0hi
+ vpmadd52luq $H0,$RR1,$D1lo
+ vpmadd52huq $H0,$RR1,$D1hi
+ vpmadd52luq $H0,$RR2,$D2lo
+ vpmadd52huq $H0,$RR2,$D2hi
+
+ vpmadd52luq $H1,$SS2,$D0lo
+ vpmadd52huq $H1,$SS2,$D0hi
+ vpmadd52luq $H1,$RR0,$D1lo
+ vpmadd52huq $H1,$RR0,$D1hi
+ vpmadd52luq $H1,$RR1,$D2lo
+ vpmadd52huq $H1,$RR1,$D2hi
+
+ ################################################################
+ # horizontal addition
+
+ mov \$1,%eax
+ kmovw %eax,%k1
+ vpsrldq \$8,$D0lo,$T0
+ vpsrldq \$8,$D0hi,$H0
+ vpsrldq \$8,$D1lo,$T1
+ vpsrldq \$8,$D1hi,$H1
+ vpaddq $T0,$D0lo,$D0lo
+ vpaddq $H0,$D0hi,$D0hi
+ vpsrldq \$8,$D2lo,$T2
+ vpsrldq \$8,$D2hi,$H2
+ vpaddq $T1,$D1lo,$D1lo
+ vpaddq $H1,$D1hi,$D1hi
+ vpermq \$0x2,$D0lo,$T0
+ vpermq \$0x2,$D0hi,$H0
+ vpaddq $T2,$D2lo,$D2lo
+ vpaddq $H2,$D2hi,$D2hi
+
+ vpermq \$0x2,$D1lo,$T1
+ vpermq \$0x2,$D1hi,$H1
+ vpaddq $T0,$D0lo,$D0lo
+ vpaddq $H0,$D0hi,$D0hi
+ vpermq \$0x2,$D2lo,$T2
+ vpermq \$0x2,$D2hi,$H2
+ vpaddq $T1,$D1lo,$D1lo
+ vpaddq $H1,$D1hi,$D1hi
+ vextracti64x4 \$1,$D0lo,%y#$T0
+ vextracti64x4 \$1,$D0hi,%y#$H0
+ vpaddq $T2,$D2lo,$D2lo
+ vpaddq $H2,$D2hi,$D2hi
+
+ vextracti64x4 \$1,$D1lo,%y#$T1
+ vextracti64x4 \$1,$D1hi,%y#$H1
+ vextracti64x4 \$1,$D2lo,%y#$T2
+ vextracti64x4 \$1,$D2hi,%y#$H2
+___
+######## switch back to %ymm
+map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
+map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
+map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
+
+$code.=<<___;
+ vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
+ vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
+ vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
+ vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
+ vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
+ vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ ################################################################
+
+ vmovq %x#$H0,0($ctx)
+ vmovq %x#$H1,8($ctx)
+ vmovq %x#$H2,16($ctx)
+ vzeroall
+
+.Lno_data_vpmadd52_8x:
+ RET
+.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
+___
+}
+$code.=<<___;
+.type poly1305_emit_base2_44,\@function,3
+.align 32
+poly1305_emit_base2_44:
+ mov 0($ctx),%r8 # load hash value
+ mov 8($ctx),%r9
+ mov 16($ctx),%r10
+
+ mov %r9,%rax
+ shr \$20,%r9
+ shl \$44,%rax
+ mov %r10,%rcx
+ shr \$40,%r10
+ shl \$24,%rcx
+
+ add %rax,%r8
+ adc %rcx,%r9
+ adc \$0,%r10
+
+ mov %r8,%rax
+ add \$5,%r8 # compare to modulus
+ mov %r9,%rcx
+ adc \$0,%r9
+ adc \$0,%r10
+ shr \$2,%r10 # did 130-bit value overflow?
+ cmovnz %r8,%rax
+ cmovnz %r9,%rcx
+
+ add 0($nonce),%rax # accumulate nonce
+ adc 8($nonce),%rcx
+ mov %rax,0($mac) # write result
+ mov %rcx,8($mac)
+
+ RET
+.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
+___
+} } }
+}
+
+if (!$kernel)
+{ # chacha20-poly1305 helpers
+my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+$code.=<<___;
+.globl xor128_encrypt_n_pad
+.type xor128_encrypt_n_pad,\@abi-omnipotent
+.align 16
+xor128_encrypt_n_pad:
+ sub $otp,$inp
+ sub $otp,$out
+ mov $len,%r10 # put len aside
+ shr \$4,$len # len / 16
+ jz .Ltail_enc
+ nop
+.Loop_enc_xmm:
+ movdqu ($inp,$otp),%xmm0
+ pxor ($otp),%xmm0
+ movdqu %xmm0,($out,$otp)
+ movdqa %xmm0,($otp)
+ lea 16($otp),$otp
+ dec $len
+ jnz .Loop_enc_xmm
+
+ and \$15,%r10 # len % 16
+ jz .Ldone_enc
+
+.Ltail_enc:
+ mov \$16,$len
+ sub %r10,$len
+ xor %eax,%eax
+.Loop_enc_byte:
+ mov ($inp,$otp),%al
+ xor ($otp),%al
+ mov %al,($out,$otp)
+ mov %al,($otp)
+ lea 1($otp),$otp
+ dec %r10
+ jnz .Loop_enc_byte
+
+ xor %eax,%eax
+.Loop_enc_pad:
+ mov %al,($otp)
+ lea 1($otp),$otp
+ dec $len
+ jnz .Loop_enc_pad
+
+.Ldone_enc:
+ mov $otp,%rax
+ RET
+.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
+
+.globl xor128_decrypt_n_pad
+.type xor128_decrypt_n_pad,\@abi-omnipotent
+.align 16
+xor128_decrypt_n_pad:
+ sub $otp,$inp
+ sub $otp,$out
+ mov $len,%r10 # put len aside
+ shr \$4,$len # len / 16
+ jz .Ltail_dec
+ nop
+.Loop_dec_xmm:
+ movdqu ($inp,$otp),%xmm0
+ movdqa ($otp),%xmm1
+ pxor %xmm0,%xmm1
+ movdqu %xmm1,($out,$otp)
+ movdqa %xmm0,($otp)
+ lea 16($otp),$otp
+ dec $len
+ jnz .Loop_dec_xmm
+
+ pxor %xmm1,%xmm1
+ and \$15,%r10 # len % 16
+ jz .Ldone_dec
+
+.Ltail_dec:
+ mov \$16,$len
+ sub %r10,$len
+ xor %eax,%eax
+ xor %r11d,%r11d
+.Loop_dec_byte:
+ mov ($inp,$otp),%r11b
+ mov ($otp),%al
+ xor %r11b,%al
+ mov %al,($out,$otp)
+ mov %r11b,($otp)
+ lea 1($otp),$otp
+ dec %r10
+ jnz .Loop_dec_byte
+
+ xor %eax,%eax
+.Loop_dec_pad:
+ mov %al,($otp)
+ lea 1($otp),$otp
+ dec $len
+ jnz .Loop_dec_pad
+
+.Ldone_dec:
+ mov $otp,%rax
+ RET
+.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
+___
+}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lcommon_seh_tail
+
+ lea 48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R14
+
+ jmp .Lcommon_seh_tail
+.size se_handler,.-se_handler
+
+.type avx_handler,\@abi-omnipotent
+.align 16
+avx_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 208($context),%rax # pull context->R11
+
+ lea 0x50(%rax),%rsi
+ lea 0xf8(%rax),%rax
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ RET
+.size avx_handler,.-avx_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_poly1305_init_x86_64
+ .rva .LSEH_end_poly1305_init_x86_64
+ .rva .LSEH_info_poly1305_init_x86_64
+
+ .rva .LSEH_begin_poly1305_blocks_x86_64
+ .rva .LSEH_end_poly1305_blocks_x86_64
+ .rva .LSEH_info_poly1305_blocks_x86_64
+
+ .rva .LSEH_begin_poly1305_emit_x86_64
+ .rva .LSEH_end_poly1305_emit_x86_64
+ .rva .LSEH_info_poly1305_emit_x86_64
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_poly1305_blocks_avx
+ .rva .Lbase2_64_avx
+ .rva .LSEH_info_poly1305_blocks_avx_1
+
+ .rva .Lbase2_64_avx
+ .rva .Leven_avx
+ .rva .LSEH_info_poly1305_blocks_avx_2
+
+ .rva .Leven_avx
+ .rva .LSEH_end_poly1305_blocks_avx
+ .rva .LSEH_info_poly1305_blocks_avx_3
+
+ .rva .LSEH_begin_poly1305_emit_avx
+ .rva .LSEH_end_poly1305_emit_avx
+ .rva .LSEH_info_poly1305_emit_avx
+___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_poly1305_blocks_avx2
+ .rva .Lbase2_64_avx2
+ .rva .LSEH_info_poly1305_blocks_avx2_1
+
+ .rva .Lbase2_64_avx2
+ .rva .Leven_avx2
+ .rva .LSEH_info_poly1305_blocks_avx2_2
+
+ .rva .Leven_avx2
+ .rva .LSEH_end_poly1305_blocks_avx2
+ .rva .LSEH_info_poly1305_blocks_avx2_3
+___
+$code.=<<___ if ($avx>2);
+ .rva .LSEH_begin_poly1305_blocks_avx512
+ .rva .LSEH_end_poly1305_blocks_avx512
+ .rva .LSEH_info_poly1305_blocks_avx512
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_poly1305_init_x86_64:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
+
+.LSEH_info_poly1305_blocks_x86_64:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_body,.Lblocks_epilogue
+
+.LSEH_info_poly1305_emit_x86_64:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
+___
+$code.=<<___ if ($avx);
+.LSEH_info_poly1305_blocks_avx_1:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx_2:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx_3:
+ .byte 9,0,0,0
+ .rva avx_handler
+ .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_emit_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_poly1305_blocks_avx2_1:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx2_2:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx2_3:
+ .byte 9,0,0,0
+ .rva avx_handler
+ .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
+___
+$code.=<<___ if ($avx>2);
+.LSEH_info_poly1305_blocks_avx512:
+ .byte 9,0,0,0
+ .rva avx_handler
+ .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
+___
+}
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
+foreach (split('\n',$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+ s/%r([a-z]+)#d/%e$1/g;
+ s/%r([0-9]+)#d/%r$1d/g;
+ s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
+
+ if ($kernel) {
+ s/(^\.type.*),[0-9]+$/\1/;
+ s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
+ next if /^\.cfi.*/;
+ }
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/lib/crypto/x86/poly1305.h b/lib/crypto/x86/poly1305.h
new file mode 100644
index 000000000000..ee92e3740a78
--- /dev/null
+++ b/lib/crypto/x86/poly1305.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/cpu_device_id.h>
+#include <asm/fpu/api.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+struct poly1305_arch_internal {
+ union {
+ struct {
+ u32 h[5];
+ u32 is_base2_26;
+ };
+ u64 hs[3];
+ };
+ u64 r[2];
+ u64 pad;
+ struct { u32 r2, r1, r4, r3; } rn[9];
+};
+
+/*
+ * The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
+ * the unfortunate situation of using AVX and then having to go back to scalar
+ * -- because the user is silly and has called the update function from two
+ * separate contexts -- then we need to convert back to the original base before
+ * proceeding. It is possible to reason that the initial reduction below is
+ * sufficient given the implementation invariants. However, for an avoidance of
+ * doubt and because this is not performance critical, we do the full reduction
+ * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
+ */
+static void convert_to_base2_64(void *ctx)
+{
+ struct poly1305_arch_internal *state = ctx;
+ u32 cy;
+
+ if (!state->is_base2_26)
+ return;
+
+ cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+ cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+ cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+ cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+ state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
+ state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
+ state->hs[2] = state->h[4] >> 24;
+ /* Unsigned Less Than: branchlessly produces 1 if a < b, else 0. */
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+ cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
+ state->hs[2] &= 3;
+ state->hs[0] += cy;
+ state->hs[1] += (cy = ULT(state->hs[0], cy));
+ state->hs[2] += ULT(state->hs[1], cy);
+#undef ULT
+ state->is_base2_26 = 0;
+}
+
+asmlinkage void poly1305_init_x86_64(struct poly1305_block_state *state,
+ const u8 raw_key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx,
+ const u8 *inp,
+ const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx,
+ u8 mac[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx,
+ u8 mac[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx,
+ const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx,
+ const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx,
+ const u8 *inp,
+ const size_t len, const u32 padbit);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
+
+static void poly1305_block_init(struct poly1305_block_state *state,
+ const u8 raw_key[POLY1305_BLOCK_SIZE])
+{
+ poly1305_init_x86_64(state, raw_key);
+}
+
+static void poly1305_blocks(struct poly1305_block_state *state, const u8 *inp,
+ unsigned int len, u32 padbit)
+{
+ struct poly1305_arch_internal *ctx =
+ container_of(&state->h.h, struct poly1305_arch_internal, h);
+
+ /* SIMD disables preemption, so relax after processing each page. */
+ BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
+ SZ_4K % POLY1305_BLOCK_SIZE);
+
+ /*
+ * The AVX implementations have significant setup overhead (e.g. key
+ * power computation, kernel FPU enabling) which makes them slower for
+ * short messages. Fall back to the scalar implementation for messages
+ * shorter than 288 bytes, unless the AVX-specific key setup has already
+ * been performed (indicated by ctx->is_base2_26).
+ */
+ if (!static_branch_likely(&poly1305_use_avx) ||
+ (len < POLY1305_BLOCK_SIZE * 18 && !ctx->is_base2_26) ||
+ unlikely(!irq_fpu_usable())) {
+ convert_to_base2_64(ctx);
+ poly1305_blocks_x86_64(ctx, inp, len, padbit);
+ return;
+ }
+
+ do {
+ const unsigned int bytes = min(len, SZ_4K);
+
+ kernel_fpu_begin();
+ if (static_branch_likely(&poly1305_use_avx512))
+ poly1305_blocks_avx512(ctx, inp, bytes, padbit);
+ else if (static_branch_likely(&poly1305_use_avx2))
+ poly1305_blocks_avx2(ctx, inp, bytes, padbit);
+ else
+ poly1305_blocks_avx(ctx, inp, bytes, padbit);
+ kernel_fpu_end();
+
+ len -= bytes;
+ inp += bytes;
+ } while (len);
+}
+
+static void poly1305_emit(const struct poly1305_state *ctx,
+ u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4])
+{
+ if (!static_branch_likely(&poly1305_use_avx))
+ poly1305_emit_x86_64(ctx, mac, nonce);
+ else
+ poly1305_emit_avx(ctx, mac, nonce);
+}
+
+#define poly1305_mod_init_arch poly1305_mod_init_arch
+static void poly1305_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ static_branch_enable(&poly1305_use_avx);
+ if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ static_branch_enable(&poly1305_use_avx2);
+ if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
+ /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
+ boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
+ static_branch_enable(&poly1305_use_avx512);
+}
diff --git a/lib/crypto/x86/polyval-pclmul-avx.S b/lib/crypto/x86/polyval-pclmul-avx.S
new file mode 100644
index 000000000000..7f739465ad35
--- /dev/null
+++ b/lib/crypto/x86/polyval-pclmul-avx.S
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
+ * instructions. It works on 8 blocks at a time, by precomputing the first 8
+ * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
+ * allows us to split finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STRIDE_BLOCKS 8
+
+#define GSTAR %xmm7
+#define PL %xmm8
+#define PH %xmm9
+#define TMP_XMM %xmm11
+#define LO %xmm12
+#define HI %xmm13
+#define MI %xmm14
+#define SUM %xmm15
+
+#define ACCUMULATOR %rdi
+#define KEY_POWERS %rsi
+#define MSG %rdx
+#define BLOCKS_LEFT %rcx
+#define TMP %rax
+
+.section .rodata.cst16.gstar, "aM", @progbits, 16
+.align 16
+
+.Lgstar:
+ .quad 0xc200000000000000, 0xc200000000000000
+
+.text
+
+/*
+ * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
+ * count pointed to by MSG and KEY_POWERS.
+ */
+.macro schoolbook1 count
+ .set i, 0
+ .rept (\count)
+ schoolbook1_iteration i 0
+ .set i, (i +1)
+ .endr
+.endm
+
+/*
+ * Computes the product of two 128-bit polynomials at the memory locations
+ * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
+ * the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ * X = [X_1 : X_0]
+ * Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ * LO += X_0 * Y_0
+ * MI += X_0 * Y_1 + X_1 * Y_0
+ * HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an
+ * extra multiplication of SUM and h^8.
+ */
+.macro schoolbook1_iteration i xor_sum
+ movups (16*\i)(MSG), %xmm0
+ .if (\i == 0 && \xor_sum == 1)
+ pxor SUM, %xmm0
+ .endif
+ vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
+ vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
+ vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
+ vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
+ vpxor %xmm2, MI, MI
+ vpxor %xmm1, LO, LO
+ vpxor %xmm4, HI, HI
+ vpxor %xmm3, MI, MI
+.endm
+
+/*
+ * Performs the same computation as schoolbook1_iteration, except we expect the
+ * arguments to already be loaded into xmm0 and xmm1 and we set the result
+ * registers LO, MI, and HI directly rather than XOR'ing into them.
+ */
+.macro schoolbook1_noload
+ vpclmulqdq $0x01, %xmm0, %xmm1, MI
+ vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
+ vpclmulqdq $0x00, %xmm0, %xmm1, LO
+ vpclmulqdq $0x11, %xmm0, %xmm1, HI
+ vpxor %xmm2, MI, MI
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ */
+.macro schoolbook2
+ vpslldq $8, MI, PL
+ vpsrldq $8, MI, PH
+ pxor LO, PL
+ pxor HI, PH
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form. We need to reduce it
+ * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128. To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ * T = T_1 : T_0 = g*(x) * P_0
+ * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+ vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x)
+ pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1
+ pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1
+ pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+ pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
+ vpxor TMP_XMM, PH, \dest
+.endm
+
+/*
+ * Compute schoolbook multiplication for 8 blocks
+ * m_0h^8 + ... + m_7h^1
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ */
+.macro full_stride reduce
+ pxor LO, LO
+ pxor HI, HI
+ pxor MI, MI
+
+ schoolbook1_iteration 7 0
+ .if \reduce
+ vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 6 0
+ .if \reduce
+ pshufd $0b01001110, TMP_XMM, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 5 0
+ .if \reduce
+ pxor PL, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 4 0
+ .if \reduce
+ pxor TMP_XMM, PH
+ .endif
+
+ schoolbook1_iteration 3 0
+ .if \reduce
+ pclmulqdq $0x11, GSTAR, TMP_XMM
+ .endif
+
+ schoolbook1_iteration 2 0
+ .if \reduce
+ vpxor TMP_XMM, PH, SUM
+ .endif
+
+ schoolbook1_iteration 1 0
+
+ schoolbook1_iteration 0 1
+
+ addq $(8*16), MSG
+ schoolbook2
+.endm
+
+/*
+ * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
+ */
+.macro partial_stride
+ mov BLOCKS_LEFT, TMP
+ shlq $4, TMP
+ addq $(16*STRIDE_BLOCKS), KEY_POWERS
+ subq TMP, KEY_POWERS
+
+ movups (MSG), %xmm0
+ pxor SUM, %xmm0
+ movups (KEY_POWERS), %xmm1
+ schoolbook1_noload
+ dec BLOCKS_LEFT
+ addq $16, MSG
+ addq $16, KEY_POWERS
+
+ test $4, BLOCKS_LEFT
+ jz .Lpartial4BlocksDone
+ schoolbook1 4
+ addq $(4*16), MSG
+ addq $(4*16), KEY_POWERS
+.Lpartial4BlocksDone:
+ test $2, BLOCKS_LEFT
+ jz .Lpartial2BlocksDone
+ schoolbook1 2
+ addq $(2*16), MSG
+ addq $(2*16), KEY_POWERS
+.Lpartial2BlocksDone:
+ test $1, BLOCKS_LEFT
+ jz .LpartialDone
+ schoolbook1 1
+.LpartialDone:
+ schoolbook2
+ montgomery_reduction SUM
+.endm
+
+/*
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ * const struct polyval_elem *b);
+ */
+SYM_FUNC_START(polyval_mul_pclmul_avx)
+ FRAME_BEGIN
+ vmovdqa .Lgstar(%rip), GSTAR
+ movups (%rdi), %xmm0
+ movups (%rsi), %xmm1
+ schoolbook1_noload
+ schoolbook2
+ montgomery_reduction SUM
+ movups SUM, (%rdi)
+ FRAME_END
+ RET
+SYM_FUNC_END(polyval_mul_pclmul_avx)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL. This computes:
+ * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * rdi - pointer to the accumulator
+ * rsi - pointer to precomputed key powers h^8 ... h^1
+ * rdx - pointer to message blocks
+ * rcx - number of blocks to hash
+ *
+ * void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ * const struct polyval_key *key,
+ * const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(polyval_blocks_pclmul_avx)
+ FRAME_BEGIN
+ vmovdqa .Lgstar(%rip), GSTAR
+ movups (ACCUMULATOR), SUM
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ js .LstrideLoopExit
+ full_stride 0
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ js .LstrideLoopExitReduce
+.LstrideLoop:
+ full_stride 1
+ subq $STRIDE_BLOCKS, BLOCKS_LEFT
+ jns .LstrideLoop
+.LstrideLoopExitReduce:
+ montgomery_reduction SUM
+.LstrideLoopExit:
+ add $STRIDE_BLOCKS, BLOCKS_LEFT
+ jz .LskipPartial
+ partial_stride
+.LskipPartial:
+ movups SUM, (ACCUMULATOR)
+ FRAME_END
+ RET
+SYM_FUNC_END(polyval_blocks_pclmul_avx)
diff --git a/lib/crypto/x86/polyval.h b/lib/crypto/x86/polyval.h
new file mode 100644
index 000000000000..ef8797521420
--- /dev/null
+++ b/lib/crypto/x86/polyval.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, x86_64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
+
+asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ const struct polyval_key *key,
+ const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+ const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+ static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+ memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+ if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+ key->h_powers[i] = key->h_powers[i + 1];
+ polyval_mul_pclmul_avx(
+ &key->h_powers[i],
+ &key->h_powers[NUM_H_POWERS - 1]);
+ }
+ kernel_fpu_end();
+ } else {
+ for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+ key->h_powers[i] = key->h_powers[i + 1];
+ polyval_mul_generic(&key->h_powers[i],
+ &key->h_powers[NUM_H_POWERS - 1]);
+ }
+ }
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+ const struct polyval_key *key)
+{
+ if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ kernel_fpu_end();
+ } else {
+ polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+ }
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+ const struct polyval_key *key,
+ const u8 *data, size_t nblocks)
+{
+ if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+ do {
+ /* Allow rescheduling every 4 KiB. */
+ size_t n = min_t(size_t, nblocks,
+ 4096 / POLYVAL_BLOCK_SIZE);
+
+ kernel_fpu_begin();
+ polyval_blocks_pclmul_avx(acc, key, data, n);
+ kernel_fpu_end();
+ data += n * POLYVAL_BLOCK_SIZE;
+ nblocks -= n;
+ } while (nblocks);
+ } else {
+ polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+ data, nblocks);
+ }
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
+ boot_cpu_has(X86_FEATURE_AVX))
+ static_branch_enable(&have_pclmul_avx);
+}
diff --git a/lib/crypto/x86/sha1-avx2-asm.S b/lib/crypto/x86/sha1-avx2-asm.S
new file mode 100644
index 000000000000..91b2dc6db179
--- /dev/null
+++ b/lib/crypto/x86/sha1-avx2-asm.S
@@ -0,0 +1,697 @@
+/*
+ * Implement fast SHA-1 with AVX2 instructions. (x86_64)
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Ilya Albrekht <ilya.albrekht@intel.com>
+ * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
+ *
+ *This implementation is based on the previous SSSE3 release:
+ *Visit http://software.intel.com/en-us/articles/
+ *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * void sha1_transform_avx2(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+
+#include <linux/linkage.h>
+
+#define CTX %rdi /* arg1 */
+#define BUF %rsi /* arg2 */
+#define CNT %rdx /* arg3 */
+
+#define REG_A %ecx
+#define REG_B %esi
+#define REG_C %edi
+#define REG_D %eax
+#define REG_E %edx
+#define REG_TB %ebx
+#define REG_TA %r12d
+#define REG_RA %rcx
+#define REG_RB %rsi
+#define REG_RC %rdi
+#define REG_RD %rax
+#define REG_RE %rdx
+#define REG_RTA %r12
+#define REG_RTB %rbx
+#define REG_T1 %r11d
+#define xmm_mov vmovups
+#define avx2_zeroupper vzeroupper
+#define RND_F1 1
+#define RND_F2 2
+#define RND_F3 3
+
+.macro REGALLOC
+ .set A, REG_A
+ .set B, REG_B
+ .set C, REG_C
+ .set D, REG_D
+ .set E, REG_E
+ .set TB, REG_TB
+ .set TA, REG_TA
+
+ .set RA, REG_RA
+ .set RB, REG_RB
+ .set RC, REG_RC
+ .set RD, REG_RD
+ .set RE, REG_RE
+
+ .set RTA, REG_RTA
+ .set RTB, REG_RTB
+
+ .set T1, REG_T1
+.endm
+
+#define HASH_PTR %r9
+#define BLOCKS_CTR %r8
+#define BUFFER_PTR %r10
+#define BUFFER_PTR2 %r13
+
+#define PRECALC_BUF %r14
+#define WK_BUF %r15
+
+#define W_TMP %xmm0
+#define WY_TMP %ymm0
+#define WY_TMP2 %ymm9
+
+# AVX2 variables
+#define WY0 %ymm3
+#define WY4 %ymm5
+#define WY08 %ymm7
+#define WY12 %ymm8
+#define WY16 %ymm12
+#define WY20 %ymm13
+#define WY24 %ymm14
+#define WY28 %ymm15
+
+#define YMM_SHUFB_BSWAP %ymm10
+
+/*
+ * Keep 2 iterations precalculated at a time:
+ * - 80 DWORDs per iteration * 2
+ */
+#define W_SIZE (80*2*2 +16)
+
+#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
+#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
+
+
+.macro UPDATE_HASH hash, val
+ add \hash, \val
+ mov \val, \hash
+.endm
+
+.macro PRECALC_RESET_WY
+ .set WY_00, WY0
+ .set WY_04, WY4
+ .set WY_08, WY08
+ .set WY_12, WY12
+ .set WY_16, WY16
+ .set WY_20, WY20
+ .set WY_24, WY24
+ .set WY_28, WY28
+ .set WY_32, WY_00
+.endm
+
+.macro PRECALC_ROTATE_WY
+ /* Rotate macros */
+ .set WY_32, WY_28
+ .set WY_28, WY_24
+ .set WY_24, WY_20
+ .set WY_20, WY_16
+ .set WY_16, WY_12
+ .set WY_12, WY_08
+ .set WY_08, WY_04
+ .set WY_04, WY_00
+ .set WY_00, WY_32
+
+ /* Define register aliases */
+ .set WY, WY_00
+ .set WY_minus_04, WY_04
+ .set WY_minus_08, WY_08
+ .set WY_minus_12, WY_12
+ .set WY_minus_16, WY_16
+ .set WY_minus_20, WY_20
+ .set WY_minus_24, WY_24
+ .set WY_minus_28, WY_28
+ .set WY_minus_32, WY
+.endm
+
+.macro PRECALC_00_15
+ .if (i == 0) # Initialize and rotate registers
+ PRECALC_RESET_WY
+ PRECALC_ROTATE_WY
+ .endif
+
+ /* message scheduling pre-compute for rounds 0-15 */
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ vmovdqu (i * 2)(BUFFER_PTR), W_TMP
+ .elseif ((i & 7) == 1)
+ vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
+ WY_TMP, WY_TMP
+ .elseif ((i & 7) == 2)
+ vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
+ .elseif ((i & 7) == 4)
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+ .elseif ((i & 7) == 7)
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC_16_31
+ /*
+ * message scheduling pre-compute for rounds 16-31
+ * calculating last 32 w[i] values in 8 XMM registers
+ * pre-calculate K+w[i] values and store to mem
+ * for later load by ALU add instruction
+ *
+ * "brute force" vectorization for rounds 16-31 only
+ * due to w[i]->w[i-3] dependency
+ */
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ /* w[i-14] */
+ vpalignr $8, WY_minus_16, WY_minus_12, WY
+ vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
+ .elseif ((i & 7) == 1)
+ vpxor WY_minus_08, WY, WY
+ vpxor WY_minus_16, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 2)
+ vpxor WY_TMP, WY, WY
+ vpslldq $12, WY, WY_TMP2
+ .elseif ((i & 7) == 3)
+ vpslld $1, WY, WY_TMP
+ vpsrld $31, WY, WY
+ .elseif ((i & 7) == 4)
+ vpor WY, WY_TMP, WY_TMP
+ vpslld $2, WY_TMP2, WY
+ .elseif ((i & 7) == 5)
+ vpsrld $30, WY_TMP2, WY_TMP2
+ vpxor WY, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 7)
+ vpxor WY_TMP2, WY_TMP, WY
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC_32_79
+ /*
+ * in SHA-1 specification:
+ * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal:
+ * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization
+ * since w[i]=>w[i-3] dependency is broken
+ */
+
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
+ .elseif ((i & 7) == 1)
+ /* W is W_minus_32 before xor */
+ vpxor WY_minus_28, WY, WY
+ .elseif ((i & 7) == 2)
+ vpxor WY_minus_16, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 3)
+ vpxor WY_TMP, WY, WY
+ .elseif ((i & 7) == 4)
+ vpslld $2, WY, WY_TMP
+ .elseif ((i & 7) == 5)
+ vpsrld $30, WY, WY
+ vpor WY, WY_TMP, WY
+ .elseif ((i & 7) == 7)
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC r, s
+ .set i, \r
+
+ .if (i < 40)
+ .set K_XMM, 32*0
+ .elseif (i < 80)
+ .set K_XMM, 32*1
+ .elseif (i < 120)
+ .set K_XMM, 32*2
+ .else
+ .set K_XMM, 32*3
+ .endif
+
+ .if (i<32)
+ PRECALC_00_15 \s
+ .elseif (i<64)
+ PRECALC_16_31 \s
+ .elseif (i < 160)
+ PRECALC_32_79 \s
+ .endif
+.endm
+
+.macro ROTATE_STATE
+ .set T_REG, E
+ .set E, D
+ .set D, C
+ .set C, B
+ .set B, TB
+ .set TB, A
+ .set A, T_REG
+
+ .set T_REG, RE
+ .set RE, RD
+ .set RD, RC
+ .set RC, RB
+ .set RB, RTB
+ .set RTB, RA
+ .set RA, T_REG
+.endm
+
+/* Macro relies on saved ROUND_Fx */
+
+.macro RND_FUN f, r
+ .if (\f == RND_F1)
+ ROUND_F1 \r
+ .elseif (\f == RND_F2)
+ ROUND_F2 \r
+ .elseif (\f == RND_F3)
+ ROUND_F3 \r
+ .endif
+.endm
+
+.macro RR r
+ .set round_id, (\r % 80)
+
+ .if (round_id == 0) /* Precalculate F for first round */
+ .set ROUND_FUNC, RND_F1
+ mov B, TB
+
+ rorx $(32-30), B, B /* b>>>2 */
+ andn D, TB, T1
+ and C, TB
+ xor T1, TB
+ .endif
+
+ RND_FUN ROUND_FUNC, \r
+ ROTATE_STATE
+
+ .if (round_id == 18)
+ .set ROUND_FUNC, RND_F2
+ .elseif (round_id == 38)
+ .set ROUND_FUNC, RND_F3
+ .elseif (round_id == 58)
+ .set ROUND_FUNC, RND_F2
+ .endif
+
+ .set round_id, ( (\r+1) % 80)
+
+ RND_FUN ROUND_FUNC, (\r+1)
+ ROTATE_STATE
+.endm
+
+.macro ROUND_F1 r
+ add WK(\r), E
+
+ andn C, A, T1 /* ~b&d */
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ rorx $(32-30),A, TB /* b>>>2 for next round */
+
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ /*
+ * Calculate F for the next round
+ * (b & c) ^ andn[b, d]
+ */
+ and B, A /* b&c */
+ xor T1, A /* F1 = (b&c) ^ (~b&d) */
+
+ lea (RE,RTA), E /* E += A >>> 5 */
+.endm
+
+.macro ROUND_F2 r
+ add WK(\r), E
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ /* Calculate F for the next round */
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ .if ((round_id) < 79)
+ rorx $(32-30), A, TB /* b>>>2 for next round */
+ .endif
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ .if ((round_id) < 79)
+ xor B, A
+ .endif
+
+ add TA, E /* E += A >>> 5 */
+
+ .if ((round_id) < 79)
+ xor C, A
+ .endif
+.endm
+
+.macro ROUND_F3 r
+ add WK(\r), E
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ mov B, T1
+ or A, T1
+
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ rorx $(32-30), A, TB /* b>>>2 for next round */
+
+ /* Calculate F for the next round
+ * (b and c) or (d and (b or c))
+ */
+ and C, T1
+ and B, A
+ or T1, A
+
+ add TA, E /* E += A >>> 5 */
+
+.endm
+
+/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
+ * %1 + %2 >= %3 ? %4 : 0
+ */
+.macro ADD_IF_GE a, b, c, d
+ mov \a, RTA
+ add $\d, RTA
+ cmp $\c, \b
+ cmovge RTA, \a
+.endm
+
+/*
+ * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+
+ REGALLOC
+
+ mov (HASH_PTR), A
+ mov 4(HASH_PTR), B
+ mov 8(HASH_PTR), C
+ mov 12(HASH_PTR), D
+ mov 16(HASH_PTR), E
+
+ mov %rsp, PRECALC_BUF
+ lea (2*4*80+32)(%rsp), WK_BUF
+
+ # Precalc WK for first 2 blocks
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
+ .set i, 0
+ .rept 160
+ PRECALC i
+ .set i, i + 1
+ .endr
+
+ /* Go to next block if needed */
+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+ xchg WK_BUF, PRECALC_BUF
+
+ .align 32
+.L_loop:
+ /*
+ * code loops through more than one block
+ * we use K_BASE value as a signal of a last block,
+ * it is set below by: cmovae BUFFER_PTR, K_BASE
+ */
+ test BLOCKS_CTR, BLOCKS_CTR
+ jnz .L_begin
+ .align 32
+ jmp .L_end
+ .align 32
+.L_begin:
+
+ /*
+ * Do first block
+ * rounds: 0,2,4,6,8
+ */
+ .set j, 0
+ .rept 5
+ RR j
+ .set j, j+2
+ .endr
+
+ /*
+ * rounds:
+ * 10,12,14,16,18
+ * 20,22,24,26,28
+ * 30,32,34,36,38
+ * 40,42,44,46,48
+ * 50,52,54,56,58
+ */
+ .rept 25
+ RR j
+ .set j, j+2
+ .endr
+
+ /* Update Counter */
+ sub $1, BLOCKS_CTR
+ /* Move to the next block only if needed*/
+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
+ /*
+ * rounds
+ * 60,62,64,66,68
+ * 70,72,74,76,78
+ */
+ .rept 10
+ RR j
+ .set j, j+2
+ .endr
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), TB
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ test BLOCKS_CTR, BLOCKS_CTR
+ jz .L_loop
+
+ mov TB, B
+
+ /* Process second block */
+ /*
+ * rounds
+ * 0+80, 2+80, 4+80, 6+80, 8+80
+ * 10+80,12+80,14+80,16+80,18+80
+ */
+
+ .set j, 0
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ /*
+ * rounds
+ * 20+80,22+80,24+80,26+80,28+80
+ * 30+80,32+80,34+80,36+80,38+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ /*
+ * rounds
+ * 40+80,42+80,44+80,46+80,48+80
+ * 50+80,52+80,54+80,56+80,58+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ /* update counter */
+ sub $1, BLOCKS_CTR
+ /* Move to the next block only if needed*/
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+
+ /*
+ * rounds
+ * 60+80,62+80,64+80,66+80,68+80
+ * 70+80,72+80,74+80,76+80,78+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), TB
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ /* Reset state for AVX2 reg permutation */
+ mov A, TA
+ mov TB, A
+ mov C, TB
+ mov E, C
+ mov D, B
+ mov TA, D
+
+ REGALLOC
+
+ xchg WK_BUF, PRECALC_BUF
+
+ jmp .L_loop
+
+ .align 32
+.L_end:
+
+.endm
+/*
+ * macro implements SHA-1 function's body for several 64-byte blocks
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM name
+ SYM_FUNC_START(\name)
+
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ RESERVE_STACK = (W_SIZE*4 + 8+24)
+
+ /* Align stack */
+ push %rbp
+ mov %rsp, %rbp
+ and $~(0x20-1), %rsp
+ sub $RESERVE_STACK, %rsp
+
+ avx2_zeroupper
+
+ /* Setup initial values */
+ mov CTX, HASH_PTR
+ mov BUF, BUFFER_PTR
+
+ mov BUF, BUFFER_PTR2
+ mov CNT, BLOCKS_CTR
+
+ xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
+
+ SHA1_PIPELINED_MAIN_BODY
+
+ avx2_zeroupper
+
+ mov %rbp, %rsp
+ pop %rbp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ RET
+
+ SYM_FUNC_END(\name)
+.endm
+
+.section .rodata
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.align 128
+K_XMM_AR:
+ .long K1, K1, K1, K1
+ .long K1, K1, K1, K1
+ .long K2, K2, K2, K2
+ .long K2, K2, K2, K2
+ .long K3, K3, K3, K3
+ .long K3, K3, K3, K3
+ .long K4, K4, K4, K4
+ .long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+.text
+
+SHA1_VECTOR_ASM sha1_transform_avx2
diff --git a/lib/crypto/x86/sha1-ni-asm.S b/lib/crypto/x86/sha1-ni-asm.S
new file mode 100644
index 000000000000..428f9b960594
--- /dev/null
+++ b/lib/crypto/x86/sha1-ni-asm.S
@@ -0,0 +1,152 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define STATE_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define ABCD %xmm0
+#define E0 %xmm1 /* Need two E's b/c they ping pong */
+#define E1 %xmm2
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define SHUF_MASK %xmm7
+#define ABCD_SAVED %xmm8
+#define E0_SAVED %xmm9
+
+.macro do_4rounds i, m0, m1, m2, m3, e0, e1
+.if \i < 16
+ movdqu \i*4(DATA_PTR), \m0
+ pshufb SHUF_MASK, \m0
+.endif
+.if \i == 0
+ paddd \m0, \e0
+.else
+ sha1nexte \m0, \e0
+.endif
+ movdqa ABCD, \e1
+.if \i >= 12 && \i < 76
+ sha1msg2 \m0, \m1
+.endif
+ sha1rnds4 $\i / 20, \e0, ABCD
+.if \i >= 4 && \i < 68
+ sha1msg1 \m0, \m3
+.endif
+.if \i >= 8 && \i < 72
+ pxor \m0, \m2
+.endif
+.endm
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 block function
+ *
+ * This function takes a pointer to the current SHA-1 state, a pointer to the
+ * input data, and the number of 64-byte blocks to process. The number of
+ * blocks to process is assumed to be nonzero. Once all blocks have been
+ * processed, the state is updated with the new state. This function only
+ * processes complete blocks. State initialization, buffering of partial
+ * blocks, and digest finalization are expected to be handled elsewhere.
+ *
+ * void sha1_ni_transform(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks)
+ */
+.text
+SYM_FUNC_START(sha1_ni_transform)
+
+ /* Load the initial state from STATE_PTR. */
+ pxor E0, E0
+ pinsrd $3, 16(STATE_PTR), E0
+ movdqu (STATE_PTR), ABCD
+ pshufd $0x1B, ABCD, ABCD
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+.Lnext_block:
+ /* Save the state for addition after the rounds. */
+ movdqa E0, E0_SAVED
+ movdqa ABCD, ABCD_SAVED
+
+.irp i, 0, 16, 32, 48, 64
+ do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3, E0, E1
+ do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0, E1, E0
+ do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1, E0, E1
+ do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2, E1, E0
+.endr
+
+ /* Add the previous state (before the rounds) to the current state. */
+ sha1nexte E0_SAVED, E0
+ paddd ABCD_SAVED, ABCD
+
+ /* Advance to the next block, or break if there are no more blocks. */
+ add $64, DATA_PTR
+ dec NUM_BLKS
+ jnz .Lnext_block
+
+ /* Store the new state to STATE_PTR. */
+ pextrd $3, E0, 16(STATE_PTR)
+ pshufd $0x1B, ABCD, ABCD
+ movdqu ABCD, (STATE_PTR)
+
+ RET
+SYM_FUNC_END(sha1_ni_transform)
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x000102030405060708090a0b0c0d0e0f
diff --git a/lib/crypto/x86/sha1-ssse3-and-avx.S b/lib/crypto/x86/sha1-ssse3-and-avx.S
new file mode 100644
index 000000000000..78b48cb09c5b
--- /dev/null
+++ b/lib/crypto/x86/sha1-ssse3-and-avx.S
@@ -0,0 +1,551 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
+ * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
+ * processors. CPUs supporting Intel(R) AVX extensions will get an additional
+ * boost.
+ *
+ * This work was inspired by the vectorized implementation of Dean Gaudet.
+ * Additional information on it can be found at:
+ * http://www.arctic.org/~dean/crypto/sha1.html
+ *
+ * It was improved upon with more efficient vectorization of the message
+ * scheduling. This implementation has also been optimized for all current and
+ * several future generations of Intel CPUs.
+ *
+ * See this article for more information about the implementation details:
+ * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * Copyright (C) 2010, Intel Corp.
+ * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ *
+ * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
+ * Author: Mathias Krause <minipli@googlemail.com>
+ */
+
+#include <linux/linkage.h>
+
+#define CTX %rdi // arg1
+#define BUF %rsi // arg2
+#define CNT %rdx // arg3
+
+#define REG_A %ecx
+#define REG_B %esi
+#define REG_C %edi
+#define REG_D %r12d
+#define REG_E %edx
+
+#define REG_T1 %eax
+#define REG_T2 %ebx
+
+#define K_BASE %r8
+#define HASH_PTR %r9
+#define BUFFER_PTR %r10
+#define BUFFER_END %r11
+
+#define W_TMP1 %xmm0
+#define W_TMP2 %xmm9
+
+#define W0 %xmm1
+#define W4 %xmm2
+#define W8 %xmm3
+#define W12 %xmm4
+#define W16 %xmm5
+#define W20 %xmm6
+#define W24 %xmm7
+#define W28 %xmm8
+
+#define XMM_SHUFB_BSWAP %xmm10
+
+/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
+#define WK(t) (((t) & 15) * 4)(%rsp)
+#define W_PRECALC_AHEAD 16
+
+/*
+ * This macro implements the SHA-1 function's body for single 64-byte block
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM name
+ SYM_FUNC_START(\name)
+
+ push %rbx
+ push %r12
+ push %rbp
+ mov %rsp, %rbp
+
+ sub $64, %rsp # allocate workspace
+ and $~15, %rsp # align stack
+
+ mov CTX, HASH_PTR
+ mov BUF, BUFFER_PTR
+
+ shl $6, CNT # multiply by 64
+ add BUF, CNT
+ mov CNT, BUFFER_END
+
+ lea K_XMM_AR(%rip), K_BASE
+ xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
+
+ SHA1_PIPELINED_MAIN_BODY
+
+ # cleanup workspace
+ mov $8, %ecx
+ mov %rsp, %rdi
+ xor %eax, %eax
+ rep stosq
+
+ mov %rbp, %rsp # deallocate workspace
+ pop %rbp
+ pop %r12
+ pop %rbx
+ RET
+
+ SYM_FUNC_END(\name)
+.endm
+
+/*
+ * This macro implements 80 rounds of SHA-1 for one 64-byte block
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+ INIT_REGALLOC
+
+ mov (HASH_PTR), A
+ mov 4(HASH_PTR), B
+ mov 8(HASH_PTR), C
+ mov 12(HASH_PTR), D
+ mov 16(HASH_PTR), E
+
+ .set i, 0
+ .rept W_PRECALC_AHEAD
+ W_PRECALC i
+ .set i, (i+1)
+ .endr
+
+.align 4
+1:
+ RR F1,A,B,C,D,E,0
+ RR F1,D,E,A,B,C,2
+ RR F1,B,C,D,E,A,4
+ RR F1,E,A,B,C,D,6
+ RR F1,C,D,E,A,B,8
+
+ RR F1,A,B,C,D,E,10
+ RR F1,D,E,A,B,C,12
+ RR F1,B,C,D,E,A,14
+ RR F1,E,A,B,C,D,16
+ RR F1,C,D,E,A,B,18
+
+ RR F2,A,B,C,D,E,20
+ RR F2,D,E,A,B,C,22
+ RR F2,B,C,D,E,A,24
+ RR F2,E,A,B,C,D,26
+ RR F2,C,D,E,A,B,28
+
+ RR F2,A,B,C,D,E,30
+ RR F2,D,E,A,B,C,32
+ RR F2,B,C,D,E,A,34
+ RR F2,E,A,B,C,D,36
+ RR F2,C,D,E,A,B,38
+
+ RR F3,A,B,C,D,E,40
+ RR F3,D,E,A,B,C,42
+ RR F3,B,C,D,E,A,44
+ RR F3,E,A,B,C,D,46
+ RR F3,C,D,E,A,B,48
+
+ RR F3,A,B,C,D,E,50
+ RR F3,D,E,A,B,C,52
+ RR F3,B,C,D,E,A,54
+ RR F3,E,A,B,C,D,56
+ RR F3,C,D,E,A,B,58
+
+ add $64, BUFFER_PTR # move to the next 64-byte block
+ cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
+ cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
+
+ RR F4,A,B,C,D,E,60
+ RR F4,D,E,A,B,C,62
+ RR F4,B,C,D,E,A,64
+ RR F4,E,A,B,C,D,66
+ RR F4,C,D,E,A,B,68
+
+ RR F4,A,B,C,D,E,70
+ RR F4,D,E,A,B,C,72
+ RR F4,B,C,D,E,A,74
+ RR F4,E,A,B,C,D,76
+ RR F4,C,D,E,A,B,78
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), B
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ RESTORE_RENAMED_REGS
+ cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
+ jne 1b
+.endm
+
+.macro INIT_REGALLOC
+ .set A, REG_A
+ .set B, REG_B
+ .set C, REG_C
+ .set D, REG_D
+ .set E, REG_E
+ .set T1, REG_T1
+ .set T2, REG_T2
+.endm
+
+.macro RESTORE_RENAMED_REGS
+ # order is important (REG_C is where it should be)
+ mov B, REG_B
+ mov D, REG_D
+ mov A, REG_A
+ mov E, REG_E
+.endm
+
+.macro SWAP_REG_NAMES a, b
+ .set _T, \a
+ .set \a, \b
+ .set \b, _T
+.endm
+
+.macro F1 b, c, d
+ mov \c, T1
+ SWAP_REG_NAMES \c, T1
+ xor \d, T1
+ and \b, T1
+ xor \d, T1
+.endm
+
+.macro F2 b, c, d
+ mov \d, T1
+ SWAP_REG_NAMES \d, T1
+ xor \c, T1
+ xor \b, T1
+.endm
+
+.macro F3 b, c ,d
+ mov \c, T1
+ SWAP_REG_NAMES \c, T1
+ mov \b, T2
+ or \b, T1
+ and \c, T2
+ and \d, T1
+ or T2, T1
+.endm
+
+.macro F4 b, c, d
+ F2 \b, \c, \d
+.endm
+
+.macro UPDATE_HASH hash, val
+ add \hash, \val
+ mov \val, \hash
+.endm
+
+/*
+ * RR does two rounds of SHA-1 back to back with W[] pre-calc
+ * t1 = F(b, c, d); e += w(i)
+ * e += t1; b <<= 30; d += w(i+1);
+ * t1 = F(a, b, c);
+ * d += t1; a <<= 5;
+ * e += a;
+ * t1 = e; a >>= 7;
+ * t1 <<= 5;
+ * d += t1;
+ */
+.macro RR F, a, b, c, d, e, round
+ add WK(\round), \e
+ \F \b, \c, \d # t1 = F(b, c, d);
+ W_PRECALC (\round + W_PRECALC_AHEAD)
+ rol $30, \b
+ add T1, \e
+ add WK(\round + 1), \d
+
+ \F \a, \b, \c
+ W_PRECALC (\round + W_PRECALC_AHEAD + 1)
+ rol $5, \a
+ add \a, \e
+ add T1, \d
+ ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
+
+ mov \e, T1
+ SWAP_REG_NAMES \e, T1
+
+ rol $5, T1
+ add T1, \d
+
+ # write: \a, \b
+ # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
+.endm
+
+.macro W_PRECALC r
+ .set i, \r
+
+ .if (i < 20)
+ .set K_XMM, 0
+ .elseif (i < 40)
+ .set K_XMM, 16
+ .elseif (i < 60)
+ .set K_XMM, 32
+ .elseif (i < 80)
+ .set K_XMM, 48
+ .endif
+
+ .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
+ .set i, ((\r) % 80) # pre-compute for the next iteration
+ .if (i == 0)
+ W_PRECALC_RESET
+ .endif
+ W_PRECALC_00_15
+ .elseif (i<32)
+ W_PRECALC_16_31
+ .elseif (i < 80) // rounds 32-79
+ W_PRECALC_32_79
+ .endif
+.endm
+
+.macro W_PRECALC_RESET
+ .set W, W0
+ .set W_minus_04, W4
+ .set W_minus_08, W8
+ .set W_minus_12, W12
+ .set W_minus_16, W16
+ .set W_minus_20, W20
+ .set W_minus_24, W24
+ .set W_minus_28, W28
+ .set W_minus_32, W
+.endm
+
+.macro W_PRECALC_ROTATE
+ .set W_minus_32, W_minus_28
+ .set W_minus_28, W_minus_24
+ .set W_minus_24, W_minus_20
+ .set W_minus_20, W_minus_16
+ .set W_minus_16, W_minus_12
+ .set W_minus_12, W_minus_08
+ .set W_minus_08, W_minus_04
+ .set W_minus_04, W
+ .set W, W_minus_32
+.endm
+
+.macro W_PRECALC_SSSE3
+
+.macro W_PRECALC_00_15
+ W_PRECALC_00_15_SSSE3
+.endm
+.macro W_PRECALC_16_31
+ W_PRECALC_16_31_SSSE3
+.endm
+.macro W_PRECALC_32_79
+ W_PRECALC_32_79_SSSE3
+.endm
+
+/* message scheduling pre-compute for rounds 0-15 */
+.macro W_PRECALC_00_15_SSSE3
+ .if ((i & 3) == 0)
+ movdqu (i*4)(BUFFER_PTR), W_TMP1
+ .elseif ((i & 3) == 1)
+ pshufb XMM_SHUFB_BSWAP, W_TMP1
+ movdqa W_TMP1, W
+ .elseif ((i & 3) == 2)
+ paddd (K_BASE), W_TMP1
+ .elseif ((i & 3) == 3)
+ movdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+/* message scheduling pre-compute for rounds 16-31
+ *
+ * - calculating last 32 w[i] values in 8 XMM registers
+ * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
+ * instruction
+ *
+ * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
+ * dependency, but improves for 32-79
+ */
+.macro W_PRECALC_16_31_SSSE3
+ # blended scheduling of vector and scalar instruction streams, one 4-wide
+ # vector iteration / 4 scalar rounds
+ .if ((i & 3) == 0)
+ movdqa W_minus_12, W
+ palignr $8, W_minus_16, W # w[i-14]
+ movdqa W_minus_04, W_TMP1
+ psrldq $4, W_TMP1 # w[i-3]
+ pxor W_minus_08, W
+ .elseif ((i & 3) == 1)
+ pxor W_minus_16, W_TMP1
+ pxor W_TMP1, W
+ movdqa W, W_TMP2
+ movdqa W, W_TMP1
+ pslldq $12, W_TMP2
+ .elseif ((i & 3) == 2)
+ psrld $31, W
+ pslld $1, W_TMP1
+ por W, W_TMP1
+ movdqa W_TMP2, W
+ psrld $30, W_TMP2
+ pslld $2, W
+ .elseif ((i & 3) == 3)
+ pxor W, W_TMP1
+ pxor W_TMP2, W_TMP1
+ movdqa W_TMP1, W
+ paddd K_XMM(K_BASE), W_TMP1
+ movdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+/* message scheduling pre-compute for rounds 32-79
+ *
+ * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ */
+.macro W_PRECALC_32_79_SSSE3
+ .if ((i & 3) == 0)
+ movdqa W_minus_04, W_TMP1
+ pxor W_minus_28, W # W is W_minus_32 before xor
+ palignr $8, W_minus_08, W_TMP1
+ .elseif ((i & 3) == 1)
+ pxor W_minus_16, W
+ pxor W_TMP1, W
+ movdqa W, W_TMP1
+ .elseif ((i & 3) == 2)
+ psrld $30, W
+ pslld $2, W_TMP1
+ por W, W_TMP1
+ .elseif ((i & 3) == 3)
+ movdqa W_TMP1, W
+ paddd K_XMM(K_BASE), W_TMP1
+ movdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.endm // W_PRECALC_SSSE3
+
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.section .rodata
+.align 16
+
+K_XMM_AR:
+ .long K1, K1, K1, K1
+ .long K2, K2, K2, K2
+ .long K3, K3, K3, K3
+ .long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+
+
+.section .text
+
+W_PRECALC_SSSE3
+.macro xmm_mov a, b
+ movdqu \a,\b
+.endm
+
+/*
+ * SSSE3 optimized implementation:
+ *
+ * void sha1_transform_ssse3(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+SHA1_VECTOR_ASM sha1_transform_ssse3
+
+.macro W_PRECALC_AVX
+
+.purgem W_PRECALC_00_15
+.macro W_PRECALC_00_15
+ W_PRECALC_00_15_AVX
+.endm
+.purgem W_PRECALC_16_31
+.macro W_PRECALC_16_31
+ W_PRECALC_16_31_AVX
+.endm
+.purgem W_PRECALC_32_79
+.macro W_PRECALC_32_79
+ W_PRECALC_32_79_AVX
+.endm
+
+.macro W_PRECALC_00_15_AVX
+ .if ((i & 3) == 0)
+ vmovdqu (i*4)(BUFFER_PTR), W_TMP1
+ .elseif ((i & 3) == 1)
+ vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
+ .elseif ((i & 3) == 2)
+ vpaddd (K_BASE), W, W_TMP1
+ .elseif ((i & 3) == 3)
+ vmovdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.macro W_PRECALC_16_31_AVX
+ .if ((i & 3) == 0)
+ vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
+ vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
+ vpxor W_minus_08, W, W
+ vpxor W_minus_16, W_TMP1, W_TMP1
+ .elseif ((i & 3) == 1)
+ vpxor W_TMP1, W, W
+ vpslldq $12, W, W_TMP2
+ vpslld $1, W, W_TMP1
+ .elseif ((i & 3) == 2)
+ vpsrld $31, W, W
+ vpor W, W_TMP1, W_TMP1
+ vpslld $2, W_TMP2, W
+ vpsrld $30, W_TMP2, W_TMP2
+ .elseif ((i & 3) == 3)
+ vpxor W, W_TMP1, W_TMP1
+ vpxor W_TMP2, W_TMP1, W
+ vpaddd K_XMM(K_BASE), W, W_TMP1
+ vmovdqu W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.macro W_PRECALC_32_79_AVX
+ .if ((i & 3) == 0)
+ vpalignr $8, W_minus_08, W_minus_04, W_TMP1
+ vpxor W_minus_28, W, W # W is W_minus_32 before xor
+ .elseif ((i & 3) == 1)
+ vpxor W_minus_16, W_TMP1, W_TMP1
+ vpxor W_TMP1, W, W
+ .elseif ((i & 3) == 2)
+ vpslld $2, W, W_TMP1
+ vpsrld $30, W, W
+ vpor W, W_TMP1, W
+ .elseif ((i & 3) == 3)
+ vpaddd K_XMM(K_BASE), W, W_TMP1
+ vmovdqu W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.endm // W_PRECALC_AVX
+
+W_PRECALC_AVX
+.purgem xmm_mov
+.macro xmm_mov a, b
+ vmovdqu \a,\b
+.endm
+
+
+/* AVX optimized implementation:
+ * void sha1_transform_avx(struct sha1_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+SHA1_VECTOR_ASM sha1_transform_avx
diff --git a/lib/crypto/x86/sha1.h b/lib/crypto/x86/sha1.h
new file mode 100644
index 000000000000..c48a0131fd12
--- /dev/null
+++ b/lib/crypto/x86/sha1.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for x86_64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/static_call.h>
+
+DEFINE_STATIC_CALL(sha1_blocks_x86, sha1_blocks_generic);
+
+#define DEFINE_X86_SHA1_FN(c_fn, asm_fn) \
+ asmlinkage void asm_fn(struct sha1_block_state *state, \
+ const u8 *data, size_t nblocks); \
+ static void c_fn(struct sha1_block_state *state, \
+ const u8 *data, size_t nblocks) \
+ { \
+ if (likely(irq_fpu_usable())) { \
+ kernel_fpu_begin(); \
+ asm_fn(state, data, nblocks); \
+ kernel_fpu_end(); \
+ } else { \
+ sha1_blocks_generic(state, data, nblocks); \
+ } \
+ }
+
+DEFINE_X86_SHA1_FN(sha1_blocks_ssse3, sha1_transform_ssse3);
+DEFINE_X86_SHA1_FN(sha1_blocks_avx, sha1_transform_avx);
+DEFINE_X86_SHA1_FN(sha1_blocks_ni, sha1_ni_transform);
+
+#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
+
+asmlinkage void sha1_transform_avx2(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks);
+static void sha1_blocks_avx2(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ if (likely(irq_fpu_usable())) {
+ kernel_fpu_begin();
+ /* Select the optimal transform based on the number of blocks */
+ if (nblocks >= SHA1_AVX2_BLOCK_OPTSIZE)
+ sha1_transform_avx2(state, data, nblocks);
+ else
+ sha1_transform_avx(state, data, nblocks);
+ kernel_fpu_end();
+ } else {
+ sha1_blocks_generic(state, data, nblocks);
+ }
+}
+
+static void sha1_blocks(struct sha1_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ static_call(sha1_blocks_x86)(state, data, nblocks);
+}
+
+#define sha1_mod_init_arch sha1_mod_init_arch
+static void sha1_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+ static_call_update(sha1_blocks_x86, sha1_blocks_ni);
+ } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ NULL) &&
+ boot_cpu_has(X86_FEATURE_AVX)) {
+ if (boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI1) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ static_call_update(sha1_blocks_x86, sha1_blocks_avx2);
+ else
+ static_call_update(sha1_blocks_x86, sha1_blocks_avx);
+ } else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+ static_call_update(sha1_blocks_x86, sha1_blocks_ssse3);
+ }
+}
diff --git a/lib/crypto/x86/sha256-avx-asm.S b/lib/crypto/x86/sha256-avx-asm.S
new file mode 100644
index 000000000000..c1aceb3ba3a3
--- /dev/null
+++ b/lib/crypto/x86/sha256-avx-asm.S
@@ -0,0 +1,493 @@
+########################################################################
+# Implement fast SHA-256 with AVX1 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 block at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+
+.macro MY_ROR p1 p2
+ shld $(32-(\p1)), \p2, \p2
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+ VMOVDQ \p2, \p1
+ vpshufb \p3, \p1, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+XTMP5 = %xmm11
+
+SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm13
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+
+SRND = %rsi # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP = _INP_END + _INP_END_SIZE
+_XFER = _INP + _INP_SIZE
+_XMM_SAVE = _XFER + _XFER_SIZE
+STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+ ## compute s0 four at a time and s1 two at a time
+ ## compute W[-16] + W[-7] 4 at a time
+
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ## compute s0
+ vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpsrld $7, XTMP1, XTMP2
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpslld $(32-7), XTMP1, XTMP3
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ vpsrld $18, XTMP1, XTMP2 #
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpslld $(32-18), XTMP1, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ vpxor XTMP1, XTMP3, XTMP3 #
+ add y0, y2 # y2 = S1 + CH
+ add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ ## compute low s1
+ vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+ xor g, y2 # y2 = f^g
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpxor XTMP3, XTMP2, XTMP2 #
+ add y0, y2 # y2 = S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ ## compute high s1
+ vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ vpxor XTMP3, XTMP2, XTMP2
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and e, y2 # y2 = (f^g)&e
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ add y0, y2 # y2 = S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ offset = \round * 4 + _XFER #
+ add offset(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_avx(struct sha256_block_state *state,
+## const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_avx)
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp # allocate stack space
+ and $~15, %rsp # align stack pointer
+
+ shl $6, NUM_BLKS # convert to bytes
+ add INP, NUM_BLKS # pointer to end of data
+ mov NUM_BLKS, _INP_END(%rsp)
+
+ ## load initial digest
+ mov 4*0(CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+.Lloop0:
+ lea K256(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
+
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov $3, SRND
+.align 16
+.Lloop1:
+ vpaddd (TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 1*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 2*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 3*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ add $4*16, TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ sub $1, SRND
+ jne .Lloop1
+
+ mov $2, SRND
+.Lloop2:
+ vpaddd (TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vpaddd 1*16(TBL), X1, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ add $2*16, TBL
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vmovdqa X2, X0
+ vmovdqa X3, X1
+
+ sub $1, SRND
+ jne .Lloop2
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ mov _INP(%rsp), INP
+ add $64, INP
+ cmp _INP_END(%rsp), INP
+ jne .Lloop0
+
+ mov %rbp, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ RET
+SYM_FUNC_END(sha256_transform_avx)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha256-avx2-asm.S b/lib/crypto/x86/sha256-avx2-asm.S
new file mode 100644
index 000000000000..eb8836fb9695
--- /dev/null
+++ b/lib/crypto/x86/sha256-avx2-asm.S
@@ -0,0 +1,770 @@
+########################################################################
+# Implement fast SHA-256 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 2 blocks at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+################################
+
+X0 = %ymm4
+X1 = %ymm5
+X2 = %ymm6
+X3 = %ymm7
+
+# XMM versions of above
+XWORD0 = %xmm4
+XWORD1 = %xmm5
+XWORD2 = %xmm6
+XWORD3 = %xmm7
+
+XTMP0 = %ymm0
+XTMP1 = %ymm1
+XTMP2 = %ymm2
+XTMP3 = %ymm3
+XTMP4 = %ymm8
+XFER = %ymm9
+XTMP5 = %ymm11
+
+SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %ymm13
+
+X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+c = %ecx
+d = %r8d
+e = %edx # clobbers NUM_BLKS
+y3 = %esi # clobbers INP
+
+SRND = CTX # SRND is same register as CTX
+
+a = %eax
+b = %ebx
+f = %r9d
+g = %r10d
+h = %r11d
+old_h = %r11d
+
+T1 = %r12d
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
+_XMM_SAVE_SIZE = 0
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_CTX_SIZE = 8
+
+_XFER = 0
+_XMM_SAVE = _XFER + _XFER_SIZE
+_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
+_INP = _INP_END + _INP_END_SIZE
+_CTX = _INP + _INP_SIZE
+STACK_SIZE = _CTX + _CTX_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+ X_ = X0
+ X0 = X1
+ X1 = X2
+ X2 = X3
+ X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+ old_h = h
+ TMP_ = h
+ h = g
+ g = f
+ f = e
+ e = d
+ d = c
+ c = b
+ b = a
+ a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED disp
+################################### RND N + 0 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+
+ addl \disp(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+ vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+ vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+
+ and e, y2 # y2 = (f^g)&e # CH
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ vpsrld $7, XTMP1, XTMP2
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+
+ add y0, y2 # y2 = S1 + CH # --
+ vpslld $(32-7), XTMP1, XTMP3
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
+
+ vpsrld $18, XTMP1, XTMP2
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+
+ ROTATE_ARGS
+
+################################### RND N + 1 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ offset = \disp + 1*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+
+ vpslld $(32-18), XTMP1, XTMP1
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+
+ vpxor XTMP1, XTMP3, XTMP3
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
+ vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+
+
+ ROTATE_ARGS
+
+################################### RND N + 2 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ offset = \disp + 2*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ xor g, y2 # y2 = f^g # CH
+
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
+ and e, y2 # y2 = (f^g)&e # CH
+
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ vpxor XTMP3, XTMP2, XTMP2
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a ,T1 # T1 = (a >> 2) # S0
+ vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+ vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1,h # h = k + w + h + S0 # --
+ add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3,h # h = t1 + S0 + MAJ # --
+
+
+ ROTATE_ARGS
+
+################################### RND N + 3 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ offset = \disp + 3*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpxor XTMP3, XTMP2, XTMP2
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ add y0, y2 # y2 = S1 + CH # --
+
+ vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+
+ vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+
+ add y1, h # h = k + w + h + S0 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+.macro DO_4ROUNDS disp
+################################### RND N + 0 ###########################
+
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ addl \disp(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 1 ###########################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*1 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 2 ##############################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*2 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 3 ###########################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*3 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ ROTATE_ARGS
+
+.endm
+
+########################################################################
+## void sha256_transform_rorx(struct sha256_block_state *state,
+## const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_rorx)
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ push %rbp
+ mov %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp
+ and $-32, %rsp # align rsp to 32 byte boundary
+
+ shl $6, NUM_BLKS # convert to bytes
+ lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
+ mov NUM_BLKS, _INP_END(%rsp)
+
+ cmp NUM_BLKS, INP
+ je .Lonly_one_block
+
+ ## load initial digest
+ mov (CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+
+ mov CTX, _CTX(%rsp)
+
+.Lloop0:
+ ## Load first 16 dwords from two blocks
+ VMOVDQ 0*32(INP),XTMP0
+ VMOVDQ 1*32(INP),XTMP1
+ VMOVDQ 2*32(INP),XTMP2
+ VMOVDQ 3*32(INP),XTMP3
+
+ ## byte swap data
+ vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
+ vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
+ vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
+ vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
+
+ ## transpose data into high/low halves
+ vperm2i128 $0x20, XTMP2, XTMP0, X0
+ vperm2i128 $0x31, XTMP2, XTMP0, X1
+ vperm2i128 $0x20, XTMP3, XTMP1, X2
+ vperm2i128 $0x31, XTMP3, XTMP1, X3
+
+.Llast_block_enter:
+ add $64, INP
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 12 each
+ xor SRND, SRND
+
+.align 16
+.Lloop1:
+ leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
+
+ leaq K256+1*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
+
+ leaq K256+2*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
+
+ leaq K256+3*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
+
+ add $4*32, SRND
+ cmp $3*4*32, SRND
+ jb .Lloop1
+
+.Lloop2:
+ ## Do last 16 rounds with no scheduling
+ leaq K256+0*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
+ vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+ DO_4ROUNDS (_XFER + 0*32)
+
+ leaq K256+1*32(%rip), INP
+ vpaddd (INP, SRND), X1, XFER
+ vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+ DO_4ROUNDS (_XFER + 1*32)
+ add $2*32, SRND
+
+ vmovdqa X2, X0
+ vmovdqa X3, X1
+
+ cmp $4*4*32, SRND
+ jb .Lloop2
+
+ mov _CTX(%rsp), CTX
+ mov _INP(%rsp), INP
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ cmp _INP_END(%rsp), INP
+ ja .Ldone_hash
+
+ #### Do second block using previously scheduled results
+ xor SRND, SRND
+.align 16
+.Lloop3:
+ DO_4ROUNDS (_XFER + 0*32 + 16)
+ DO_4ROUNDS (_XFER + 1*32 + 16)
+ add $2*32, SRND
+ cmp $4*4*32, SRND
+ jb .Lloop3
+
+ mov _CTX(%rsp), CTX
+ mov _INP(%rsp), INP
+ add $64, INP
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ cmp _INP_END(%rsp), INP
+ jb .Lloop0
+ ja .Ldone_hash
+
+.Ldo_last_block:
+ VMOVDQ 0*16(INP),XWORD0
+ VMOVDQ 1*16(INP),XWORD1
+ VMOVDQ 2*16(INP),XWORD2
+ VMOVDQ 3*16(INP),XWORD3
+
+ vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
+ vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
+ vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
+ vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
+
+ jmp .Llast_block_enter
+
+.Lonly_one_block:
+
+ ## load initial digest
+ mov (4*0)(CTX),a
+ mov (4*1)(CTX),b
+ mov (4*2)(CTX),c
+ mov (4*3)(CTX),d
+ mov (4*4)(CTX),e
+ mov (4*5)(CTX),f
+ mov (4*6)(CTX),g
+ mov (4*7)(CTX),h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+
+ mov CTX, _CTX(%rsp)
+ jmp .Ldo_last_block
+
+.Ldone_hash:
+
+ mov %rbp, %rsp
+ pop %rbp
+
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ vzeroupper
+ RET
+SYM_FUNC_END(sha256_transform_rorx)
+
+.section .rodata.cst512.K256, "aM", @progbits, 512
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
+.align 32
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
+.align 32
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S
new file mode 100644
index 000000000000..de5f707e7ef7
--- /dev/null
+++ b/lib/crypto/x86/sha256-ni-asm.S
@@ -0,0 +1,559 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define STATE_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define SHA256CONSTANTS %rax
+
+#define MSG %xmm0 /* sha256rnds2 implicit operand */
+#define STATE0 %xmm1
+#define STATE1 %xmm2
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define TMP %xmm7
+
+#define SHUF_MASK %xmm8
+
+#define ABEF_SAVE %xmm9
+#define CDGH_SAVE %xmm10
+
+.macro do_4rounds i, m0, m1, m2, m3
+.if \i < 16
+ movdqu \i*4(DATA_PTR), \m0
+ pshufb SHUF_MASK, \m0
+.endif
+ movdqa (\i-32)*4(SHA256CONSTANTS), MSG
+ paddd \m0, MSG
+ sha256rnds2 STATE0, STATE1
+.if \i >= 12 && \i < 60
+ movdqa \m0, TMP
+ palignr $4, \m3, TMP
+ paddd TMP, \m1
+ sha256msg2 \m0, \m1
+.endif
+ punpckhqdq MSG, MSG
+ sha256rnds2 STATE1, STATE0
+.if \i >= 4 && \i < 52
+ sha256msg1 \m0, \m3
+.endif
+.endm
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 block function
+ *
+ * This function takes a pointer to the current SHA-256 state, a pointer to the
+ * input data, and the number of 64-byte blocks to process. Once all blocks
+ * have been processed, the state is updated with the new state. This function
+ * only processes complete blocks. State initialization, buffering of partial
+ * blocks, and digest finalization is expected to be handled elsewhere.
+ *
+ * void sha256_ni_transform(struct sha256_block_state *state,
+ * const u8 *data, size_t nblocks);
+ */
+.text
+SYM_FUNC_START(sha256_ni_transform)
+
+ shl $6, NUM_BLKS /* convert to bytes */
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /*
+ * load initial hash values
+ * Need to reorder these appropriately
+ * DCBA, HGFE -> ABEF, CDGH
+ */
+ movdqu 0*16(STATE_PTR), STATE0 /* DCBA */
+ movdqu 1*16(STATE_PTR), STATE1 /* HGFE */
+
+ movdqa STATE0, TMP
+ punpcklqdq STATE1, STATE0 /* FEBA */
+ punpckhqdq TMP, STATE1 /* DCHG */
+ pshufd $0x1B, STATE0, STATE0 /* ABEF */
+ pshufd $0xB1, STATE1, STATE1 /* CDGH */
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+ lea K256+32*4(%rip), SHA256CONSTANTS
+
+.Lloop0:
+ /* Save hash values for addition after rounds */
+ movdqa STATE0, ABEF_SAVE
+ movdqa STATE1, CDGH_SAVE
+
+.irp i, 0, 16, 32, 48
+ do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
+ do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
+ do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
+ do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
+.endr
+
+ /* Add current hash values with previously saved */
+ paddd ABEF_SAVE, STATE0
+ paddd CDGH_SAVE, STATE1
+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */
+ movdqa STATE0, TMP
+ punpcklqdq STATE1, STATE0 /* GHEF */
+ punpckhqdq TMP, STATE1 /* ABCD */
+ pshufd $0xB1, STATE0, STATE0 /* HGFE */
+ pshufd $0x1B, STATE1, STATE1 /* DCBA */
+
+ movdqu STATE1, 0*16(STATE_PTR)
+ movdqu STATE0, 1*16(STATE_PTR)
+
+ RET
+SYM_FUNC_END(sha256_ni_transform)
+
+#undef DIGEST_PTR
+#undef DATA_PTR
+#undef NUM_BLKS
+#undef SHA256CONSTANTS
+#undef MSG
+#undef STATE0
+#undef STATE1
+#undef MSG0
+#undef MSG1
+#undef MSG2
+#undef MSG3
+#undef TMP
+#undef SHUF_MASK
+#undef ABEF_SAVE
+#undef CDGH_SAVE
+
+// parameters for sha256_ni_finup2x()
+#define CTX %rdi
+#define DATA1 %rsi
+#define DATA2 %rdx
+#define LEN %ecx
+#define LEN8 %cl
+#define LEN64 %rcx
+#define OUT1 %r8
+#define OUT2 %r9
+
+// other scalar variables
+#define SHA256CONSTANTS %rax
+#define COUNT %r10
+#define COUNT32 %r10d
+#define FINAL_STEP %r11d
+
+// rbx is used as a temporary.
+
+#define MSG %xmm0 // sha256rnds2 implicit operand
+#define STATE0_A %xmm1
+#define STATE1_A %xmm2
+#define STATE0_B %xmm3
+#define STATE1_B %xmm4
+#define TMP_A %xmm5
+#define TMP_B %xmm6
+#define MSG0_A %xmm7
+#define MSG1_A %xmm8
+#define MSG2_A %xmm9
+#define MSG3_A %xmm10
+#define MSG0_B %xmm11
+#define MSG1_B %xmm12
+#define MSG2_B %xmm13
+#define MSG3_B %xmm14
+#define SHUF_MASK %xmm15
+
+#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state)
+#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
+#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
+
+// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b
+// contain the current 4 message schedule words for the first and second message
+// respectively.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words for each message. m1_a-m3_a contain
+// the next 3 groups of 4 message schedule words for the first message, and
+// likewise m1_b-m3_b for the second. After consuming the current value of
+// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
+// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the
+// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
+// cycle through the registers accordingly.
+.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b
+ movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A
+ movdqa TMP_A, TMP_B
+ paddd \m0_a, TMP_A
+ paddd \m0_b, TMP_B
+.if \i < 48
+ sha256msg1 \m1_a, \m0_a
+ sha256msg1 \m1_b, \m0_b
+.endif
+ movdqa TMP_A, MSG
+ sha256rnds2 STATE0_A, STATE1_A
+ movdqa TMP_B, MSG
+ sha256rnds2 STATE0_B, STATE1_B
+ pshufd $0x0E, TMP_A, MSG
+ sha256rnds2 STATE1_A, STATE0_A
+ pshufd $0x0E, TMP_B, MSG
+ sha256rnds2 STATE1_B, STATE0_B
+.if \i < 48
+ movdqa \m3_a, TMP_A
+ movdqa \m3_b, TMP_B
+ palignr $4, \m2_a, TMP_A
+ palignr $4, \m2_b, TMP_B
+ paddd TMP_A, \m0_a
+ paddd TMP_B, \m0_b
+ sha256msg2 \m3_a, \m0_a
+ sha256msg2 \m3_b, \m0_b
+.endif
+.endm
+
+//
+// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+// const u8 *data1, const u8 *data2, int len,
+// u8 out1[SHA256_DIGEST_SIZE],
+// u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial context
+// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved. On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(sha256_ni_finup2x)
+ // Allocate 128 bytes of stack space, 16-byte aligned.
+ push %rbx
+ push %rbp
+ mov %rsp, %rbp
+ sub $128, %rsp
+ and $~15, %rsp
+
+ // Load the shuffle mask for swapping the endianness of 32-bit words.
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+ // Set up pointer to the round constants.
+ lea K256+32*4(%rip), SHA256CONSTANTS
+
+ // Initially we're not processing the final blocks.
+ xor FINAL_STEP, FINAL_STEP
+
+ // Load the initial state from ctx->state.
+ movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA
+ movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE
+ movdqa STATE0_A, TMP_A
+ punpcklqdq STATE1_A, STATE0_A // FEBA
+ punpckhqdq TMP_A, STATE1_A // DCHG
+ pshufd $0x1B, STATE0_A, STATE0_A // ABEF
+ pshufd $0xB1, STATE1_A, STATE1_A // CDGH
+
+ // Load ctx->bytecount. Take the mod 64 of it to get the number of
+ // bytes that are buffered in ctx->buf. Also save it in a register with
+ // LEN added to it.
+ mov LEN, LEN
+ mov OFFSETOF_BYTECOUNT(CTX), %rbx
+ lea (%rbx, LEN64, 1), COUNT
+ and $63, %ebx
+ jz .Lfinup2x_enter_loop // No bytes buffered?
+
+ // %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them
+ // followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we
+ // just load 64 bytes from each of ctx->buf, DATA1, and DATA2
+ // unconditionally and rearrange the data as needed.
+
+ movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A
+ movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A
+ movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A
+ movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A
+ movdqa MSG0_A, 0*16(%rsp)
+ movdqa MSG1_A, 1*16(%rsp)
+ movdqa MSG2_A, 2*16(%rsp)
+ movdqa MSG3_A, 3*16(%rsp)
+
+ movdqu 0*16(DATA1), MSG0_A
+ movdqu 1*16(DATA1), MSG1_A
+ movdqu 2*16(DATA1), MSG2_A
+ movdqu 3*16(DATA1), MSG3_A
+ movdqu MSG0_A, 0*16(%rsp,%rbx)
+ movdqu MSG1_A, 1*16(%rsp,%rbx)
+ movdqu MSG2_A, 2*16(%rsp,%rbx)
+ movdqu MSG3_A, 3*16(%rsp,%rbx)
+ movdqa 0*16(%rsp), MSG0_A
+ movdqa 1*16(%rsp), MSG1_A
+ movdqa 2*16(%rsp), MSG2_A
+ movdqa 3*16(%rsp), MSG3_A
+
+ movdqu 0*16(DATA2), MSG0_B
+ movdqu 1*16(DATA2), MSG1_B
+ movdqu 2*16(DATA2), MSG2_B
+ movdqu 3*16(DATA2), MSG3_B
+ movdqu MSG0_B, 0*16(%rsp,%rbx)
+ movdqu MSG1_B, 1*16(%rsp,%rbx)
+ movdqu MSG2_B, 2*16(%rsp,%rbx)
+ movdqu MSG3_B, 3*16(%rsp,%rbx)
+ movdqa 0*16(%rsp), MSG0_B
+ movdqa 1*16(%rsp), MSG1_B
+ movdqa 2*16(%rsp), MSG2_B
+ movdqa 3*16(%rsp), MSG3_B
+
+ sub $64, %rbx // rbx = buffered - 64
+ sub %rbx, DATA1 // DATA1 += 64 - buffered
+ sub %rbx, DATA2 // DATA2 += 64 - buffered
+ add %ebx, LEN // LEN += buffered - 64
+ movdqa STATE0_A, STATE0_B
+ movdqa STATE1_A, STATE1_B
+ jmp .Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+ sub $64, LEN
+ movdqa STATE0_A, STATE0_B
+ movdqa STATE1_A, STATE1_B
+.Lfinup2x_loop:
+ // Load the next two data blocks.
+ movdqu 0*16(DATA1), MSG0_A
+ movdqu 0*16(DATA2), MSG0_B
+ movdqu 1*16(DATA1), MSG1_A
+ movdqu 1*16(DATA2), MSG1_B
+ movdqu 2*16(DATA1), MSG2_A
+ movdqu 2*16(DATA2), MSG2_B
+ movdqu 3*16(DATA1), MSG3_A
+ movdqu 3*16(DATA2), MSG3_B
+ add $64, DATA1
+ add $64, DATA2
+.Lfinup2x_loop_have_data:
+ // Convert the words of the data blocks from big endian.
+ pshufb SHUF_MASK, MSG0_A
+ pshufb SHUF_MASK, MSG0_B
+ pshufb SHUF_MASK, MSG1_A
+ pshufb SHUF_MASK, MSG1_B
+ pshufb SHUF_MASK, MSG2_A
+ pshufb SHUF_MASK, MSG2_B
+ pshufb SHUF_MASK, MSG3_A
+ pshufb SHUF_MASK, MSG3_B
+.Lfinup2x_loop_have_bswapped_data:
+
+ // Save the original state for each block.
+ movdqa STATE0_A, 0*16(%rsp)
+ movdqa STATE0_B, 1*16(%rsp)
+ movdqa STATE1_A, 2*16(%rsp)
+ movdqa STATE1_B, 3*16(%rsp)
+
+ // Do the SHA-256 rounds on each block.
+.irp i, 0, 16, 32, 48
+ do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
+ MSG0_B, MSG1_B, MSG2_B, MSG3_B
+ do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
+ MSG1_B, MSG2_B, MSG3_B, MSG0_B
+ do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
+ MSG2_B, MSG3_B, MSG0_B, MSG1_B
+ do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
+ MSG3_B, MSG0_B, MSG1_B, MSG2_B
+.endr
+
+ // Add the original state for each block.
+ paddd 0*16(%rsp), STATE0_A
+ paddd 1*16(%rsp), STATE0_B
+ paddd 2*16(%rsp), STATE1_A
+ paddd 3*16(%rsp), STATE1_B
+
+ // Update LEN and loop back if more blocks remain.
+ sub $64, LEN
+ jge .Lfinup2x_loop
+
+ // Check if any final blocks need to be handled.
+ // FINAL_STEP = 2: all done
+ // FINAL_STEP = 1: need to do count-only padding block
+ // FINAL_STEP = 0: need to do the block with 0x80 padding byte
+ cmp $1, FINAL_STEP
+ jg .Lfinup2x_done
+ je .Lfinup2x_finalize_countonly
+ add $64, LEN
+ jz .Lfinup2x_finalize_blockaligned
+
+ // Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block.
+ // To do this, write the padding starting with the 0x80 byte to
+ // &sp[64]. Then for each message, copy the last 64 data bytes to sp
+ // and load from &sp[64 - LEN] to get the needed padding block. This
+ // code relies on the data buffers being >= 64 bytes in length.
+ mov $64, %ebx
+ sub LEN, %ebx // ebx = 64 - LEN
+ sub %rbx, DATA1 // DATA1 -= 64 - LEN
+ sub %rbx, DATA2 // DATA2 -= 64 - LEN
+ mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary
+ movd FINAL_STEP, MSG0_A
+ pxor MSG1_A, MSG1_A
+ movdqa MSG0_A, 4*16(%rsp)
+ movdqa MSG1_A, 5*16(%rsp)
+ movdqa MSG1_A, 6*16(%rsp)
+ movdqa MSG1_A, 7*16(%rsp)
+ cmp $56, LEN
+ jge 1f // will COUNT spill into its own block?
+ shl $3, COUNT
+ bswap COUNT
+ mov COUNT, 56(%rsp,%rbx)
+ mov $2, FINAL_STEP // won't need count-only block
+ jmp 2f
+1:
+ mov $1, FINAL_STEP // will need count-only block
+2:
+ movdqu 0*16(DATA1), MSG0_A
+ movdqu 1*16(DATA1), MSG1_A
+ movdqu 2*16(DATA1), MSG2_A
+ movdqu 3*16(DATA1), MSG3_A
+ movdqa MSG0_A, 0*16(%rsp)
+ movdqa MSG1_A, 1*16(%rsp)
+ movdqa MSG2_A, 2*16(%rsp)
+ movdqa MSG3_A, 3*16(%rsp)
+ movdqu 0*16(%rsp,%rbx), MSG0_A
+ movdqu 1*16(%rsp,%rbx), MSG1_A
+ movdqu 2*16(%rsp,%rbx), MSG2_A
+ movdqu 3*16(%rsp,%rbx), MSG3_A
+
+ movdqu 0*16(DATA2), MSG0_B
+ movdqu 1*16(DATA2), MSG1_B
+ movdqu 2*16(DATA2), MSG2_B
+ movdqu 3*16(DATA2), MSG3_B
+ movdqa MSG0_B, 0*16(%rsp)
+ movdqa MSG1_B, 1*16(%rsp)
+ movdqa MSG2_B, 2*16(%rsp)
+ movdqa MSG3_B, 3*16(%rsp)
+ movdqu 0*16(%rsp,%rbx), MSG0_B
+ movdqu 1*16(%rsp,%rbx), MSG1_B
+ movdqu 2*16(%rsp,%rbx), MSG2_B
+ movdqu 3*16(%rsp,%rbx), MSG3_B
+ jmp .Lfinup2x_loop_have_data
+
+ // Prepare a padding block, either:
+ //
+ // {0x80, 0, 0, 0, ..., count (as __be64)}
+ // This is for a block aligned message.
+ //
+ // { 0, 0, 0, 0, ..., count (as __be64)}
+ // This is for a message whose length mod 64 is >= 56.
+ //
+ // Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+ pxor MSG0_A, MSG0_A
+ jmp 1f
+
+.Lfinup2x_finalize_blockaligned:
+ mov $0x80000000, %ebx
+ movd %ebx, MSG0_A
+1:
+ pxor MSG1_A, MSG1_A
+ pxor MSG2_A, MSG2_A
+ ror $29, COUNT
+ movq COUNT, MSG3_A
+ pslldq $8, MSG3_A
+ movdqa MSG0_A, MSG0_B
+ pxor MSG1_B, MSG1_B
+ pxor MSG2_B, MSG2_B
+ movdqa MSG3_A, MSG3_B
+ mov $2, FINAL_STEP
+ jmp .Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+ // Write the two digests with all bytes in the correct order.
+ movdqa STATE0_A, TMP_A
+ movdqa STATE0_B, TMP_B
+ punpcklqdq STATE1_A, STATE0_A // GHEF
+ punpcklqdq STATE1_B, STATE0_B
+ punpckhqdq TMP_A, STATE1_A // ABCD
+ punpckhqdq TMP_B, STATE1_B
+ pshufd $0xB1, STATE0_A, STATE0_A // HGFE
+ pshufd $0xB1, STATE0_B, STATE0_B
+ pshufd $0x1B, STATE1_A, STATE1_A // DCBA
+ pshufd $0x1B, STATE1_B, STATE1_B
+ pshufb SHUF_MASK, STATE0_A
+ pshufb SHUF_MASK, STATE0_B
+ pshufb SHUF_MASK, STATE1_A
+ pshufb SHUF_MASK, STATE1_B
+ movdqu STATE0_A, 1*16(OUT1)
+ movdqu STATE0_B, 1*16(OUT2)
+ movdqu STATE1_A, 0*16(OUT1)
+ movdqu STATE1_B, 0*16(OUT2)
+
+ mov %rbp, %rsp
+ pop %rbp
+ pop %rbx
+ RET
+SYM_FUNC_END(sha256_ni_finup2x)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/lib/crypto/x86/sha256-ssse3-asm.S b/lib/crypto/x86/sha256-ssse3-asm.S
new file mode 100644
index 000000000000..383b8eec7ebe
--- /dev/null
+++ b/lib/crypto/x86/sha256-ssse3-asm.S
@@ -0,0 +1,505 @@
+########################################################################
+# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define MOVDQ movdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+ MOVDQ \p2, \p1
+ pshufb \p3, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+
+SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm12
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+
+SRND = %rsi # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP = _INP_END + _INP_END_SIZE
+_XFER = _INP + _INP_SIZE
+_XMM_SAVE = _XFER + _XFER_SIZE
+STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+ ## compute s0 four at a time and s1 two at a time
+ ## compute W[-16] + W[-7] 4 at a time
+ movdqa X3, XTMP0
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ palignr $4, X2, XTMP0 # XTMP0 = W[-7]
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ movdqa X1, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ## compute s0
+ palignr $4, X0, XTMP1 # XTMP1 = W[-15]
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
+ movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pslld $(32-7), XTMP1 #
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ psrld $7, XTMP2 #
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ #
+ ROTATE_ARGS #
+ movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ pslld $(32-18), XTMP3 #
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ psrld $18, XTMP2 #
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP3, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
+ add y0, y2 # y2 = S1 + CH
+ add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pxor XTMP4, XTMP1 # XTMP1 = s0
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ ## compute low s1
+ pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
+ xor g, y2 # y2 = f^g
+ psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP3, XTMP2
+ add y0, y2 # y2 = S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ ## compute high s1
+ pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ #
+ ROTATE_ARGS #
+ movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
+ and e, y2 # y2 = (f^g)&e
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ pxor XTMP3, XTMP2 #
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+ add y0, y2 # y2 = S1 + CH
+ add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ pxor XTMP2, X0 # X0 = s1 {xDxC}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and e, y2 # y2 = (f^g)&e
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ add y0, y2 # y2 = S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ offset = \round * 4 + _XFER
+ add offset(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_ssse3(struct sha256_block_state *state,
+## const u8 *data, size_t nblocks);
+########################################################################
+.text
+SYM_FUNC_START(sha256_transform_ssse3)
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ mov %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp
+ and $~15, %rsp
+
+ shl $6, NUM_BLKS # convert to bytes
+ add INP, NUM_BLKS
+ mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
+
+ ## load initial digest
+ mov 4*0(CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ movdqa _SHUF_00BA(%rip), SHUF_00BA
+ movdqa _SHUF_DC00(%rip), SHUF_DC00
+
+.Lloop0:
+ lea K256(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
+
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov $3, SRND
+.align 16
+.Lloop1:
+ movdqa (TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 1*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 2*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 3*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ add $4*16, TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ sub $1, SRND
+ jne .Lloop1
+
+ mov $2, SRND
+.Lloop2:
+ paddd (TBL), X0
+ movdqa X0, _XFER(%rsp)
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+ paddd 1*16(TBL), X1
+ movdqa X1, _XFER(%rsp)
+ add $2*16, TBL
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ movdqa X2, X0
+ movdqa X3, X1
+
+ sub $1, SRND
+ jne .Lloop2
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ mov _INP(%rsp), INP
+ add $64, INP
+ cmp _INP_END(%rsp), INP
+ jne .Lloop0
+
+ mov %rbp, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+
+ RET
+SYM_FUNC_END(sha256_transform_ssse3)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h
new file mode 100644
index 000000000000..38e33b22a092
--- /dev/null
+++ b/lib/crypto/x86/sha256.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for x86_64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/static_call.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha_ni);
+
+DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_blocks_generic);
+
+#define DEFINE_X86_SHA256_FN(c_fn, asm_fn) \
+ asmlinkage void asm_fn(struct sha256_block_state *state, \
+ const u8 *data, size_t nblocks); \
+ static void c_fn(struct sha256_block_state *state, const u8 *data, \
+ size_t nblocks) \
+ { \
+ if (likely(irq_fpu_usable())) { \
+ kernel_fpu_begin(); \
+ asm_fn(state, data, nblocks); \
+ kernel_fpu_end(); \
+ } else { \
+ sha256_blocks_generic(state, data, nblocks); \
+ } \
+ }
+
+DEFINE_X86_SHA256_FN(sha256_blocks_ssse3, sha256_transform_ssse3);
+DEFINE_X86_SHA256_FN(sha256_blocks_avx, sha256_transform_avx);
+DEFINE_X86_SHA256_FN(sha256_blocks_avx2, sha256_transform_rorx);
+DEFINE_X86_SHA256_FN(sha256_blocks_ni, sha256_ni_transform);
+
+static void sha256_blocks(struct sha256_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ static_call(sha256_blocks_x86)(state, data, nblocks);
+}
+
+static_assert(offsetof(struct __sha256_ctx, state) == 0);
+static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
+static_assert(offsetof(struct __sha256_ctx, buf) == 40);
+asmlinkage void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, int len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE]);
+
+#define sha256_finup_2x_arch sha256_finup_2x_arch
+static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
+ const u8 *data1, const u8 *data2, size_t len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE])
+{
+ /*
+ * The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
+ * Further limit len to 65536 to avoid spending too long with preemption
+ * disabled. (Of course, in practice len is nearly always 4096 anyway.)
+ */
+ if (static_branch_likely(&have_sha_ni) && len >= SHA256_BLOCK_SIZE &&
+ len <= 65536 && likely(irq_fpu_usable())) {
+ kernel_fpu_begin();
+ sha256_ni_finup2x(ctx, data1, data2, len, out1, out2);
+ kernel_fpu_end();
+ kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
+ kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
+ return true;
+ }
+ return false;
+}
+
+static bool sha256_finup_2x_is_optimized_arch(void)
+{
+ return static_key_enabled(&have_sha_ni);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static void sha256_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+ static_call_update(sha256_blocks_x86, sha256_blocks_ni);
+ static_branch_enable(&have_sha_ni);
+ } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ NULL) &&
+ boot_cpu_has(X86_FEATURE_AVX)) {
+ if (boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ static_call_update(sha256_blocks_x86,
+ sha256_blocks_avx2);
+ else
+ static_call_update(sha256_blocks_x86,
+ sha256_blocks_avx);
+ } else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+ static_call_update(sha256_blocks_x86, sha256_blocks_ssse3);
+ }
+}
diff --git a/lib/crypto/x86/sha512-avx-asm.S b/lib/crypto/x86/sha512-avx-asm.S
new file mode 100644
index 000000000000..7732aa8fd850
--- /dev/null
+++ b/lib/crypto/x86/sha512-avx-asm.S
@@ -0,0 +1,420 @@
+########################################################################
+# Implement fast SHA-512 with AVX instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest = %rdi
+# ARG2
+msg = %rsi
+# ARG3
+msglen = %rdx
+T1 = %rcx
+T2 = %r8
+a_64 = %r9
+b_64 = %r10
+c_64 = %r11
+d_64 = %r12
+e_64 = %r13
+f_64 = %r14
+g_64 = %r15
+h_64 = %rbx
+tmp0 = %rax
+
+# Local variables (stack frame)
+
+# Message Schedule
+W_SIZE = 80*8
+# W[t] + K[t] | W[t+1] + K[t+1]
+WK_SIZE = 2*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_size = frame_WK + WK_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i) 8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i) 8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i) 8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+ # Rotate symbols a..h right
+ TMP = h_64
+ h_64 = g_64
+ g_64 = f_64
+ f_64 = e_64
+ e_64 = d_64
+ d_64 = c_64
+ c_64 = b_64
+ b_64 = a_64
+ a_64 = TMP
+.endm
+
+.macro RORQ p1 p2
+ # shld is faster than ror on Sandybridge
+ shld $(64-\p2), \p1, \p1
+.endm
+
+.macro SHA512_Round rnd
+ # Compute Round %%t
+ mov f_64, T1 # T1 = f
+ mov e_64, tmp0 # tmp = e
+ xor g_64, T1 # T1 = f ^ g
+ RORQ tmp0, 23 # 41 # tmp = e ror 23
+ and e_64, T1 # T1 = (f ^ g) & e
+ xor e_64, tmp0 # tmp = (e ror 23) ^ e
+ xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ idx = \rnd
+ add WK_2(idx), T1 # W[t] + K[t] from message scheduler
+ RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4
+ xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov a_64, T2 # T2 = a
+ add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
+ RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov a_64, tmp0 # tmp = a
+ xor c_64, T2 # T2 = a ^ c
+ and c_64, tmp0 # tmp = a & c
+ and b_64, T2 # T2 = (a ^ c) & b
+ xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov a_64, tmp0 # tmp = a
+ RORQ tmp0, 5 # 39 # tmp = a ror 5
+ xor a_64, tmp0 # tmp = (a ror 5) ^ a
+ add T1, d_64 # e(next_state) = d + T1
+ RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6
+ xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
+ RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_avx rnd
+ # Compute rounds t-2 and t-1
+ # Compute message schedule QWORDS t and t+1
+
+ # Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ # scheduler.
+ # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
+ # They are then added to their respective SHA512 constants at
+ # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
+ # For brievity, the comments following vectored instructions only refer to
+ # the first of a pair of QWORDS.
+ # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
+ # The computation of the message schedule and the rounds are tightly
+ # stitched to take advantage of instruction-level parallelism.
+
+ idx = \rnd - 2
+ vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2]
+ idx = \rnd - 15
+ vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
+ mov f_64, T1
+ vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61
+ mov e_64, tmp0
+ vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1
+ xor g_64, T1
+ RORQ tmp0, 23 # 41
+ vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19
+ and e_64, T1
+ xor e_64, tmp0
+ vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19
+ xor g_64, T1
+ idx = \rnd
+ add WK_2(idx), T1#
+ vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8
+ RORQ tmp0, 4 # 18
+ vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6
+ xor e_64, tmp0
+ mov a_64, T2
+ add h_64, T1
+ vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8
+ RORQ tmp0, 14 # 14
+ add tmp0, T1
+ vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7
+ mov a_64, tmp0
+ xor c_64, T2
+ vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3
+ and c_64, tmp0
+ and b_64, T2
+ vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3
+ xor tmp0, T2
+ mov a_64, tmp0
+ vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63
+ RORQ tmp0, 5 # 39
+ vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63
+ xor a_64, tmp0
+ add T1, d_64
+ RORQ tmp0, 6 # 34
+ xor a_64, tmp0
+ vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
+ # W[t-15]>>7 ^ W[t-15]<<63
+ lea (T1, T2), h_64
+ RORQ tmp0, 28 # 28
+ vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25
+ add tmp0, h_64
+ RotateState
+ vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
+ # W[t-2]<<25
+ mov f_64, T1
+ vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2])
+ mov e_64, tmp0
+ xor g_64, T1
+ idx = \rnd - 16
+ vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16]
+ idx = \rnd - 7
+ vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
+ RORQ tmp0, 23 # 41
+ and e_64, T1
+ xor e_64, tmp0
+ xor g_64, T1
+ vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56
+ idx = \rnd + 1
+ add WK_2(idx), T1
+ vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15])
+ RORQ tmp0, 4 # 18
+ vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
+ xor e_64, tmp0
+ vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
+ # s0(W[t-15]) + W[t-16]
+ mov a_64, T2
+ add h_64, T1
+ RORQ tmp0, 14 # 14
+ add tmp0, T1
+ idx = \rnd
+ vmovdqa %xmm0, W_t(idx) # Store W[t]
+ vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t]
+ vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
+ mov a_64, tmp0
+ xor c_64, T2
+ and c_64, tmp0
+ and b_64, T2
+ xor tmp0, T2
+ mov a_64, tmp0
+ RORQ tmp0, 5 # 39
+ xor a_64, tmp0
+ add T1, d_64
+ RORQ tmp0, 6 # 34
+ xor a_64, tmp0
+ lea (T1, T2), h_64
+ RORQ tmp0, 28 # 28
+ add tmp0, h_64
+ RotateState
+.endm
+
+########################################################################
+# void sha512_transform_avx(struct sha512_block_state *state,
+# const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks. Must be >= 1.
+########################################################################
+SYM_FUNC_START(sha512_transform_avx)
+
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # Allocate Stack Space
+ push %rbp
+ mov %rsp, %rbp
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+
+.Lupdateblock:
+
+ # Load state variables
+ mov DIGEST(0), a_64
+ mov DIGEST(1), b_64
+ mov DIGEST(2), c_64
+ mov DIGEST(3), d_64
+ mov DIGEST(4), e_64
+ mov DIGEST(5), f_64
+ mov DIGEST(6), g_64
+ mov DIGEST(7), h_64
+
+ t = 0
+ .rept 80/2 + 1
+ # (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ # +1 iteration because the scheduler leads hashing by 1 iteration
+ .if t < 2
+ # BSWAP 2 QWORDS
+ vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1
+ vmovdqu MSG(t), %xmm0
+ vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
+ vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
+ vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+ vmovdqa %xmm0, WK_2(t) # Store into WK for rounds
+ .elseif t < 16
+ # BSWAP 2 QWORDS# Compute 2 Rounds
+ vmovdqu MSG(t), %xmm0
+ vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
+ SHA512_Round t-2 # Round t-2
+ vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
+ vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+ SHA512_Round t-1 # Round t-1
+ vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK
+ .elseif t < 79
+ # Schedule 2 QWORDS# Compute 2 Rounds
+ SHA512_2Sched_2Round_avx t
+ .else
+ # Compute 2 Rounds
+ SHA512_Round t-2
+ SHA512_Round t-1
+ .endif
+ t = t+2
+ .endr
+
+ # Update digest
+ add a_64, DIGEST(0)
+ add b_64, DIGEST(1)
+ add c_64, DIGEST(2)
+ add d_64, DIGEST(3)
+ add e_64, DIGEST(4)
+ add f_64, DIGEST(5)
+ add g_64, DIGEST(6)
+ add h_64, DIGEST(7)
+
+ # Advance to next message block
+ add $16*8, msg
+ dec msglen
+ jnz .Lupdateblock
+
+ # Restore Stack Pointer
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ RET
+SYM_FUNC_END(sha512_transform_avx)
+
+########################################################################
+### Binary Data
+
+.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/lib/crypto/x86/sha512-avx2-asm.S b/lib/crypto/x86/sha512-avx2-asm.S
new file mode 100644
index 000000000000..22bdbfd899d0
--- /dev/null
+++ b/lib/crypto/x86/sha512-avx2-asm.S
@@ -0,0 +1,748 @@
+########################################################################
+# Implement fast SHA-512 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 blocks at a time, with 4 lanes per block
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+Y_0 = %ymm4
+Y_1 = %ymm5
+Y_2 = %ymm6
+Y_3 = %ymm7
+
+YTMP0 = %ymm0
+YTMP1 = %ymm1
+YTMP2 = %ymm2
+YTMP3 = %ymm3
+YTMP4 = %ymm8
+XFER = YTMP0
+
+BYTE_FLIP_MASK = %ymm9
+
+# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
+CTX1 = %rdi
+CTX2 = %r12
+# 2nd arg
+INP = %rsi
+# 3rd arg
+NUM_BLKS = %rdx
+
+c = %rcx
+d = %r8
+e = %rdx
+y3 = %rsi
+
+TBL = %rdi # clobbers CTX1
+
+a = %rax
+b = %rbx
+
+f = %r9
+g = %r10
+h = %r11
+old_h = %r11
+
+T1 = %r12 # clobbers CTX2
+y0 = %r13
+y1 = %r14
+y2 = %r15
+
+# Local variables (stack frame)
+XFER_SIZE = 4*8
+SRND_SIZE = 1*8
+INP_SIZE = 1*8
+INPEND_SIZE = 1*8
+CTX_SIZE = 1*8
+
+frame_XFER = 0
+frame_SRND = frame_XFER + XFER_SIZE
+frame_INP = frame_SRND + SRND_SIZE
+frame_INPEND = frame_INP + INP_SIZE
+frame_CTX = frame_INPEND + INPEND_SIZE
+frame_size = frame_CTX + CTX_SIZE
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+
+# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
+# Load ymm with mem and byte swap each dword
+.macro COPY_YMM_AND_BSWAP p1 p2 p3
+ VMOVDQ \p2, \p1
+ vpshufb \p3, \p1, \p1
+.endm
+# rotate_Ys
+# Rotate values of symbols Y0...Y3
+.macro rotate_Ys
+ Y_ = Y_0
+ Y_0 = Y_1
+ Y_1 = Y_2
+ Y_2 = Y_3
+ Y_3 = Y_
+.endm
+
+# RotateState
+.macro RotateState
+ # Rotate symbols a..h right
+ old_h = h
+ TMP_ = h
+ h = g
+ g = f
+ f = e
+ e = d
+ d = c
+ c = b
+ b = a
+ a = TMP_
+.endm
+
+# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
+# YDST = {YSRC1, YSRC2} >> RVAL*8
+.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
+ vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
+ vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+################################### RND N + 0 #########################################
+
+ # Extract w[t-7]
+ MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
+ # Calculate w[t-16] + w[t-7]
+ vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
+ # Extract w[t-15]
+ MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
+
+ # Calculate sigma0
+
+ # Calculate w[t-15] ror 1
+ vpsrlq $1, YTMP1, YTMP2
+ vpsllq $(64-1), YTMP1, YTMP3
+ vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
+ # Calculate w[t-15] shr 7
+ vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add frame_XFER(%rsp),h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+
+ and e, y2 # y2 = (f^g)&e # CH
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+
+ add y0, y2 # y2 = S1 + CH # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+################################### RND N + 1 #########################################
+
+ # Calculate w[t-15] ror 8
+ vpsrlq $8, YTMP1, YTMP2
+ vpsllq $(64-8), YTMP1, YTMP1
+ vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
+ # XOR the three components
+ vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
+ vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
+
+
+ # Add three components, w[t-16], w[t-7] and sigma0
+ vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
+ # Move to appropriate lanes for calculating w[16] and w[17]
+ vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
+ # Move to appropriate lanes for calculating w[18] and w[19]
+ vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
+
+ # Calculate w[16] and w[17] in both 128 bit lanes
+
+ # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+ vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
+ vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
+
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+
+################################### RND N + 2 #########################################
+
+ vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
+ vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+ vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
+ vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
+ # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
+
+ # Add sigma1 to the other compunents to get w[16] and w[17]
+ vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
+
+ # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+ vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
+
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ xor g, y2 # y2 = f^g # CH
+
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+################################### RND N + 3 #########################################
+
+ vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
+ vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+ vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
+ vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
+ # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
+
+ # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
+ # to newly calculated sigma1 to get w[18] and w[19]
+ vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
+
+ # Form w[19, w[18], w17], w[16]
+ vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ add y0, y2 # y2 = S1 + CH # --
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+
+ add y1, h # h = k + w + h + S0 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+ rotate_Ys
+.endm
+
+.macro DO_4ROUNDS
+
+################################### RND N + 0 #########################################
+
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 1 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 2 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 3 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+.endm
+
+########################################################################
+# void sha512_transform_rorx(struct sha512_block_state *state,
+# const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks. Must be >= 1.
+########################################################################
+SYM_FUNC_START(sha512_transform_rorx)
+
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # Allocate Stack Space
+ push %rbp
+ mov %rsp, %rbp
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+
+ shl $7, NUM_BLKS # convert to bytes
+ add INP, NUM_BLKS # pointer to end of data
+ mov NUM_BLKS, frame_INPEND(%rsp)
+
+ ## load initial digest
+ mov 8*0(CTX1), a
+ mov 8*1(CTX1), b
+ mov 8*2(CTX1), c
+ mov 8*3(CTX1), d
+ mov 8*4(CTX1), e
+ mov 8*5(CTX1), f
+ mov 8*6(CTX1), g
+ mov 8*7(CTX1), h
+
+ # save %rdi (CTX) before it gets clobbered
+ mov %rdi, frame_CTX(%rsp)
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+
+.Lloop0:
+ lea K512(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
+
+ mov INP, frame_INP(%rsp)
+
+ ## schedule 64 input dwords, by doing 12 rounds of 4 each
+ movq $4, frame_SRND(%rsp)
+
+.align 16
+.Lloop1:
+ vpaddq (TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 1*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 2*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 3*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ add $(4*32), TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ subq $1, frame_SRND(%rsp)
+ jne .Lloop1
+
+ movq $2, frame_SRND(%rsp)
+.Lloop2:
+ vpaddq (TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ DO_4ROUNDS
+ vpaddq 1*32(TBL), Y_1, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ add $(2*32), TBL
+ DO_4ROUNDS
+
+ vmovdqa Y_2, Y_0
+ vmovdqa Y_3, Y_1
+
+ subq $1, frame_SRND(%rsp)
+ jne .Lloop2
+
+ mov frame_CTX(%rsp), CTX2
+ addm 8*0(CTX2), a
+ addm 8*1(CTX2), b
+ addm 8*2(CTX2), c
+ addm 8*3(CTX2), d
+ addm 8*4(CTX2), e
+ addm 8*5(CTX2), f
+ addm 8*6(CTX2), g
+ addm 8*7(CTX2), h
+
+ mov frame_INP(%rsp), INP
+ add $128, INP
+ cmp frame_INPEND(%rsp), INP
+ jne .Lloop0
+
+ # Restore Stack Pointer
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ vzeroupper
+ RET
+SYM_FUNC_END(sha512_transform_rorx)
+
+########################################################################
+### Binary Data
+
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+ .octa 0x18191a1b1c1d1e1f1011121314151617
+
+.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
+.align 32
+MASK_YMM_LO:
+ .octa 0x00000000000000000000000000000000
+ .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
diff --git a/lib/crypto/x86/sha512-ssse3-asm.S b/lib/crypto/x86/sha512-ssse3-asm.S
new file mode 100644
index 000000000000..4cae7445b2a8
--- /dev/null
+++ b/lib/crypto/x86/sha512-ssse3-asm.S
@@ -0,0 +1,419 @@
+########################################################################
+# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest = %rdi
+# ARG2
+msg = %rsi
+# ARG3
+msglen = %rdx
+T1 = %rcx
+T2 = %r8
+a_64 = %r9
+b_64 = %r10
+c_64 = %r11
+d_64 = %r12
+e_64 = %r13
+f_64 = %r14
+g_64 = %r15
+h_64 = %rbx
+tmp0 = %rax
+
+# Local variables (stack frame)
+
+W_SIZE = 80*8
+WK_SIZE = 2*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_size = frame_WK + WK_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i) 8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i) 8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i) 8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+ # Rotate symbols a..h right
+ TMP = h_64
+ h_64 = g_64
+ g_64 = f_64
+ f_64 = e_64
+ e_64 = d_64
+ d_64 = c_64
+ c_64 = b_64
+ b_64 = a_64
+ a_64 = TMP
+.endm
+
+.macro SHA512_Round rnd
+
+ # Compute Round %%t
+ mov f_64, T1 # T1 = f
+ mov e_64, tmp0 # tmp = e
+ xor g_64, T1 # T1 = f ^ g
+ ror $23, tmp0 # 41 # tmp = e ror 23
+ and e_64, T1 # T1 = (f ^ g) & e
+ xor e_64, tmp0 # tmp = (e ror 23) ^ e
+ xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ idx = \rnd
+ add WK_2(idx), T1 # W[t] + K[t] from message scheduler
+ ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4
+ xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov a_64, T2 # T2 = a
+ add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
+ ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov a_64, tmp0 # tmp = a
+ xor c_64, T2 # T2 = a ^ c
+ and c_64, tmp0 # tmp = a & c
+ and b_64, T2 # T2 = (a ^ c) & b
+ xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov a_64, tmp0 # tmp = a
+ ror $5, tmp0 # 39 # tmp = a ror 5
+ xor a_64, tmp0 # tmp = (a ror 5) ^ a
+ add T1, d_64 # e(next_state) = d + T1
+ ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6
+ xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
+ ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_sse rnd
+
+ # Compute rounds t-2 and t-1
+ # Compute message schedule QWORDS t and t+1
+
+ # Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ # scheduler.
+ # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+ # They are then added to their respective SHA512 constants at
+ # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+ # For brievity, the comments following vectored instructions only refer to
+ # the first of a pair of QWORDS.
+ # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+ # The computation of the message schedule and the rounds are tightly
+ # stitched to take advantage of instruction-level parallelism.
+ # For clarity, integer instructions (for the rounds calculation) are indented
+ # by one tab. Vectored instructions (for the message scheduler) are indented
+ # by two tabs.
+
+ mov f_64, T1
+ idx = \rnd -2
+ movdqa W_t(idx), %xmm2 # XMM2 = W[t-2]
+ xor g_64, T1
+ and e_64, T1
+ movdqa %xmm2, %xmm0 # XMM0 = W[t-2]
+ xor g_64, T1
+ idx = \rnd
+ add WK_2(idx), T1
+ idx = \rnd - 15
+ movdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
+ mov e_64, tmp0
+ ror $23, tmp0 # 41
+ movdqa %xmm5, %xmm3 # XMM3 = W[t-15]
+ xor e_64, tmp0
+ ror $4, tmp0 # 18
+ psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42
+ xor e_64, tmp0
+ ror $14, tmp0 # 14
+ psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1
+ add tmp0, T1
+ add h_64, T1
+ pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2]
+ mov a_64, T2
+ xor c_64, T2
+ pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15]
+ and b_64, T2
+ mov a_64, tmp0
+ psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+ and c_64, tmp0
+ xor tmp0, T2
+ psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+ mov a_64, tmp0
+ ror $5, tmp0 # 39
+ pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+ xor a_64, tmp0
+ ror $6, tmp0 # 34
+ pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+ xor a_64, tmp0
+ ror $28, tmp0 # 28
+ psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+ add tmp0, T2
+ add T1, d_64
+ psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+ lea (T1, T2), h_64
+ RotateState
+ movdqa %xmm2, %xmm1 # XMM1 = W[t-2]
+ mov f_64, T1
+ xor g_64, T1
+ movdqa %xmm5, %xmm4 # XMM4 = W[t-15]
+ and e_64, T1
+ xor g_64, T1
+ psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42
+ idx = \rnd + 1
+ add WK_2(idx), T1
+ mov e_64, tmp0
+ psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7
+ ror $23, tmp0 # 41
+ xor e_64, tmp0
+ pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2]
+ ror $4, tmp0 # 18
+ xor e_64, tmp0
+ pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15]
+ ror $14, tmp0 # 14
+ add tmp0, T1
+ psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+ add h_64, T1
+ mov a_64, T2
+ psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+ xor c_64, T2
+ and b_64, T2
+ pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2])
+ mov a_64, tmp0
+ and c_64, tmp0
+ idx = \rnd - 7
+ movdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
+ xor tmp0, T2
+ pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15])
+ mov a_64, tmp0
+ paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15])
+ ror $5, tmp0 # 39
+ idx =\rnd-16
+ paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+ xor a_64, tmp0
+ paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+ ror $6, tmp0 # 34
+ movdqa %xmm0, W_t(\rnd) # Store scheduled qwords
+ xor a_64, tmp0
+ paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t]
+ ror $28, tmp0 # 28
+ idx = \rnd
+ movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
+ add tmp0, T2
+ add T1, d_64
+ lea (T1, T2), h_64
+ RotateState
+.endm
+
+########################################################################
+# void sha512_transform_ssse3(struct sha512_block_state *state,
+# const u8 *data, size_t nblocks);
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "nblocks" is the message length in SHA512 blocks. Must be >= 1.
+########################################################################
+SYM_FUNC_START(sha512_transform_ssse3)
+
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ # Allocate Stack Space
+ push %rbp
+ mov %rsp, %rbp
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+
+.Lupdateblock:
+
+# Load state variables
+ mov DIGEST(0), a_64
+ mov DIGEST(1), b_64
+ mov DIGEST(2), c_64
+ mov DIGEST(3), d_64
+ mov DIGEST(4), e_64
+ mov DIGEST(5), f_64
+ mov DIGEST(6), g_64
+ mov DIGEST(7), h_64
+
+ t = 0
+ .rept 80/2 + 1
+ # (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ # +1 iteration because the scheduler leads hashing by 1 iteration
+ .if t < 2
+ # BSWAP 2 QWORDS
+ movdqa XMM_QWORD_BSWAP(%rip), %xmm1
+ movdqu MSG(t), %xmm0
+ pshufb %xmm1, %xmm0 # BSWAP
+ movdqa %xmm0, W_t(t) # Store Scheduled Pair
+ paddq K_t(t), %xmm0 # Compute W[t]+K[t]
+ movdqa %xmm0, WK_2(t) # Store into WK for rounds
+ .elseif t < 16
+ # BSWAP 2 QWORDS# Compute 2 Rounds
+ movdqu MSG(t), %xmm0
+ pshufb %xmm1, %xmm0 # BSWAP
+ SHA512_Round t-2 # Round t-2
+ movdqa %xmm0, W_t(t) # Store Scheduled Pair
+ paddq K_t(t), %xmm0 # Compute W[t]+K[t]
+ SHA512_Round t-1 # Round t-1
+ movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK
+ .elseif t < 79
+ # Schedule 2 QWORDS# Compute 2 Rounds
+ SHA512_2Sched_2Round_sse t
+ .else
+ # Compute 2 Rounds
+ SHA512_Round t-2
+ SHA512_Round t-1
+ .endif
+ t = t+2
+ .endr
+
+ # Update digest
+ add a_64, DIGEST(0)
+ add b_64, DIGEST(1)
+ add c_64, DIGEST(2)
+ add d_64, DIGEST(3)
+ add e_64, DIGEST(4)
+ add f_64, DIGEST(5)
+ add g_64, DIGEST(6)
+ add h_64, DIGEST(7)
+
+ # Advance to next message block
+ add $16*8, msg
+ dec msglen
+ jnz .Lupdateblock
+
+ # Restore Stack Pointer
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ RET
+SYM_FUNC_END(sha512_transform_ssse3)
+
+########################################################################
+### Binary Data
+
+.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/lib/crypto/x86/sha512.h b/lib/crypto/x86/sha512.h
new file mode 100644
index 000000000000..0213c70cedd0
--- /dev/null
+++ b/lib/crypto/x86/sha512.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * x86-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/static_call.h>
+
+DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic);
+
+#define DEFINE_X86_SHA512_FN(c_fn, asm_fn) \
+ asmlinkage void asm_fn(struct sha512_block_state *state, \
+ const u8 *data, size_t nblocks); \
+ static void c_fn(struct sha512_block_state *state, const u8 *data, \
+ size_t nblocks) \
+ { \
+ if (likely(irq_fpu_usable())) { \
+ kernel_fpu_begin(); \
+ asm_fn(state, data, nblocks); \
+ kernel_fpu_end(); \
+ } else { \
+ sha512_blocks_generic(state, data, nblocks); \
+ } \
+ }
+
+DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3);
+DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx);
+DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx);
+
+static void sha512_blocks(struct sha512_block_state *state,
+ const u8 *data, size_t nblocks)
+{
+ static_call(sha512_blocks_x86)(state, data, nblocks);
+}
+
+#define sha512_mod_init_arch sha512_mod_init_arch
+static void sha512_mod_init_arch(void)
+{
+ if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) &&
+ boot_cpu_has(X86_FEATURE_AVX)) {
+ if (boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ static_call_update(sha512_blocks_x86,
+ sha512_blocks_avx2);
+ else
+ static_call_update(sha512_blocks_x86,
+ sha512_blocks_avx);
+ } else if (boot_cpu_has(X86_FEATURE_SSSE3)) {
+ static_call_update(sha512_blocks_x86, sha512_blocks_ssse3);
+ }
+}