summaryrefslogtreecommitdiff
path: root/arch/arm64/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/crypto')
-rw-r--r--arch/arm64/crypto/.gitignore2
-rw-r--r--arch/arm64/crypto/Kconfig314
-rw-r--r--arch/arm64/crypto/Makefile35
-rw-r--r--arch/arm64/crypto/aes-ce-ccm-core.S40
-rw-r--r--arch/arm64/crypto/aes-ce-ccm-glue.c204
-rw-r--r--arch/arm64/crypto/aes-ce-core.S16
-rw-r--r--arch/arm64/crypto/aes-ce-glue.c10
-rw-r--r--arch/arm64/crypto/aes-ce.S4
-rw-r--r--arch/arm64/crypto/aes-cipher-core.S8
-rw-r--r--arch/arm64/crypto/aes-cipher-glue.c2
-rw-r--r--arch/arm64/crypto/aes-glue-ce.c2
-rw-r--r--arch/arm64/crypto/aes-glue-neon.c1
-rw-r--r--arch/arm64/crypto/aes-glue.c248
-rw-r--r--arch/arm64/crypto/aes-modes.S495
-rw-r--r--arch/arm64/crypto/aes-neon.S6
-rw-r--r--arch/arm64/crypto/aes-neonbs-core.S307
-rw-r--r--arch/arm64/crypto/aes-neonbs-glue.c220
-rw-r--r--arch/arm64/crypto/chacha-neon-core.S209
-rw-r--r--arch/arm64/crypto/chacha-neon-glue.c22
-rw-r--r--arch/arm64/crypto/crct10dif-ce-core.S54
-rw-r--r--arch/arm64/crypto/crct10dif-ce-glue.c30
-rw-r--r--arch/arm64/crypto/ghash-ce-core.S48
-rw-r--r--arch/arm64/crypto/ghash-ce-glue.c601
-rw-r--r--arch/arm64/crypto/nh-neon-core.S7
-rw-r--r--arch/arm64/crypto/nhpoly1305-neon-glue.c22
-rw-r--r--arch/arm64/crypto/poly1305-armv8.pl2
-rw-r--r--arch/arm64/crypto/poly1305-core.S_shipped835
-rw-r--r--arch/arm64/crypto/poly1305-glue.c39
-rw-r--r--arch/arm64/crypto/polyval-ce-core.S361
-rw-r--r--arch/arm64/crypto/polyval-ce-glue.c191
-rw-r--r--arch/arm64/crypto/sha1-ce-core.S51
-rw-r--r--arch/arm64/crypto/sha1-ce-glue.c62
-rw-r--r--arch/arm64/crypto/sha2-ce-core.S46
-rw-r--r--arch/arm64/crypto/sha2-ce-glue.c91
-rw-r--r--arch/arm64/crypto/sha256-core.S_shipped2069
-rw-r--r--arch/arm64/crypto/sha256-glue.c42
-rw-r--r--arch/arm64/crypto/sha3-ce-core.S85
-rw-r--r--arch/arm64/crypto/sha3-ce-glue.c22
-rw-r--r--arch/arm64/crypto/sha512-armv8.pl2
-rw-r--r--arch/arm64/crypto/sha512-ce-core.S37
-rw-r--r--arch/arm64/crypto/sha512-ce-glue.c69
-rw-r--r--arch/arm64/crypto/sha512-core.S_shipped1093
-rw-r--r--arch/arm64/crypto/sha512-glue.c20
-rw-r--r--arch/arm64/crypto/sm3-ce-core.S5
-rw-r--r--arch/arm64/crypto/sm3-ce-glue.c30
-rw-r--r--arch/arm64/crypto/sm3-neon-core.S601
-rw-r--r--arch/arm64/crypto/sm3-neon-glue.c103
-rw-r--r--arch/arm64/crypto/sm4-ce-asm.h209
-rw-r--r--arch/arm64/crypto/sm4-ce-ccm-core.S329
-rw-r--r--arch/arm64/crypto/sm4-ce-ccm-glue.c307
-rw-r--r--arch/arm64/crypto/sm4-ce-cipher-core.S36
-rw-r--r--arch/arm64/crypto/sm4-ce-cipher-glue.c82
-rw-r--r--arch/arm64/crypto/sm4-ce-core.S963
-rw-r--r--arch/arm64/crypto/sm4-ce-gcm-core.S742
-rw-r--r--arch/arm64/crypto/sm4-ce-gcm-glue.c285
-rw-r--r--arch/arm64/crypto/sm4-ce-glue.c791
-rw-r--r--arch/arm64/crypto/sm4-ce.h13
-rw-r--r--arch/arm64/crypto/sm4-neon-core.S566
-rw-r--r--arch/arm64/crypto/sm4-neon-glue.c257
59 files changed, 7489 insertions, 5854 deletions
diff --git a/arch/arm64/crypto/.gitignore b/arch/arm64/crypto/.gitignore
index 879df8781ed5..fcf2d731e6c1 100644
--- a/arch/arm64/crypto/.gitignore
+++ b/arch/arm64/crypto/.gitignore
@@ -1,2 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
sha256-core.S
sha512-core.S
+poly1305-core.S
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index b8eb0453123d..eb7b423ba463 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -1,129 +1,325 @@
# SPDX-License-Identifier: GPL-2.0
-menuconfig ARM64_CRYPTO
- bool "ARM64 Accelerated Cryptographic Algorithms"
- depends on ARM64
+menu "Accelerated Cryptographic Algorithms for CPU (arm64)"
+
+config CRYPTO_GHASH_ARM64_CE
+ tristate "Hash functions: GHASH (ARMv8 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_HASH
+ select CRYPTO_LIB_AES
+ select CRYPTO_LIB_GF128MUL
+ select CRYPTO_AEAD
help
- Say Y here to choose from a selection of cryptographic algorithms
- implemented using ARM64 specific CPU features or instructions.
+ GCM GHASH function (NIST SP800-38D)
-if ARM64_CRYPTO
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
-config CRYPTO_SHA256_ARM64
- tristate "SHA-224/SHA-256 digest algorithm for arm64"
- select CRYPTO_HASH
+config CRYPTO_NHPOLY1305_NEON
+ tristate "Hash functions: NHPoly1305 (NEON)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_NHPOLY1305
+ help
+ NHPoly1305 hash function (Adiantum)
-config CRYPTO_SHA512_ARM64
- tristate "SHA-384/SHA-512 digest algorithm for arm64"
+ Architecture: arm64 using:
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_POLY1305_NEON
+ tristate "Hash functions: Poly1305 (NEON)"
+ depends on KERNEL_MODE_NEON
select CRYPTO_HASH
+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
+ help
+ Poly1305 authenticator algorithm (RFC7539)
+
+ Architecture: arm64 using:
+ - NEON (Advanced SIMD) extensions
config CRYPTO_SHA1_ARM64_CE
- tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
+ tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_SHA1
+ help
+ SHA-1 secure hash algorithm (FIPS 180)
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
+
+config CRYPTO_SHA256_ARM64
+ tristate "Hash functions: SHA-224 and SHA-256"
+ select CRYPTO_HASH
+ help
+ SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
+
+ Architecture: arm64
config CRYPTO_SHA2_ARM64_CE
- tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
+ tristate "Hash functions: SHA-224 and SHA-256 (ARMv8 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_SHA256_ARM64
+ help
+ SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
+
+config CRYPTO_SHA512_ARM64
+ tristate "Hash functions: SHA-384 and SHA-512"
+ select CRYPTO_HASH
+ help
+ SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
+
+ Architecture: arm64
config CRYPTO_SHA512_ARM64_CE
- tristate "SHA-384/SHA-512 digest algorithm (ARMv8 Crypto Extensions)"
+ tristate "Hash functions: SHA-384 and SHA-512 (ARMv8 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_SHA512_ARM64
+ help
+ SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
config CRYPTO_SHA3_ARM64
- tristate "SHA3 digest algorithm (ARMv8.2 Crypto Extensions)"
+ tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_SHA3
+ help
+ SHA-3 secure hash algorithms (FIPS 202)
-config CRYPTO_SM3_ARM64_CE
- tristate "SM3 digest algorithm (ARMv8.2 Crypto Extensions)"
+ Architecture: arm64 using:
+ - ARMv8.2 Crypto Extensions
+
+config CRYPTO_SM3_NEON
+ tristate "Hash functions: SM3 (NEON)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_SM3
+ help
+ SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012)
-config CRYPTO_SM4_ARM64_CE
- tristate "SM4 symmetric cipher (ARMv8.2 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
- select CRYPTO_ALGAPI
- select CRYPTO_SM4
+ Architecture: arm64 using:
+ - NEON (Advanced SIMD) extensions
-config CRYPTO_GHASH_ARM64_CE
- tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
+config CRYPTO_SM3_ARM64_CE
+ tristate "Hash functions: SM3 (ARMv8.2 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
- select CRYPTO_GF128MUL
- select CRYPTO_LIB_AES
+ select CRYPTO_SM3
+ help
+ SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012)
-config CRYPTO_CRCT10DIF_ARM64_CE
- tristate "CRCT10DIF digest algorithm using PMULL instructions"
- depends on KERNEL_MODE_NEON && CRC_T10DIF
- select CRYPTO_HASH
+ Architecture: arm64 using:
+ - ARMv8.2 Crypto Extensions
+
+config CRYPTO_POLYVAL_ARM64_CE
+ tristate "Hash functions: POLYVAL (ARMv8 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_POLYVAL
+ help
+ POLYVAL hash function for HCTR2
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
config CRYPTO_AES_ARM64
- tristate "AES core cipher using scalar instructions"
+ tristate "Ciphers: AES, modes: ECB, CBC, CTR, CTS, XCTR, XTS"
select CRYPTO_AES
+ help
+ Block ciphers: AES cipher algorithms (FIPS-197)
+ Length-preserving ciphers: AES with ECB, CBC, CTR, CTS,
+ XCTR, and XTS modes
+ AEAD cipher: AES with CBC, ESSIV, and SHA-256
+ for fscrypt and dm-crypt
+
+ Architecture: arm64
config CRYPTO_AES_ARM64_CE
- tristate "AES core cipher using ARMv8 Crypto Extensions"
+ tristate "Ciphers: AES (ARMv8 Crypto Extensions)"
depends on ARM64 && KERNEL_MODE_NEON
select CRYPTO_ALGAPI
select CRYPTO_LIB_AES
+ help
+ Block ciphers: AES cipher algorithms (FIPS-197)
-config CRYPTO_AES_ARM64_CE_CCM
- tristate "AES in CCM mode using ARMv8 Crypto Extensions"
- depends on ARM64 && KERNEL_MODE_NEON
- select CRYPTO_ALGAPI
- select CRYPTO_AES_ARM64_CE
- select CRYPTO_AEAD
- select CRYPTO_LIB_AES
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
config CRYPTO_AES_ARM64_CE_BLK
- tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
+ tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (ARMv8 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_AES_ARM64_CE
- select CRYPTO_AES_ARM64
- select CRYPTO_SIMD
+ help
+ Length-preserving ciphers: AES cipher algorithms (FIPS-197)
+ with block cipher modes:
+ - ECB (Electronic Codebook) mode (NIST SP800-38A)
+ - CBC (Cipher Block Chaining) mode (NIST SP800-38A)
+ - CTR (Counter) mode (NIST SP800-38A)
+ - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E
+ and IEEE 1619)
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
config CRYPTO_AES_ARM64_NEON_BLK
- tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
+ tristate "Ciphers: AES, modes: ECB/CBC/CTR/XTS (NEON)"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
- select CRYPTO_AES_ARM64
select CRYPTO_LIB_AES
- select CRYPTO_SIMD
+ help
+ Length-preserving ciphers: AES cipher algorithms (FIPS-197)
+ with block cipher modes:
+ - ECB (Electronic Codebook) mode (NIST SP800-38A)
+ - CBC (Cipher Block Chaining) mode (NIST SP800-38A)
+ - CTR (Counter) mode (NIST SP800-38A)
+ - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E
+ and IEEE 1619)
+
+ Architecture: arm64 using:
+ - NEON (Advanced SIMD) extensions
config CRYPTO_CHACHA20_NEON
- tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
+ tristate "Ciphers: ChaCha (NEON)"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_LIB_CHACHA_GENERIC
select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ help
+ Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
+ stream cipher algorithms
-config CRYPTO_POLY1305_NEON
- tristate "Poly1305 hash function using scalar or NEON instructions"
+ Architecture: arm64 using:
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_AES_ARM64_BS
+ tristate "Ciphers: AES, modes: ECB/CBC/CTR/XCTR/XTS modes (bit-sliced NEON)"
depends on KERNEL_MODE_NEON
- select CRYPTO_HASH
- select CRYPTO_ARCH_HAVE_LIB_POLY1305
+ select CRYPTO_SKCIPHER
+ select CRYPTO_AES_ARM64_NEON_BLK
+ select CRYPTO_LIB_AES
+ help
+ Length-preserving ciphers: AES cipher algorithms (FIPS-197)
+ with block cipher modes:
+ - ECB (Electronic Codebook) mode (NIST SP800-38A)
+ - CBC (Cipher Block Chaining) mode (NIST SP800-38A)
+ - CTR (Counter) mode (NIST SP800-38A)
+ - XCTR mode for HCTR2
+ - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E
+ and IEEE 1619)
-config CRYPTO_NHPOLY1305_NEON
- tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
+ Architecture: arm64 using:
+ - bit-sliced algorithm
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_SM4_ARM64_CE
+ tristate "Ciphers: SM4 (ARMv8.2 Crypto Extensions)"
depends on KERNEL_MODE_NEON
- select CRYPTO_NHPOLY1305
+ select CRYPTO_ALGAPI
+ select CRYPTO_SM4
+ help
+ Block ciphers: SM4 cipher algorithms (OSCCA GB/T 32907-2016)
-config CRYPTO_AES_ARM64_BS
- tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
+ Architecture: arm64 using:
+ - ARMv8.2 Crypto Extensions
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_SM4_ARM64_CE_BLK
+ tristate "Ciphers: SM4, modes: ECB/CBC/CTR/XTS (ARMv8 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
- select CRYPTO_AES_ARM64_NEON_BLK
- select CRYPTO_AES_ARM64
+ select CRYPTO_SM4
+ help
+ Length-preserving ciphers: SM4 cipher algorithms (OSCCA GB/T 32907-2016)
+ with block cipher modes:
+ - ECB (Electronic Codebook) mode (NIST SP800-38A)
+ - CBC (Cipher Block Chaining) mode (NIST SP800-38A)
+ - CTR (Counter) mode (NIST SP800-38A)
+ - XTS (XOR Encrypt XOR with ciphertext stealing) mode (NIST SP800-38E
+ and IEEE 1619)
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_SM4_ARM64_NEON_BLK
+ tristate "Ciphers: SM4, modes: ECB/CBC/CTR (NEON)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SM4
+ help
+ Length-preserving ciphers: SM4 cipher algorithms (OSCCA GB/T 32907-2016)
+ with block cipher modes:
+ - ECB (Electronic Codebook) mode (NIST SP800-38A)
+ - CBC (Cipher Block Chaining) mode (NIST SP800-38A)
+ - CTR (Counter) mode (NIST SP800-38A)
+
+ Architecture: arm64 using:
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_AES_ARM64_CE_CCM
+ tristate "AEAD cipher: AES in CCM mode (ARMv8 Crypto Extensions)"
+ depends on ARM64 && KERNEL_MODE_NEON
+ select CRYPTO_ALGAPI
+ select CRYPTO_AES_ARM64_CE
+ select CRYPTO_AEAD
select CRYPTO_LIB_AES
- select CRYPTO_SIMD
+ help
+ AEAD cipher: AES cipher algorithms (FIPS-197) with
+ CCM (Counter with Cipher Block Chaining-Message Authentication Code)
+ authenticated encryption mode (NIST SP800-38C)
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_SM4_ARM64_CE_CCM
+ tristate "AEAD cipher: SM4 in CCM mode (ARMv8 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_ALGAPI
+ select CRYPTO_AEAD
+ select CRYPTO_SM4
+ select CRYPTO_SM4_ARM64_CE_BLK
+ help
+ AEAD cipher: SM4 cipher algorithms (OSCCA GB/T 32907-2016) with
+ CCM (Counter with Cipher Block Chaining-Message Authentication Code)
+ authenticated encryption mode (NIST SP800-38C)
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_SM4_ARM64_CE_GCM
+ tristate "AEAD cipher: SM4 in GCM mode (ARMv8 Crypto Extensions)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_ALGAPI
+ select CRYPTO_AEAD
+ select CRYPTO_SM4
+ select CRYPTO_SM4_ARM64_CE_BLK
+ help
+ AEAD cipher: SM4 cipher algorithms (OSCCA GB/T 32907-2016) with
+ GCM (Galois/Counter Mode) authenticated encryption mode (NIST SP800-38D)
+
+ Architecture: arm64 using:
+ - ARMv8 Crypto Extensions
+ - PMULL (Polynomial Multiply Long) instructions
+ - NEON (Advanced SIMD) extensions
+
+config CRYPTO_CRCT10DIF_ARM64_CE
+ tristate "CRCT10DIF (PMULL)"
+ depends on KERNEL_MODE_NEON && CRC_T10DIF
+ select CRYPTO_HASH
+ help
+ CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF)
+
+ Architecture: arm64 using
+ - PMULL (Polynomial Multiply Long) instructions
+
+endmenu
-endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index d0901e610df3..fbe64dce66e0 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -17,15 +17,33 @@ sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o
sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
+obj-$(CONFIG_CRYPTO_SM3_NEON) += sm3-neon.o
+sm3-neon-y := sm3-neon-glue.o sm3-neon-core.o
+
obj-$(CONFIG_CRYPTO_SM3_ARM64_CE) += sm3-ce.o
sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o
-obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce.o
+obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce-cipher.o
+sm4-ce-cipher-y := sm4-ce-cipher-glue.o sm4-ce-cipher-core.o
+
+obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_BLK) += sm4-ce.o
sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o
+obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_CCM) += sm4-ce-ccm.o
+sm4-ce-ccm-y := sm4-ce-ccm-glue.o sm4-ce-ccm-core.o
+
+obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_GCM) += sm4-ce-gcm.o
+sm4-ce-gcm-y := sm4-ce-gcm-glue.o sm4-ce-gcm-core.o
+
+obj-$(CONFIG_CRYPTO_SM4_ARM64_NEON_BLK) += sm4-neon.o
+sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o
+
obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
+obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o
+polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
+
obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
@@ -63,24 +81,13 @@ aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
-CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS
-
-$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
- $(call if_changed_rule,cc_o_c)
-
-ifdef REGENERATE_ARM64_CRYPTO
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $(<) void $(@)
-$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
+$(obj)/%-core.S: $(src)/%-armv8.pl
$(call cmd,perlasm)
-$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
+$(obj)/sha256-core.S: $(src)/sha512-armv8.pl
$(call cmd,perlasm)
-$(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
- $(call cmd,perlasm)
-
-endif
-
clean-files += poly1305-core.S sha256-core.S sha512-core.S
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index 9add9bbc48d8..b03f7f71f893 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -12,22 +12,21 @@
.arch armv8-a+crypto
/*
- * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
- * u32 *macp, u8 const rk[], u32 rounds);
+ * u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
+ * u32 macp, u8 const rk[], u32 rounds);
*/
-ENTRY(ce_aes_ccm_auth_data)
- ldr w8, [x3] /* leftover from prev round? */
+SYM_FUNC_START(ce_aes_ccm_auth_data)
ld1 {v0.16b}, [x0] /* load mac */
- cbz w8, 1f
- sub w8, w8, #16
+ cbz w3, 1f
+ sub w3, w3, #16
eor v1.16b, v1.16b, v1.16b
0: ldrb w7, [x1], #1 /* get 1 byte of input */
subs w2, w2, #1
- add w8, w8, #1
+ add w3, w3, #1
ins v1.b[0], w7
ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */
beq 8f /* out of input? */
- cbnz w8, 0b
+ cbnz w3, 0b
eor v0.16b, v0.16b, v1.16b
1: ld1 {v3.4s}, [x4] /* load first round key */
prfm pldl1strm, [x1]
@@ -62,7 +61,7 @@ ENTRY(ce_aes_ccm_auth_data)
beq 10f
adds w2, w2, #16
beq 10f
- mov w8, w2
+ mov w3, w2
7: ldrb w7, [x1], #1
umov w6, v0.b[0]
eor w6, w6, w7
@@ -71,23 +70,23 @@ ENTRY(ce_aes_ccm_auth_data)
beq 10f
ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */
b 7b
-8: cbz w8, 91f
- mov w7, w8
- add w8, w8, #16
+8: cbz w3, 91f
+ mov w7, w3
+ add w3, w3, #16
9: ext v1.16b, v1.16b, v1.16b, #1
adds w7, w7, #1
bne 9b
91: eor v0.16b, v0.16b, v1.16b
st1 {v0.16b}, [x0]
-10: str w8, [x3]
+10: mov w0, w3
ret
-ENDPROC(ce_aes_ccm_auth_data)
+SYM_FUNC_END(ce_aes_ccm_auth_data)
/*
* void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
* u32 rounds);
*/
-ENTRY(ce_aes_ccm_final)
+SYM_FUNC_START(ce_aes_ccm_final)
ld1 {v3.4s}, [x2], #16 /* load first round key */
ld1 {v0.16b}, [x0] /* load mac */
cmp w3, #12 /* which key size? */
@@ -121,9 +120,10 @@ ENTRY(ce_aes_ccm_final)
eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */
st1 {v0.16b}, [x0] /* store result */
ret
-ENDPROC(ce_aes_ccm_final)
+SYM_FUNC_END(ce_aes_ccm_final)
.macro aes_ccm_do_crypt,enc
+ cbz x2, 5f
ldr x8, [x6, #8] /* load lower ctr */
ld1 {v0.16b}, [x5] /* load mac */
CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
@@ -212,10 +212,10 @@ CPU_LE( rev x8, x8 )
* u8 const rk[], u32 rounds, u8 mac[],
* u8 ctr[]);
*/
-ENTRY(ce_aes_ccm_encrypt)
+SYM_FUNC_START(ce_aes_ccm_encrypt)
aes_ccm_do_crypt 1
-ENDPROC(ce_aes_ccm_encrypt)
+SYM_FUNC_END(ce_aes_ccm_encrypt)
-ENTRY(ce_aes_ccm_decrypt)
+SYM_FUNC_START(ce_aes_ccm_decrypt)
aes_ccm_do_crypt 0
-ENDPROC(ce_aes_ccm_decrypt)
+SYM_FUNC_END(ce_aes_ccm_decrypt)
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index 541cf9165748..25cd3808ecbe 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -6,12 +6,10 @@
*/
#include <asm/neon.h>
-#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/aes.h>
#include <crypto/scatterwalk.h>
#include <crypto/internal/aead.h>
-#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <linux/module.h>
@@ -29,8 +27,8 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
return 6 + ctx->key_length / 4;
}
-asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
- u32 *macp, u32 const rk[], u32 rounds);
+asmlinkage u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
+ u32 macp, u32 const rk[], u32 rounds);
asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
u32 const rk[], u32 rounds, u8 mac[],
@@ -47,14 +45,8 @@ static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
unsigned int key_len)
{
struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm);
- int ret;
- ret = ce_aes_expandkey(ctx, in_key, key_len);
- if (!ret)
- return 0;
-
- tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
+ return ce_aes_expandkey(ctx, in_key, key_len);
}
static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
@@ -102,41 +94,6 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
return 0;
}
-static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
- u32 abytes, u32 *macp)
-{
- if (crypto_simd_usable()) {
- kernel_neon_begin();
- ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
- num_rounds(key));
- kernel_neon_end();
- } else {
- if (*macp > 0 && *macp < AES_BLOCK_SIZE) {
- int added = min(abytes, AES_BLOCK_SIZE - *macp);
-
- crypto_xor(&mac[*macp], in, added);
-
- *macp += added;
- in += added;
- abytes -= added;
- }
-
- while (abytes >= AES_BLOCK_SIZE) {
- aes_encrypt(key, mac, mac);
- crypto_xor(mac, in, AES_BLOCK_SIZE);
-
- in += AES_BLOCK_SIZE;
- abytes -= AES_BLOCK_SIZE;
- }
-
- if (abytes > 0) {
- aes_encrypt(key, mac, mac);
- crypto_xor(mac, in, abytes);
- *macp = abytes;
- }
- }
-}
-
static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
@@ -156,7 +113,8 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
ltag.len = 6;
}
- ccm_update_mac(ctx, mac, (u8 *)&ltag, ltag.len, &macp);
+ macp = ce_aes_ccm_auth_data(mac, (u8 *)&ltag, ltag.len, macp,
+ ctx->key_enc, num_rounds(ctx));
scatterwalk_start(&walk, req->src);
do {
@@ -167,8 +125,16 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
scatterwalk_start(&walk, sg_next(walk.sg));
n = scatterwalk_clamp(&walk, len);
}
+ n = min_t(u32, n, SZ_4K); /* yield NEON at least every 4k */
p = scatterwalk_map(&walk);
- ccm_update_mac(ctx, mac, p, n, &macp);
+
+ macp = ce_aes_ccm_auth_data(mac, p, n, macp, ctx->key_enc,
+ num_rounds(ctx));
+
+ if (len / SZ_4K > (len - n) / SZ_4K) {
+ kernel_neon_end();
+ kernel_neon_begin();
+ }
len -= n;
scatterwalk_unmap(p);
@@ -177,54 +143,6 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
} while (len);
}
-static int ccm_crypt_fallback(struct skcipher_walk *walk, u8 mac[], u8 iv0[],
- struct crypto_aes_ctx *ctx, bool enc)
-{
- u8 buf[AES_BLOCK_SIZE];
- int err = 0;
-
- while (walk->nbytes) {
- int blocks = walk->nbytes / AES_BLOCK_SIZE;
- u32 tail = walk->nbytes % AES_BLOCK_SIZE;
- u8 *dst = walk->dst.virt.addr;
- u8 *src = walk->src.virt.addr;
- u32 nbytes = walk->nbytes;
-
- if (nbytes == walk->total && tail > 0) {
- blocks++;
- tail = 0;
- }
-
- do {
- u32 bsize = AES_BLOCK_SIZE;
-
- if (nbytes < AES_BLOCK_SIZE)
- bsize = nbytes;
-
- crypto_inc(walk->iv, AES_BLOCK_SIZE);
- aes_encrypt(ctx, buf, walk->iv);
- aes_encrypt(ctx, mac, mac);
- if (enc)
- crypto_xor(mac, src, bsize);
- crypto_xor_cpy(dst, src, buf, bsize);
- if (!enc)
- crypto_xor(mac, dst, bsize);
- dst += bsize;
- src += bsize;
- nbytes -= bsize;
- } while (--blocks);
-
- err = skcipher_walk_done(walk, tail);
- }
-
- if (!err) {
- aes_encrypt(ctx, buf, iv0);
- aes_encrypt(ctx, mac, mac);
- crypto_xor(mac, buf, AES_BLOCK_SIZE);
- }
- return err;
-}
-
static int ccm_encrypt(struct aead_request *req)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
@@ -239,47 +157,43 @@ static int ccm_encrypt(struct aead_request *req)
if (err)
return err;
- if (req->assoclen)
- ccm_calculate_auth_mac(req, mac);
-
/* preserve the original iv for the final round */
memcpy(buf, req->iv, AES_BLOCK_SIZE);
err = skcipher_walk_aead_encrypt(&walk, req, false);
- if (crypto_simd_usable()) {
- while (walk.nbytes) {
- u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+ kernel_neon_begin();
+
+ if (req->assoclen)
+ ccm_calculate_auth_mac(req, mac);
- if (walk.nbytes == walk.total)
- tail = 0;
+ while (walk.nbytes) {
+ u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+ bool final = walk.nbytes == walk.total;
- kernel_neon_begin();
- ce_aes_ccm_encrypt(walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes - tail, ctx->key_enc,
- num_rounds(ctx), mac, walk.iv);
- kernel_neon_end();
+ if (final)
+ tail = 0;
- err = skcipher_walk_done(&walk, tail);
- }
- if (!err) {
- kernel_neon_begin();
- ce_aes_ccm_final(mac, buf, ctx->key_enc,
- num_rounds(ctx));
+ ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes - tail, ctx->key_enc,
+ num_rounds(ctx), mac, walk.iv);
+
+ if (!final)
kernel_neon_end();
- }
- } else {
- err = ccm_crypt_fallback(&walk, mac, buf, ctx, true);
+ err = skcipher_walk_done(&walk, tail);
+ if (!final)
+ kernel_neon_begin();
}
- if (err)
- return err;
+
+ ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+
+ kernel_neon_end();
/* copy authtag to end of dst */
scatterwalk_map_and_copy(mac, req->dst, req->assoclen + req->cryptlen,
crypto_aead_authsize(aead), 1);
- return 0;
+ return err;
}
static int ccm_decrypt(struct aead_request *req)
@@ -297,41 +211,39 @@ static int ccm_decrypt(struct aead_request *req)
if (err)
return err;
- if (req->assoclen)
- ccm_calculate_auth_mac(req, mac);
-
/* preserve the original iv for the final round */
memcpy(buf, req->iv, AES_BLOCK_SIZE);
err = skcipher_walk_aead_decrypt(&walk, req, false);
- if (crypto_simd_usable()) {
- while (walk.nbytes) {
- u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+ kernel_neon_begin();
- if (walk.nbytes == walk.total)
- tail = 0;
+ if (req->assoclen)
+ ccm_calculate_auth_mac(req, mac);
- kernel_neon_begin();
- ce_aes_ccm_decrypt(walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes - tail, ctx->key_enc,
- num_rounds(ctx), mac, walk.iv);
- kernel_neon_end();
+ while (walk.nbytes) {
+ u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+ bool final = walk.nbytes == walk.total;
- err = skcipher_walk_done(&walk, tail);
- }
- if (!err) {
- kernel_neon_begin();
- ce_aes_ccm_final(mac, buf, ctx->key_enc,
- num_rounds(ctx));
+ if (final)
+ tail = 0;
+
+ ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes - tail, ctx->key_enc,
+ num_rounds(ctx), mac, walk.iv);
+
+ if (!final)
kernel_neon_end();
- }
- } else {
- err = ccm_crypt_fallback(&walk, mac, buf, ctx, false);
+ err = skcipher_walk_done(&walk, tail);
+ if (!final)
+ kernel_neon_begin();
}
- if (err)
+ ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+
+ kernel_neon_end();
+
+ if (unlikely(err))
return err;
/* compare calculated auth tag with the stored one */
diff --git a/arch/arm64/crypto/aes-ce-core.S b/arch/arm64/crypto/aes-ce-core.S
index 76a30fe4ba8b..e52e13eb8fdb 100644
--- a/arch/arm64/crypto/aes-ce-core.S
+++ b/arch/arm64/crypto/aes-ce-core.S
@@ -8,7 +8,7 @@
.arch armv8-a+crypto
-ENTRY(__aes_ce_encrypt)
+SYM_FUNC_START(__aes_ce_encrypt)
sub w3, w3, #2
ld1 {v0.16b}, [x2]
ld1 {v1.4s}, [x0], #16
@@ -34,9 +34,9 @@ ENTRY(__aes_ce_encrypt)
eor v0.16b, v0.16b, v3.16b
st1 {v0.16b}, [x1]
ret
-ENDPROC(__aes_ce_encrypt)
+SYM_FUNC_END(__aes_ce_encrypt)
-ENTRY(__aes_ce_decrypt)
+SYM_FUNC_START(__aes_ce_decrypt)
sub w3, w3, #2
ld1 {v0.16b}, [x2]
ld1 {v1.4s}, [x0], #16
@@ -62,23 +62,23 @@ ENTRY(__aes_ce_decrypt)
eor v0.16b, v0.16b, v3.16b
st1 {v0.16b}, [x1]
ret
-ENDPROC(__aes_ce_decrypt)
+SYM_FUNC_END(__aes_ce_decrypt)
/*
* __aes_ce_sub() - use the aese instruction to perform the AES sbox
* substitution on each byte in 'input'
*/
-ENTRY(__aes_ce_sub)
+SYM_FUNC_START(__aes_ce_sub)
dup v1.4s, w0
movi v0.16b, #0
aese v0.16b, v1.16b
umov w0, v0.s[0]
ret
-ENDPROC(__aes_ce_sub)
+SYM_FUNC_END(__aes_ce_sub)
-ENTRY(__aes_ce_invert)
+SYM_FUNC_START(__aes_ce_invert)
ld1 {v0.4s}, [x1]
aesimc v1.16b, v0.16b
st1 {v1.4s}, [x0]
ret
-ENDPROC(__aes_ce_invert)
+SYM_FUNC_END(__aes_ce_invert)
diff --git a/arch/arm64/crypto/aes-ce-glue.c b/arch/arm64/crypto/aes-ce-glue.c
index 6d085dc56c51..e921823ca103 100644
--- a/arch/arm64/crypto/aes-ce-glue.c
+++ b/arch/arm64/crypto/aes-ce-glue.c
@@ -9,9 +9,9 @@
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/aes.h>
+#include <crypto/algapi.h>
#include <crypto/internal/simd.h>
#include <linux/cpufeature.h>
-#include <linux/crypto.h>
#include <linux/module.h>
#include "aes-ce-setkey.h"
@@ -143,14 +143,8 @@ int ce_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
- int ret;
- ret = ce_aes_expandkey(ctx, in_key, key_len);
- if (!ret)
- return 0;
-
- tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
+ return ce_aes_expandkey(ctx, in_key, key_len);
}
EXPORT_SYMBOL(ce_aes_setkey);
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index c132c49c89a8..1dc5bbbfeed2 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -9,8 +9,8 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
-#define AES_ENTRY(func) ENTRY(ce_ ## func)
-#define AES_ENDPROC(func) ENDPROC(ce_ ## func)
+#define AES_FUNC_START(func) SYM_FUNC_START(ce_ ## func)
+#define AES_FUNC_END(func) SYM_FUNC_END(ce_ ## func)
.arch armv8-a+crypto
diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S
index 423d0aebc570..c9d6955f8404 100644
--- a/arch/arm64/crypto/aes-cipher-core.S
+++ b/arch/arm64/crypto/aes-cipher-core.S
@@ -122,11 +122,11 @@ CPU_BE( rev w7, w7 )
ret
.endm
-ENTRY(__aes_arm64_encrypt)
+SYM_FUNC_START(__aes_arm64_encrypt)
do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2
-ENDPROC(__aes_arm64_encrypt)
+SYM_FUNC_END(__aes_arm64_encrypt)
.align 5
-ENTRY(__aes_arm64_decrypt)
+SYM_FUNC_START(__aes_arm64_decrypt)
do_crypt iround, crypto_it_tab, crypto_aes_inv_sbox, 0
-ENDPROC(__aes_arm64_decrypt)
+SYM_FUNC_END(__aes_arm64_decrypt)
diff --git a/arch/arm64/crypto/aes-cipher-glue.c b/arch/arm64/crypto/aes-cipher-glue.c
index 8caf6dfefce8..4ec55e568941 100644
--- a/arch/arm64/crypto/aes-cipher-glue.c
+++ b/arch/arm64/crypto/aes-cipher-glue.c
@@ -6,7 +6,7 @@
*/
#include <crypto/aes.h>
-#include <linux/crypto.h>
+#include <crypto/algapi.h>
#include <linux/module.h>
asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
diff --git a/arch/arm64/crypto/aes-glue-ce.c b/arch/arm64/crypto/aes-glue-ce.c
new file mode 100644
index 000000000000..7d309ceeddf3
--- /dev/null
+++ b/arch/arm64/crypto/aes-glue-ce.c
@@ -0,0 +1,2 @@
+#define USE_V8_CRYPTO_EXTENSIONS
+#include "aes-glue.c"
diff --git a/arch/arm64/crypto/aes-glue-neon.c b/arch/arm64/crypto/aes-glue-neon.c
new file mode 100644
index 000000000000..8ba046321064
--- /dev/null
+++ b/arch/arm64/crypto/aes-glue-neon.c
@@ -0,0 +1 @@
+#include "aes-glue.c"
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index aa57dc639f77..162787c7aa86 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -10,7 +10,7 @@
#include <asm/simd.h>
#include <crypto/aes.h>
#include <crypto/ctr.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
@@ -34,10 +34,11 @@
#define aes_essiv_cbc_encrypt ce_aes_essiv_cbc_encrypt
#define aes_essiv_cbc_decrypt ce_aes_essiv_cbc_decrypt
#define aes_ctr_encrypt ce_aes_ctr_encrypt
+#define aes_xctr_encrypt ce_aes_xctr_encrypt
#define aes_xts_encrypt ce_aes_xts_encrypt
#define aes_xts_decrypt ce_aes_xts_decrypt
#define aes_mac_update ce_aes_mac_update
-MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 Crypto Extensions");
#else
#define MODE "neon"
#define PRIO 200
@@ -50,16 +51,18 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
#define aes_essiv_cbc_encrypt neon_aes_essiv_cbc_encrypt
#define aes_essiv_cbc_decrypt neon_aes_essiv_cbc_decrypt
#define aes_ctr_encrypt neon_aes_ctr_encrypt
+#define aes_xctr_encrypt neon_aes_xctr_encrypt
#define aes_xts_encrypt neon_aes_xts_encrypt
#define aes_xts_decrypt neon_aes_xts_decrypt
#define aes_mac_update neon_aes_mac_update
-MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS/XCTR using ARMv8 NEON");
#endif
-#if defined(USE_V8_CRYPTO_EXTENSIONS) || !defined(CONFIG_CRYPTO_AES_ARM64_BS)
+#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
MODULE_ALIAS_CRYPTO("ecb(aes)");
MODULE_ALIAS_CRYPTO("cbc(aes)");
MODULE_ALIAS_CRYPTO("ctr(aes)");
MODULE_ALIAS_CRYPTO("xts(aes)");
+MODULE_ALIAS_CRYPTO("xctr(aes)");
#endif
MODULE_ALIAS_CRYPTO("cts(cbc(aes))");
MODULE_ALIAS_CRYPTO("essiv(cbc(aes),sha256)");
@@ -87,7 +90,10 @@ asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int bytes, u8 const iv[]);
asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
- int rounds, int blocks, u8 ctr[]);
+ int rounds, int bytes, u8 ctr[]);
+
+asmlinkage void aes_xctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int bytes, u8 ctr[], int byte_ctr);
asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int bytes, u32 const rk2[], u8 iv[],
@@ -103,9 +109,9 @@ asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int blocks, u8 iv[],
u32 const rk2[]);
-asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
- int blocks, u8 dg[], int enc_before,
- int enc_after);
+asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds,
+ int blocks, u8 dg[], int enc_before,
+ int enc_after);
struct crypto_aes_xts_ctx {
struct crypto_aes_ctx key1;
@@ -132,13 +138,8 @@ static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
- int ret;
-
- ret = aes_expandkey(ctx, in_key, key_len);
- if (ret)
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return ret;
+ return aes_expandkey(ctx, in_key, key_len);
}
static int __maybe_unused xts_set_key(struct crypto_skcipher *tfm,
@@ -155,11 +156,7 @@ static int __maybe_unused xts_set_key(struct crypto_skcipher *tfm,
if (!ret)
ret = aes_expandkey(&ctx->key2, &in_key[key_len / 2],
key_len / 2);
- if (!ret)
- return 0;
-
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
+ return ret;
}
static int __maybe_unused essiv_cbc_set_key(struct crypto_skcipher *tfm,
@@ -167,25 +164,16 @@ static int __maybe_unused essiv_cbc_set_key(struct crypto_skcipher *tfm,
unsigned int key_len)
{
struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
- SHASH_DESC_ON_STACK(desc, ctx->hash);
u8 digest[SHA256_DIGEST_SIZE];
int ret;
ret = aes_expandkey(&ctx->key1, in_key, key_len);
if (ret)
- goto out;
-
- desc->tfm = ctx->hash;
- crypto_shash_digest(desc, in_key, key_len, digest);
+ return ret;
- ret = aes_expandkey(&ctx->key2, digest, sizeof(digest));
- if (ret)
- goto out;
+ crypto_shash_tfm_digest(ctx->hash, in_key, key_len, digest);
- return 0;
-out:
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
+ return aes_expandkey(&ctx->key2, digest, sizeof(digest));
}
static int __maybe_unused ecb_encrypt(struct skcipher_request *req)
@@ -460,66 +448,94 @@ static int __maybe_unused essiv_cbc_decrypt(struct skcipher_request *req)
return err ?: cbc_decrypt_walk(req, &walk);
}
-static int ctr_encrypt(struct skcipher_request *req)
+static int __maybe_unused xctr_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, rounds = 6 + ctx->key_length / 4;
struct skcipher_walk walk;
- int blocks;
+ unsigned int byte_ctr = 0;
err = skcipher_walk_virt(&walk, req, false);
- while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
- kernel_neon_begin();
- aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- ctx->key_enc, rounds, blocks, walk.iv);
- kernel_neon_end();
- err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
- }
- if (walk.nbytes) {
- u8 __aligned(8) tail[AES_BLOCK_SIZE];
+ while (walk.nbytes > 0) {
+ const u8 *src = walk.src.virt.addr;
unsigned int nbytes = walk.nbytes;
- u8 *tdst = walk.dst.virt.addr;
- u8 *tsrc = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ u8 buf[AES_BLOCK_SIZE];
/*
- * Tell aes_ctr_encrypt() to process a tail block.
+ * If given less than 16 bytes, we must copy the partial block
+ * into a temporary buffer of 16 bytes to avoid out of bounds
+ * reads and writes. Furthermore, this code is somewhat unusual
+ * in that it expects the end of the data to be at the end of
+ * the temporary buffer, rather than the start of the data at
+ * the start of the temporary buffer.
*/
- blocks = -1;
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ else if (nbytes < walk.total)
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
kernel_neon_begin();
- aes_ctr_encrypt(tail, NULL, ctx->key_enc, rounds,
- blocks, walk.iv);
+ aes_xctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
+ walk.iv, byte_ctr);
kernel_neon_end();
- crypto_xor_cpy(tdst, tsrc, tail, nbytes);
- err = skcipher_walk_done(&walk, 0);
+
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
+ byte_ctr += nbytes;
+
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
-static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
+static int __maybe_unused ctr_encrypt(struct skcipher_request *req)
{
- const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
- unsigned long flags;
-
- /*
- * Temporarily disable interrupts to avoid races where
- * cachelines are evicted when the CPU is interrupted
- * to do something else.
- */
- local_irq_save(flags);
- aes_encrypt(ctx, dst, src);
- local_irq_restore(flags);
-}
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err, rounds = 6 + ctx->key_length / 4;
+ struct skcipher_walk walk;
-static int __maybe_unused ctr_encrypt_sync(struct skcipher_request *req)
-{
- if (!crypto_simd_usable())
- return crypto_ctr_encrypt_walk(req, ctr_encrypt_one);
+ err = skcipher_walk_virt(&walk, req, false);
- return ctr_encrypt(req);
+ while (walk.nbytes > 0) {
+ const u8 *src = walk.src.virt.addr;
+ unsigned int nbytes = walk.nbytes;
+ u8 *dst = walk.dst.virt.addr;
+ u8 buf[AES_BLOCK_SIZE];
+
+ /*
+ * If given less than 16 bytes, we must copy the partial block
+ * into a temporary buffer of 16 bytes to avoid out of bounds
+ * reads and writes. Furthermore, this code is somewhat unusual
+ * in that it expects the end of the data to be at the end of
+ * the temporary buffer, rather than the start of the data at
+ * the start of the temporary buffer.
+ */
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ else if (nbytes < walk.total)
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+
+ kernel_neon_begin();
+ aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
+ walk.iv);
+ kernel_neon_end();
+
+ if (unlikely(nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
+
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+
+ return err;
}
static int __maybe_unused xts_encrypt(struct skcipher_request *req)
@@ -668,12 +684,11 @@ static int __maybe_unused xts_decrypt(struct skcipher_request *req)
}
static struct skcipher_alg aes_algs[] = { {
-#if defined(USE_V8_CRYPTO_EXTENSIONS) || !defined(CONFIG_CRYPTO_AES_ARM64_BS)
+#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
.base = {
- .cra_name = "__ecb(aes)",
- .cra_driver_name = "__ecb-aes-" MODE,
+ .cra_name = "ecb(aes)",
+ .cra_driver_name = "ecb-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -685,10 +700,9 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = ecb_decrypt,
}, {
.base = {
- .cra_name = "__cbc(aes)",
- .cra_driver_name = "__cbc-aes-" MODE,
+ .cra_name = "cbc(aes)",
+ .cra_driver_name = "cbc-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -701,10 +715,9 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = cbc_decrypt,
}, {
.base = {
- .cra_name = "__ctr(aes)",
- .cra_driver_name = "__ctr-aes-" MODE,
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -718,9 +731,9 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = ctr_encrypt,
}, {
.base = {
- .cra_name = "ctr(aes)",
- .cra_driver_name = "ctr-aes-" MODE,
- .cra_priority = PRIO - 1,
+ .cra_name = "xctr(aes)",
+ .cra_driver_name = "xctr-aes-" MODE,
+ .cra_priority = PRIO,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -730,14 +743,13 @@ static struct skcipher_alg aes_algs[] = { {
.ivsize = AES_BLOCK_SIZE,
.chunksize = AES_BLOCK_SIZE,
.setkey = skcipher_aes_setkey,
- .encrypt = ctr_encrypt_sync,
- .decrypt = ctr_encrypt_sync,
+ .encrypt = xctr_encrypt,
+ .decrypt = xctr_encrypt,
}, {
.base = {
- .cra_name = "__xts(aes)",
- .cra_driver_name = "__xts-aes-" MODE,
+ .cra_name = "xts(aes)",
+ .cra_driver_name = "xts-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_xts_ctx),
.cra_module = THIS_MODULE,
@@ -752,10 +764,9 @@ static struct skcipher_alg aes_algs[] = { {
}, {
#endif
.base = {
- .cra_name = "__cts(cbc(aes))",
- .cra_driver_name = "__cts-cbc-aes-" MODE,
+ .cra_name = "cts(cbc(aes))",
+ .cra_driver_name = "cts-cbc-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -769,10 +780,9 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = cts_cbc_decrypt,
}, {
.base = {
- .cra_name = "__essiv(cbc(aes),sha256)",
- .cra_driver_name = "__essiv-cbc-aes-sha256-" MODE,
+ .cra_name = "essiv(cbc(aes),sha256)",
+ .cra_driver_name = "essiv-cbc-aes-sha256-" MODE,
.cra_priority = PRIO + 1,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_essiv_cbc_ctx),
.cra_module = THIS_MODULE,
@@ -791,13 +801,8 @@ static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
unsigned int key_len)
{
struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
- int err;
-
- err = aes_expandkey(&ctx->key, in_key, key_len);
- if (err)
- crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return err;
+ return aes_expandkey(&ctx->key, in_key, key_len);
}
static void cmac_gf128_mul_by_x(be128 *y, const be128 *x)
@@ -875,10 +880,17 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
int rounds = 6 + ctx->key_length / 4;
if (crypto_simd_usable()) {
- kernel_neon_begin();
- aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before,
- enc_after);
- kernel_neon_end();
+ int rem;
+
+ do {
+ kernel_neon_begin();
+ rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
+ dg, enc_before, enc_after);
+ kernel_neon_end();
+ in += (blocks - rem) * AES_BLOCK_SIZE;
+ blocks = rem;
+ enc_before = 0;
+ } while (blocks);
} else {
if (enc_before)
aes_encrypt(ctx, dg, dg);
@@ -1009,28 +1021,15 @@ static struct shash_alg mac_algs[] = { {
.descsize = sizeof(struct mac_desc_ctx),
} };
-static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
-
static void aes_exit(void)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
- if (aes_simd_algs[i])
- simd_skcipher_free(aes_simd_algs[i]);
-
crypto_unregister_shashes(mac_algs, ARRAY_SIZE(mac_algs));
crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
}
static int __init aes_init(void)
{
- struct simd_skcipher_alg *simd;
- const char *basename;
- const char *algname;
- const char *drvname;
int err;
- int i;
err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
if (err)
@@ -1040,26 +1039,8 @@ static int __init aes_init(void)
if (err)
goto unregister_ciphers;
- for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
- if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
- continue;
-
- algname = aes_algs[i].base.cra_name + 2;
- drvname = aes_algs[i].base.cra_driver_name + 2;
- basename = aes_algs[i].base.cra_driver_name;
- simd = simd_skcipher_create_compat(algname, drvname, basename);
- err = PTR_ERR(simd);
- if (IS_ERR(simd))
- goto unregister_simds;
-
- aes_simd_algs[i] = simd;
- }
-
return 0;
-unregister_simds:
- aes_exit();
- return err;
unregister_ciphers:
crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
return err;
@@ -1071,6 +1052,7 @@ module_cpu_feature_match(AES, aes_init);
module_init(aes_init);
EXPORT_SYMBOL(neon_aes_ecb_encrypt);
EXPORT_SYMBOL(neon_aes_cbc_encrypt);
+EXPORT_SYMBOL(neon_aes_ctr_encrypt);
EXPORT_SYMBOL(neon_aes_xts_encrypt);
EXPORT_SYMBOL(neon_aes_xts_decrypt);
#endif
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 131618389f1f..0e834a2c062c 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -22,26 +22,26 @@
#define ST5(x...) x
#endif
-aes_encrypt_block4x:
+SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
-ENDPROC(aes_encrypt_block4x)
+SYM_FUNC_END(aes_encrypt_block4x)
-aes_decrypt_block4x:
+SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
-ENDPROC(aes_decrypt_block4x)
+SYM_FUNC_END(aes_decrypt_block4x)
#if MAX_STRIDE == 5
-aes_encrypt_block5x:
+SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
ret
-ENDPROC(aes_encrypt_block5x)
+SYM_FUNC_END(aes_encrypt_block5x)
-aes_decrypt_block5x:
+SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
ret
-ENDPROC(aes_decrypt_block5x)
+SYM_FUNC_END(aes_decrypt_block5x)
#endif
/*
@@ -51,9 +51,8 @@ ENDPROC(aes_decrypt_block5x)
* int blocks)
*/
-AES_ENTRY(aes_ecb_encrypt)
- stp x29, x30, [sp, #-16]!
- mov x29, sp
+AES_FUNC_START(aes_ecb_encrypt)
+ frame_push 0
enc_prepare w3, x2, x5
@@ -77,14 +76,13 @@ ST5( st1 {v4.16b}, [x0], #16 )
subs w4, w4, #1
bne .Lecbencloop
.Lecbencout:
- ldp x29, x30, [sp], #16
+ frame_pop
ret
-AES_ENDPROC(aes_ecb_encrypt)
+AES_FUNC_END(aes_ecb_encrypt)
-AES_ENTRY(aes_ecb_decrypt)
- stp x29, x30, [sp, #-16]!
- mov x29, sp
+AES_FUNC_START(aes_ecb_decrypt)
+ frame_push 0
dec_prepare w3, x2, x5
@@ -108,9 +106,9 @@ ST5( st1 {v4.16b}, [x0], #16 )
subs w4, w4, #1
bne .Lecbdecloop
.Lecbdecout:
- ldp x29, x30, [sp], #16
+ frame_pop
ret
-AES_ENDPROC(aes_ecb_decrypt)
+AES_FUNC_END(aes_ecb_decrypt)
/*
@@ -126,7 +124,7 @@ AES_ENDPROC(aes_ecb_decrypt)
* u32 const rk2[]);
*/
-AES_ENTRY(aes_essiv_cbc_encrypt)
+AES_FUNC_START(aes_essiv_cbc_encrypt)
ld1 {v4.16b}, [x5] /* get iv */
mov w8, #14 /* AES-256: 14 rounds */
@@ -135,7 +133,7 @@ AES_ENTRY(aes_essiv_cbc_encrypt)
enc_switch_key w3, x2, x6
b .Lcbcencloop4x
-AES_ENTRY(aes_cbc_encrypt)
+AES_FUNC_START(aes_cbc_encrypt)
ld1 {v4.16b}, [x5] /* get iv */
enc_prepare w3, x2, x6
@@ -167,13 +165,10 @@ AES_ENTRY(aes_cbc_encrypt)
.Lcbcencout:
st1 {v4.16b}, [x5] /* return iv */
ret
-AES_ENDPROC(aes_cbc_encrypt)
-AES_ENDPROC(aes_essiv_cbc_encrypt)
-
-AES_ENTRY(aes_essiv_cbc_decrypt)
- stp x29, x30, [sp, #-16]!
- mov x29, sp
+AES_FUNC_END(aes_cbc_encrypt)
+AES_FUNC_END(aes_essiv_cbc_encrypt)
+AES_FUNC_START(aes_essiv_cbc_decrypt)
ld1 {cbciv.16b}, [x5] /* get iv */
mov w8, #14 /* AES-256: 14 rounds */
@@ -181,12 +176,10 @@ AES_ENTRY(aes_essiv_cbc_decrypt)
encrypt_block cbciv, w8, x6, x7, w9
b .Lessivcbcdecstart
-AES_ENTRY(aes_cbc_decrypt)
- stp x29, x30, [sp, #-16]!
- mov x29, sp
-
+AES_FUNC_START(aes_cbc_decrypt)
ld1 {cbciv.16b}, [x5] /* get iv */
.Lessivcbcdecstart:
+ frame_push 0
dec_prepare w3, x2, x6
.LcbcdecloopNx:
@@ -236,10 +229,10 @@ ST5( st1 {v4.16b}, [x0], #16 )
bne .Lcbcdecloop
.Lcbcdecout:
st1 {cbciv.16b}, [x5] /* return iv */
- ldp x29, x30, [sp], #16
+ frame_pop
ret
-AES_ENDPROC(aes_cbc_decrypt)
-AES_ENDPROC(aes_essiv_cbc_decrypt)
+AES_FUNC_END(aes_cbc_decrypt)
+AES_FUNC_END(aes_essiv_cbc_decrypt)
/*
@@ -249,7 +242,7 @@ AES_ENDPROC(aes_essiv_cbc_decrypt)
* int rounds, int bytes, u8 const iv[])
*/
-AES_ENTRY(aes_cbc_cts_encrypt)
+AES_FUNC_START(aes_cbc_cts_encrypt)
adr_l x8, .Lcts_permute_table
sub x4, x4, #16
add x9, x8, #32
@@ -276,9 +269,9 @@ AES_ENTRY(aes_cbc_cts_encrypt)
st1 {v0.16b}, [x4] /* overlapping stores */
st1 {v1.16b}, [x0]
ret
-AES_ENDPROC(aes_cbc_cts_encrypt)
+AES_FUNC_END(aes_cbc_cts_encrypt)
-AES_ENTRY(aes_cbc_cts_decrypt)
+AES_FUNC_START(aes_cbc_cts_decrypt)
adr_l x8, .Lcts_permute_table
sub x4, x4, #16
add x9, x8, #32
@@ -305,7 +298,7 @@ AES_ENTRY(aes_cbc_cts_decrypt)
st1 {v2.16b}, [x4] /* overlapping stores */
st1 {v0.16b}, [x0]
ret
-AES_ENDPROC(aes_cbc_cts_decrypt)
+AES_FUNC_END(aes_cbc_cts_decrypt)
.section ".rodata", "a"
.align 6
@@ -318,98 +311,308 @@ AES_ENDPROC(aes_cbc_cts_decrypt)
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.previous
-
/*
- * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, u8 ctr[])
+ * This macro generates the code for CTR and XCTR mode.
*/
+.macro ctr_encrypt xctr
+ // Arguments
+ OUT .req x0
+ IN .req x1
+ KEY .req x2
+ ROUNDS_W .req w3
+ BYTES_W .req w4
+ IV .req x5
+ BYTE_CTR_W .req w6 // XCTR only
+ // Intermediate values
+ CTR_W .req w11 // XCTR only
+ CTR .req x11 // XCTR only
+ IV_PART .req x12
+ BLOCKS .req x13
+ BLOCKS_W .req w13
+
+ frame_push 0
+
+ enc_prepare ROUNDS_W, KEY, IV_PART
+ ld1 {vctr.16b}, [IV]
-AES_ENTRY(aes_ctr_encrypt)
- stp x29, x30, [sp, #-16]!
- mov x29, sp
-
- enc_prepare w3, x2, x6
- ld1 {vctr.16b}, [x5]
+ /*
+ * Keep 64 bits of the IV in a register. For CTR mode this lets us
+ * easily increment the IV. For XCTR mode this lets us efficiently XOR
+ * the 64-bit counter with the IV.
+ */
+ .if \xctr
+ umov IV_PART, vctr.d[0]
+ lsr CTR_W, BYTE_CTR_W, #4
+ .else
+ umov IV_PART, vctr.d[1]
+ rev IV_PART, IV_PART
+ .endif
+
+.LctrloopNx\xctr:
+ add BLOCKS_W, BYTES_W, #15
+ sub BYTES_W, BYTES_W, #MAX_STRIDE << 4
+ lsr BLOCKS_W, BLOCKS_W, #4
+ mov w8, #MAX_STRIDE
+ cmp BLOCKS_W, w8
+ csel BLOCKS_W, BLOCKS_W, w8, lt
- umov x6, vctr.d[1] /* keep swabbed ctr in reg */
- rev x6, x6
- cmn w6, w4 /* 32 bit overflow? */
- bcs .Lctrloop
-.LctrloopNx:
- subs w4, w4, #MAX_STRIDE
- bmi .Lctr1x
- add w7, w6, #1
+ /*
+ * Set up the counter values in v0-v{MAX_STRIDE-1}.
+ *
+ * If we are encrypting less than MAX_STRIDE blocks, the tail block
+ * handling code expects the last keystream block to be in
+ * v{MAX_STRIDE-1}. For example: if encrypting two blocks with
+ * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
+ */
+ .if \xctr
+ add CTR, CTR, BLOCKS
+ .else
+ adds IV_PART, IV_PART, BLOCKS
+ .endif
mov v0.16b, vctr.16b
- add w8, w6, #2
mov v1.16b, vctr.16b
- add w9, w6, #3
mov v2.16b, vctr.16b
- add w9, w6, #3
- rev w7, w7
mov v3.16b, vctr.16b
- rev w8, w8
ST5( mov v4.16b, vctr.16b )
- mov v1.s[3], w7
- rev w9, w9
-ST5( add w10, w6, #4 )
- mov v2.s[3], w8
-ST5( rev w10, w10 )
- mov v3.s[3], w9
-ST5( mov v4.s[3], w10 )
- ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
+ .if \xctr
+ sub x6, CTR, #MAX_STRIDE - 1
+ sub x7, CTR, #MAX_STRIDE - 2
+ sub x8, CTR, #MAX_STRIDE - 3
+ sub x9, CTR, #MAX_STRIDE - 4
+ST5( sub x10, CTR, #MAX_STRIDE - 5 )
+ eor x6, x6, IV_PART
+ eor x7, x7, IV_PART
+ eor x8, x8, IV_PART
+ eor x9, x9, IV_PART
+ST5( eor x10, x10, IV_PART )
+ mov v0.d[0], x6
+ mov v1.d[0], x7
+ mov v2.d[0], x8
+ mov v3.d[0], x9
+ST5( mov v4.d[0], x10 )
+ .else
+ bcs 0f
+ .subsection 1
+ /*
+ * This subsection handles carries.
+ *
+ * Conditional branching here is allowed with respect to time
+ * invariance since the branches are dependent on the IV instead
+ * of the plaintext or key. This code is rarely executed in
+ * practice anyway.
+ */
+
+ /* Apply carry to outgoing counter. */
+0: umov x8, vctr.d[0]
+ rev x8, x8
+ add x8, x8, #1
+ rev x8, x8
+ ins vctr.d[0], x8
+
+ /*
+ * Apply carry to counter blocks if needed.
+ *
+ * Since the carry flag was set, we know 0 <= IV_PART <
+ * MAX_STRIDE. Using the value of IV_PART we can determine how
+ * many counter blocks need to be updated.
+ */
+ cbz IV_PART, 2f
+ adr x16, 1f
+ sub x16, x16, IV_PART, lsl #3
+ br x16
+ bti c
+ mov v0.d[0], vctr.d[0]
+ bti c
+ mov v1.d[0], vctr.d[0]
+ bti c
+ mov v2.d[0], vctr.d[0]
+ bti c
+ mov v3.d[0], vctr.d[0]
+ST5( bti c )
+ST5( mov v4.d[0], vctr.d[0] )
+1: b 2f
+ .previous
+
+2: rev x7, IV_PART
+ ins vctr.d[1], x7
+ sub x7, IV_PART, #MAX_STRIDE - 1
+ sub x8, IV_PART, #MAX_STRIDE - 2
+ sub x9, IV_PART, #MAX_STRIDE - 3
+ rev x7, x7
+ rev x8, x8
+ mov v1.d[1], x7
+ rev x9, x9
+ST5( sub x10, IV_PART, #MAX_STRIDE - 4 )
+ mov v2.d[1], x8
+ST5( rev x10, x10 )
+ mov v3.d[1], x9
+ST5( mov v4.d[1], x10 )
+ .endif
+
+ /*
+ * If there are at least MAX_STRIDE blocks left, XOR the data with
+ * keystream and store. Otherwise jump to tail handling.
+ */
+ tbnz BYTES_W, #31, .Lctrtail\xctr
+ ld1 {v5.16b-v7.16b}, [IN], #48
ST4( bl aes_encrypt_block4x )
ST5( bl aes_encrypt_block5x )
eor v0.16b, v5.16b, v0.16b
-ST4( ld1 {v5.16b}, [x1], #16 )
+ST4( ld1 {v5.16b}, [IN], #16 )
eor v1.16b, v6.16b, v1.16b
-ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
+ST5( ld1 {v5.16b-v6.16b}, [IN], #32 )
eor v2.16b, v7.16b, v2.16b
eor v3.16b, v5.16b, v3.16b
ST5( eor v4.16b, v6.16b, v4.16b )
- st1 {v0.16b-v3.16b}, [x0], #64
-ST5( st1 {v4.16b}, [x0], #16 )
- add x6, x6, #MAX_STRIDE
- rev x7, x6
- ins vctr.d[1], x7
- cbz w4, .Lctrout
- b .LctrloopNx
-.Lctr1x:
- adds w4, w4, #MAX_STRIDE
- beq .Lctrout
-.Lctrloop:
- mov v0.16b, vctr.16b
- encrypt_block v0, w3, x2, x8, w7
+ st1 {v0.16b-v3.16b}, [OUT], #64
+ST5( st1 {v4.16b}, [OUT], #16 )
+ cbz BYTES_W, .Lctrout\xctr
+ b .LctrloopNx\xctr
+
+.Lctrout\xctr:
+ .if !\xctr
+ st1 {vctr.16b}, [IV] /* return next CTR value */
+ .endif
+ frame_pop
+ ret
- adds x6, x6, #1 /* increment BE ctr */
- rev x7, x6
- ins vctr.d[1], x7
- bcs .Lctrcarry /* overflow? */
+.Lctrtail\xctr:
+ /*
+ * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
+ *
+ * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
+ * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
+ * v4 should have the next two counter blocks.
+ *
+ * This allows us to store the ciphertext by writing to overlapping
+ * regions of memory. Any invalid ciphertext blocks get overwritten by
+ * correctly computed blocks. This approach greatly simplifies the
+ * logic for storing the ciphertext.
+ */
+ mov x16, #16
+ ands w7, BYTES_W, #0xf
+ csel x13, x7, x16, ne
+
+ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4))
+ST5( csel x14, x16, xzr, gt )
+ cmp BYTES_W, #48 - (MAX_STRIDE << 4)
+ csel x15, x16, xzr, gt
+ cmp BYTES_W, #32 - (MAX_STRIDE << 4)
+ csel x16, x16, xzr, gt
+ cmp BYTES_W, #16 - (MAX_STRIDE << 4)
+
+ adr_l x9, .Lcts_permute_table
+ add x9, x9, x13
+ ble .Lctrtail1x\xctr
+
+ST5( ld1 {v5.16b}, [IN], x14 )
+ ld1 {v6.16b}, [IN], x15
+ ld1 {v7.16b}, [IN], x16
-.Lctrcarrydone:
- subs w4, w4, #1
- bmi .Lctrtailblock /* blocks <0 means tail block */
- ld1 {v3.16b}, [x1], #16
- eor v3.16b, v0.16b, v3.16b
- st1 {v3.16b}, [x0], #16
- bne .Lctrloop
-
-.Lctrout:
- st1 {vctr.16b}, [x5] /* return next CTR value */
- ldp x29, x30, [sp], #16
- ret
+ST4( bl aes_encrypt_block4x )
+ST5( bl aes_encrypt_block5x )
-.Lctrtailblock:
- st1 {v0.16b}, [x0]
- b .Lctrout
+ ld1 {v8.16b}, [IN], x13
+ ld1 {v9.16b}, [IN]
+ ld1 {v10.16b}, [x9]
+
+ST4( eor v6.16b, v6.16b, v0.16b )
+ST4( eor v7.16b, v7.16b, v1.16b )
+ST4( tbl v3.16b, {v3.16b}, v10.16b )
+ST4( eor v8.16b, v8.16b, v2.16b )
+ST4( eor v9.16b, v9.16b, v3.16b )
+
+ST5( eor v5.16b, v5.16b, v0.16b )
+ST5( eor v6.16b, v6.16b, v1.16b )
+ST5( tbl v4.16b, {v4.16b}, v10.16b )
+ST5( eor v7.16b, v7.16b, v2.16b )
+ST5( eor v8.16b, v8.16b, v3.16b )
+ST5( eor v9.16b, v9.16b, v4.16b )
+
+ST5( st1 {v5.16b}, [OUT], x14 )
+ st1 {v6.16b}, [OUT], x15
+ st1 {v7.16b}, [OUT], x16
+ add x13, x13, OUT
+ st1 {v9.16b}, [x13] // overlapping stores
+ st1 {v8.16b}, [OUT]
+ b .Lctrout\xctr
+
+.Lctrtail1x\xctr:
+ /*
+ * Handle <= 16 bytes of plaintext
+ *
+ * This code always reads and writes 16 bytes. To avoid out of bounds
+ * accesses, XCTR and CTR modes must use a temporary buffer when
+ * encrypting/decrypting less than 16 bytes.
+ *
+ * This code is unusual in that it loads the input and stores the output
+ * relative to the end of the buffers rather than relative to the start.
+ * This causes unusual behaviour when encrypting/decrypting less than 16
+ * bytes; the end of the data is expected to be at the end of the
+ * temporary buffer rather than the start of the data being at the start
+ * of the temporary buffer.
+ */
+ sub x8, x7, #16
+ csel x7, x7, x8, eq
+ add IN, IN, x7
+ add OUT, OUT, x7
+ ld1 {v5.16b}, [IN]
+ ld1 {v6.16b}, [OUT]
+ST5( mov v3.16b, v4.16b )
+ encrypt_block v3, ROUNDS_W, KEY, x8, w7
+ ld1 {v10.16b-v11.16b}, [x9]
+ tbl v3.16b, {v3.16b}, v10.16b
+ sshr v11.16b, v11.16b, #7
+ eor v5.16b, v5.16b, v3.16b
+ bif v5.16b, v6.16b, v11.16b
+ st1 {v5.16b}, [OUT]
+ b .Lctrout\xctr
+
+ // Arguments
+ .unreq OUT
+ .unreq IN
+ .unreq KEY
+ .unreq ROUNDS_W
+ .unreq BYTES_W
+ .unreq IV
+ .unreq BYTE_CTR_W // XCTR only
+ // Intermediate values
+ .unreq CTR_W // XCTR only
+ .unreq CTR // XCTR only
+ .unreq IV_PART
+ .unreq BLOCKS
+ .unreq BLOCKS_W
+.endm
+
+ /*
+ * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int bytes, u8 ctr[])
+ *
+ * The input and output buffers must always be at least 16 bytes even if
+ * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
+ * accesses will occur. The data to be encrypted/decrypted is expected
+ * to be at the end of this 16-byte temporary buffer rather than the
+ * start.
+ */
+
+AES_FUNC_START(aes_ctr_encrypt)
+ ctr_encrypt 0
+AES_FUNC_END(aes_ctr_encrypt)
+
+ /*
+ * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int bytes, u8 const iv[], int byte_ctr)
+ *
+ * The input and output buffers must always be at least 16 bytes even if
+ * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
+ * accesses will occur. The data to be encrypted/decrypted is expected
+ * to be at the end of this 16-byte temporary buffer rather than the
+ * start.
+ */
-.Lctrcarry:
- umov x7, vctr.d[0] /* load upper word of ctr */
- rev x7, x7 /* ... to handle the carry */
- add x7, x7, #1
- rev x7, x7
- ins vctr.d[0], x7
- b .Lctrcarrydone
-AES_ENDPROC(aes_ctr_encrypt)
+AES_FUNC_START(aes_xctr_encrypt)
+ ctr_encrypt 1
+AES_FUNC_END(aes_xctr_encrypt)
/*
@@ -433,9 +636,8 @@ AES_ENDPROC(aes_ctr_encrypt)
uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
.endm
-AES_ENTRY(aes_xts_encrypt)
- stp x29, x30, [sp, #-16]!
- mov x29, sp
+AES_FUNC_START(aes_xts_encrypt)
+ frame_push 0
ld1 {v4.16b}, [x6]
xts_load_mask v8
@@ -493,7 +695,7 @@ AES_ENTRY(aes_xts_encrypt)
st1 {v0.16b}, [x0]
.Lxtsencret:
st1 {v4.16b}, [x6]
- ldp x29, x30, [sp], #16
+ frame_pop
ret
.LxtsencctsNx:
@@ -518,11 +720,10 @@ AES_ENTRY(aes_xts_encrypt)
st1 {v2.16b}, [x4] /* overlapping stores */
mov w4, wzr
b .Lxtsencctsout
-AES_ENDPROC(aes_xts_encrypt)
+AES_FUNC_END(aes_xts_encrypt)
-AES_ENTRY(aes_xts_decrypt)
- stp x29, x30, [sp, #-16]!
- mov x29, sp
+AES_FUNC_START(aes_xts_decrypt)
+ frame_push 0
/* subtract 16 bytes if we are doing CTS */
sub w8, w4, #0x10
@@ -583,7 +784,7 @@ AES_ENTRY(aes_xts_decrypt)
b .Lxtsdecloop
.Lxtsdecout:
st1 {v4.16b}, [x6]
- ldp x29, x30, [sp], #16
+ frame_pop
ret
.Lxtsdeccts:
@@ -612,68 +813,54 @@ AES_ENTRY(aes_xts_decrypt)
st1 {v2.16b}, [x4] /* overlapping stores */
mov w4, wzr
b .Lxtsdecctsout
-AES_ENDPROC(aes_xts_decrypt)
+AES_FUNC_END(aes_xts_decrypt)
/*
* aes_mac_update(u8 const in[], u32 const rk[], int rounds,
* int blocks, u8 dg[], int enc_before, int enc_after)
*/
-AES_ENTRY(aes_mac_update)
- frame_push 6
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x6
-
- ld1 {v0.16b}, [x23] /* get dg */
+AES_FUNC_START(aes_mac_update)
+ ld1 {v0.16b}, [x4] /* get dg */
enc_prepare w2, x1, x7
cbz w5, .Lmacloop4x
encrypt_block v0, w2, x1, x7, w8
.Lmacloop4x:
- subs w22, w22, #4
+ subs w3, w3, #4
bmi .Lmac1x
- ld1 {v1.16b-v4.16b}, [x19], #64 /* get next pt block */
+ ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
- encrypt_block v0, w21, x20, x7, w8
+ encrypt_block v0, w2, x1, x7, w8
eor v0.16b, v0.16b, v2.16b
- encrypt_block v0, w21, x20, x7, w8
+ encrypt_block v0, w2, x1, x7, w8
eor v0.16b, v0.16b, v3.16b
- encrypt_block v0, w21, x20, x7, w8
+ encrypt_block v0, w2, x1, x7, w8
eor v0.16b, v0.16b, v4.16b
- cmp w22, wzr
- csinv x5, x24, xzr, eq
+ cmp w3, wzr
+ csinv x5, x6, xzr, eq
cbz w5, .Lmacout
- encrypt_block v0, w21, x20, x7, w8
- st1 {v0.16b}, [x23] /* return dg */
- cond_yield_neon .Lmacrestart
+ encrypt_block v0, w2, x1, x7, w8
+ st1 {v0.16b}, [x4] /* return dg */
+ cond_yield .Lmacout, x7, x8
b .Lmacloop4x
.Lmac1x:
- add w22, w22, #4
+ add w3, w3, #4
.Lmacloop:
- cbz w22, .Lmacout
- ld1 {v1.16b}, [x19], #16 /* get next pt block */
+ cbz w3, .Lmacout
+ ld1 {v1.16b}, [x0], #16 /* get next pt block */
eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
- subs w22, w22, #1
- csinv x5, x24, xzr, eq
+ subs w3, w3, #1
+ csinv x5, x6, xzr, eq
cbz w5, .Lmacout
.Lmacenc:
- encrypt_block v0, w21, x20, x7, w8
+ encrypt_block v0, w2, x1, x7, w8
b .Lmacloop
.Lmacout:
- st1 {v0.16b}, [x23] /* return dg */
- frame_pop
+ st1 {v0.16b}, [x4] /* return dg */
+ mov w0, w3
ret
-
-.Lmacrestart:
- ld1 {v0.16b}, [x23] /* get dg */
- enc_prepare w21, x20, x0
- b .Lmacloop4x
-AES_ENDPROC(aes_mac_update)
+AES_FUNC_END(aes_mac_update)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 22d9b110cf78..9de7fbc797af 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -8,8 +8,8 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
-#define AES_ENTRY(func) ENTRY(neon_ ## func)
-#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
+#define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func)
+#define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func)
xtsmask .req v7
cbciv .req v7
@@ -66,7 +66,7 @@
prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
.endm
- /* apply SubBytes transformation using the the preloaded Sbox */
+ /* apply SubBytes transformation using the preloaded Sbox */
.macro sub_bytes, in
sub v9.16b, \in\().16b, v15.16b
tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
index 65982039fa36..baf450717b24 100644
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -15,6 +15,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/assembler.h>
.text
@@ -380,7 +381,7 @@ ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
/*
* void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
*/
-ENTRY(aesbs_convert_key)
+SYM_FUNC_START(aesbs_convert_key)
ld1 {v7.4s}, [x1], #16 // load round 0 key
ld1 {v17.4s}, [x1], #16 // load round 1 key
@@ -425,10 +426,10 @@ ENTRY(aesbs_convert_key)
eor v17.16b, v17.16b, v7.16b
str q17, [x0]
ret
-ENDPROC(aesbs_convert_key)
+SYM_FUNC_END(aesbs_convert_key)
.align 4
-aesbs_encrypt8:
+SYM_FUNC_START_LOCAL(aesbs_encrypt8)
ldr q9, [bskey], #16 // round 0 key
ldr q8, M0SR
ldr q24, SR
@@ -488,10 +489,10 @@ aesbs_encrypt8:
eor v2.16b, v2.16b, v12.16b
eor v5.16b, v5.16b, v12.16b
ret
-ENDPROC(aesbs_encrypt8)
+SYM_FUNC_END(aesbs_encrypt8)
.align 4
-aesbs_decrypt8:
+SYM_FUNC_START_LOCAL(aesbs_decrypt8)
lsl x9, rounds, #7
add bskey, bskey, x9
@@ -553,7 +554,7 @@ aesbs_decrypt8:
eor v3.16b, v3.16b, v12.16b
eor v5.16b, v5.16b, v12.16b
ret
-ENDPROC(aesbs_decrypt8)
+SYM_FUNC_END(aesbs_decrypt8)
/*
* aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
@@ -613,7 +614,6 @@ ENDPROC(aesbs_decrypt8)
st1 {\o7\().16b}, [x19], #16
cbz x23, 1f
- cond_yield_neon
b 99b
1: frame_pop
@@ -621,21 +621,21 @@ ENDPROC(aesbs_decrypt8)
.endm
.align 4
-ENTRY(aesbs_ecb_encrypt)
+SYM_TYPED_FUNC_START(aesbs_ecb_encrypt)
__ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
-ENDPROC(aesbs_ecb_encrypt)
+SYM_FUNC_END(aesbs_ecb_encrypt)
.align 4
-ENTRY(aesbs_ecb_decrypt)
+SYM_TYPED_FUNC_START(aesbs_ecb_decrypt)
__ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
-ENDPROC(aesbs_ecb_decrypt)
+SYM_FUNC_END(aesbs_ecb_decrypt)
/*
* aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
*/
.align 4
-ENTRY(aesbs_cbc_decrypt)
+SYM_FUNC_START(aesbs_cbc_decrypt)
frame_push 6
mov x19, x0
@@ -715,12 +715,11 @@ ENTRY(aesbs_cbc_decrypt)
1: st1 {v24.16b}, [x24] // store IV
cbz x23, 2f
- cond_yield_neon
b 99b
2: frame_pop
ret
-ENDPROC(aesbs_cbc_decrypt)
+SYM_FUNC_END(aesbs_cbc_decrypt)
.macro next_tweak, out, in, const, tmp
sshr \tmp\().2d, \in\().2d, #63
@@ -736,131 +735,78 @@ ENDPROC(aesbs_cbc_decrypt)
* aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
*/
-__xts_crypt8:
- mov x6, #1
- lsl x6, x6, x23
- subs w23, w23, #8
- csel x23, x23, xzr, pl
- csel x6, x6, xzr, mi
+SYM_FUNC_START_LOCAL(__xts_crypt8)
+ movi v18.2s, #0x1
+ movi v19.2s, #0x87
+ uzp1 v18.4s, v18.4s, v19.4s
+
+ ld1 {v0.16b-v3.16b}, [x1], #64
+ ld1 {v4.16b-v7.16b}, [x1], #64
+
+ next_tweak v26, v25, v18, v19
+ next_tweak v27, v26, v18, v19
+ next_tweak v28, v27, v18, v19
+ next_tweak v29, v28, v18, v19
+ next_tweak v30, v29, v18, v19
+ next_tweak v31, v30, v18, v19
+ next_tweak v16, v31, v18, v19
+ next_tweak v17, v16, v18, v19
- ld1 {v0.16b}, [x20], #16
- next_tweak v26, v25, v30, v31
eor v0.16b, v0.16b, v25.16b
- tbnz x6, #1, 0f
-
- ld1 {v1.16b}, [x20], #16
- next_tweak v27, v26, v30, v31
eor v1.16b, v1.16b, v26.16b
- tbnz x6, #2, 0f
-
- ld1 {v2.16b}, [x20], #16
- next_tweak v28, v27, v30, v31
eor v2.16b, v2.16b, v27.16b
- tbnz x6, #3, 0f
-
- ld1 {v3.16b}, [x20], #16
- next_tweak v29, v28, v30, v31
eor v3.16b, v3.16b, v28.16b
- tbnz x6, #4, 0f
-
- ld1 {v4.16b}, [x20], #16
- str q29, [sp, #.Lframe_local_offset]
eor v4.16b, v4.16b, v29.16b
- next_tweak v29, v29, v30, v31
- tbnz x6, #5, 0f
-
- ld1 {v5.16b}, [x20], #16
- str q29, [sp, #.Lframe_local_offset + 16]
- eor v5.16b, v5.16b, v29.16b
- next_tweak v29, v29, v30, v31
- tbnz x6, #6, 0f
-
- ld1 {v6.16b}, [x20], #16
- str q29, [sp, #.Lframe_local_offset + 32]
- eor v6.16b, v6.16b, v29.16b
- next_tweak v29, v29, v30, v31
- tbnz x6, #7, 0f
+ eor v5.16b, v5.16b, v30.16b
+ eor v6.16b, v6.16b, v31.16b
+ eor v7.16b, v7.16b, v16.16b
- ld1 {v7.16b}, [x20], #16
- str q29, [sp, #.Lframe_local_offset + 48]
- eor v7.16b, v7.16b, v29.16b
- next_tweak v29, v29, v30, v31
+ stp q16, q17, [x6]
-0: mov bskey, x21
- mov rounds, x22
- br x7
-ENDPROC(__xts_crypt8)
+ mov bskey, x2
+ mov rounds, x3
+ br x16
+SYM_FUNC_END(__xts_crypt8)
.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
- frame_push 6, 64
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x5
+ frame_push 0, 32
+ add x6, sp, #.Lframe_local_offset
-0: movi v30.2s, #0x1
- movi v25.2s, #0x87
- uzp1 v30.4s, v30.4s, v25.4s
- ld1 {v25.16b}, [x24]
+ ld1 {v25.16b}, [x5]
-99: adr x7, \do8
+0: adr x16, \do8
bl __xts_crypt8
- ldp q16, q17, [sp, #.Lframe_local_offset]
- ldp q18, q19, [sp, #.Lframe_local_offset + 32]
-
- eor \o0\().16b, \o0\().16b, v25.16b
- eor \o1\().16b, \o1\().16b, v26.16b
- eor \o2\().16b, \o2\().16b, v27.16b
- eor \o3\().16b, \o3\().16b, v28.16b
-
- st1 {\o0\().16b}, [x19], #16
- mov v25.16b, v26.16b
- tbnz x6, #1, 1f
- st1 {\o1\().16b}, [x19], #16
- mov v25.16b, v27.16b
- tbnz x6, #2, 1f
- st1 {\o2\().16b}, [x19], #16
- mov v25.16b, v28.16b
- tbnz x6, #3, 1f
- st1 {\o3\().16b}, [x19], #16
- mov v25.16b, v29.16b
- tbnz x6, #4, 1f
+ eor v16.16b, \o0\().16b, v25.16b
+ eor v17.16b, \o1\().16b, v26.16b
+ eor v18.16b, \o2\().16b, v27.16b
+ eor v19.16b, \o3\().16b, v28.16b
- eor \o4\().16b, \o4\().16b, v16.16b
- eor \o5\().16b, \o5\().16b, v17.16b
- eor \o6\().16b, \o6\().16b, v18.16b
- eor \o7\().16b, \o7\().16b, v19.16b
+ ldp q24, q25, [x6]
- st1 {\o4\().16b}, [x19], #16
- tbnz x6, #5, 1f
- st1 {\o5\().16b}, [x19], #16
- tbnz x6, #6, 1f
- st1 {\o6\().16b}, [x19], #16
- tbnz x6, #7, 1f
- st1 {\o7\().16b}, [x19], #16
+ eor v20.16b, \o4\().16b, v29.16b
+ eor v21.16b, \o5\().16b, v30.16b
+ eor v22.16b, \o6\().16b, v31.16b
+ eor v23.16b, \o7\().16b, v24.16b
- cbz x23, 1f
- st1 {v25.16b}, [x24]
+ st1 {v16.16b-v19.16b}, [x0], #64
+ st1 {v20.16b-v23.16b}, [x0], #64
- cond_yield_neon 0b
- b 99b
+ subs x4, x4, #8
+ b.gt 0b
-1: st1 {v25.16b}, [x24]
+ st1 {v25.16b}, [x5]
frame_pop
ret
.endm
-ENTRY(aesbs_xts_encrypt)
+SYM_TYPED_FUNC_START(aesbs_xts_encrypt)
__xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
-ENDPROC(aesbs_xts_encrypt)
+SYM_FUNC_END(aesbs_xts_encrypt)
-ENTRY(aesbs_xts_decrypt)
+SYM_TYPED_FUNC_START(aesbs_xts_decrypt)
__xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
-ENDPROC(aesbs_xts_decrypt)
+SYM_FUNC_END(aesbs_xts_decrypt)
.macro next_ctr, v
mov \v\().d[1], x8
@@ -872,134 +818,49 @@ ENDPROC(aesbs_xts_decrypt)
/*
* aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
- * int rounds, int blocks, u8 iv[], u8 final[])
+ * int rounds, int blocks, u8 iv[])
*/
-ENTRY(aesbs_ctr_encrypt)
- frame_push 8
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
- mov x23, x4
- mov x24, x5
- mov x25, x6
-
- cmp x25, #0
- cset x26, ne
- add x23, x23, x26 // do one extra block if final
-
-98: ldp x7, x8, [x24]
- ld1 {v0.16b}, [x24]
+SYM_FUNC_START(aesbs_ctr_encrypt)
+ frame_push 0
+ ldp x7, x8, [x5]
+ ld1 {v0.16b}, [x5]
CPU_LE( rev x7, x7 )
CPU_LE( rev x8, x8 )
adds x8, x8, #1
adc x7, x7, xzr
-99: mov x9, #1
- lsl x9, x9, x23
- subs w23, w23, #8
- csel x23, x23, xzr, pl
- csel x9, x9, xzr, le
-
- tbnz x9, #1, 0f
- next_ctr v1
- tbnz x9, #2, 0f
+0: next_ctr v1
next_ctr v2
- tbnz x9, #3, 0f
next_ctr v3
- tbnz x9, #4, 0f
next_ctr v4
- tbnz x9, #5, 0f
next_ctr v5
- tbnz x9, #6, 0f
next_ctr v6
- tbnz x9, #7, 0f
next_ctr v7
-0: mov bskey, x21
- mov rounds, x22
+ mov bskey, x2
+ mov rounds, x3
bl aesbs_encrypt8
- lsr x9, x9, x26 // disregard the extra block
- tbnz x9, #0, 0f
-
- ld1 {v8.16b}, [x20], #16
- eor v0.16b, v0.16b, v8.16b
- st1 {v0.16b}, [x19], #16
- tbnz x9, #1, 1f
-
- ld1 {v9.16b}, [x20], #16
- eor v1.16b, v1.16b, v9.16b
- st1 {v1.16b}, [x19], #16
- tbnz x9, #2, 2f
-
- ld1 {v10.16b}, [x20], #16
- eor v4.16b, v4.16b, v10.16b
- st1 {v4.16b}, [x19], #16
- tbnz x9, #3, 3f
-
- ld1 {v11.16b}, [x20], #16
- eor v6.16b, v6.16b, v11.16b
- st1 {v6.16b}, [x19], #16
- tbnz x9, #4, 4f
-
- ld1 {v12.16b}, [x20], #16
- eor v3.16b, v3.16b, v12.16b
- st1 {v3.16b}, [x19], #16
- tbnz x9, #5, 5f
-
- ld1 {v13.16b}, [x20], #16
- eor v7.16b, v7.16b, v13.16b
- st1 {v7.16b}, [x19], #16
- tbnz x9, #6, 6f
+ ld1 { v8.16b-v11.16b}, [x1], #64
+ ld1 {v12.16b-v15.16b}, [x1], #64
- ld1 {v14.16b}, [x20], #16
- eor v2.16b, v2.16b, v14.16b
- st1 {v2.16b}, [x19], #16
- tbnz x9, #7, 7f
+ eor v8.16b, v0.16b, v8.16b
+ eor v9.16b, v1.16b, v9.16b
+ eor v10.16b, v4.16b, v10.16b
+ eor v11.16b, v6.16b, v11.16b
+ eor v12.16b, v3.16b, v12.16b
+ eor v13.16b, v7.16b, v13.16b
+ eor v14.16b, v2.16b, v14.16b
+ eor v15.16b, v5.16b, v15.16b
- ld1 {v15.16b}, [x20], #16
- eor v5.16b, v5.16b, v15.16b
- st1 {v5.16b}, [x19], #16
+ st1 { v8.16b-v11.16b}, [x0], #64
+ st1 {v12.16b-v15.16b}, [x0], #64
-8: next_ctr v0
- st1 {v0.16b}, [x24]
- cbz x23, .Lctr_done
+ next_ctr v0
+ subs x4, x4, #8
+ b.gt 0b
- cond_yield_neon 98b
- b 99b
-
-.Lctr_done:
+ st1 {v0.16b}, [x5]
frame_pop
ret
-
- /*
- * If we are handling the tail of the input (x6 != NULL), return the
- * final keystream block back to the caller.
- */
-0: cbz x25, 8b
- st1 {v0.16b}, [x25]
- b 8b
-1: cbz x25, 8b
- st1 {v1.16b}, [x25]
- b 8b
-2: cbz x25, 8b
- st1 {v4.16b}, [x25]
- b 8b
-3: cbz x25, 8b
- st1 {v6.16b}, [x25]
- b 8b
-4: cbz x25, 8b
- st1 {v3.16b}, [x25]
- b 8b
-5: cbz x25, 8b
- st1 {v7.16b}, [x25]
- b 8b
-6: cbz x25, 8b
- st1 {v2.16b}, [x25]
- b 8b
-7: cbz x25, 8b
- st1 {v5.16b}, [x25]
- b 8b
-ENDPROC(aesbs_ctr_encrypt)
+SYM_FUNC_END(aesbs_ctr_encrypt)
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index e3e27349a9fe..bac4cabef607 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -34,7 +34,7 @@ asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]);
asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, u8 iv[], u8 final[]);
+ int rounds, int blocks, u8 iv[]);
asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]);
@@ -46,6 +46,8 @@ asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks);
asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 iv[]);
+asmlinkage void neon_aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int bytes, u8 ctr[]);
asmlinkage void neon_aes_xts_encrypt(u8 out[], u8 const in[],
u32 const rk1[], int rounds, int bytes,
u32 const rk2[], u8 iv[], int first);
@@ -58,16 +60,11 @@ struct aesbs_ctx {
int rounds;
} __aligned(AES_BLOCK_SIZE);
-struct aesbs_cbc_ctx {
+struct aesbs_cbc_ctr_ctx {
struct aesbs_ctx key;
u32 enc[AES_MAX_KEYLENGTH_U32];
};
-struct aesbs_ctr_ctx {
- struct aesbs_ctx key; /* must be first member */
- struct crypto_aes_ctx fallback;
-};
-
struct aesbs_xts_ctx {
struct aesbs_ctx key;
u32 twkey[AES_MAX_KEYLENGTH_U32];
@@ -133,10 +130,10 @@ static int ecb_decrypt(struct skcipher_request *req)
return __ecb_crypt(req, aesbs_ecb_decrypt);
}
-static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+static int aesbs_cbc_ctr_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
- struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
struct crypto_aes_ctx rk;
int err;
@@ -151,6 +148,7 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
kernel_neon_begin();
aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
kernel_neon_end();
+ memzero_explicit(&rk, sizeof(rk));
return 0;
}
@@ -158,7 +156,7 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
static int cbc_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
int err;
@@ -181,7 +179,7 @@ static int cbc_encrypt(struct skcipher_request *req)
static int cbc_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
int err;
@@ -206,62 +204,35 @@ static int cbc_decrypt(struct skcipher_request *req)
return err;
}
-static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key,
- unsigned int key_len)
-{
- struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = aes_expandkey(&ctx->fallback, in_key, key_len);
- if (err)
- return err;
-
- ctx->key.rounds = 6 + key_len / 4;
-
- kernel_neon_begin();
- aesbs_convert_key(ctx->key.rk, ctx->fallback.key_enc, ctx->key.rounds);
- kernel_neon_end();
-
- return 0;
-}
-
static int ctr_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct aesbs_cbc_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
- u8 buf[AES_BLOCK_SIZE];
int err;
err = skcipher_walk_virt(&walk, req, false);
while (walk.nbytes > 0) {
- unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
- u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
-
- if (walk.nbytes < walk.total) {
- blocks = round_down(blocks,
- walk.stride / AES_BLOCK_SIZE);
- final = NULL;
- }
+ int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7;
+ int nbytes = walk.nbytes % (8 * AES_BLOCK_SIZE);
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
kernel_neon_begin();
- aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- ctx->rk, ctx->rounds, blocks, walk.iv, final);
- kernel_neon_end();
-
- if (final) {
- u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
- u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
-
- crypto_xor_cpy(dst, src, final,
- walk.total % AES_BLOCK_SIZE);
-
- err = skcipher_walk_done(&walk, 0);
- break;
+ if (blocks >= 8) {
+ aesbs_ctr_encrypt(dst, src, ctx->key.rk, ctx->key.rounds,
+ blocks, walk.iv);
+ dst += blocks * AES_BLOCK_SIZE;
+ src += blocks * AES_BLOCK_SIZE;
}
- err = skcipher_walk_done(&walk,
- walk.nbytes - blocks * AES_BLOCK_SIZE);
+ if (nbytes && walk.nbytes == walk.total) {
+ neon_aes_ctr_encrypt(dst, src, ctx->enc, ctx->key.rounds,
+ nbytes, walk.iv);
+ nbytes = 0;
+ }
+ kernel_neon_end();
+ err = skcipher_walk_done(&walk, nbytes);
}
return err;
}
@@ -291,29 +262,6 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
return aesbs_setkey(tfm, in_key, key_len);
}
-static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
-{
- struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
- unsigned long flags;
-
- /*
- * Temporarily disable interrupts to avoid races where
- * cachelines are evicted when the CPU is interrupted
- * to do something else.
- */
- local_irq_save(flags);
- aes_encrypt(&ctx->fallback, dst, src);
- local_irq_restore(flags);
-}
-
-static int ctr_encrypt_sync(struct skcipher_request *req)
-{
- if (!crypto_simd_usable())
- return crypto_ctr_encrypt_walk(req, ctr_encrypt_one);
-
- return ctr_encrypt(req);
-}
-
static int __xts_crypt(struct skcipher_request *req, bool encrypt,
void (*fn)(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]))
@@ -354,23 +302,18 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
return err;
while (walk.nbytes >= AES_BLOCK_SIZE) {
- unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
-
- if (walk.nbytes < walk.total || walk.nbytes % AES_BLOCK_SIZE)
- blocks = round_down(blocks,
- walk.stride / AES_BLOCK_SIZE);
-
+ int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7;
out = walk.dst.virt.addr;
in = walk.src.virt.addr;
nbytes = walk.nbytes;
kernel_neon_begin();
- if (likely(blocks > 6)) { /* plain NEON is faster otherwise */
- if (first)
+ if (blocks >= 8) {
+ if (first == 1)
neon_aes_ecb_encrypt(walk.iv, walk.iv,
ctx->twkey,
ctx->key.rounds, 1);
- first = 0;
+ first = 2;
fn(out, in, ctx->key.rk, ctx->key.rounds, blocks,
walk.iv);
@@ -379,10 +322,17 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
in += blocks * AES_BLOCK_SIZE;
nbytes -= blocks * AES_BLOCK_SIZE;
}
-
- if (walk.nbytes == walk.total && nbytes > 0)
- goto xts_tail;
-
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ if (encrypt)
+ neon_aes_xts_encrypt(out, in, ctx->cts.key_enc,
+ ctx->key.rounds, nbytes,
+ ctx->twkey, walk.iv, first);
+ else
+ neon_aes_xts_decrypt(out, in, ctx->cts.key_dec,
+ ctx->key.rounds, nbytes,
+ ctx->twkey, walk.iv, first);
+ nbytes = first = 0;
+ }
kernel_neon_end();
err = skcipher_walk_done(&walk, nbytes);
}
@@ -407,13 +357,12 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
nbytes = walk.nbytes;
kernel_neon_begin();
-xts_tail:
if (encrypt)
neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds,
- nbytes, ctx->twkey, walk.iv, first ?: 2);
+ nbytes, ctx->twkey, walk.iv, first);
else
neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, ctx->key.rounds,
- nbytes, ctx->twkey, walk.iv, first ?: 2);
+ nbytes, ctx->twkey, walk.iv, first);
kernel_neon_end();
return skcipher_walk_done(&walk, 0);
@@ -430,13 +379,12 @@ static int xts_decrypt(struct skcipher_request *req)
}
static struct skcipher_alg aes_algs[] = { {
- .base.cra_name = "__ecb(aes)",
- .base.cra_driver_name = "__ecb-aes-neonbs",
+ .base.cra_name = "ecb(aes)",
+ .base.cra_driver_name = "ecb-aes-neonbs",
.base.cra_priority = 250,
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct aesbs_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
@@ -445,62 +393,43 @@ static struct skcipher_alg aes_algs[] = { {
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
}, {
- .base.cra_name = "__cbc(aes)",
- .base.cra_driver_name = "__cbc-aes-neonbs",
+ .base.cra_name = "cbc(aes)",
+ .base.cra_driver_name = "cbc-aes-neonbs",
.base.cra_priority = 250,
.base.cra_blocksize = AES_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctx),
+ .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctr_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.walksize = 8 * AES_BLOCK_SIZE,
.ivsize = AES_BLOCK_SIZE,
- .setkey = aesbs_cbc_setkey,
+ .setkey = aesbs_cbc_ctr_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
}, {
- .base.cra_name = "__ctr(aes)",
- .base.cra_driver_name = "__ctr-aes-neonbs",
+ .base.cra_name = "ctr(aes)",
+ .base.cra_driver_name = "ctr-aes-neonbs",
.base.cra_priority = 250,
.base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct aesbs_ctx),
+ .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctr_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.chunksize = AES_BLOCK_SIZE,
.walksize = 8 * AES_BLOCK_SIZE,
.ivsize = AES_BLOCK_SIZE,
- .setkey = aesbs_setkey,
+ .setkey = aesbs_cbc_ctr_setkey,
.encrypt = ctr_encrypt,
.decrypt = ctr_encrypt,
}, {
- .base.cra_name = "ctr(aes)",
- .base.cra_driver_name = "ctr-aes-neonbs",
- .base.cra_priority = 250 - 1,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct aesbs_ctr_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .chunksize = AES_BLOCK_SIZE,
- .walksize = 8 * AES_BLOCK_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = aesbs_ctr_setkey_sync,
- .encrypt = ctr_encrypt_sync,
- .decrypt = ctr_encrypt_sync,
-}, {
- .base.cra_name = "__xts(aes)",
- .base.cra_driver_name = "__xts-aes-neonbs",
+ .base.cra_name = "xts(aes)",
+ .base.cra_driver_name = "xts-aes-neonbs",
.base.cra_priority = 250,
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct aesbs_xts_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
@@ -511,54 +440,17 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = xts_decrypt,
} };
-static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
-
static void aes_exit(void)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
- if (aes_simd_algs[i])
- simd_skcipher_free(aes_simd_algs[i]);
-
crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
}
static int __init aes_init(void)
{
- struct simd_skcipher_alg *simd;
- const char *basename;
- const char *algname;
- const char *drvname;
- int err;
- int i;
-
if (!cpu_have_named_feature(ASIMD))
return -ENODEV;
- err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
- if (err)
- return err;
-
- for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
- if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
- continue;
-
- algname = aes_algs[i].base.cra_name + 2;
- drvname = aes_algs[i].base.cra_driver_name + 2;
- basename = aes_algs[i].base.cra_driver_name;
- simd = simd_skcipher_create_compat(algname, drvname, basename);
- err = PTR_ERR(simd);
- if (IS_ERR(simd))
- goto unregister_simds;
-
- aes_simd_algs[i] = simd;
- }
- return 0;
-
-unregister_simds:
- aes_exit();
- return err;
+ return crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
}
module_init(aes_init);
diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
index 706c4e10e9e2..b70ac76f2610 100644
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
@@ -36,7 +36,7 @@
*
* Clobbers: w3, x10, v4, v12
*/
-chacha_permute:
+SYM_FUNC_START_LOCAL(chacha_permute)
adr_l x10, ROT8
ld1 {v12.4s}, [x10]
@@ -104,9 +104,9 @@ chacha_permute:
b.ne .Ldoubleround
ret
-ENDPROC(chacha_permute)
+SYM_FUNC_END(chacha_permute)
-ENTRY(chacha_block_xor_neon)
+SYM_FUNC_START(chacha_block_xor_neon)
// x0: Input state matrix, s
// x1: 1 data block output, o
// x2: 1 data block input, i
@@ -143,9 +143,9 @@ ENTRY(chacha_block_xor_neon)
ldp x29, x30, [sp], #16
ret
-ENDPROC(chacha_block_xor_neon)
+SYM_FUNC_END(chacha_block_xor_neon)
-ENTRY(hchacha_block_neon)
+SYM_FUNC_START(hchacha_block_neon)
// x0: Input state matrix, s
// x1: output (8 32-bit words)
// w2: nrounds
@@ -163,7 +163,7 @@ ENTRY(hchacha_block_neon)
ldp x29, x30, [sp], #16
ret
-ENDPROC(hchacha_block_neon)
+SYM_FUNC_END(hchacha_block_neon)
a0 .req w12
a1 .req w13
@@ -183,7 +183,7 @@ ENDPROC(hchacha_block_neon)
a15 .req w28
.align 6
-ENTRY(chacha_4block_xor_neon)
+SYM_FUNC_START(chacha_4block_xor_neon)
frame_push 10
// x0: Input state matrix, s
@@ -195,7 +195,6 @@ ENTRY(chacha_4block_xor_neon)
adr_l x10, .Lpermute
and x5, x4, #63
add x10, x10, x5
- add x11, x10, #64
//
// This function encrypts four consecutive ChaCha blocks by loading
@@ -645,11 +644,11 @@ CPU_BE( rev a15, a15 )
zip2 v31.4s, v14.4s, v15.4s
eor a15, a15, w9
- mov x3, #64
+ add x3, x2, x4
+ sub x3, x3, #128 // start of last block
+
subs x5, x4, #128
- add x6, x5, x2
- csel x3, x3, xzr, ge
- csel x2, x2, x6, ge
+ csel x2, x2, x3, ge
// interleave 64-bit words in state n, n+2
zip1 v0.2d, v16.2d, v18.2d
@@ -658,13 +657,10 @@ CPU_BE( rev a15, a15 )
zip1 v8.2d, v17.2d, v19.2d
zip2 v12.2d, v17.2d, v19.2d
stp a2, a3, [x1, #-56]
- ld1 {v16.16b-v19.16b}, [x2], x3
subs x6, x4, #192
- ccmp x3, xzr, #4, lt
- add x7, x6, x2
- csel x3, x3, xzr, eq
- csel x2, x2, x7, eq
+ ld1 {v16.16b-v19.16b}, [x2], #64
+ csel x2, x2, x3, ge
zip1 v1.2d, v20.2d, v22.2d
zip2 v5.2d, v20.2d, v22.2d
@@ -672,13 +668,10 @@ CPU_BE( rev a15, a15 )
zip1 v9.2d, v21.2d, v23.2d
zip2 v13.2d, v21.2d, v23.2d
stp a6, a7, [x1, #-40]
- ld1 {v20.16b-v23.16b}, [x2], x3
subs x7, x4, #256
- ccmp x3, xzr, #4, lt
- add x8, x7, x2
- csel x3, x3, xzr, eq
- csel x2, x2, x8, eq
+ ld1 {v20.16b-v23.16b}, [x2], #64
+ csel x2, x2, x3, ge
zip1 v2.2d, v24.2d, v26.2d
zip2 v6.2d, v24.2d, v26.2d
@@ -686,12 +679,10 @@ CPU_BE( rev a15, a15 )
zip1 v10.2d, v25.2d, v27.2d
zip2 v14.2d, v25.2d, v27.2d
stp a10, a11, [x1, #-24]
- ld1 {v24.16b-v27.16b}, [x2], x3
subs x8, x4, #320
- ccmp x3, xzr, #4, lt
- add x9, x8, x2
- csel x2, x2, x9, eq
+ ld1 {v24.16b-v27.16b}, [x2], #64
+ csel x2, x2, x3, ge
zip1 v3.2d, v28.2d, v30.2d
zip2 v7.2d, v28.2d, v30.2d
@@ -699,159 +690,113 @@ CPU_BE( rev a15, a15 )
zip1 v11.2d, v29.2d, v31.2d
zip2 v15.2d, v29.2d, v31.2d
stp a14, a15, [x1, #-8]
+
+ tbnz x5, #63, .Lt128
ld1 {v28.16b-v31.16b}, [x2]
// xor with corresponding input, write to output
- tbnz x5, #63, 0f
eor v16.16b, v16.16b, v0.16b
eor v17.16b, v17.16b, v1.16b
eor v18.16b, v18.16b, v2.16b
eor v19.16b, v19.16b, v3.16b
- st1 {v16.16b-v19.16b}, [x1], #64
- cbz x5, .Lout
- tbnz x6, #63, 1f
+ tbnz x6, #63, .Lt192
+
eor v20.16b, v20.16b, v4.16b
eor v21.16b, v21.16b, v5.16b
eor v22.16b, v22.16b, v6.16b
eor v23.16b, v23.16b, v7.16b
- st1 {v20.16b-v23.16b}, [x1], #64
- cbz x6, .Lout
- tbnz x7, #63, 2f
+ st1 {v16.16b-v19.16b}, [x1], #64
+ tbnz x7, #63, .Lt256
+
eor v24.16b, v24.16b, v8.16b
eor v25.16b, v25.16b, v9.16b
eor v26.16b, v26.16b, v10.16b
eor v27.16b, v27.16b, v11.16b
- st1 {v24.16b-v27.16b}, [x1], #64
- cbz x7, .Lout
- tbnz x8, #63, 3f
+ st1 {v20.16b-v23.16b}, [x1], #64
+ tbnz x8, #63, .Lt320
+
eor v28.16b, v28.16b, v12.16b
eor v29.16b, v29.16b, v13.16b
eor v30.16b, v30.16b, v14.16b
eor v31.16b, v31.16b, v15.16b
+
+ st1 {v24.16b-v27.16b}, [x1], #64
st1 {v28.16b-v31.16b}, [x1]
.Lout: frame_pop
ret
- // fewer than 128 bytes of in/output
-0: ld1 {v8.16b}, [x10]
- ld1 {v9.16b}, [x11]
- movi v10.16b, #16
- sub x2, x1, #64
- add x1, x1, x5
- ld1 {v16.16b-v19.16b}, [x2]
- tbl v4.16b, {v0.16b-v3.16b}, v8.16b
- tbx v20.16b, {v16.16b-v19.16b}, v9.16b
- add v8.16b, v8.16b, v10.16b
- add v9.16b, v9.16b, v10.16b
- tbl v5.16b, {v0.16b-v3.16b}, v8.16b
- tbx v21.16b, {v16.16b-v19.16b}, v9.16b
- add v8.16b, v8.16b, v10.16b
- add v9.16b, v9.16b, v10.16b
- tbl v6.16b, {v0.16b-v3.16b}, v8.16b
- tbx v22.16b, {v16.16b-v19.16b}, v9.16b
- add v8.16b, v8.16b, v10.16b
- add v9.16b, v9.16b, v10.16b
- tbl v7.16b, {v0.16b-v3.16b}, v8.16b
- tbx v23.16b, {v16.16b-v19.16b}, v9.16b
-
- eor v20.16b, v20.16b, v4.16b
- eor v21.16b, v21.16b, v5.16b
- eor v22.16b, v22.16b, v6.16b
- eor v23.16b, v23.16b, v7.16b
- st1 {v20.16b-v23.16b}, [x1]
- b .Lout
-
// fewer than 192 bytes of in/output
-1: ld1 {v8.16b}, [x10]
- ld1 {v9.16b}, [x11]
- movi v10.16b, #16
- add x1, x1, x6
- tbl v0.16b, {v4.16b-v7.16b}, v8.16b
- tbx v20.16b, {v16.16b-v19.16b}, v9.16b
- add v8.16b, v8.16b, v10.16b
- add v9.16b, v9.16b, v10.16b
- tbl v1.16b, {v4.16b-v7.16b}, v8.16b
- tbx v21.16b, {v16.16b-v19.16b}, v9.16b
- add v8.16b, v8.16b, v10.16b
- add v9.16b, v9.16b, v10.16b
- tbl v2.16b, {v4.16b-v7.16b}, v8.16b
- tbx v22.16b, {v16.16b-v19.16b}, v9.16b
- add v8.16b, v8.16b, v10.16b
- add v9.16b, v9.16b, v10.16b
- tbl v3.16b, {v4.16b-v7.16b}, v8.16b
- tbx v23.16b, {v16.16b-v19.16b}, v9.16b
-
- eor v20.16b, v20.16b, v0.16b
- eor v21.16b, v21.16b, v1.16b
- eor v22.16b, v22.16b, v2.16b
- eor v23.16b, v23.16b, v3.16b
- st1 {v20.16b-v23.16b}, [x1]
+.Lt192: cbz x5, 1f // exactly 128 bytes?
+ ld1 {v28.16b-v31.16b}, [x10]
+ add x5, x5, x1
+ tbl v28.16b, {v4.16b-v7.16b}, v28.16b
+ tbl v29.16b, {v4.16b-v7.16b}, v29.16b
+ tbl v30.16b, {v4.16b-v7.16b}, v30.16b
+ tbl v31.16b, {v4.16b-v7.16b}, v31.16b
+
+0: eor v20.16b, v20.16b, v28.16b
+ eor v21.16b, v21.16b, v29.16b
+ eor v22.16b, v22.16b, v30.16b
+ eor v23.16b, v23.16b, v31.16b
+ st1 {v20.16b-v23.16b}, [x5] // overlapping stores
+1: st1 {v16.16b-v19.16b}, [x1]
b .Lout
+ // fewer than 128 bytes of in/output
+.Lt128: ld1 {v28.16b-v31.16b}, [x10]
+ add x5, x5, x1
+ sub x1, x1, #64
+ tbl v28.16b, {v0.16b-v3.16b}, v28.16b
+ tbl v29.16b, {v0.16b-v3.16b}, v29.16b
+ tbl v30.16b, {v0.16b-v3.16b}, v30.16b
+ tbl v31.16b, {v0.16b-v3.16b}, v31.16b
+ ld1 {v16.16b-v19.16b}, [x1] // reload first output block
+ b 0b
+
// fewer than 256 bytes of in/output
-2: ld1 {v4.16b}, [x10]
- ld1 {v5.16b}, [x11]
- movi v6.16b, #16
- add x1, x1, x7
+.Lt256: cbz x6, 2f // exactly 192 bytes?
+ ld1 {v4.16b-v7.16b}, [x10]
+ add x6, x6, x1
tbl v0.16b, {v8.16b-v11.16b}, v4.16b
- tbx v24.16b, {v20.16b-v23.16b}, v5.16b
- add v4.16b, v4.16b, v6.16b
- add v5.16b, v5.16b, v6.16b
- tbl v1.16b, {v8.16b-v11.16b}, v4.16b
- tbx v25.16b, {v20.16b-v23.16b}, v5.16b
- add v4.16b, v4.16b, v6.16b
- add v5.16b, v5.16b, v6.16b
- tbl v2.16b, {v8.16b-v11.16b}, v4.16b
- tbx v26.16b, {v20.16b-v23.16b}, v5.16b
- add v4.16b, v4.16b, v6.16b
- add v5.16b, v5.16b, v6.16b
- tbl v3.16b, {v8.16b-v11.16b}, v4.16b
- tbx v27.16b, {v20.16b-v23.16b}, v5.16b
-
- eor v24.16b, v24.16b, v0.16b
- eor v25.16b, v25.16b, v1.16b
- eor v26.16b, v26.16b, v2.16b
- eor v27.16b, v27.16b, v3.16b
- st1 {v24.16b-v27.16b}, [x1]
+ tbl v1.16b, {v8.16b-v11.16b}, v5.16b
+ tbl v2.16b, {v8.16b-v11.16b}, v6.16b
+ tbl v3.16b, {v8.16b-v11.16b}, v7.16b
+
+ eor v28.16b, v28.16b, v0.16b
+ eor v29.16b, v29.16b, v1.16b
+ eor v30.16b, v30.16b, v2.16b
+ eor v31.16b, v31.16b, v3.16b
+ st1 {v28.16b-v31.16b}, [x6] // overlapping stores
+2: st1 {v20.16b-v23.16b}, [x1]
b .Lout
// fewer than 320 bytes of in/output
-3: ld1 {v4.16b}, [x10]
- ld1 {v5.16b}, [x11]
- movi v6.16b, #16
- add x1, x1, x8
+.Lt320: cbz x7, 3f // exactly 256 bytes?
+ ld1 {v4.16b-v7.16b}, [x10]
+ add x7, x7, x1
tbl v0.16b, {v12.16b-v15.16b}, v4.16b
- tbx v28.16b, {v24.16b-v27.16b}, v5.16b
- add v4.16b, v4.16b, v6.16b
- add v5.16b, v5.16b, v6.16b
- tbl v1.16b, {v12.16b-v15.16b}, v4.16b
- tbx v29.16b, {v24.16b-v27.16b}, v5.16b
- add v4.16b, v4.16b, v6.16b
- add v5.16b, v5.16b, v6.16b
- tbl v2.16b, {v12.16b-v15.16b}, v4.16b
- tbx v30.16b, {v24.16b-v27.16b}, v5.16b
- add v4.16b, v4.16b, v6.16b
- add v5.16b, v5.16b, v6.16b
- tbl v3.16b, {v12.16b-v15.16b}, v4.16b
- tbx v31.16b, {v24.16b-v27.16b}, v5.16b
+ tbl v1.16b, {v12.16b-v15.16b}, v5.16b
+ tbl v2.16b, {v12.16b-v15.16b}, v6.16b
+ tbl v3.16b, {v12.16b-v15.16b}, v7.16b
eor v28.16b, v28.16b, v0.16b
eor v29.16b, v29.16b, v1.16b
eor v30.16b, v30.16b, v2.16b
eor v31.16b, v31.16b, v3.16b
- st1 {v28.16b-v31.16b}, [x1]
+ st1 {v28.16b-v31.16b}, [x7] // overlapping stores
+3: st1 {v24.16b-v27.16b}, [x1]
b .Lout
-ENDPROC(chacha_4block_xor_neon)
+SYM_FUNC_END(chacha_4block_xor_neon)
.section ".rodata", "a", %progbits
.align L1_CACHE_SHIFT
.Lpermute:
.set .Li, 0
- .rept 192
+ .rept 128
.byte (.Li - 64)
.set .Li, .Li + 1
.endr
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index c1f9660d104c..af2bbca38e70 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -55,10 +55,10 @@ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
break;
}
chacha_4block_xor_neon(state, dst, src, nrounds, l);
- bytes -= CHACHA_BLOCK_SIZE * 5;
- src += CHACHA_BLOCK_SIZE * 5;
- dst += CHACHA_BLOCK_SIZE * 5;
- state[12] += 5;
+ bytes -= l;
+ src += l;
+ dst += l;
+ state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
}
}
@@ -87,9 +87,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
!crypto_simd_usable())
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
- kernel_neon_begin();
- chacha_doneon(state, dst, src, bytes, nrounds);
- kernel_neon_end();
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_neon_begin();
+ chacha_doneon(state, dst, src, todo, nrounds);
+ kernel_neon_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index e545b42e6a46..5604de61d06d 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -66,12 +66,12 @@
#include <asm/assembler.h>
.text
- .cpu generic+crypto
+ .arch armv8-a+crypto
- init_crc .req w19
- buf .req x20
- len .req x21
- fold_consts_ptr .req x22
+ init_crc .req w0
+ buf .req x1
+ len .req x2
+ fold_consts_ptr .req x3
fold_consts .req v10
@@ -131,7 +131,7 @@
tbl bd4.16b, {\bd\().16b}, perm4.16b
.endm
-__pmull_p8_core:
+SYM_FUNC_START_LOCAL(__pmull_p8_core)
.L__pmull_p8_core:
ext t4.8b, ad.8b, ad.8b, #1 // A1
ext t5.8b, ad.8b, ad.8b, #2 // A2
@@ -194,7 +194,7 @@ __pmull_p8_core:
eor t4.16b, t4.16b, t5.16b
eor t6.16b, t6.16b, t3.16b
ret
-ENDPROC(__pmull_p8_core)
+SYM_FUNC_END(__pmull_p8_core)
.macro __pmull_p8, rq, ad, bd, i
.ifnc \bd, fold_consts
@@ -257,12 +257,6 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
.endm
.macro crc_t10dif_pmull, p
- frame_push 4, 128
-
- mov init_crc, w0
- mov buf, x1
- mov len, x2
-
__pmull_init_\p
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
@@ -317,26 +311,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
fold_32_bytes \p, v6, v7
subs len, len, #128
- b.lt .Lfold_128_bytes_loop_done_\@
-
- if_will_cond_yield_neon
- stp q0, q1, [sp, #.Lframe_local_offset]
- stp q2, q3, [sp, #.Lframe_local_offset + 32]
- stp q4, q5, [sp, #.Lframe_local_offset + 64]
- stp q6, q7, [sp, #.Lframe_local_offset + 96]
- do_cond_yield_neon
- ldp q0, q1, [sp, #.Lframe_local_offset]
- ldp q2, q3, [sp, #.Lframe_local_offset + 32]
- ldp q4, q5, [sp, #.Lframe_local_offset + 64]
- ldp q6, q7, [sp, #.Lframe_local_offset + 96]
- ld1 {fold_consts.2d}, [fold_consts_ptr]
- __pmull_init_\p
- __pmull_pre_\p fold_consts
- endif_yield_neon
-
- b .Lfold_128_bytes_loop_\@
-
-.Lfold_128_bytes_loop_done_\@:
+ b.ge .Lfold_128_bytes_loop_\@
// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
@@ -453,7 +428,9 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
umov w0, v0.h[0]
+ .ifc \p, p8
frame_pop
+ .endif
ret
.Lless_than_256_bytes_\@:
@@ -488,9 +465,10 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
//
// Assumes len >= 16.
//
-ENTRY(crc_t10dif_pmull_p8)
- crc_t10dif_pmull p8
-ENDPROC(crc_t10dif_pmull_p8)
+SYM_FUNC_START(crc_t10dif_pmull_p8)
+ frame_push 1
+ crc_t10dif_pmull p8
+SYM_FUNC_END(crc_t10dif_pmull_p8)
.align 5
//
@@ -498,9 +476,9 @@ ENDPROC(crc_t10dif_pmull_p8)
//
// Assumes len >= 16.
//
-ENTRY(crc_t10dif_pmull_p64)
+SYM_FUNC_START(crc_t10dif_pmull_p64)
crc_t10dif_pmull p64
-ENDPROC(crc_t10dif_pmull_p64)
+SYM_FUNC_END(crc_t10dif_pmull_p64)
.section ".rodata", "a"
.align 4
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index ccc3f6067742..09eb1456aed4 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -37,9 +37,18 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
- kernel_neon_begin();
- *crc = crc_t10dif_pmull_p8(*crc, data, length);
- kernel_neon_end();
+ do {
+ unsigned int chunk = length;
+
+ if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
+ chunk = SZ_4K;
+
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull_p8(*crc, data, chunk);
+ kernel_neon_end();
+ data += chunk;
+ length -= chunk;
+ } while (length);
} else {
*crc = crc_t10dif_generic(*crc, data, length);
}
@@ -53,9 +62,18 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
- kernel_neon_begin();
- *crc = crc_t10dif_pmull_p64(*crc, data, length);
- kernel_neon_end();
+ do {
+ unsigned int chunk = length;
+
+ if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
+ chunk = SZ_4K;
+
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull_p64(*crc, data, chunk);
+ kernel_neon_end();
+ data += chunk;
+ length -= chunk;
+ } while (length);
} else {
*crc = crc_t10dif_generic(*crc, data, length);
}
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index a791c4adf8e6..23ee9a5eaf27 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -6,6 +6,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/assembler.h>
SHASH .req v0
@@ -350,13 +351,13 @@ CPU_LE( rev64 T1.16b, T1.16b )
* void pmull_ghash_update(int blocks, u64 dg[], const char *src,
* struct ghash_key const *k, const char *head)
*/
-ENTRY(pmull_ghash_update_p64)
+SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
__pmull_ghash p64
-ENDPROC(pmull_ghash_update_p64)
+SYM_FUNC_END(pmull_ghash_update_p64)
-ENTRY(pmull_ghash_update_p8)
+SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
__pmull_ghash p8
-ENDPROC(pmull_ghash_update_p8)
+SYM_FUNC_END(pmull_ghash_update_p8)
KS0 .req v8
KS1 .req v9
@@ -435,9 +436,7 @@ ENDPROC(pmull_ghash_update_p8)
.align 6
.macro pmull_gcm_do_crypt, enc
- stp x29, x30, [sp, #-32]!
- mov x29, sp
- str x19, [sp, #24]
+ frame_push 1
load_round_keys x7, x6, x8
@@ -528,7 +527,7 @@ CPU_LE( rev w8, w8 )
.endif
bne 0b
-3: ldp x19, x10, [sp, #24]
+3: ldr x10, [sp, #.Lframe_local_offset]
cbz x10, 5f // output tag?
ld1 {INP3.16b}, [x10] // load lengths[]
@@ -544,9 +543,24 @@ CPU_LE( rev w8, w8 )
ext XL.16b, XL.16b, XL.16b, #8
rev64 XL.16b, XL.16b
eor XL.16b, XL.16b, KS0.16b
+
+ .if \enc == 1
st1 {XL.16b}, [x10] // store tag
+ .else
+ ldp x11, x12, [sp, #40] // load tag pointer and authsize
+ adr_l x17, .Lpermute_table
+ ld1 {KS0.16b}, [x11] // load supplied tag
+ add x17, x17, x12
+ ld1 {KS1.16b}, [x17] // load permute vector
+
+ cmeq XL.16b, XL.16b, KS0.16b // compare tags
+ mvn XL.16b, XL.16b // -1 for fail, 0 for pass
+ tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only
+ sminv b0, XL.16b // signed minimum across XL
+ smov w0, v0.b[0] // return b0
+ .endif
-4: ldp x29, x30, [sp], #32
+4: frame_pop
ret
5:
@@ -587,20 +601,20 @@ CPU_LE( rev w8, w8 )
* struct ghash_key const *k, u64 dg[], u8 ctr[],
* int rounds, u8 tag)
*/
-ENTRY(pmull_gcm_encrypt)
+SYM_FUNC_START(pmull_gcm_encrypt)
pmull_gcm_do_crypt 1
-ENDPROC(pmull_gcm_encrypt)
+SYM_FUNC_END(pmull_gcm_encrypt)
/*
* void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
* struct ghash_key const *k, u64 dg[], u8 ctr[],
* int rounds, u8 tag)
*/
-ENTRY(pmull_gcm_decrypt)
+SYM_FUNC_START(pmull_gcm_decrypt)
pmull_gcm_do_crypt 0
-ENDPROC(pmull_gcm_decrypt)
+SYM_FUNC_END(pmull_gcm_decrypt)
-pmull_gcm_ghash_4x:
+SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
movi MASK.16b, #0xe1
shl MASK.2d, MASK.2d, #57
@@ -681,9 +695,9 @@ pmull_gcm_ghash_4x:
eor XL.16b, XL.16b, T2.16b
ret
-ENDPROC(pmull_gcm_ghash_4x)
+SYM_FUNC_END(pmull_gcm_ghash_4x)
-pmull_gcm_enc_4x:
+SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
ld1 {KS0.16b}, [x5] // load upper counter
sub w10, w8, #4
sub w11, w8, #3
@@ -746,7 +760,7 @@ pmull_gcm_enc_4x:
eor INP3.16b, INP3.16b, KS3.16b
ret
-ENDPROC(pmull_gcm_enc_4x)
+SYM_FUNC_END(pmull_gcm_enc_4x)
.section ".rodata", "a"
.align 6
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 522cf004ce65..97331b454ea8 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -9,6 +9,7 @@
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/aes.h>
+#include <crypto/gcm.h>
#include <crypto/algapi.h>
#include <crypto/b128ops.h>
#include <crypto/gf128mul.h>
@@ -28,15 +29,12 @@ MODULE_ALIAS_CRYPTO("ghash");
#define GHASH_BLOCK_SIZE 16
#define GHASH_DIGEST_SIZE 16
-#define GCM_IV_SIZE 12
-struct ghash_key {
- u64 h[2];
- u64 h2[2];
- u64 h3[2];
- u64 h4[2];
+#define RFC4106_NONCE_SIZE 4
+struct ghash_key {
be128 k;
+ u64 h[][2];
};
struct ghash_desc_ctx {
@@ -47,26 +45,23 @@ struct ghash_desc_ctx {
struct gcm_aes_ctx {
struct crypto_aes_ctx aes_key;
+ u8 nonce[RFC4106_NONCE_SIZE];
struct ghash_key ghash_key;
};
asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
+ u64 const h[][2], const char *head);
asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k,
- const char *head);
+ u64 const h[][2], const char *head);
asmlinkage void pmull_gcm_encrypt(int bytes, u8 dst[], const u8 src[],
- struct ghash_key const *k, u64 dg[],
- u8 ctr[], u32 const rk[], int rounds,
- u8 tag[]);
-
-asmlinkage void pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
- struct ghash_key const *k, u64 dg[],
- u8 ctr[], u32 const rk[], int rounds,
- u8 tag[]);
+ u64 const h[][2], u64 dg[], u8 ctr[],
+ u32 const rk[], int rounds, u8 tag[]);
+asmlinkage int pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
+ u64 const h[][2], u64 dg[], u8 ctr[],
+ u32 const rk[], int rounds, const u8 l[],
+ const u8 tag[], u64 authsize);
static int ghash_init(struct shash_desc *desc)
{
@@ -77,48 +72,51 @@ static int ghash_init(struct shash_desc *desc)
}
static void ghash_do_update(int blocks, u64 dg[], const char *src,
- struct ghash_key *key, const char *head,
- void (*simd_update)(int blocks, u64 dg[],
- const char *src,
- struct ghash_key const *k,
- const char *head))
+ struct ghash_key *key, const char *head)
+{
+ be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
+
+ do {
+ const u8 *in = src;
+
+ if (head) {
+ in = head;
+ blocks++;
+ head = NULL;
+ } else {
+ src += GHASH_BLOCK_SIZE;
+ }
+
+ crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
+ gf128mul_lle(&dst, &key->k);
+ } while (--blocks);
+
+ dg[0] = be64_to_cpu(dst.b);
+ dg[1] = be64_to_cpu(dst.a);
+}
+
+static __always_inline
+void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
+ struct ghash_key *key, const char *head,
+ void (*simd_update)(int blocks, u64 dg[],
+ const char *src,
+ u64 const h[][2],
+ const char *head))
{
- if (likely(crypto_simd_usable() && simd_update)) {
+ if (likely(crypto_simd_usable())) {
kernel_neon_begin();
- simd_update(blocks, dg, src, key, head);
+ simd_update(blocks, dg, src, key->h, head);
kernel_neon_end();
} else {
- be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
-
- do {
- const u8 *in = src;
-
- if (head) {
- in = head;
- blocks++;
- head = NULL;
- } else {
- src += GHASH_BLOCK_SIZE;
- }
-
- crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
- gf128mul_lle(&dst, &key->k);
- } while (--blocks);
-
- dg[0] = be64_to_cpu(dst.b);
- dg[1] = be64_to_cpu(dst.a);
+ ghash_do_update(blocks, dg, src, key, head);
}
}
/* avoid hogging the CPU for too long */
#define MAX_BLOCKS (SZ_64K / GHASH_BLOCK_SIZE)
-static int __ghash_update(struct shash_desc *desc, const u8 *src,
- unsigned int len,
- void (*simd_update)(int blocks, u64 dg[],
- const char *src,
- struct ghash_key const *k,
- const char *head))
+static int ghash_update(struct shash_desc *desc, const u8 *src,
+ unsigned int len)
{
struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
@@ -143,9 +141,9 @@ static int __ghash_update(struct shash_desc *desc, const u8 *src,
do {
int chunk = min(blocks, MAX_BLOCKS);
- ghash_do_update(chunk, ctx->digest, src, key,
- partial ? ctx->buf : NULL,
- simd_update);
+ ghash_do_simd_update(chunk, ctx->digest, src, key,
+ partial ? ctx->buf : NULL,
+ pmull_ghash_update_p8);
blocks -= chunk;
src += chunk * GHASH_BLOCK_SIZE;
@@ -157,19 +155,7 @@ static int __ghash_update(struct shash_desc *desc, const u8 *src,
return 0;
}
-static int ghash_update_p8(struct shash_desc *desc, const u8 *src,
- unsigned int len)
-{
- return __ghash_update(desc, src, len, pmull_ghash_update_p8);
-}
-
-static int ghash_update_p64(struct shash_desc *desc, const u8 *src,
- unsigned int len)
-{
- return __ghash_update(desc, src, len, pmull_ghash_update_p64);
-}
-
-static int ghash_final_p8(struct shash_desc *desc, u8 *dst)
+static int ghash_final(struct shash_desc *desc, u8 *dst)
{
struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
@@ -179,33 +165,13 @@ static int ghash_final_p8(struct shash_desc *desc, u8 *dst)
memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
- ghash_do_update(1, ctx->digest, ctx->buf, key, NULL,
- pmull_ghash_update_p8);
+ ghash_do_simd_update(1, ctx->digest, ctx->buf, key, NULL,
+ pmull_ghash_update_p8);
}
put_unaligned_be64(ctx->digest[1], dst);
put_unaligned_be64(ctx->digest[0], dst + 8);
- *ctx = (struct ghash_desc_ctx){};
- return 0;
-}
-
-static int ghash_final_p64(struct shash_desc *desc, u8 *dst)
-{
- struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
- unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
-
- if (partial) {
- struct ghash_key *key = crypto_shash_ctx(desc->tfm);
-
- memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
-
- ghash_do_update(1, ctx->digest, ctx->buf, key, NULL,
- pmull_ghash_update_p64);
- }
- put_unaligned_be64(ctx->digest[1], dst);
- put_unaligned_be64(ctx->digest[0], dst + 8);
-
- *ctx = (struct ghash_desc_ctx){};
+ memzero_explicit(ctx, sizeof(*ctx));
return 0;
}
@@ -220,71 +186,36 @@ static void ghash_reflect(u64 h[], const be128 *k)
h[1] ^= 0xc200000000000000UL;
}
-static int __ghash_setkey(struct ghash_key *key,
- const u8 *inkey, unsigned int keylen)
-{
- be128 h;
-
- /* needed for the fallback */
- memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
-
- ghash_reflect(key->h, &key->k);
-
- h = key->k;
- gf128mul_lle(&h, &key->k);
- ghash_reflect(key->h2, &h);
-
- gf128mul_lle(&h, &key->k);
- ghash_reflect(key->h3, &h);
-
- gf128mul_lle(&h, &key->k);
- ghash_reflect(key->h4, &h);
-
- return 0;
-}
-
static int ghash_setkey(struct crypto_shash *tfm,
const u8 *inkey, unsigned int keylen)
{
struct ghash_key *key = crypto_shash_ctx(tfm);
- if (keylen != GHASH_BLOCK_SIZE) {
- crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ if (keylen != GHASH_BLOCK_SIZE)
return -EINVAL;
- }
- return __ghash_setkey(key, inkey, keylen);
+ /* needed for the fallback */
+ memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
+
+ ghash_reflect(key->h[0], &key->k);
+ return 0;
}
-static struct shash_alg ghash_alg[] = {{
+static struct shash_alg ghash_alg = {
.base.cra_name = "ghash",
.base.cra_driver_name = "ghash-neon",
- .base.cra_priority = 100,
- .base.cra_blocksize = GHASH_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct ghash_key),
- .base.cra_module = THIS_MODULE,
-
- .digestsize = GHASH_DIGEST_SIZE,
- .init = ghash_init,
- .update = ghash_update_p8,
- .final = ghash_final_p8,
- .setkey = ghash_setkey,
- .descsize = sizeof(struct ghash_desc_ctx),
-}, {
- .base.cra_name = "ghash",
- .base.cra_driver_name = "ghash-ce",
- .base.cra_priority = 200,
+ .base.cra_priority = 150,
.base.cra_blocksize = GHASH_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct ghash_key),
+ .base.cra_ctxsize = sizeof(struct ghash_key) + sizeof(u64[2]),
.base.cra_module = THIS_MODULE,
.digestsize = GHASH_DIGEST_SIZE,
.init = ghash_init,
- .update = ghash_update_p64,
- .final = ghash_final_p64,
+ .update = ghash_update,
+ .final = ghash_final,
.setkey = ghash_setkey,
.descsize = sizeof(struct ghash_desc_ctx),
-}};
+};
static int num_rounds(struct crypto_aes_ctx *ctx)
{
@@ -298,35 +229,41 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
return 6 + ctx->key_length / 4;
}
-static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
- unsigned int keylen)
+static int gcm_aes_setkey(struct crypto_aead *tfm, const u8 *inkey,
+ unsigned int keylen)
{
struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);
u8 key[GHASH_BLOCK_SIZE];
+ be128 h;
int ret;
ret = aes_expandkey(&ctx->aes_key, inkey, keylen);
- if (ret) {
- tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ if (ret)
return -EINVAL;
- }
aes_encrypt(&ctx->aes_key, key, (u8[AES_BLOCK_SIZE]){});
- return __ghash_setkey(&ctx->ghash_key, key, sizeof(be128));
+ /* needed for the fallback */
+ memcpy(&ctx->ghash_key.k, key, GHASH_BLOCK_SIZE);
+
+ ghash_reflect(ctx->ghash_key.h[0], &ctx->ghash_key.k);
+
+ h = ctx->ghash_key.k;
+ gf128mul_lle(&h, &ctx->ghash_key.k);
+ ghash_reflect(ctx->ghash_key.h[1], &h);
+
+ gf128mul_lle(&h, &ctx->ghash_key.k);
+ ghash_reflect(ctx->ghash_key.h[2], &h);
+
+ gf128mul_lle(&h, &ctx->ghash_key.k);
+ ghash_reflect(ctx->ghash_key.h[3], &h);
+
+ return 0;
}
-static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+static int gcm_aes_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
{
- switch (authsize) {
- case 4:
- case 8:
- case 12 ... 16:
- break;
- default:
- return -EINVAL;
- }
- return 0;
+ return crypto_gcm_check_authsize(authsize);
}
static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
@@ -345,9 +282,9 @@ static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) {
int blocks = count / GHASH_BLOCK_SIZE;
- ghash_do_update(blocks, dg, src, &ctx->ghash_key,
- *buf_count ? buf : NULL,
- pmull_ghash_update_p64);
+ ghash_do_simd_update(blocks, dg, src, &ctx->ghash_key,
+ *buf_count ? buf : NULL,
+ pmull_ghash_update_p64);
src += blocks * GHASH_BLOCK_SIZE;
count %= GHASH_BLOCK_SIZE;
@@ -360,13 +297,12 @@ static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
}
}
-static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
+static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[], u32 len)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
u8 buf[GHASH_BLOCK_SIZE];
struct scatter_walk walk;
- u32 len = req->assoclen;
int buf_count = 0;
scatterwalk_start(&walk, req->src);
@@ -391,113 +327,63 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
if (buf_count) {
memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count);
- ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL,
- pmull_ghash_update_p64);
+ ghash_do_simd_update(1, dg, buf, &ctx->ghash_key, NULL,
+ pmull_ghash_update_p64);
}
}
-static int gcm_encrypt(struct aead_request *req)
+static int gcm_encrypt(struct aead_request *req, char *iv, int assoclen)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
int nrounds = num_rounds(&ctx->aes_key);
struct skcipher_walk walk;
u8 buf[AES_BLOCK_SIZE];
- u8 iv[AES_BLOCK_SIZE];
u64 dg[2] = {};
- u128 lengths;
+ be128 lengths;
u8 *tag;
int err;
- lengths.a = cpu_to_be64(req->assoclen * 8);
+ lengths.a = cpu_to_be64(assoclen * 8);
lengths.b = cpu_to_be64(req->cryptlen * 8);
- if (req->assoclen)
- gcm_calculate_auth_mac(req, dg);
+ if (assoclen)
+ gcm_calculate_auth_mac(req, dg, assoclen);
- memcpy(iv, req->iv, GCM_IV_SIZE);
- put_unaligned_be32(2, iv + GCM_IV_SIZE);
+ put_unaligned_be32(2, iv + GCM_AES_IV_SIZE);
err = skcipher_walk_aead_encrypt(&walk, req, false);
- if (likely(crypto_simd_usable())) {
- do {
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
- int nbytes = walk.nbytes;
-
- tag = (u8 *)&lengths;
-
- if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
- src = dst = memcpy(buf + sizeof(buf) - nbytes,
- src, nbytes);
- } else if (nbytes < walk.total) {
- nbytes &= ~(AES_BLOCK_SIZE - 1);
- tag = NULL;
- }
-
- kernel_neon_begin();
- pmull_gcm_encrypt(nbytes, dst, src, &ctx->ghash_key, dg,
- iv, ctx->aes_key.key_enc, nrounds,
- tag);
- kernel_neon_end();
-
- if (unlikely(!nbytes))
- break;
-
- if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
- memcpy(walk.dst.virt.addr,
- buf + sizeof(buf) - nbytes, nbytes);
-
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
- } while (walk.nbytes);
- } else {
- while (walk.nbytes >= AES_BLOCK_SIZE) {
- int blocks = walk.nbytes / AES_BLOCK_SIZE;
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
- int remaining = blocks;
-
- do {
- aes_encrypt(&ctx->aes_key, buf, iv);
- crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
- crypto_inc(iv, AES_BLOCK_SIZE);
-
- dst += AES_BLOCK_SIZE;
- src += AES_BLOCK_SIZE;
- } while (--remaining > 0);
-
- ghash_do_update(blocks, dg, walk.dst.virt.addr,
- &ctx->ghash_key, NULL, NULL);
-
- err = skcipher_walk_done(&walk,
- walk.nbytes % AES_BLOCK_SIZE);
- }
-
- /* handle the tail */
- if (walk.nbytes) {
- aes_encrypt(&ctx->aes_key, buf, iv);
+ do {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ int nbytes = walk.nbytes;
- crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr,
- buf, walk.nbytes);
+ tag = (u8 *)&lengths;
- memcpy(buf, walk.dst.virt.addr, walk.nbytes);
- memset(buf + walk.nbytes, 0, sizeof(buf) - walk.nbytes);
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ } else if (nbytes < walk.total) {
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+ tag = NULL;
}
- tag = (u8 *)&lengths;
- ghash_do_update(1, dg, tag, &ctx->ghash_key,
- walk.nbytes ? buf : NULL, NULL);
+ kernel_neon_begin();
+ pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h,
+ dg, iv, ctx->aes_key.key_enc, nrounds,
+ tag);
+ kernel_neon_end();
- if (walk.nbytes)
- err = skcipher_walk_done(&walk, 0);
+ if (unlikely(!nbytes))
+ break;
- put_unaligned_be64(dg[1], tag);
- put_unaligned_be64(dg[0], tag + 8);
- put_unaligned_be32(1, iv + GCM_IV_SIZE);
- aes_encrypt(&ctx->aes_key, iv, iv);
- crypto_xor(tag, iv, AES_BLOCK_SIZE);
- }
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
+
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ } while (walk.nbytes);
if (err)
return err;
@@ -509,176 +395,189 @@ static int gcm_encrypt(struct aead_request *req)
return 0;
}
-static int gcm_decrypt(struct aead_request *req)
+static int gcm_decrypt(struct aead_request *req, char *iv, int assoclen)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
unsigned int authsize = crypto_aead_authsize(aead);
int nrounds = num_rounds(&ctx->aes_key);
struct skcipher_walk walk;
+ u8 otag[AES_BLOCK_SIZE];
u8 buf[AES_BLOCK_SIZE];
- u8 iv[AES_BLOCK_SIZE];
u64 dg[2] = {};
- u128 lengths;
+ be128 lengths;
u8 *tag;
+ int ret;
int err;
- lengths.a = cpu_to_be64(req->assoclen * 8);
+ lengths.a = cpu_to_be64(assoclen * 8);
lengths.b = cpu_to_be64((req->cryptlen - authsize) * 8);
- if (req->assoclen)
- gcm_calculate_auth_mac(req, dg);
+ if (assoclen)
+ gcm_calculate_auth_mac(req, dg, assoclen);
- memcpy(iv, req->iv, GCM_IV_SIZE);
- put_unaligned_be32(2, iv + GCM_IV_SIZE);
+ put_unaligned_be32(2, iv + GCM_AES_IV_SIZE);
+
+ scatterwalk_map_and_copy(otag, req->src,
+ req->assoclen + req->cryptlen - authsize,
+ authsize, 0);
err = skcipher_walk_aead_decrypt(&walk, req, false);
- if (likely(crypto_simd_usable())) {
- do {
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
- int nbytes = walk.nbytes;
-
- tag = (u8 *)&lengths;
-
- if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
- src = dst = memcpy(buf + sizeof(buf) - nbytes,
- src, nbytes);
- } else if (nbytes < walk.total) {
- nbytes &= ~(AES_BLOCK_SIZE - 1);
- tag = NULL;
- }
-
- kernel_neon_begin();
- pmull_gcm_decrypt(nbytes, dst, src, &ctx->ghash_key, dg,
- iv, ctx->aes_key.key_enc, nrounds,
- tag);
- kernel_neon_end();
-
- if (unlikely(!nbytes))
- break;
-
- if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
- memcpy(walk.dst.virt.addr,
- buf + sizeof(buf) - nbytes, nbytes);
-
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
- } while (walk.nbytes);
- } else {
- while (walk.nbytes >= AES_BLOCK_SIZE) {
- int blocks = walk.nbytes / AES_BLOCK_SIZE;
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
+ do {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ int nbytes = walk.nbytes;
- ghash_do_update(blocks, dg, walk.src.virt.addr,
- &ctx->ghash_key, NULL, NULL);
+ tag = (u8 *)&lengths;
- do {
- aes_encrypt(&ctx->aes_key, buf, iv);
- crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
- crypto_inc(iv, AES_BLOCK_SIZE);
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ } else if (nbytes < walk.total) {
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+ tag = NULL;
+ }
- dst += AES_BLOCK_SIZE;
- src += AES_BLOCK_SIZE;
- } while (--blocks > 0);
+ kernel_neon_begin();
+ ret = pmull_gcm_decrypt(nbytes, dst, src, ctx->ghash_key.h,
+ dg, iv, ctx->aes_key.key_enc,
+ nrounds, tag, otag, authsize);
+ kernel_neon_end();
- err = skcipher_walk_done(&walk,
- walk.nbytes % AES_BLOCK_SIZE);
- }
+ if (unlikely(!nbytes))
+ break;
- /* handle the tail */
- if (walk.nbytes) {
- memcpy(buf, walk.src.virt.addr, walk.nbytes);
- memset(buf + walk.nbytes, 0, sizeof(buf) - walk.nbytes);
- }
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
- tag = (u8 *)&lengths;
- ghash_do_update(1, dg, tag, &ctx->ghash_key,
- walk.nbytes ? buf : NULL, NULL);
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ } while (walk.nbytes);
- if (walk.nbytes) {
- aes_encrypt(&ctx->aes_key, buf, iv);
+ if (err)
+ return err;
- crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr,
- buf, walk.nbytes);
+ return ret ? -EBADMSG : 0;
+}
- err = skcipher_walk_done(&walk, 0);
- }
+static int gcm_aes_encrypt(struct aead_request *req)
+{
+ u8 iv[AES_BLOCK_SIZE];
- put_unaligned_be64(dg[1], tag);
- put_unaligned_be64(dg[0], tag + 8);
- put_unaligned_be32(1, iv + GCM_IV_SIZE);
- aes_encrypt(&ctx->aes_key, iv, iv);
- crypto_xor(tag, iv, AES_BLOCK_SIZE);
- }
+ memcpy(iv, req->iv, GCM_AES_IV_SIZE);
+ return gcm_encrypt(req, iv, req->assoclen);
+}
+
+static int gcm_aes_decrypt(struct aead_request *req)
+{
+ u8 iv[AES_BLOCK_SIZE];
+
+ memcpy(iv, req->iv, GCM_AES_IV_SIZE);
+ return gcm_decrypt(req, iv, req->assoclen);
+}
+
+static int rfc4106_setkey(struct crypto_aead *tfm, const u8 *inkey,
+ unsigned int keylen)
+{
+ struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);
+ int err;
+ keylen -= RFC4106_NONCE_SIZE;
+ err = gcm_aes_setkey(tfm, inkey, keylen);
if (err)
return err;
- /* compare calculated auth tag with the stored one */
- scatterwalk_map_and_copy(buf, req->src,
- req->assoclen + req->cryptlen - authsize,
- authsize, 0);
-
- if (crypto_memneq(tag, buf, authsize))
- return -EBADMSG;
+ memcpy(ctx->nonce, inkey + keylen, RFC4106_NONCE_SIZE);
return 0;
}
-static struct aead_alg gcm_aes_alg = {
- .ivsize = GCM_IV_SIZE,
+static int rfc4106_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+ return crypto_rfc4106_check_authsize(authsize);
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
+ u8 iv[AES_BLOCK_SIZE];
+
+ memcpy(iv, ctx->nonce, RFC4106_NONCE_SIZE);
+ memcpy(iv + RFC4106_NONCE_SIZE, req->iv, GCM_RFC4106_IV_SIZE);
+
+ return crypto_ipsec_check_assoclen(req->assoclen) ?:
+ gcm_encrypt(req, iv, req->assoclen - GCM_RFC4106_IV_SIZE);
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
+ u8 iv[AES_BLOCK_SIZE];
+
+ memcpy(iv, ctx->nonce, RFC4106_NONCE_SIZE);
+ memcpy(iv + RFC4106_NONCE_SIZE, req->iv, GCM_RFC4106_IV_SIZE);
+
+ return crypto_ipsec_check_assoclen(req->assoclen) ?:
+ gcm_decrypt(req, iv, req->assoclen - GCM_RFC4106_IV_SIZE);
+}
+
+static struct aead_alg gcm_aes_algs[] = {{
+ .ivsize = GCM_AES_IV_SIZE,
.chunksize = AES_BLOCK_SIZE,
.maxauthsize = AES_BLOCK_SIZE,
- .setkey = gcm_setkey,
- .setauthsize = gcm_setauthsize,
- .encrypt = gcm_encrypt,
- .decrypt = gcm_decrypt,
+ .setkey = gcm_aes_setkey,
+ .setauthsize = gcm_aes_setauthsize,
+ .encrypt = gcm_aes_encrypt,
+ .decrypt = gcm_aes_decrypt,
.base.cra_name = "gcm(aes)",
.base.cra_driver_name = "gcm-aes-ce",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct gcm_aes_ctx),
+ .base.cra_ctxsize = sizeof(struct gcm_aes_ctx) +
+ 4 * sizeof(u64[2]),
.base.cra_module = THIS_MODULE,
-};
+}, {
+ .ivsize = GCM_RFC4106_IV_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .maxauthsize = AES_BLOCK_SIZE,
+ .setkey = rfc4106_setkey,
+ .setauthsize = rfc4106_setauthsize,
+ .encrypt = rfc4106_encrypt,
+ .decrypt = rfc4106_decrypt,
+
+ .base.cra_name = "rfc4106(gcm(aes))",
+ .base.cra_driver_name = "rfc4106-gcm-aes-ce",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct gcm_aes_ctx) +
+ 4 * sizeof(u64[2]),
+ .base.cra_module = THIS_MODULE,
+}};
static int __init ghash_ce_mod_init(void)
{
- int ret;
-
if (!cpu_have_named_feature(ASIMD))
return -ENODEV;
if (cpu_have_named_feature(PMULL))
- ret = crypto_register_shashes(ghash_alg,
- ARRAY_SIZE(ghash_alg));
- else
- /* only register the first array element */
- ret = crypto_register_shash(ghash_alg);
+ return crypto_register_aeads(gcm_aes_algs,
+ ARRAY_SIZE(gcm_aes_algs));
- if (ret)
- return ret;
-
- if (cpu_have_named_feature(PMULL)) {
- ret = crypto_register_aead(&gcm_aes_alg);
- if (ret)
- crypto_unregister_shashes(ghash_alg,
- ARRAY_SIZE(ghash_alg));
- }
- return ret;
+ return crypto_register_shash(&ghash_alg);
}
static void __exit ghash_ce_mod_exit(void)
{
if (cpu_have_named_feature(PMULL))
- crypto_unregister_shashes(ghash_alg, ARRAY_SIZE(ghash_alg));
+ crypto_unregister_aeads(gcm_aes_algs, ARRAY_SIZE(gcm_aes_algs));
else
- crypto_unregister_shash(ghash_alg);
- crypto_unregister_aead(&gcm_aes_alg);
+ crypto_unregister_shash(&ghash_alg);
}
-static const struct cpu_feature ghash_cpu_feature[] = {
+static const struct cpu_feature __maybe_unused ghash_cpu_feature[] = {
{ cpu_feature(PMULL) }, { }
};
MODULE_DEVICE_TABLE(cpu, ghash_cpu_feature);
diff --git a/arch/arm64/crypto/nh-neon-core.S b/arch/arm64/crypto/nh-neon-core.S
index e05570c38de7..13eda08fda1e 100644
--- a/arch/arm64/crypto/nh-neon-core.S
+++ b/arch/arm64/crypto/nh-neon-core.S
@@ -8,6 +8,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
KEY .req x0
MESSAGE .req x1
@@ -58,11 +59,11 @@
/*
* void nh_neon(const u32 *key, const u8 *message, size_t message_len,
- * u8 hash[NH_HASH_BYTES])
+ * __le64 hash[NH_NUM_PASSES])
*
* It's guaranteed that message_len % 16 == 0.
*/
-ENTRY(nh_neon)
+SYM_TYPED_FUNC_START(nh_neon)
ld1 {K0.4s,K1.4s}, [KEY], #32
movi PASS0_SUMS.2d, #0
@@ -100,4 +101,4 @@ ENTRY(nh_neon)
addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
st1 {T0.16b,T1.16b}, [HASH]
ret
-ENDPROC(nh_neon)
+SYM_FUNC_END(nh_neon)
diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c
index 895d3727c1fb..e4a0b463f080 100644
--- a/arch/arm64/crypto/nhpoly1305-neon-glue.c
+++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c
@@ -14,14 +14,7 @@
#include <linux/module.h>
asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
- u8 hash[NH_HASH_BYTES]);
-
-/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
-static void _nh_neon(const u32 *key, const u8 *message, size_t message_len,
- __le64 hash[NH_NUM_PASSES])
-{
- nh_neon(key, message, message_len, (u8 *)hash);
-}
+ __le64 hash[NH_NUM_PASSES]);
static int nhpoly1305_neon_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
@@ -30,10 +23,10 @@ static int nhpoly1305_neon_update(struct shash_desc *desc,
return crypto_nhpoly1305_update(desc, src, srclen);
do {
- unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_neon_begin();
- crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon);
+ crypto_nhpoly1305_update_helper(desc, src, n, nh_neon);
kernel_neon_end();
src += n;
srclen -= n;
@@ -41,6 +34,14 @@ static int nhpoly1305_neon_update(struct shash_desc *desc,
return 0;
}
+static int nhpoly1305_neon_digest(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen, u8 *out)
+{
+ return crypto_nhpoly1305_init(desc) ?:
+ nhpoly1305_neon_update(desc, src, srclen) ?:
+ crypto_nhpoly1305_final(desc, out);
+}
+
static struct shash_alg nhpoly1305_alg = {
.base.cra_name = "nhpoly1305",
.base.cra_driver_name = "nhpoly1305-neon",
@@ -51,6 +52,7 @@ static struct shash_alg nhpoly1305_alg = {
.init = crypto_nhpoly1305_init,
.update = nhpoly1305_neon_update,
.final = crypto_nhpoly1305_final,
+ .digest = nhpoly1305_neon_digest,
.setkey = crypto_nhpoly1305_setkey,
.descsize = sizeof(struct nhpoly1305_state),
};
diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl
index 6e5576d19af8..cbc980fb02e3 100644
--- a/arch/arm64/crypto/poly1305-armv8.pl
+++ b/arch/arm64/crypto/poly1305-armv8.pl
@@ -840,7 +840,6 @@ poly1305_blocks_neon:
ldp d14,d15,[sp,#64]
addp $ACC2,$ACC2,$ACC2
ldr x30,[sp,#8]
- .inst 0xd50323bf // autiasp
////////////////////////////////////////////////////////////////
// lazy reduction, but without narrowing
@@ -882,6 +881,7 @@ poly1305_blocks_neon:
str x4,[$ctx,#8] // set is_base2_26
ldr x29,[sp],#80
+ .inst 0xd50323bf // autiasp
ret
.size poly1305_blocks_neon,.-poly1305_blocks_neon
diff --git a/arch/arm64/crypto/poly1305-core.S_shipped b/arch/arm64/crypto/poly1305-core.S_shipped
deleted file mode 100644
index 8d1c4e420ccd..000000000000
--- a/arch/arm64/crypto/poly1305-core.S_shipped
+++ /dev/null
@@ -1,835 +0,0 @@
-#ifndef __KERNEL__
-# include "arm_arch.h"
-.extern OPENSSL_armcap_P
-#endif
-
-.text
-
-// forward "declarations" are required for Apple
-.globl poly1305_blocks
-.globl poly1305_emit
-
-.globl poly1305_init
-.type poly1305_init,%function
-.align 5
-poly1305_init:
- cmp x1,xzr
- stp xzr,xzr,[x0] // zero hash value
- stp xzr,xzr,[x0,#16] // [along with is_base2_26]
-
- csel x0,xzr,x0,eq
- b.eq .Lno_key
-
-#ifndef __KERNEL__
- adrp x17,OPENSSL_armcap_P
- ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
-#endif
-
- ldp x7,x8,[x1] // load key
- mov x9,#0xfffffffc0fffffff
- movk x9,#0x0fff,lsl#48
-#ifdef __AARCH64EB__
- rev x7,x7 // flip bytes
- rev x8,x8
-#endif
- and x7,x7,x9 // &=0ffffffc0fffffff
- and x9,x9,#-4
- and x8,x8,x9 // &=0ffffffc0ffffffc
- mov w9,#-1
- stp x7,x8,[x0,#32] // save key value
- str w9,[x0,#48] // impossible key power value
-
-#ifndef __KERNEL__
- tst w17,#ARMV7_NEON
-
- adr x12,.Lpoly1305_blocks
- adr x7,.Lpoly1305_blocks_neon
- adr x13,.Lpoly1305_emit
-
- csel x12,x12,x7,eq
-
-# ifdef __ILP32__
- stp w12,w13,[x2]
-# else
- stp x12,x13,[x2]
-# endif
-#endif
- mov x0,#1
-.Lno_key:
- ret
-.size poly1305_init,.-poly1305_init
-
-.type poly1305_blocks,%function
-.align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
- ands x2,x2,#-16
- b.eq .Lno_data
-
- ldp x4,x5,[x0] // load hash value
- ldp x6,x17,[x0,#16] // [along with is_base2_26]
- ldp x7,x8,[x0,#32] // load key value
-
-#ifdef __AARCH64EB__
- lsr x12,x4,#32
- mov w13,w4
- lsr x14,x5,#32
- mov w15,w5
- lsr x16,x6,#32
-#else
- mov w12,w4
- lsr x13,x4,#32
- mov w14,w5
- lsr x15,x5,#32
- mov w16,w6
-#endif
-
- add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
- lsr x13,x14,#12
- adds x12,x12,x14,lsl#52
- add x13,x13,x15,lsl#14
- adc x13,x13,xzr
- lsr x14,x16,#24
- adds x13,x13,x16,lsl#40
- adc x14,x14,xzr
-
- cmp x17,#0 // is_base2_26?
- add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
- csel x4,x4,x12,eq // choose between radixes
- csel x5,x5,x13,eq
- csel x6,x6,x14,eq
-
-.Loop:
- ldp x10,x11,[x1],#16 // load input
- sub x2,x2,#16
-#ifdef __AARCH64EB__
- rev x10,x10
- rev x11,x11
-#endif
- adds x4,x4,x10 // accumulate input
- adcs x5,x5,x11
-
- mul x12,x4,x7 // h0*r0
- adc x6,x6,x3
- umulh x13,x4,x7
-
- mul x10,x5,x9 // h1*5*r1
- umulh x11,x5,x9
-
- adds x12,x12,x10
- mul x10,x4,x8 // h0*r1
- adc x13,x13,x11
- umulh x14,x4,x8
-
- adds x13,x13,x10
- mul x10,x5,x7 // h1*r0
- adc x14,x14,xzr
- umulh x11,x5,x7
-
- adds x13,x13,x10
- mul x10,x6,x9 // h2*5*r1
- adc x14,x14,x11
- mul x11,x6,x7 // h2*r0
-
- adds x13,x13,x10
- adc x14,x14,x11
-
- and x10,x14,#-4 // final reduction
- and x6,x14,#3
- add x10,x10,x14,lsr#2
- adds x4,x12,x10
- adcs x5,x13,xzr
- adc x6,x6,xzr
-
- cbnz x2,.Loop
-
- stp x4,x5,[x0] // store hash value
- stp x6,xzr,[x0,#16] // [and clear is_base2_26]
-
-.Lno_data:
- ret
-.size poly1305_blocks,.-poly1305_blocks
-
-.type poly1305_emit,%function
-.align 5
-poly1305_emit:
-.Lpoly1305_emit:
- ldp x4,x5,[x0] // load hash base 2^64
- ldp x6,x7,[x0,#16] // [along with is_base2_26]
- ldp x10,x11,[x2] // load nonce
-
-#ifdef __AARCH64EB__
- lsr x12,x4,#32
- mov w13,w4
- lsr x14,x5,#32
- mov w15,w5
- lsr x16,x6,#32
-#else
- mov w12,w4
- lsr x13,x4,#32
- mov w14,w5
- lsr x15,x5,#32
- mov w16,w6
-#endif
-
- add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
- lsr x13,x14,#12
- adds x12,x12,x14,lsl#52
- add x13,x13,x15,lsl#14
- adc x13,x13,xzr
- lsr x14,x16,#24
- adds x13,x13,x16,lsl#40
- adc x14,x14,xzr
-
- cmp x7,#0 // is_base2_26?
- csel x4,x4,x12,eq // choose between radixes
- csel x5,x5,x13,eq
- csel x6,x6,x14,eq
-
- adds x12,x4,#5 // compare to modulus
- adcs x13,x5,xzr
- adc x14,x6,xzr
-
- tst x14,#-4 // see if it's carried/borrowed
-
- csel x4,x4,x12,eq
- csel x5,x5,x13,eq
-
-#ifdef __AARCH64EB__
- ror x10,x10,#32 // flip nonce words
- ror x11,x11,#32
-#endif
- adds x4,x4,x10 // accumulate nonce
- adc x5,x5,x11
-#ifdef __AARCH64EB__
- rev x4,x4 // flip output bytes
- rev x5,x5
-#endif
- stp x4,x5,[x1] // write result
-
- ret
-.size poly1305_emit,.-poly1305_emit
-.type poly1305_mult,%function
-.align 5
-poly1305_mult:
- mul x12,x4,x7 // h0*r0
- umulh x13,x4,x7
-
- mul x10,x5,x9 // h1*5*r1
- umulh x11,x5,x9
-
- adds x12,x12,x10
- mul x10,x4,x8 // h0*r1
- adc x13,x13,x11
- umulh x14,x4,x8
-
- adds x13,x13,x10
- mul x10,x5,x7 // h1*r0
- adc x14,x14,xzr
- umulh x11,x5,x7
-
- adds x13,x13,x10
- mul x10,x6,x9 // h2*5*r1
- adc x14,x14,x11
- mul x11,x6,x7 // h2*r0
-
- adds x13,x13,x10
- adc x14,x14,x11
-
- and x10,x14,#-4 // final reduction
- and x6,x14,#3
- add x10,x10,x14,lsr#2
- adds x4,x12,x10
- adcs x5,x13,xzr
- adc x6,x6,xzr
-
- ret
-.size poly1305_mult,.-poly1305_mult
-
-.type poly1305_splat,%function
-.align 4
-poly1305_splat:
- and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x13,x4,#26,#26
- extr x14,x5,x4,#52
- and x14,x14,#0x03ffffff
- ubfx x15,x5,#14,#26
- extr x16,x6,x5,#40
-
- str w12,[x0,#16*0] // r0
- add w12,w13,w13,lsl#2 // r1*5
- str w13,[x0,#16*1] // r1
- add w13,w14,w14,lsl#2 // r2*5
- str w12,[x0,#16*2] // s1
- str w14,[x0,#16*3] // r2
- add w14,w15,w15,lsl#2 // r3*5
- str w13,[x0,#16*4] // s2
- str w15,[x0,#16*5] // r3
- add w15,w16,w16,lsl#2 // r4*5
- str w14,[x0,#16*6] // s3
- str w16,[x0,#16*7] // r4
- str w15,[x0,#16*8] // s4
-
- ret
-.size poly1305_splat,.-poly1305_splat
-
-#ifdef __KERNEL__
-.globl poly1305_blocks_neon
-#endif
-.type poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
- ldr x17,[x0,#24]
- cmp x2,#128
- b.lo .Lpoly1305_blocks
-
- .inst 0xd503233f // paciasp
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
-
- stp d8,d9,[sp,#16] // meet ABI requirements
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
-
- cbz x17,.Lbase2_64_neon
-
- ldp w10,w11,[x0] // load hash value base 2^26
- ldp w12,w13,[x0,#8]
- ldr w14,[x0,#16]
-
- tst x2,#31
- b.eq .Leven_neon
-
- ldp x7,x8,[x0,#32] // load key value
-
- add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
- lsr x5,x12,#12
- adds x4,x4,x12,lsl#52
- add x5,x5,x13,lsl#14
- adc x5,x5,xzr
- lsr x6,x14,#24
- adds x5,x5,x14,lsl#40
- adc x14,x6,xzr // can be partially reduced...
-
- ldp x12,x13,[x1],#16 // load input
- sub x2,x2,#16
- add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
-
-#ifdef __AARCH64EB__
- rev x12,x12
- rev x13,x13
-#endif
- adds x4,x4,x12 // accumulate input
- adcs x5,x5,x13
- adc x6,x6,x3
-
- bl poly1305_mult
-
- and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,x4,#26,#26
- extr x12,x5,x4,#52
- and x12,x12,#0x03ffffff
- ubfx x13,x5,#14,#26
- extr x14,x6,x5,#40
-
- b .Leven_neon
-
-.align 4
-.Lbase2_64_neon:
- ldp x7,x8,[x0,#32] // load key value
-
- ldp x4,x5,[x0] // load hash value base 2^64
- ldr x6,[x0,#16]
-
- tst x2,#31
- b.eq .Linit_neon
-
- ldp x12,x13,[x1],#16 // load input
- sub x2,x2,#16
- add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
-#ifdef __AARCH64EB__
- rev x12,x12
- rev x13,x13
-#endif
- adds x4,x4,x12 // accumulate input
- adcs x5,x5,x13
- adc x6,x6,x3
-
- bl poly1305_mult
-
-.Linit_neon:
- ldr w17,[x0,#48] // first table element
- and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,x4,#26,#26
- extr x12,x5,x4,#52
- and x12,x12,#0x03ffffff
- ubfx x13,x5,#14,#26
- extr x14,x6,x5,#40
-
- cmp w17,#-1 // is value impossible?
- b.ne .Leven_neon
-
- fmov d24,x10
- fmov d25,x11
- fmov d26,x12
- fmov d27,x13
- fmov d28,x14
-
- ////////////////////////////////// initialize r^n table
- mov x4,x7 // r^1
- add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
- mov x5,x8
- mov x6,xzr
- add x0,x0,#48+12
- bl poly1305_splat
-
- bl poly1305_mult // r^2
- sub x0,x0,#4
- bl poly1305_splat
-
- bl poly1305_mult // r^3
- sub x0,x0,#4
- bl poly1305_splat
-
- bl poly1305_mult // r^4
- sub x0,x0,#4
- bl poly1305_splat
- sub x0,x0,#48 // restore original x0
- b .Ldo_neon
-
-.align 4
-.Leven_neon:
- fmov d24,x10
- fmov d25,x11
- fmov d26,x12
- fmov d27,x13
- fmov d28,x14
-
-.Ldo_neon:
- ldp x8,x12,[x1,#32] // inp[2:3]
- subs x2,x2,#64
- ldp x9,x13,[x1,#48]
- add x16,x1,#96
- adr x17,.Lzeros
-
- lsl x3,x3,#24
- add x15,x0,#48
-
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov d14,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,x3,x12,lsr#40
- add x13,x3,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov d15,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- fmov d16,x8
- fmov d17,x10
- fmov d18,x12
-
- ldp x8,x12,[x1],#16 // inp[0:1]
- ldp x9,x13,[x1],#48
-
- ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
- ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
- ld1 {v8.4s},[x15]
-
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov d9,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,x3,x12,lsr#40
- add x13,x3,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov d10,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- movi v31.2d,#-1
- fmov d11,x8
- fmov d12,x10
- fmov d13,x12
- ushr v31.2d,v31.2d,#38
-
- b.ls .Lskip_loop
-
-.align 4
-.Loop_neon:
- ////////////////////////////////////////////////////////////////
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- // ___________________/
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- // ___________________/ ____________________/
- //
- // Note that we start with inp[2:3]*r^2. This is because it
- // doesn't depend on reduction in previous iteration.
- ////////////////////////////////////////////////////////////////
- // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
- // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
- // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
- // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
- // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
- subs x2,x2,#64
- umull v23.2d,v14.2s,v7.s[2]
- csel x16,x17,x16,lo
- umull v22.2d,v14.2s,v5.s[2]
- umull v21.2d,v14.2s,v3.s[2]
- ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
- umull v20.2d,v14.2s,v1.s[2]
- ldp x9,x13,[x16],#48
- umull v19.2d,v14.2s,v0.s[2]
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- umlal v23.2d,v15.2s,v5.s[2]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal v22.2d,v15.2s,v3.s[2]
- and x5,x9,#0x03ffffff
- umlal v21.2d,v15.2s,v1.s[2]
- ubfx x6,x8,#26,#26
- umlal v20.2d,v15.2s,v0.s[2]
- ubfx x7,x9,#26,#26
- umlal v19.2d,v15.2s,v8.s[2]
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
-
- umlal v23.2d,v16.2s,v3.s[2]
- extr x8,x12,x8,#52
- umlal v22.2d,v16.2s,v1.s[2]
- extr x9,x13,x9,#52
- umlal v21.2d,v16.2s,v0.s[2]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal v20.2d,v16.2s,v8.s[2]
- fmov d14,x4
- umlal v19.2d,v16.2s,v6.s[2]
- and x8,x8,#0x03ffffff
-
- umlal v23.2d,v17.2s,v1.s[2]
- and x9,x9,#0x03ffffff
- umlal v22.2d,v17.2s,v0.s[2]
- ubfx x10,x12,#14,#26
- umlal v21.2d,v17.2s,v8.s[2]
- ubfx x11,x13,#14,#26
- umlal v20.2d,v17.2s,v6.s[2]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal v19.2d,v17.2s,v4.s[2]
- fmov d15,x6
-
- add v11.2s,v11.2s,v26.2s
- add x12,x3,x12,lsr#40
- umlal v23.2d,v18.2s,v0.s[2]
- add x13,x3,x13,lsr#40
- umlal v22.2d,v18.2s,v8.s[2]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal v21.2d,v18.2s,v6.s[2]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal v20.2d,v18.2s,v4.s[2]
- fmov d16,x8
- umlal v19.2d,v18.2s,v2.s[2]
- fmov d17,x10
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4 and accumulate
-
- add v9.2s,v9.2s,v24.2s
- fmov d18,x12
- umlal v22.2d,v11.2s,v1.s[0]
- ldp x8,x12,[x1],#16 // inp[0:1]
- umlal v19.2d,v11.2s,v6.s[0]
- ldp x9,x13,[x1],#48
- umlal v23.2d,v11.2s,v3.s[0]
- umlal v20.2d,v11.2s,v8.s[0]
- umlal v21.2d,v11.2s,v0.s[0]
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- add v10.2s,v10.2s,v25.2s
- umlal v22.2d,v9.2s,v5.s[0]
- umlal v23.2d,v9.2s,v7.s[0]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal v21.2d,v9.2s,v3.s[0]
- and x5,x9,#0x03ffffff
- umlal v19.2d,v9.2s,v0.s[0]
- ubfx x6,x8,#26,#26
- umlal v20.2d,v9.2s,v1.s[0]
- ubfx x7,x9,#26,#26
-
- add v12.2s,v12.2s,v27.2s
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- umlal v22.2d,v10.2s,v3.s[0]
- extr x8,x12,x8,#52
- umlal v23.2d,v10.2s,v5.s[0]
- extr x9,x13,x9,#52
- umlal v19.2d,v10.2s,v8.s[0]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal v21.2d,v10.2s,v1.s[0]
- fmov d9,x4
- umlal v20.2d,v10.2s,v0.s[0]
- and x8,x8,#0x03ffffff
-
- add v13.2s,v13.2s,v28.2s
- and x9,x9,#0x03ffffff
- umlal v22.2d,v12.2s,v0.s[0]
- ubfx x10,x12,#14,#26
- umlal v19.2d,v12.2s,v4.s[0]
- ubfx x11,x13,#14,#26
- umlal v23.2d,v12.2s,v1.s[0]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal v20.2d,v12.2s,v6.s[0]
- fmov d10,x6
- umlal v21.2d,v12.2s,v8.s[0]
- add x12,x3,x12,lsr#40
-
- umlal v22.2d,v13.2s,v8.s[0]
- add x13,x3,x13,lsr#40
- umlal v19.2d,v13.2s,v2.s[0]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal v23.2d,v13.2s,v0.s[0]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal v20.2d,v13.2s,v4.s[0]
- fmov d11,x8
- umlal v21.2d,v13.2s,v6.s[0]
- fmov d12,x10
- fmov d13,x12
-
- /////////////////////////////////////////////////////////////////
- // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- // and P. Schwabe
- //
- // [see discussion in poly1305-armv4 module]
-
- ushr v29.2d,v22.2d,#26
- xtn v27.2s,v22.2d
- ushr v30.2d,v19.2d,#26
- and v19.16b,v19.16b,v31.16b
- add v23.2d,v23.2d,v29.2d // h3 -> h4
- bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
- add v20.2d,v20.2d,v30.2d // h0 -> h1
-
- ushr v29.2d,v23.2d,#26
- xtn v28.2s,v23.2d
- ushr v30.2d,v20.2d,#26
- xtn v25.2s,v20.2d
- bic v28.2s,#0xfc,lsl#24
- add v21.2d,v21.2d,v30.2d // h1 -> h2
-
- add v19.2d,v19.2d,v29.2d
- shl v29.2d,v29.2d,#2
- shrn v30.2s,v21.2d,#26
- xtn v26.2s,v21.2d
- add v19.2d,v19.2d,v29.2d // h4 -> h0
- bic v25.2s,#0xfc,lsl#24
- add v27.2s,v27.2s,v30.2s // h2 -> h3
- bic v26.2s,#0xfc,lsl#24
-
- shrn v29.2s,v19.2d,#26
- xtn v24.2s,v19.2d
- ushr v30.2s,v27.2s,#26
- bic v27.2s,#0xfc,lsl#24
- bic v24.2s,#0xfc,lsl#24
- add v25.2s,v25.2s,v29.2s // h0 -> h1
- add v28.2s,v28.2s,v30.2s // h3 -> h4
-
- b.hi .Loop_neon
-
-.Lskip_loop:
- dup v16.2d,v16.d[0]
- add v11.2s,v11.2s,v26.2s
-
- ////////////////////////////////////////////////////////////////
- // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- adds x2,x2,#32
- b.ne .Long_tail
-
- dup v16.2d,v11.d[0]
- add v14.2s,v9.2s,v24.2s
- add v17.2s,v12.2s,v27.2s
- add v15.2s,v10.2s,v25.2s
- add v18.2s,v13.2s,v28.2s
-
-.Long_tail:
- dup v14.2d,v14.d[0]
- umull2 v19.2d,v16.4s,v6.4s
- umull2 v22.2d,v16.4s,v1.4s
- umull2 v23.2d,v16.4s,v3.4s
- umull2 v21.2d,v16.4s,v0.4s
- umull2 v20.2d,v16.4s,v8.4s
-
- dup v15.2d,v15.d[0]
- umlal2 v19.2d,v14.4s,v0.4s
- umlal2 v21.2d,v14.4s,v3.4s
- umlal2 v22.2d,v14.4s,v5.4s
- umlal2 v23.2d,v14.4s,v7.4s
- umlal2 v20.2d,v14.4s,v1.4s
-
- dup v17.2d,v17.d[0]
- umlal2 v19.2d,v15.4s,v8.4s
- umlal2 v22.2d,v15.4s,v3.4s
- umlal2 v21.2d,v15.4s,v1.4s
- umlal2 v23.2d,v15.4s,v5.4s
- umlal2 v20.2d,v15.4s,v0.4s
-
- dup v18.2d,v18.d[0]
- umlal2 v22.2d,v17.4s,v0.4s
- umlal2 v23.2d,v17.4s,v1.4s
- umlal2 v19.2d,v17.4s,v4.4s
- umlal2 v20.2d,v17.4s,v6.4s
- umlal2 v21.2d,v17.4s,v8.4s
-
- umlal2 v22.2d,v18.4s,v8.4s
- umlal2 v19.2d,v18.4s,v2.4s
- umlal2 v23.2d,v18.4s,v0.4s
- umlal2 v20.2d,v18.4s,v4.4s
- umlal2 v21.2d,v18.4s,v6.4s
-
- b.eq .Lshort_tail
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4:r^3 and accumulate
-
- add v9.2s,v9.2s,v24.2s
- umlal v22.2d,v11.2s,v1.2s
- umlal v19.2d,v11.2s,v6.2s
- umlal v23.2d,v11.2s,v3.2s
- umlal v20.2d,v11.2s,v8.2s
- umlal v21.2d,v11.2s,v0.2s
-
- add v10.2s,v10.2s,v25.2s
- umlal v22.2d,v9.2s,v5.2s
- umlal v19.2d,v9.2s,v0.2s
- umlal v23.2d,v9.2s,v7.2s
- umlal v20.2d,v9.2s,v1.2s
- umlal v21.2d,v9.2s,v3.2s
-
- add v12.2s,v12.2s,v27.2s
- umlal v22.2d,v10.2s,v3.2s
- umlal v19.2d,v10.2s,v8.2s
- umlal v23.2d,v10.2s,v5.2s
- umlal v20.2d,v10.2s,v0.2s
- umlal v21.2d,v10.2s,v1.2s
-
- add v13.2s,v13.2s,v28.2s
- umlal v22.2d,v12.2s,v0.2s
- umlal v19.2d,v12.2s,v4.2s
- umlal v23.2d,v12.2s,v1.2s
- umlal v20.2d,v12.2s,v6.2s
- umlal v21.2d,v12.2s,v8.2s
-
- umlal v22.2d,v13.2s,v8.2s
- umlal v19.2d,v13.2s,v2.2s
- umlal v23.2d,v13.2s,v0.2s
- umlal v20.2d,v13.2s,v4.2s
- umlal v21.2d,v13.2s,v6.2s
-
-.Lshort_tail:
- ////////////////////////////////////////////////////////////////
- // horizontal add
-
- addp v22.2d,v22.2d,v22.2d
- ldp d8,d9,[sp,#16] // meet ABI requirements
- addp v19.2d,v19.2d,v19.2d
- ldp d10,d11,[sp,#32]
- addp v23.2d,v23.2d,v23.2d
- ldp d12,d13,[sp,#48]
- addp v20.2d,v20.2d,v20.2d
- ldp d14,d15,[sp,#64]
- addp v21.2d,v21.2d,v21.2d
- ldr x30,[sp,#8]
- .inst 0xd50323bf // autiasp
-
- ////////////////////////////////////////////////////////////////
- // lazy reduction, but without narrowing
-
- ushr v29.2d,v22.2d,#26
- and v22.16b,v22.16b,v31.16b
- ushr v30.2d,v19.2d,#26
- and v19.16b,v19.16b,v31.16b
-
- add v23.2d,v23.2d,v29.2d // h3 -> h4
- add v20.2d,v20.2d,v30.2d // h0 -> h1
-
- ushr v29.2d,v23.2d,#26
- and v23.16b,v23.16b,v31.16b
- ushr v30.2d,v20.2d,#26
- and v20.16b,v20.16b,v31.16b
- add v21.2d,v21.2d,v30.2d // h1 -> h2
-
- add v19.2d,v19.2d,v29.2d
- shl v29.2d,v29.2d,#2
- ushr v30.2d,v21.2d,#26
- and v21.16b,v21.16b,v31.16b
- add v19.2d,v19.2d,v29.2d // h4 -> h0
- add v22.2d,v22.2d,v30.2d // h2 -> h3
-
- ushr v29.2d,v19.2d,#26
- and v19.16b,v19.16b,v31.16b
- ushr v30.2d,v22.2d,#26
- and v22.16b,v22.16b,v31.16b
- add v20.2d,v20.2d,v29.2d // h0 -> h1
- add v23.2d,v23.2d,v30.2d // h3 -> h4
-
- ////////////////////////////////////////////////////////////////
- // write the result, can be partially reduced
-
- st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
- mov x4,#1
- st1 {v23.s}[0],[x0]
- str x4,[x0,#8] // set is_base2_26
-
- ldr x29,[sp],#80
- ret
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0
-.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
-.align 2
-#if !defined(__KERNEL__) && !defined(_WIN64)
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c
index 83a2338a8826..1fae18ba11ed 100644
--- a/arch/arm64/crypto/poly1305-glue.c
+++ b/arch/arm64/crypto/poly1305-glue.c
@@ -21,11 +21,11 @@
asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit(void *state, __le32 *digest, const u32 *nonce);
+asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
{
poly1305_init_arm64(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(key + 16);
@@ -52,7 +52,7 @@ static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
{
if (unlikely(!dctx->sset)) {
if (!dctx->rset) {
- poly1305_init_arch(dctx, src);
+ poly1305_init_arm64(&dctx->h, src);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->rset = 1;
@@ -143,13 +143,20 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
- kernel_neon_begin();
- poly1305_blocks_neon(&dctx->h, src, len, 1);
- kernel_neon_end();
+ do {
+ unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+ kernel_neon_begin();
+ poly1305_blocks_neon(&dctx->h, src, todo, 1);
+ kernel_neon_end();
+
+ len -= todo;
+ src += todo;
+ } while (len);
} else {
poly1305_blocks(&dctx->h, src, len, 1);
+ src += len;
}
- src += len;
nbytes %= POLY1305_BLOCK_SIZE;
}
@@ -162,9 +169,6 @@ EXPORT_SYMBOL(poly1305_update_arch);
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
{
- __le32 digest[4];
- u64 f = 0;
-
if (unlikely(dctx->buflen)) {
dctx->buf[dctx->buflen++] = 1;
memset(dctx->buf + dctx->buflen, 0,
@@ -172,19 +176,8 @@ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
}
- poly1305_emit(&dctx->h, digest, dctx->s);
-
- /* mac = (h + s) % (2^128) */
- f = (f >> 32) + le32_to_cpu(digest[0]);
- put_unaligned_le32(f, dst);
- f = (f >> 32) + le32_to_cpu(digest[1]);
- put_unaligned_le32(f, dst + 4);
- f = (f >> 32) + le32_to_cpu(digest[2]);
- put_unaligned_le32(f, dst + 8);
- f = (f >> 32) + le32_to_cpu(digest[3]);
- put_unaligned_le32(f, dst + 12);
-
- *dctx = (struct poly1305_desc_ctx){};
+ poly1305_emit(&dctx->h, dst, dctx->s);
+ memzero_explicit(dctx, sizeof(*dctx));
}
EXPORT_SYMBOL(poly1305_final_arch);
diff --git a/arch/arm64/crypto/polyval-ce-core.S b/arch/arm64/crypto/polyval-ce-core.S
new file mode 100644
index 000000000000..b5326540d2e3
--- /dev/null
+++ b/arch/arm64/crypto/polyval-ce-core.S
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Implementation of POLYVAL using ARMv8 Crypto Extensions.
+ *
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
+ * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
+ * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
+ * finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#define STRIDE_BLOCKS 8
+
+KEY_POWERS .req x0
+MSG .req x1
+BLOCKS_LEFT .req x2
+ACCUMULATOR .req x3
+KEY_START .req x10
+EXTRA_BYTES .req x11
+TMP .req x13
+
+M0 .req v0
+M1 .req v1
+M2 .req v2
+M3 .req v3
+M4 .req v4
+M5 .req v5
+M6 .req v6
+M7 .req v7
+KEY8 .req v8
+KEY7 .req v9
+KEY6 .req v10
+KEY5 .req v11
+KEY4 .req v12
+KEY3 .req v13
+KEY2 .req v14
+KEY1 .req v15
+PL .req v16
+PH .req v17
+TMP_V .req v18
+LO .req v20
+MI .req v21
+HI .req v22
+SUM .req v23
+GSTAR .req v24
+
+ .text
+
+ .arch armv8-a+crypto
+ .align 4
+
+.Lgstar:
+ .quad 0xc200000000000000, 0xc200000000000000
+
+/*
+ * Computes the product of two 128-bit polynomials in X and Y and XORs the
+ * components of the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ * X = [X_1 : X_0]
+ * Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ * LO += X_0 * Y_0
+ * MI += (X_0 + X_1) * (Y_0 + Y_1)
+ * HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * Karatsuba multiplication is used instead of Schoolbook multiplication because
+ * it was found to be slightly faster on ARM64 CPUs.
+ *
+ */
+.macro karatsuba1 X Y
+ X .req \X
+ Y .req \Y
+ ext v25.16b, X.16b, X.16b, #8
+ ext v26.16b, Y.16b, Y.16b, #8
+ eor v25.16b, v25.16b, X.16b
+ eor v26.16b, v26.16b, Y.16b
+ pmull2 v28.1q, X.2d, Y.2d
+ pmull v29.1q, X.1d, Y.1d
+ pmull v27.1q, v25.1d, v26.1d
+ eor HI.16b, HI.16b, v28.16b
+ eor LO.16b, LO.16b, v29.16b
+ eor MI.16b, MI.16b, v27.16b
+ .unreq X
+ .unreq Y
+.endm
+
+/*
+ * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
+ * them.
+ */
+.macro karatsuba1_store X Y
+ X .req \X
+ Y .req \Y
+ ext v25.16b, X.16b, X.16b, #8
+ ext v26.16b, Y.16b, Y.16b, #8
+ eor v25.16b, v25.16b, X.16b
+ eor v26.16b, v26.16b, Y.16b
+ pmull2 HI.1q, X.2d, Y.2d
+ pmull LO.1q, X.1d, Y.1d
+ pmull MI.1q, v25.1d, v26.1d
+ .unreq X
+ .unreq Y
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ * [PH : PL] =
+ * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+ */
+.macro karatsuba2
+ // v4 = [HI_1 + MI_1 : HI_0 + MI_0]
+ eor v4.16b, HI.16b, MI.16b
+ // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
+ eor v4.16b, v4.16b, LO.16b
+ // v5 = [HI_0 : LO_1]
+ ext v5.16b, LO.16b, HI.16b, #8
+ // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
+ eor v4.16b, v4.16b, v5.16b
+ // HI = [HI_0 : HI_1]
+ ext HI.16b, HI.16b, HI.16b, #8
+ // LO = [LO_0 : LO_1]
+ ext LO.16b, LO.16b, LO.16b, #8
+ // PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
+ ext PH.16b, v4.16b, HI.16b, #8
+ // PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+ ext PL.16b, LO.16b, v4.16b, #8
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form. We need to reduce it
+ * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128. To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right
+ * shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ * T = T_1 : T_0 = g*(x) * P_0
+ * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+ DEST .req \dest
+ // TMP_V = T_1 : T_0 = P_0 * g*(x)
+ pmull TMP_V.1q, PL.1d, GSTAR.1d
+ // TMP_V = T_0 : T_1
+ ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+ // TMP_V = P_1 + T_0 : P_0 + T_1
+ eor TMP_V.16b, PL.16b, TMP_V.16b
+ // PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+ eor PH.16b, PH.16b, TMP_V.16b
+ // TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
+ pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
+ eor DEST.16b, PH.16b, TMP_V.16b
+ .unreq DEST
+.endm
+
+/*
+ * Compute Polyval on 8 blocks.
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ *
+ * Sets PL, PH.
+ */
+.macro full_stride reduce
+ eor LO.16b, LO.16b, LO.16b
+ eor MI.16b, MI.16b, MI.16b
+ eor HI.16b, HI.16b, HI.16b
+
+ ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
+ ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
+
+ karatsuba1 M7 KEY1
+ .if \reduce
+ pmull TMP_V.1q, PL.1d, GSTAR.1d
+ .endif
+
+ karatsuba1 M6 KEY2
+ .if \reduce
+ ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+ .endif
+
+ karatsuba1 M5 KEY3
+ .if \reduce
+ eor TMP_V.16b, PL.16b, TMP_V.16b
+ .endif
+
+ karatsuba1 M4 KEY4
+ .if \reduce
+ eor PH.16b, PH.16b, TMP_V.16b
+ .endif
+
+ karatsuba1 M3 KEY5
+ .if \reduce
+ pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
+ .endif
+
+ karatsuba1 M2 KEY6
+ .if \reduce
+ eor SUM.16b, PH.16b, TMP_V.16b
+ .endif
+
+ karatsuba1 M1 KEY7
+ eor M0.16b, M0.16b, SUM.16b
+
+ karatsuba1 M0 KEY8
+ karatsuba2
+.endm
+
+/*
+ * Handle any extra blocks after full_stride loop.
+ */
+.macro partial_stride
+ add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
+ sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
+ ld1 {KEY1.16b}, [KEY_POWERS], #16
+
+ ld1 {TMP_V.16b}, [MSG], #16
+ eor SUM.16b, SUM.16b, TMP_V.16b
+ karatsuba1_store KEY1 SUM
+ sub BLOCKS_LEFT, BLOCKS_LEFT, #1
+
+ tst BLOCKS_LEFT, #4
+ beq .Lpartial4BlocksDone
+ ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
+ ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
+ karatsuba1 M0 KEY8
+ karatsuba1 M1 KEY7
+ karatsuba1 M2 KEY6
+ karatsuba1 M3 KEY5
+.Lpartial4BlocksDone:
+ tst BLOCKS_LEFT, #2
+ beq .Lpartial2BlocksDone
+ ld1 {M0.16b, M1.16b}, [MSG], #32
+ ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
+ karatsuba1 M0 KEY8
+ karatsuba1 M1 KEY7
+.Lpartial2BlocksDone:
+ tst BLOCKS_LEFT, #1
+ beq .LpartialDone
+ ld1 {M0.16b}, [MSG], #16
+ ld1 {KEY8.16b}, [KEY_POWERS], #16
+ karatsuba1 M0 KEY8
+.LpartialDone:
+ karatsuba2
+ montgomery_reduction SUM
+.endm
+
+/*
+ * Perform montgomery multiplication in GF(2^128) and store result in op1.
+ *
+ * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
+ * If op1, op2 are in montgomery form, this computes the montgomery
+ * form of op1*op2.
+ *
+ * void pmull_polyval_mul(u8 *op1, const u8 *op2);
+ */
+SYM_FUNC_START(pmull_polyval_mul)
+ adr TMP, .Lgstar
+ ld1 {GSTAR.2d}, [TMP]
+ ld1 {v0.16b}, [x0]
+ ld1 {v1.16b}, [x1]
+ karatsuba1_store v0 v1
+ karatsuba2
+ montgomery_reduction SUM
+ st1 {SUM.16b}, [x0]
+ ret
+SYM_FUNC_END(pmull_polyval_mul)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL. This computes:
+ * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * x0 - pointer to precomputed key powers h^8 ... h^1
+ * x1 - pointer to message blocks
+ * x2 - number of blocks to hash
+ * x3 - pointer to accumulator
+ *
+ * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in,
+ * size_t nblocks, u8 *accumulator);
+ */
+SYM_FUNC_START(pmull_polyval_update)
+ adr TMP, .Lgstar
+ mov KEY_START, KEY_POWERS
+ ld1 {GSTAR.2d}, [TMP]
+ ld1 {SUM.16b}, [ACCUMULATOR]
+ subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ blt .LstrideLoopExit
+ ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
+ ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
+ full_stride 0
+ subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ blt .LstrideLoopExitReduce
+.LstrideLoop:
+ full_stride 1
+ subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ bge .LstrideLoop
+.LstrideLoopExitReduce:
+ montgomery_reduction SUM
+.LstrideLoopExit:
+ adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+ beq .LskipPartial
+ partial_stride
+.LskipPartial:
+ st1 {SUM.16b}, [ACCUMULATOR]
+ ret
+SYM_FUNC_END(pmull_polyval_update)
diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c
new file mode 100644
index 000000000000..0a3b5718df85
--- /dev/null
+++ b/arch/arm64/crypto/polyval-ce-glue.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Glue code for POLYVAL using ARMv8 Crypto Extensions
+ *
+ * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ * Copyright 2021 Google LLC
+ */
+
+/*
+ * Glue code based on ghash-clmulni-intel_glue.c.
+ *
+ * This implementation of POLYVAL uses montgomery multiplication accelerated by
+ * ARMv8 Crypto Extensions instructions to implement the finite field operations.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/polyval.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#define NUM_KEY_POWERS 8
+
+struct polyval_tfm_ctx {
+ /*
+ * These powers must be in the order h^8, ..., h^1.
+ */
+ u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE];
+};
+
+struct polyval_desc_ctx {
+ u8 buffer[POLYVAL_BLOCK_SIZE];
+ u32 bytes;
+};
+
+asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys,
+ const u8 *in, size_t nblocks, u8 *accumulator);
+asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2);
+
+static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
+ const u8 *in, size_t nblocks, u8 *accumulator)
+{
+ if (likely(crypto_simd_usable())) {
+ kernel_neon_begin();
+ pmull_polyval_update(keys, in, nblocks, accumulator);
+ kernel_neon_end();
+ } else {
+ polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in,
+ nblocks, accumulator);
+ }
+}
+
+static void internal_polyval_mul(u8 *op1, const u8 *op2)
+{
+ if (likely(crypto_simd_usable())) {
+ kernel_neon_begin();
+ pmull_polyval_mul(op1, op2);
+ kernel_neon_end();
+ } else {
+ polyval_mul_non4k(op1, op2);
+ }
+}
+
+static int polyval_arm64_setkey(struct crypto_shash *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+ int i;
+
+ if (keylen != POLYVAL_BLOCK_SIZE)
+ return -EINVAL;
+
+ memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
+
+ for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
+ memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
+ internal_polyval_mul(tctx->key_powers[i],
+ tctx->key_powers[i+1]);
+ }
+
+ return 0;
+}
+
+static int polyval_arm64_init(struct shash_desc *desc)
+{
+ struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memset(dctx, 0, sizeof(*dctx));
+
+ return 0;
+}
+
+static int polyval_arm64_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+ const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ u8 *pos;
+ unsigned int nblocks;
+ unsigned int n;
+
+ if (dctx->bytes) {
+ n = min(srclen, dctx->bytes);
+ pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes;
+
+ dctx->bytes -= n;
+ srclen -= n;
+
+ while (n--)
+ *pos++ ^= *src++;
+
+ if (!dctx->bytes)
+ internal_polyval_mul(dctx->buffer,
+ tctx->key_powers[NUM_KEY_POWERS-1]);
+ }
+
+ while (srclen >= POLYVAL_BLOCK_SIZE) {
+ /* allow rescheduling every 4K bytes */
+ nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
+ internal_polyval_update(tctx, src, nblocks, dctx->buffer);
+ srclen -= nblocks * POLYVAL_BLOCK_SIZE;
+ src += nblocks * POLYVAL_BLOCK_SIZE;
+ }
+
+ if (srclen) {
+ dctx->bytes = POLYVAL_BLOCK_SIZE - srclen;
+ pos = dctx->buffer;
+ while (srclen--)
+ *pos++ ^= *src++;
+ }
+
+ return 0;
+}
+
+static int polyval_arm64_final(struct shash_desc *desc, u8 *dst)
+{
+ struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
+ const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+
+ if (dctx->bytes) {
+ internal_polyval_mul(dctx->buffer,
+ tctx->key_powers[NUM_KEY_POWERS-1]);
+ }
+
+ memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg polyval_alg = {
+ .digestsize = POLYVAL_DIGEST_SIZE,
+ .init = polyval_arm64_init,
+ .update = polyval_arm64_update,
+ .final = polyval_arm64_final,
+ .setkey = polyval_arm64_setkey,
+ .descsize = sizeof(struct polyval_desc_ctx),
+ .base = {
+ .cra_name = "polyval",
+ .cra_driver_name = "polyval-ce",
+ .cra_priority = 200,
+ .cra_blocksize = POLYVAL_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct polyval_tfm_ctx),
+ .cra_module = THIS_MODULE,
+ },
+};
+
+static int __init polyval_ce_mod_init(void)
+{
+ return crypto_register_shash(&polyval_alg);
+}
+
+static void __exit polyval_ce_mod_exit(void)
+{
+ crypto_unregister_shash(&polyval_alg);
+}
+
+module_cpu_feature_match(PMULL, polyval_ce_mod_init)
+module_exit(polyval_ce_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("POLYVAL hash function accelerated by ARMv8 Crypto Extensions");
+MODULE_ALIAS_CRYPTO("polyval");
+MODULE_ALIAS_CRYPTO("polyval-ce");
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
index c2ce1f820706..9b1f2d82a6fe 100644
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -62,40 +62,34 @@
.endm
/*
- * void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
- * int blocks)
+ * int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
+ * int blocks)
*/
-ENTRY(sha1_ce_transform)
- frame_push 3
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
-
+SYM_FUNC_START(__sha1_ce_transform)
/* load round constants */
-0: loadrc k0.4s, 0x5a827999, w6
+ loadrc k0.4s, 0x5a827999, w6
loadrc k1.4s, 0x6ed9eba1, w6
loadrc k2.4s, 0x8f1bbcdc, w6
loadrc k3.4s, 0xca62c1d6, w6
/* load state */
- ld1 {dgav.4s}, [x19]
- ldr dgb, [x19, #16]
+ ld1 {dgav.4s}, [x0]
+ ldr dgb, [x0, #16]
/* load sha1_ce_state::finalize */
ldr_l w4, sha1_ce_offsetof_finalize, x4
- ldr w4, [x19, x4]
+ ldr w4, [x0, x4]
/* load input */
-1: ld1 {v8.4s-v11.4s}, [x20], #64
- sub w21, w21, #1
+0: ld1 {v8.4s-v11.4s}, [x1], #64
+ sub w2, w2, #1
CPU_LE( rev32 v8.16b, v8.16b )
CPU_LE( rev32 v9.16b, v9.16b )
CPU_LE( rev32 v10.16b, v10.16b )
CPU_LE( rev32 v11.16b, v11.16b )
-2: add t0.4s, v8.4s, k0.4s
+1: add t0.4s, v8.4s, k0.4s
mov dg0v.16b, dgav.16b
add_update c, ev, k0, 8, 9, 10, 11, dgb
@@ -126,25 +120,18 @@ CPU_LE( rev32 v11.16b, v11.16b )
add dgbv.2s, dgbv.2s, dg1v.2s
add dgav.4s, dgav.4s, dg0v.4s
- cbz w21, 3f
-
- if_will_cond_yield_neon
- st1 {dgav.4s}, [x19]
- str dgb, [x19, #16]
- do_cond_yield_neon
+ cbz w2, 2f
+ cond_yield 3f, x5, x6
b 0b
- endif_yield_neon
-
- b 1b
/*
* Final block: add padding and total bit count.
* Skip if the input size was not a round multiple of the block size,
* the padding is handled by the C code in that case.
*/
-3: cbz x4, 4f
+2: cbz x4, 3f
ldr_l w4, sha1_ce_offsetof_count, x4
- ldr x4, [x19, x4]
+ ldr x4, [x0, x4]
movi v9.2d, #0
mov x8, #0x80000000
movi v10.2d, #0
@@ -153,11 +140,11 @@ CPU_LE( rev32 v11.16b, v11.16b )
mov x4, #0
mov v11.d[0], xzr
mov v11.d[1], x7
- b 2b
+ b 1b
/* store new state */
-4: st1 {dgav.4s}, [x19]
- str dgb, [x19, #16]
- frame_pop
+3: st1 {dgav.4s}, [x0]
+ str dgb, [x0, #16]
+ mov w0, w2
ret
-ENDPROC(sha1_ce_transform)
+SYM_FUNC_END(__sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index bdc1b6d7aff7..1dd93e1fcb39 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -10,7 +10,7 @@
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
#include <crypto/sha1_base.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
@@ -19,14 +19,34 @@
MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("sha1");
struct sha1_ce_state {
struct sha1_state sst;
u32 finalize;
};
-asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
- int blocks);
+extern const u32 sha1_ce_offsetof_count;
+extern const u32 sha1_ce_offsetof_finalize;
+
+asmlinkage int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
+ int blocks);
+
+static void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
+ int blocks)
+{
+ while (blocks) {
+ int rem;
+
+ kernel_neon_begin();
+ rem = __sha1_ce_transform(container_of(sst,
+ struct sha1_ce_state,
+ sst), src, blocks);
+ kernel_neon_end();
+ src += (blocks - rem) * SHA1_BLOCK_SIZE;
+ blocks = rem;
+ }
+}
const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize);
@@ -40,10 +60,7 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
return crypto_sha1_update(desc, data, len);
sctx->finalize = 0;
- kernel_neon_begin();
- sha1_base_do_update(desc, data, len,
- (sha1_block_fn *)sha1_ce_transform);
- kernel_neon_end();
+ sha1_base_do_update(desc, data, len, sha1_ce_transform);
return 0;
}
@@ -63,12 +80,9 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
*/
sctx->finalize = finalize;
- kernel_neon_begin();
- sha1_base_do_update(desc, data, len,
- (sha1_block_fn *)sha1_ce_transform);
+ sha1_base_do_update(desc, data, len, sha1_ce_transform);
if (!finalize)
- sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform);
- kernel_neon_end();
+ sha1_base_do_finalize(desc, sha1_ce_transform);
return sha1_base_finish(desc, out);
}
@@ -80,18 +94,36 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out)
return crypto_sha1_finup(desc, NULL, 0, out);
sctx->finalize = 0;
- kernel_neon_begin();
- sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform);
- kernel_neon_end();
+ sha1_base_do_finalize(desc, sha1_ce_transform);
return sha1_base_finish(desc, out);
}
+static int sha1_ce_export(struct shash_desc *desc, void *out)
+{
+ struct sha1_ce_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, &sctx->sst, sizeof(struct sha1_state));
+ return 0;
+}
+
+static int sha1_ce_import(struct shash_desc *desc, const void *in)
+{
+ struct sha1_ce_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(&sctx->sst, in, sizeof(struct sha1_state));
+ sctx->finalize = 0;
+ return 0;
+}
+
static struct shash_alg alg = {
.init = sha1_base_init,
.update = sha1_ce_update,
.final = sha1_ce_final,
.finup = sha1_ce_finup,
+ .import = sha1_ce_import,
+ .export = sha1_ce_export,
.descsize = sizeof(struct sha1_ce_state),
+ .statesize = sizeof(struct sha1_state),
.digestsize = SHA1_DIGEST_SIZE,
.base = {
.cra_name = "sha1",
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
index 6f728a419009..fce84d88ddb2 100644
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -71,41 +71,35 @@
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
/*
- * void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
- * int blocks)
+ * int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
+ * int blocks)
*/
.text
-ENTRY(sha2_ce_transform)
- frame_push 3
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
-
+SYM_FUNC_START(__sha256_ce_transform)
/* load round constants */
-0: adr_l x8, .Lsha2_rcon
+ adr_l x8, .Lsha2_rcon
ld1 { v0.4s- v3.4s}, [x8], #64
ld1 { v4.4s- v7.4s}, [x8], #64
ld1 { v8.4s-v11.4s}, [x8], #64
ld1 {v12.4s-v15.4s}, [x8]
/* load state */
- ld1 {dgav.4s, dgbv.4s}, [x19]
+ ld1 {dgav.4s, dgbv.4s}, [x0]
/* load sha256_ce_state::finalize */
ldr_l w4, sha256_ce_offsetof_finalize, x4
- ldr w4, [x19, x4]
+ ldr w4, [x0, x4]
/* load input */
-1: ld1 {v16.4s-v19.4s}, [x20], #64
- sub w21, w21, #1
+0: ld1 {v16.4s-v19.4s}, [x1], #64
+ sub w2, w2, #1
CPU_LE( rev32 v16.16b, v16.16b )
CPU_LE( rev32 v17.16b, v17.16b )
CPU_LE( rev32 v18.16b, v18.16b )
CPU_LE( rev32 v19.16b, v19.16b )
-2: add t0.4s, v16.4s, v0.4s
+1: add t0.4s, v16.4s, v0.4s
mov dg0v.16b, dgav.16b
mov dg1v.16b, dgbv.16b
@@ -134,24 +128,18 @@ CPU_LE( rev32 v19.16b, v19.16b )
add dgbv.4s, dgbv.4s, dg1v.4s
/* handled all input blocks? */
- cbz w21, 3f
-
- if_will_cond_yield_neon
- st1 {dgav.4s, dgbv.4s}, [x19]
- do_cond_yield_neon
+ cbz w2, 2f
+ cond_yield 3f, x5, x6
b 0b
- endif_yield_neon
-
- b 1b
/*
* Final block: add padding and total bit count.
* Skip if the input size was not a round multiple of the block size,
* the padding is handled by the C code in that case.
*/
-3: cbz x4, 4f
+2: cbz x4, 3f
ldr_l w4, sha256_ce_offsetof_count, x4
- ldr x4, [x19, x4]
+ ldr x4, [x0, x4]
movi v17.2d, #0
mov x8, #0x80000000
movi v18.2d, #0
@@ -160,10 +148,10 @@ CPU_LE( rev32 v19.16b, v19.16b )
mov x4, #0
mov v19.d[0], xzr
mov v19.d[1], x7
- b 2b
+ b 1b
/* store new state */
-4: st1 {dgav.4s, dgbv.4s}, [x19]
- frame_pop
+3: st1 {dgav.4s, dgbv.4s}, [x0]
+ mov w0, w2
ret
-ENDPROC(sha2_ce_transform)
+SYM_FUNC_END(__sha256_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 604a01a4ede6..0a44d2e7ee1f 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -10,7 +10,7 @@
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha256_base.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
@@ -19,14 +19,35 @@
MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("sha224");
+MODULE_ALIAS_CRYPTO("sha256");
struct sha256_ce_state {
struct sha256_state sst;
u32 finalize;
};
-asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
- int blocks);
+extern const u32 sha256_ce_offsetof_count;
+extern const u32 sha256_ce_offsetof_finalize;
+
+asmlinkage int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
+ int blocks);
+
+static void sha256_ce_transform(struct sha256_state *sst, u8 const *src,
+ int blocks)
+{
+ while (blocks) {
+ int rem;
+
+ kernel_neon_begin();
+ rem = __sha256_ce_transform(container_of(sst,
+ struct sha256_ce_state,
+ sst), src, blocks);
+ kernel_neon_end();
+ src += (blocks - rem) * SHA256_BLOCK_SIZE;
+ blocks = rem;
+ }
+}
const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
sst.count);
@@ -35,6 +56,12 @@ const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state,
asmlinkage void sha256_block_data_order(u32 *digest, u8 const *src, int blocks);
+static void sha256_arm64_transform(struct sha256_state *sst, u8 const *src,
+ int blocks)
+{
+ sha256_block_data_order(sst->state, src, blocks);
+}
+
static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
@@ -42,13 +69,10 @@ static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
if (!crypto_simd_usable())
return sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_data_order);
+ sha256_arm64_transform);
sctx->finalize = 0;
- kernel_neon_begin();
- sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha2_ce_transform);
- kernel_neon_end();
+ sha256_base_do_update(desc, data, len, sha256_ce_transform);
return 0;
}
@@ -62,9 +86,8 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
if (!crypto_simd_usable()) {
if (len)
sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_data_order);
- sha256_base_do_finalize(desc,
- (sha256_block_fn *)sha256_block_data_order);
+ sha256_arm64_transform);
+ sha256_base_do_finalize(desc, sha256_arm64_transform);
return sha256_base_finish(desc, out);
}
@@ -74,13 +97,9 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
*/
sctx->finalize = finalize;
- kernel_neon_begin();
- sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha2_ce_transform);
+ sha256_base_do_update(desc, data, len, sha256_ce_transform);
if (!finalize)
- sha256_base_do_finalize(desc,
- (sha256_block_fn *)sha2_ce_transform);
- kernel_neon_end();
+ sha256_base_do_finalize(desc, sha256_ce_transform);
return sha256_base_finish(desc, out);
}
@@ -89,24 +108,48 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out)
struct sha256_ce_state *sctx = shash_desc_ctx(desc);
if (!crypto_simd_usable()) {
- sha256_base_do_finalize(desc,
- (sha256_block_fn *)sha256_block_data_order);
+ sha256_base_do_finalize(desc, sha256_arm64_transform);
return sha256_base_finish(desc, out);
}
sctx->finalize = 0;
- kernel_neon_begin();
- sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform);
- kernel_neon_end();
+ sha256_base_do_finalize(desc, sha256_ce_transform);
return sha256_base_finish(desc, out);
}
+static int sha256_ce_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ sha256_base_init(desc);
+ return sha256_ce_finup(desc, data, len, out);
+}
+
+static int sha256_ce_export(struct shash_desc *desc, void *out)
+{
+ struct sha256_ce_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, &sctx->sst, sizeof(struct sha256_state));
+ return 0;
+}
+
+static int sha256_ce_import(struct shash_desc *desc, const void *in)
+{
+ struct sha256_ce_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(&sctx->sst, in, sizeof(struct sha256_state));
+ sctx->finalize = 0;
+ return 0;
+}
+
static struct shash_alg algs[] = { {
.init = sha224_base_init,
.update = sha256_ce_update,
.final = sha256_ce_final,
.finup = sha256_ce_finup,
+ .export = sha256_ce_export,
+ .import = sha256_ce_import,
.descsize = sizeof(struct sha256_ce_state),
+ .statesize = sizeof(struct sha256_state),
.digestsize = SHA224_DIGEST_SIZE,
.base = {
.cra_name = "sha224",
@@ -120,7 +163,11 @@ static struct shash_alg algs[] = { {
.update = sha256_ce_update,
.final = sha256_ce_final,
.finup = sha256_ce_finup,
+ .digest = sha256_ce_digest,
+ .export = sha256_ce_export,
+ .import = sha256_ce_import,
.descsize = sizeof(struct sha256_ce_state),
+ .statesize = sizeof(struct sha256_state),
.digestsize = SHA256_DIGEST_SIZE,
.base = {
.cra_name = "sha256",
diff --git a/arch/arm64/crypto/sha256-core.S_shipped b/arch/arm64/crypto/sha256-core.S_shipped
deleted file mode 100644
index 7c7ce2e3bad6..000000000000
--- a/arch/arm64/crypto/sha256-core.S_shipped
+++ /dev/null
@@ -1,2069 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-// This code is taken from the OpenSSL project but the author (Andy Polyakov)
-// has relicensed it under the GPLv2. Therefore this program is free software;
-// you can redistribute it and/or modify it under the terms of the GNU General
-// Public License version 2 as published by the Free Software Foundation.
-//
-// The original headers, including the original license headers, are
-// included below for completeness.
-
-// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License"). You may not use
-// this file except in compliance with the License. You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-// ====================================================================
-//
-// SHA256/512 for ARMv8.
-//
-// Performance in cycles per processed byte and improvement coefficient
-// over code generated with "default" compiler:
-//
-// SHA256-hw SHA256(*) SHA512
-// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
-// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
-// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
-// Denver 2.01 10.5 (+26%) 6.70 (+8%)
-// X-Gene 20.0 (+100%) 12.8 (+300%(***))
-// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
-//
-// (*) Software SHA256 results are of lesser relevance, presented
-// mostly for informational purposes.
-// (**) The result is a trade-off: it's possible to improve it by
-// 10% (or by 1 cycle per round), but at the cost of 20% loss
-// on Cortex-A53 (or by 4 cycles per round).
-// (***) Super-impressive coefficients over gcc-generated code are
-// indication of some compiler "pathology", most notably code
-// generated with -mgeneral-regs-only is significanty faster
-// and the gap is only 40-90%.
-//
-// October 2016.
-//
-// Originally it was reckoned that it makes no sense to implement NEON
-// version of SHA256 for 64-bit processors. This is because performance
-// improvement on most wide-spread Cortex-A5x processors was observed
-// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
-// observed that 32-bit NEON SHA256 performs significantly better than
-// 64-bit scalar version on *some* of the more recent processors. As
-// result 64-bit NEON version of SHA256 was added to provide best
-// all-round performance. For example it executes ~30% faster on X-Gene
-// and Mongoose. [For reference, NEON version of SHA512 is bound to
-// deliver much less improvement, likely *negative* on Cortex-A5x.
-// Which is why NEON support is limited to SHA256.]
-
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#endif
-
-.text
-
-.extern OPENSSL_armcap_P
-.globl sha256_block_data_order
-.type sha256_block_data_order,%function
-.align 6
-sha256_block_data_order:
-#ifndef __KERNEL__
-# ifdef __ILP32__
- ldrsw x16,.LOPENSSL_armcap_P
-# else
- ldr x16,.LOPENSSL_armcap_P
-# endif
- adr x17,.LOPENSSL_armcap_P
- add x16,x16,x17
- ldr w16,[x16]
- tst w16,#ARMV8_SHA256
- b.ne .Lv8_entry
- tst w16,#ARMV7_NEON
- b.ne .Lneon_entry
-#endif
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
-
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#4*4
-
- ldp w20,w21,[x0] // load context
- ldp w22,w23,[x0,#2*4]
- ldp w24,w25,[x0,#4*4]
- add x2,x1,x2,lsl#6 // end of input
- ldp w26,w27,[x0,#6*4]
- adr x30,.LK256
- stp x0,x2,[x29,#96]
-
-.Loop:
- ldp w3,w4,[x1],#2*4
- ldr w19,[x30],#4 // *K++
- eor w28,w21,w22 // magic seed
- str x1,[x29,#112]
-#ifndef __AARCH64EB__
- rev w3,w3 // 0
-#endif
- ror w16,w24,#6
- add w27,w27,w19 // h+=K[i]
- eor w6,w24,w24,ror#14
- and w17,w25,w24
- bic w19,w26,w24
- add w27,w27,w3 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w20,w21 // a^b, b^c in next round
- eor w16,w16,w6,ror#11 // Sigma1(e)
- ror w6,w20,#2
- add w27,w27,w17 // h+=Ch(e,f,g)
- eor w17,w20,w20,ror#9
- add w27,w27,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w23,w23,w27 // d+=h
- eor w28,w28,w21 // Maj(a,b,c)
- eor w17,w6,w17,ror#13 // Sigma0(a)
- add w27,w27,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w27,w27,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w4,w4 // 1
-#endif
- ldp w5,w6,[x1],#2*4
- add w27,w27,w17 // h+=Sigma0(a)
- ror w16,w23,#6
- add w26,w26,w28 // h+=K[i]
- eor w7,w23,w23,ror#14
- and w17,w24,w23
- bic w28,w25,w23
- add w26,w26,w4 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w27,w20 // a^b, b^c in next round
- eor w16,w16,w7,ror#11 // Sigma1(e)
- ror w7,w27,#2
- add w26,w26,w17 // h+=Ch(e,f,g)
- eor w17,w27,w27,ror#9
- add w26,w26,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w22,w22,w26 // d+=h
- eor w19,w19,w20 // Maj(a,b,c)
- eor w17,w7,w17,ror#13 // Sigma0(a)
- add w26,w26,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w26,w26,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w5,w5 // 2
-#endif
- add w26,w26,w17 // h+=Sigma0(a)
- ror w16,w22,#6
- add w25,w25,w19 // h+=K[i]
- eor w8,w22,w22,ror#14
- and w17,w23,w22
- bic w19,w24,w22
- add w25,w25,w5 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w26,w27 // a^b, b^c in next round
- eor w16,w16,w8,ror#11 // Sigma1(e)
- ror w8,w26,#2
- add w25,w25,w17 // h+=Ch(e,f,g)
- eor w17,w26,w26,ror#9
- add w25,w25,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w21,w21,w25 // d+=h
- eor w28,w28,w27 // Maj(a,b,c)
- eor w17,w8,w17,ror#13 // Sigma0(a)
- add w25,w25,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w25,w25,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w6,w6 // 3
-#endif
- ldp w7,w8,[x1],#2*4
- add w25,w25,w17 // h+=Sigma0(a)
- ror w16,w21,#6
- add w24,w24,w28 // h+=K[i]
- eor w9,w21,w21,ror#14
- and w17,w22,w21
- bic w28,w23,w21
- add w24,w24,w6 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w25,w26 // a^b, b^c in next round
- eor w16,w16,w9,ror#11 // Sigma1(e)
- ror w9,w25,#2
- add w24,w24,w17 // h+=Ch(e,f,g)
- eor w17,w25,w25,ror#9
- add w24,w24,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w20,w20,w24 // d+=h
- eor w19,w19,w26 // Maj(a,b,c)
- eor w17,w9,w17,ror#13 // Sigma0(a)
- add w24,w24,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w24,w24,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w7,w7 // 4
-#endif
- add w24,w24,w17 // h+=Sigma0(a)
- ror w16,w20,#6
- add w23,w23,w19 // h+=K[i]
- eor w10,w20,w20,ror#14
- and w17,w21,w20
- bic w19,w22,w20
- add w23,w23,w7 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w24,w25 // a^b, b^c in next round
- eor w16,w16,w10,ror#11 // Sigma1(e)
- ror w10,w24,#2
- add w23,w23,w17 // h+=Ch(e,f,g)
- eor w17,w24,w24,ror#9
- add w23,w23,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w27,w27,w23 // d+=h
- eor w28,w28,w25 // Maj(a,b,c)
- eor w17,w10,w17,ror#13 // Sigma0(a)
- add w23,w23,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w23,w23,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w8,w8 // 5
-#endif
- ldp w9,w10,[x1],#2*4
- add w23,w23,w17 // h+=Sigma0(a)
- ror w16,w27,#6
- add w22,w22,w28 // h+=K[i]
- eor w11,w27,w27,ror#14
- and w17,w20,w27
- bic w28,w21,w27
- add w22,w22,w8 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w23,w24 // a^b, b^c in next round
- eor w16,w16,w11,ror#11 // Sigma1(e)
- ror w11,w23,#2
- add w22,w22,w17 // h+=Ch(e,f,g)
- eor w17,w23,w23,ror#9
- add w22,w22,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w26,w26,w22 // d+=h
- eor w19,w19,w24 // Maj(a,b,c)
- eor w17,w11,w17,ror#13 // Sigma0(a)
- add w22,w22,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w22,w22,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w9,w9 // 6
-#endif
- add w22,w22,w17 // h+=Sigma0(a)
- ror w16,w26,#6
- add w21,w21,w19 // h+=K[i]
- eor w12,w26,w26,ror#14
- and w17,w27,w26
- bic w19,w20,w26
- add w21,w21,w9 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w22,w23 // a^b, b^c in next round
- eor w16,w16,w12,ror#11 // Sigma1(e)
- ror w12,w22,#2
- add w21,w21,w17 // h+=Ch(e,f,g)
- eor w17,w22,w22,ror#9
- add w21,w21,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w25,w25,w21 // d+=h
- eor w28,w28,w23 // Maj(a,b,c)
- eor w17,w12,w17,ror#13 // Sigma0(a)
- add w21,w21,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w21,w21,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w10,w10 // 7
-#endif
- ldp w11,w12,[x1],#2*4
- add w21,w21,w17 // h+=Sigma0(a)
- ror w16,w25,#6
- add w20,w20,w28 // h+=K[i]
- eor w13,w25,w25,ror#14
- and w17,w26,w25
- bic w28,w27,w25
- add w20,w20,w10 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w21,w22 // a^b, b^c in next round
- eor w16,w16,w13,ror#11 // Sigma1(e)
- ror w13,w21,#2
- add w20,w20,w17 // h+=Ch(e,f,g)
- eor w17,w21,w21,ror#9
- add w20,w20,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w24,w24,w20 // d+=h
- eor w19,w19,w22 // Maj(a,b,c)
- eor w17,w13,w17,ror#13 // Sigma0(a)
- add w20,w20,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w20,w20,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w11,w11 // 8
-#endif
- add w20,w20,w17 // h+=Sigma0(a)
- ror w16,w24,#6
- add w27,w27,w19 // h+=K[i]
- eor w14,w24,w24,ror#14
- and w17,w25,w24
- bic w19,w26,w24
- add w27,w27,w11 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w20,w21 // a^b, b^c in next round
- eor w16,w16,w14,ror#11 // Sigma1(e)
- ror w14,w20,#2
- add w27,w27,w17 // h+=Ch(e,f,g)
- eor w17,w20,w20,ror#9
- add w27,w27,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w23,w23,w27 // d+=h
- eor w28,w28,w21 // Maj(a,b,c)
- eor w17,w14,w17,ror#13 // Sigma0(a)
- add w27,w27,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w27,w27,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w12,w12 // 9
-#endif
- ldp w13,w14,[x1],#2*4
- add w27,w27,w17 // h+=Sigma0(a)
- ror w16,w23,#6
- add w26,w26,w28 // h+=K[i]
- eor w15,w23,w23,ror#14
- and w17,w24,w23
- bic w28,w25,w23
- add w26,w26,w12 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w27,w20 // a^b, b^c in next round
- eor w16,w16,w15,ror#11 // Sigma1(e)
- ror w15,w27,#2
- add w26,w26,w17 // h+=Ch(e,f,g)
- eor w17,w27,w27,ror#9
- add w26,w26,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w22,w22,w26 // d+=h
- eor w19,w19,w20 // Maj(a,b,c)
- eor w17,w15,w17,ror#13 // Sigma0(a)
- add w26,w26,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w26,w26,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w13,w13 // 10
-#endif
- add w26,w26,w17 // h+=Sigma0(a)
- ror w16,w22,#6
- add w25,w25,w19 // h+=K[i]
- eor w0,w22,w22,ror#14
- and w17,w23,w22
- bic w19,w24,w22
- add w25,w25,w13 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w26,w27 // a^b, b^c in next round
- eor w16,w16,w0,ror#11 // Sigma1(e)
- ror w0,w26,#2
- add w25,w25,w17 // h+=Ch(e,f,g)
- eor w17,w26,w26,ror#9
- add w25,w25,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w21,w21,w25 // d+=h
- eor w28,w28,w27 // Maj(a,b,c)
- eor w17,w0,w17,ror#13 // Sigma0(a)
- add w25,w25,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w25,w25,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w14,w14 // 11
-#endif
- ldp w15,w0,[x1],#2*4
- add w25,w25,w17 // h+=Sigma0(a)
- str w6,[sp,#12]
- ror w16,w21,#6
- add w24,w24,w28 // h+=K[i]
- eor w6,w21,w21,ror#14
- and w17,w22,w21
- bic w28,w23,w21
- add w24,w24,w14 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w25,w26 // a^b, b^c in next round
- eor w16,w16,w6,ror#11 // Sigma1(e)
- ror w6,w25,#2
- add w24,w24,w17 // h+=Ch(e,f,g)
- eor w17,w25,w25,ror#9
- add w24,w24,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w20,w20,w24 // d+=h
- eor w19,w19,w26 // Maj(a,b,c)
- eor w17,w6,w17,ror#13 // Sigma0(a)
- add w24,w24,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w24,w24,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w15,w15 // 12
-#endif
- add w24,w24,w17 // h+=Sigma0(a)
- str w7,[sp,#0]
- ror w16,w20,#6
- add w23,w23,w19 // h+=K[i]
- eor w7,w20,w20,ror#14
- and w17,w21,w20
- bic w19,w22,w20
- add w23,w23,w15 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w24,w25 // a^b, b^c in next round
- eor w16,w16,w7,ror#11 // Sigma1(e)
- ror w7,w24,#2
- add w23,w23,w17 // h+=Ch(e,f,g)
- eor w17,w24,w24,ror#9
- add w23,w23,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w27,w27,w23 // d+=h
- eor w28,w28,w25 // Maj(a,b,c)
- eor w17,w7,w17,ror#13 // Sigma0(a)
- add w23,w23,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w23,w23,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w0,w0 // 13
-#endif
- ldp w1,w2,[x1]
- add w23,w23,w17 // h+=Sigma0(a)
- str w8,[sp,#4]
- ror w16,w27,#6
- add w22,w22,w28 // h+=K[i]
- eor w8,w27,w27,ror#14
- and w17,w20,w27
- bic w28,w21,w27
- add w22,w22,w0 // h+=X[i]
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w23,w24 // a^b, b^c in next round
- eor w16,w16,w8,ror#11 // Sigma1(e)
- ror w8,w23,#2
- add w22,w22,w17 // h+=Ch(e,f,g)
- eor w17,w23,w23,ror#9
- add w22,w22,w16 // h+=Sigma1(e)
- and w19,w19,w28 // (b^c)&=(a^b)
- add w26,w26,w22 // d+=h
- eor w19,w19,w24 // Maj(a,b,c)
- eor w17,w8,w17,ror#13 // Sigma0(a)
- add w22,w22,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- //add w22,w22,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w1,w1 // 14
-#endif
- ldr w6,[sp,#12]
- add w22,w22,w17 // h+=Sigma0(a)
- str w9,[sp,#8]
- ror w16,w26,#6
- add w21,w21,w19 // h+=K[i]
- eor w9,w26,w26,ror#14
- and w17,w27,w26
- bic w19,w20,w26
- add w21,w21,w1 // h+=X[i]
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w22,w23 // a^b, b^c in next round
- eor w16,w16,w9,ror#11 // Sigma1(e)
- ror w9,w22,#2
- add w21,w21,w17 // h+=Ch(e,f,g)
- eor w17,w22,w22,ror#9
- add w21,w21,w16 // h+=Sigma1(e)
- and w28,w28,w19 // (b^c)&=(a^b)
- add w25,w25,w21 // d+=h
- eor w28,w28,w23 // Maj(a,b,c)
- eor w17,w9,w17,ror#13 // Sigma0(a)
- add w21,w21,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- //add w21,w21,w17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev w2,w2 // 15
-#endif
- ldr w7,[sp,#0]
- add w21,w21,w17 // h+=Sigma0(a)
- str w10,[sp,#12]
- ror w16,w25,#6
- add w20,w20,w28 // h+=K[i]
- ror w9,w4,#7
- and w17,w26,w25
- ror w8,w1,#17
- bic w28,w27,w25
- ror w10,w21,#2
- add w20,w20,w2 // h+=X[i]
- eor w16,w16,w25,ror#11
- eor w9,w9,w4,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w21,w22 // a^b, b^c in next round
- eor w16,w16,w25,ror#25 // Sigma1(e)
- eor w10,w10,w21,ror#13
- add w20,w20,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w8,w8,w1,ror#19
- eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
- add w20,w20,w16 // h+=Sigma1(e)
- eor w19,w19,w22 // Maj(a,b,c)
- eor w17,w10,w21,ror#22 // Sigma0(a)
- eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
- add w3,w3,w12
- add w24,w24,w20 // d+=h
- add w20,w20,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w3,w3,w9
- add w20,w20,w17 // h+=Sigma0(a)
- add w3,w3,w8
-.Loop_16_xx:
- ldr w8,[sp,#4]
- str w11,[sp,#0]
- ror w16,w24,#6
- add w27,w27,w19 // h+=K[i]
- ror w10,w5,#7
- and w17,w25,w24
- ror w9,w2,#17
- bic w19,w26,w24
- ror w11,w20,#2
- add w27,w27,w3 // h+=X[i]
- eor w16,w16,w24,ror#11
- eor w10,w10,w5,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w20,w21 // a^b, b^c in next round
- eor w16,w16,w24,ror#25 // Sigma1(e)
- eor w11,w11,w20,ror#13
- add w27,w27,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w9,w9,w2,ror#19
- eor w10,w10,w5,lsr#3 // sigma0(X[i+1])
- add w27,w27,w16 // h+=Sigma1(e)
- eor w28,w28,w21 // Maj(a,b,c)
- eor w17,w11,w20,ror#22 // Sigma0(a)
- eor w9,w9,w2,lsr#10 // sigma1(X[i+14])
- add w4,w4,w13
- add w23,w23,w27 // d+=h
- add w27,w27,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w4,w4,w10
- add w27,w27,w17 // h+=Sigma0(a)
- add w4,w4,w9
- ldr w9,[sp,#8]
- str w12,[sp,#4]
- ror w16,w23,#6
- add w26,w26,w28 // h+=K[i]
- ror w11,w6,#7
- and w17,w24,w23
- ror w10,w3,#17
- bic w28,w25,w23
- ror w12,w27,#2
- add w26,w26,w4 // h+=X[i]
- eor w16,w16,w23,ror#11
- eor w11,w11,w6,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w27,w20 // a^b, b^c in next round
- eor w16,w16,w23,ror#25 // Sigma1(e)
- eor w12,w12,w27,ror#13
- add w26,w26,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w10,w10,w3,ror#19
- eor w11,w11,w6,lsr#3 // sigma0(X[i+1])
- add w26,w26,w16 // h+=Sigma1(e)
- eor w19,w19,w20 // Maj(a,b,c)
- eor w17,w12,w27,ror#22 // Sigma0(a)
- eor w10,w10,w3,lsr#10 // sigma1(X[i+14])
- add w5,w5,w14
- add w22,w22,w26 // d+=h
- add w26,w26,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w5,w5,w11
- add w26,w26,w17 // h+=Sigma0(a)
- add w5,w5,w10
- ldr w10,[sp,#12]
- str w13,[sp,#8]
- ror w16,w22,#6
- add w25,w25,w19 // h+=K[i]
- ror w12,w7,#7
- and w17,w23,w22
- ror w11,w4,#17
- bic w19,w24,w22
- ror w13,w26,#2
- add w25,w25,w5 // h+=X[i]
- eor w16,w16,w22,ror#11
- eor w12,w12,w7,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w26,w27 // a^b, b^c in next round
- eor w16,w16,w22,ror#25 // Sigma1(e)
- eor w13,w13,w26,ror#13
- add w25,w25,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w11,w11,w4,ror#19
- eor w12,w12,w7,lsr#3 // sigma0(X[i+1])
- add w25,w25,w16 // h+=Sigma1(e)
- eor w28,w28,w27 // Maj(a,b,c)
- eor w17,w13,w26,ror#22 // Sigma0(a)
- eor w11,w11,w4,lsr#10 // sigma1(X[i+14])
- add w6,w6,w15
- add w21,w21,w25 // d+=h
- add w25,w25,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w6,w6,w12
- add w25,w25,w17 // h+=Sigma0(a)
- add w6,w6,w11
- ldr w11,[sp,#0]
- str w14,[sp,#12]
- ror w16,w21,#6
- add w24,w24,w28 // h+=K[i]
- ror w13,w8,#7
- and w17,w22,w21
- ror w12,w5,#17
- bic w28,w23,w21
- ror w14,w25,#2
- add w24,w24,w6 // h+=X[i]
- eor w16,w16,w21,ror#11
- eor w13,w13,w8,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w25,w26 // a^b, b^c in next round
- eor w16,w16,w21,ror#25 // Sigma1(e)
- eor w14,w14,w25,ror#13
- add w24,w24,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w12,w12,w5,ror#19
- eor w13,w13,w8,lsr#3 // sigma0(X[i+1])
- add w24,w24,w16 // h+=Sigma1(e)
- eor w19,w19,w26 // Maj(a,b,c)
- eor w17,w14,w25,ror#22 // Sigma0(a)
- eor w12,w12,w5,lsr#10 // sigma1(X[i+14])
- add w7,w7,w0
- add w20,w20,w24 // d+=h
- add w24,w24,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w7,w7,w13
- add w24,w24,w17 // h+=Sigma0(a)
- add w7,w7,w12
- ldr w12,[sp,#4]
- str w15,[sp,#0]
- ror w16,w20,#6
- add w23,w23,w19 // h+=K[i]
- ror w14,w9,#7
- and w17,w21,w20
- ror w13,w6,#17
- bic w19,w22,w20
- ror w15,w24,#2
- add w23,w23,w7 // h+=X[i]
- eor w16,w16,w20,ror#11
- eor w14,w14,w9,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w24,w25 // a^b, b^c in next round
- eor w16,w16,w20,ror#25 // Sigma1(e)
- eor w15,w15,w24,ror#13
- add w23,w23,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w13,w13,w6,ror#19
- eor w14,w14,w9,lsr#3 // sigma0(X[i+1])
- add w23,w23,w16 // h+=Sigma1(e)
- eor w28,w28,w25 // Maj(a,b,c)
- eor w17,w15,w24,ror#22 // Sigma0(a)
- eor w13,w13,w6,lsr#10 // sigma1(X[i+14])
- add w8,w8,w1
- add w27,w27,w23 // d+=h
- add w23,w23,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w8,w8,w14
- add w23,w23,w17 // h+=Sigma0(a)
- add w8,w8,w13
- ldr w13,[sp,#8]
- str w0,[sp,#4]
- ror w16,w27,#6
- add w22,w22,w28 // h+=K[i]
- ror w15,w10,#7
- and w17,w20,w27
- ror w14,w7,#17
- bic w28,w21,w27
- ror w0,w23,#2
- add w22,w22,w8 // h+=X[i]
- eor w16,w16,w27,ror#11
- eor w15,w15,w10,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w23,w24 // a^b, b^c in next round
- eor w16,w16,w27,ror#25 // Sigma1(e)
- eor w0,w0,w23,ror#13
- add w22,w22,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w14,w14,w7,ror#19
- eor w15,w15,w10,lsr#3 // sigma0(X[i+1])
- add w22,w22,w16 // h+=Sigma1(e)
- eor w19,w19,w24 // Maj(a,b,c)
- eor w17,w0,w23,ror#22 // Sigma0(a)
- eor w14,w14,w7,lsr#10 // sigma1(X[i+14])
- add w9,w9,w2
- add w26,w26,w22 // d+=h
- add w22,w22,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w9,w9,w15
- add w22,w22,w17 // h+=Sigma0(a)
- add w9,w9,w14
- ldr w14,[sp,#12]
- str w1,[sp,#8]
- ror w16,w26,#6
- add w21,w21,w19 // h+=K[i]
- ror w0,w11,#7
- and w17,w27,w26
- ror w15,w8,#17
- bic w19,w20,w26
- ror w1,w22,#2
- add w21,w21,w9 // h+=X[i]
- eor w16,w16,w26,ror#11
- eor w0,w0,w11,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w22,w23 // a^b, b^c in next round
- eor w16,w16,w26,ror#25 // Sigma1(e)
- eor w1,w1,w22,ror#13
- add w21,w21,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w15,w15,w8,ror#19
- eor w0,w0,w11,lsr#3 // sigma0(X[i+1])
- add w21,w21,w16 // h+=Sigma1(e)
- eor w28,w28,w23 // Maj(a,b,c)
- eor w17,w1,w22,ror#22 // Sigma0(a)
- eor w15,w15,w8,lsr#10 // sigma1(X[i+14])
- add w10,w10,w3
- add w25,w25,w21 // d+=h
- add w21,w21,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w10,w10,w0
- add w21,w21,w17 // h+=Sigma0(a)
- add w10,w10,w15
- ldr w15,[sp,#0]
- str w2,[sp,#12]
- ror w16,w25,#6
- add w20,w20,w28 // h+=K[i]
- ror w1,w12,#7
- and w17,w26,w25
- ror w0,w9,#17
- bic w28,w27,w25
- ror w2,w21,#2
- add w20,w20,w10 // h+=X[i]
- eor w16,w16,w25,ror#11
- eor w1,w1,w12,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w21,w22 // a^b, b^c in next round
- eor w16,w16,w25,ror#25 // Sigma1(e)
- eor w2,w2,w21,ror#13
- add w20,w20,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w0,w0,w9,ror#19
- eor w1,w1,w12,lsr#3 // sigma0(X[i+1])
- add w20,w20,w16 // h+=Sigma1(e)
- eor w19,w19,w22 // Maj(a,b,c)
- eor w17,w2,w21,ror#22 // Sigma0(a)
- eor w0,w0,w9,lsr#10 // sigma1(X[i+14])
- add w11,w11,w4
- add w24,w24,w20 // d+=h
- add w20,w20,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w11,w11,w1
- add w20,w20,w17 // h+=Sigma0(a)
- add w11,w11,w0
- ldr w0,[sp,#4]
- str w3,[sp,#0]
- ror w16,w24,#6
- add w27,w27,w19 // h+=K[i]
- ror w2,w13,#7
- and w17,w25,w24
- ror w1,w10,#17
- bic w19,w26,w24
- ror w3,w20,#2
- add w27,w27,w11 // h+=X[i]
- eor w16,w16,w24,ror#11
- eor w2,w2,w13,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w20,w21 // a^b, b^c in next round
- eor w16,w16,w24,ror#25 // Sigma1(e)
- eor w3,w3,w20,ror#13
- add w27,w27,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w1,w1,w10,ror#19
- eor w2,w2,w13,lsr#3 // sigma0(X[i+1])
- add w27,w27,w16 // h+=Sigma1(e)
- eor w28,w28,w21 // Maj(a,b,c)
- eor w17,w3,w20,ror#22 // Sigma0(a)
- eor w1,w1,w10,lsr#10 // sigma1(X[i+14])
- add w12,w12,w5
- add w23,w23,w27 // d+=h
- add w27,w27,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w12,w12,w2
- add w27,w27,w17 // h+=Sigma0(a)
- add w12,w12,w1
- ldr w1,[sp,#8]
- str w4,[sp,#4]
- ror w16,w23,#6
- add w26,w26,w28 // h+=K[i]
- ror w3,w14,#7
- and w17,w24,w23
- ror w2,w11,#17
- bic w28,w25,w23
- ror w4,w27,#2
- add w26,w26,w12 // h+=X[i]
- eor w16,w16,w23,ror#11
- eor w3,w3,w14,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w27,w20 // a^b, b^c in next round
- eor w16,w16,w23,ror#25 // Sigma1(e)
- eor w4,w4,w27,ror#13
- add w26,w26,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w2,w2,w11,ror#19
- eor w3,w3,w14,lsr#3 // sigma0(X[i+1])
- add w26,w26,w16 // h+=Sigma1(e)
- eor w19,w19,w20 // Maj(a,b,c)
- eor w17,w4,w27,ror#22 // Sigma0(a)
- eor w2,w2,w11,lsr#10 // sigma1(X[i+14])
- add w13,w13,w6
- add w22,w22,w26 // d+=h
- add w26,w26,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w13,w13,w3
- add w26,w26,w17 // h+=Sigma0(a)
- add w13,w13,w2
- ldr w2,[sp,#12]
- str w5,[sp,#8]
- ror w16,w22,#6
- add w25,w25,w19 // h+=K[i]
- ror w4,w15,#7
- and w17,w23,w22
- ror w3,w12,#17
- bic w19,w24,w22
- ror w5,w26,#2
- add w25,w25,w13 // h+=X[i]
- eor w16,w16,w22,ror#11
- eor w4,w4,w15,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w26,w27 // a^b, b^c in next round
- eor w16,w16,w22,ror#25 // Sigma1(e)
- eor w5,w5,w26,ror#13
- add w25,w25,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w3,w3,w12,ror#19
- eor w4,w4,w15,lsr#3 // sigma0(X[i+1])
- add w25,w25,w16 // h+=Sigma1(e)
- eor w28,w28,w27 // Maj(a,b,c)
- eor w17,w5,w26,ror#22 // Sigma0(a)
- eor w3,w3,w12,lsr#10 // sigma1(X[i+14])
- add w14,w14,w7
- add w21,w21,w25 // d+=h
- add w25,w25,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w14,w14,w4
- add w25,w25,w17 // h+=Sigma0(a)
- add w14,w14,w3
- ldr w3,[sp,#0]
- str w6,[sp,#12]
- ror w16,w21,#6
- add w24,w24,w28 // h+=K[i]
- ror w5,w0,#7
- and w17,w22,w21
- ror w4,w13,#17
- bic w28,w23,w21
- ror w6,w25,#2
- add w24,w24,w14 // h+=X[i]
- eor w16,w16,w21,ror#11
- eor w5,w5,w0,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w25,w26 // a^b, b^c in next round
- eor w16,w16,w21,ror#25 // Sigma1(e)
- eor w6,w6,w25,ror#13
- add w24,w24,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w4,w4,w13,ror#19
- eor w5,w5,w0,lsr#3 // sigma0(X[i+1])
- add w24,w24,w16 // h+=Sigma1(e)
- eor w19,w19,w26 // Maj(a,b,c)
- eor w17,w6,w25,ror#22 // Sigma0(a)
- eor w4,w4,w13,lsr#10 // sigma1(X[i+14])
- add w15,w15,w8
- add w20,w20,w24 // d+=h
- add w24,w24,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w15,w15,w5
- add w24,w24,w17 // h+=Sigma0(a)
- add w15,w15,w4
- ldr w4,[sp,#4]
- str w7,[sp,#0]
- ror w16,w20,#6
- add w23,w23,w19 // h+=K[i]
- ror w6,w1,#7
- and w17,w21,w20
- ror w5,w14,#17
- bic w19,w22,w20
- ror w7,w24,#2
- add w23,w23,w15 // h+=X[i]
- eor w16,w16,w20,ror#11
- eor w6,w6,w1,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w24,w25 // a^b, b^c in next round
- eor w16,w16,w20,ror#25 // Sigma1(e)
- eor w7,w7,w24,ror#13
- add w23,w23,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w5,w5,w14,ror#19
- eor w6,w6,w1,lsr#3 // sigma0(X[i+1])
- add w23,w23,w16 // h+=Sigma1(e)
- eor w28,w28,w25 // Maj(a,b,c)
- eor w17,w7,w24,ror#22 // Sigma0(a)
- eor w5,w5,w14,lsr#10 // sigma1(X[i+14])
- add w0,w0,w9
- add w27,w27,w23 // d+=h
- add w23,w23,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w0,w0,w6
- add w23,w23,w17 // h+=Sigma0(a)
- add w0,w0,w5
- ldr w5,[sp,#8]
- str w8,[sp,#4]
- ror w16,w27,#6
- add w22,w22,w28 // h+=K[i]
- ror w7,w2,#7
- and w17,w20,w27
- ror w6,w15,#17
- bic w28,w21,w27
- ror w8,w23,#2
- add w22,w22,w0 // h+=X[i]
- eor w16,w16,w27,ror#11
- eor w7,w7,w2,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w23,w24 // a^b, b^c in next round
- eor w16,w16,w27,ror#25 // Sigma1(e)
- eor w8,w8,w23,ror#13
- add w22,w22,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w6,w6,w15,ror#19
- eor w7,w7,w2,lsr#3 // sigma0(X[i+1])
- add w22,w22,w16 // h+=Sigma1(e)
- eor w19,w19,w24 // Maj(a,b,c)
- eor w17,w8,w23,ror#22 // Sigma0(a)
- eor w6,w6,w15,lsr#10 // sigma1(X[i+14])
- add w1,w1,w10
- add w26,w26,w22 // d+=h
- add w22,w22,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w1,w1,w7
- add w22,w22,w17 // h+=Sigma0(a)
- add w1,w1,w6
- ldr w6,[sp,#12]
- str w9,[sp,#8]
- ror w16,w26,#6
- add w21,w21,w19 // h+=K[i]
- ror w8,w3,#7
- and w17,w27,w26
- ror w7,w0,#17
- bic w19,w20,w26
- ror w9,w22,#2
- add w21,w21,w1 // h+=X[i]
- eor w16,w16,w26,ror#11
- eor w8,w8,w3,ror#18
- orr w17,w17,w19 // Ch(e,f,g)
- eor w19,w22,w23 // a^b, b^c in next round
- eor w16,w16,w26,ror#25 // Sigma1(e)
- eor w9,w9,w22,ror#13
- add w21,w21,w17 // h+=Ch(e,f,g)
- and w28,w28,w19 // (b^c)&=(a^b)
- eor w7,w7,w0,ror#19
- eor w8,w8,w3,lsr#3 // sigma0(X[i+1])
- add w21,w21,w16 // h+=Sigma1(e)
- eor w28,w28,w23 // Maj(a,b,c)
- eor w17,w9,w22,ror#22 // Sigma0(a)
- eor w7,w7,w0,lsr#10 // sigma1(X[i+14])
- add w2,w2,w11
- add w25,w25,w21 // d+=h
- add w21,w21,w28 // h+=Maj(a,b,c)
- ldr w28,[x30],#4 // *K++, w19 in next round
- add w2,w2,w8
- add w21,w21,w17 // h+=Sigma0(a)
- add w2,w2,w7
- ldr w7,[sp,#0]
- str w10,[sp,#12]
- ror w16,w25,#6
- add w20,w20,w28 // h+=K[i]
- ror w9,w4,#7
- and w17,w26,w25
- ror w8,w1,#17
- bic w28,w27,w25
- ror w10,w21,#2
- add w20,w20,w2 // h+=X[i]
- eor w16,w16,w25,ror#11
- eor w9,w9,w4,ror#18
- orr w17,w17,w28 // Ch(e,f,g)
- eor w28,w21,w22 // a^b, b^c in next round
- eor w16,w16,w25,ror#25 // Sigma1(e)
- eor w10,w10,w21,ror#13
- add w20,w20,w17 // h+=Ch(e,f,g)
- and w19,w19,w28 // (b^c)&=(a^b)
- eor w8,w8,w1,ror#19
- eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
- add w20,w20,w16 // h+=Sigma1(e)
- eor w19,w19,w22 // Maj(a,b,c)
- eor w17,w10,w21,ror#22 // Sigma0(a)
- eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
- add w3,w3,w12
- add w24,w24,w20 // d+=h
- add w20,w20,w19 // h+=Maj(a,b,c)
- ldr w19,[x30],#4 // *K++, w28 in next round
- add w3,w3,w9
- add w20,w20,w17 // h+=Sigma0(a)
- add w3,w3,w8
- cbnz w19,.Loop_16_xx
-
- ldp x0,x2,[x29,#96]
- ldr x1,[x29,#112]
- sub x30,x30,#260 // rewind
-
- ldp w3,w4,[x0]
- ldp w5,w6,[x0,#2*4]
- add x1,x1,#14*4 // advance input pointer
- ldp w7,w8,[x0,#4*4]
- add w20,w20,w3
- ldp w9,w10,[x0,#6*4]
- add w21,w21,w4
- add w22,w22,w5
- add w23,w23,w6
- stp w20,w21,[x0]
- add w24,w24,w7
- add w25,w25,w8
- stp w22,w23,[x0,#2*4]
- add w26,w26,w9
- add w27,w27,w10
- cmp x1,x2
- stp w24,w25,[x0,#4*4]
- stp w26,w27,[x0,#6*4]
- b.ne .Loop
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#4*4
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#128
- ret
-.size sha256_block_data_order,.-sha256_block_data_order
-
-.align 6
-.type .LK256,%object
-.LK256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
- .long 0 //terminator
-.size .LK256,.-.LK256
-#ifndef __KERNEL__
-.align 3
-.LOPENSSL_armcap_P:
-# ifdef __ILP32__
- .long OPENSSL_armcap_P-.
-# else
- .quad OPENSSL_armcap_P-.
-# endif
-#endif
-.asciz "SHA256 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
-#ifndef __KERNEL__
-.type sha256_block_armv8,%function
-.align 6
-sha256_block_armv8:
-.Lv8_entry:
- stp x29,x30,[sp,#-16]!
- add x29,sp,#0
-
- ld1 {v0.4s,v1.4s},[x0]
- adr x3,.LK256
-
-.Loop_hw:
- ld1 {v4.16b-v7.16b},[x1],#64
- sub x2,x2,#1
- ld1 {v16.4s},[x3],#16
- rev32 v4.16b,v4.16b
- rev32 v5.16b,v5.16b
- rev32 v6.16b,v6.16b
- rev32 v7.16b,v7.16b
- orr v18.16b,v0.16b,v0.16b // offload
- orr v19.16b,v1.16b,v1.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v4.4s
- .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
- .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v5.4s
- .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
- .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v6.4s
- .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
- .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v7.4s
- .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
- .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v4.4s
- .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
- .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v5.4s
- .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
- .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v6.4s
- .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
- .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v7.4s
- .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
- .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v4.4s
- .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
- .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v5.4s
- .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
- .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v6.4s
- .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
- .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v7.4s
- .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
- .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
- ld1 {v17.4s},[x3],#16
- add v16.4s,v16.4s,v4.4s
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
-
- ld1 {v16.4s},[x3],#16
- add v17.4s,v17.4s,v5.4s
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
-
- ld1 {v17.4s},[x3]
- add v16.4s,v16.4s,v6.4s
- sub x3,x3,#64*4-16 // rewind
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
- .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
-
- add v17.4s,v17.4s,v7.4s
- orr v2.16b,v0.16b,v0.16b
- .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
- .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
-
- add v0.4s,v0.4s,v18.4s
- add v1.4s,v1.4s,v19.4s
-
- cbnz x2,.Loop_hw
-
- st1 {v0.4s,v1.4s},[x0]
-
- ldr x29,[sp],#16
- ret
-.size sha256_block_armv8,.-sha256_block_armv8
-#endif
-#ifdef __KERNEL__
-.globl sha256_block_neon
-#endif
-.type sha256_block_neon,%function
-.align 4
-sha256_block_neon:
-.Lneon_entry:
- stp x29, x30, [sp, #-16]!
- mov x29, sp
- sub sp,sp,#16*4
-
- adr x16,.LK256
- add x2,x1,x2,lsl#6 // len to point at the end of inp
-
- ld1 {v0.16b},[x1], #16
- ld1 {v1.16b},[x1], #16
- ld1 {v2.16b},[x1], #16
- ld1 {v3.16b},[x1], #16
- ld1 {v4.4s},[x16], #16
- ld1 {v5.4s},[x16], #16
- ld1 {v6.4s},[x16], #16
- ld1 {v7.4s},[x16], #16
- rev32 v0.16b,v0.16b // yes, even on
- rev32 v1.16b,v1.16b // big-endian
- rev32 v2.16b,v2.16b
- rev32 v3.16b,v3.16b
- mov x17,sp
- add v4.4s,v4.4s,v0.4s
- add v5.4s,v5.4s,v1.4s
- add v6.4s,v6.4s,v2.4s
- st1 {v4.4s-v5.4s},[x17], #32
- add v7.4s,v7.4s,v3.4s
- st1 {v6.4s-v7.4s},[x17]
- sub x17,x17,#32
-
- ldp w3,w4,[x0]
- ldp w5,w6,[x0,#8]
- ldp w7,w8,[x0,#16]
- ldp w9,w10,[x0,#24]
- ldr w12,[sp,#0]
- mov w13,wzr
- eor w14,w4,w5
- mov w15,wzr
- b .L_00_48
-
-.align 4
-.L_00_48:
- ext v4.16b,v0.16b,v1.16b,#4
- add w10,w10,w12
- add w3,w3,w15
- and w12,w8,w7
- bic w15,w9,w7
- ext v7.16b,v2.16b,v3.16b,#4
- eor w11,w7,w7,ror#5
- add w3,w3,w13
- mov d19,v3.d[1]
- orr w12,w12,w15
- eor w11,w11,w7,ror#19
- ushr v6.4s,v4.4s,#7
- eor w15,w3,w3,ror#11
- ushr v5.4s,v4.4s,#3
- add w10,w10,w12
- add v0.4s,v0.4s,v7.4s
- ror w11,w11,#6
- sli v6.4s,v4.4s,#25
- eor w13,w3,w4
- eor w15,w15,w3,ror#20
- ushr v7.4s,v4.4s,#18
- add w10,w10,w11
- ldr w12,[sp,#4]
- and w14,w14,w13
- eor v5.16b,v5.16b,v6.16b
- ror w15,w15,#2
- add w6,w6,w10
- sli v7.4s,v4.4s,#14
- eor w14,w14,w4
- ushr v16.4s,v19.4s,#17
- add w9,w9,w12
- add w10,w10,w15
- and w12,w7,w6
- eor v5.16b,v5.16b,v7.16b
- bic w15,w8,w6
- eor w11,w6,w6,ror#5
- sli v16.4s,v19.4s,#15
- add w10,w10,w14
- orr w12,w12,w15
- ushr v17.4s,v19.4s,#10
- eor w11,w11,w6,ror#19
- eor w15,w10,w10,ror#11
- ushr v7.4s,v19.4s,#19
- add w9,w9,w12
- ror w11,w11,#6
- add v0.4s,v0.4s,v5.4s
- eor w14,w10,w3
- eor w15,w15,w10,ror#20
- sli v7.4s,v19.4s,#13
- add w9,w9,w11
- ldr w12,[sp,#8]
- and w13,w13,w14
- eor v17.16b,v17.16b,v16.16b
- ror w15,w15,#2
- add w5,w5,w9
- eor w13,w13,w3
- eor v17.16b,v17.16b,v7.16b
- add w8,w8,w12
- add w9,w9,w15
- and w12,w6,w5
- add v0.4s,v0.4s,v17.4s
- bic w15,w7,w5
- eor w11,w5,w5,ror#5
- add w9,w9,w13
- ushr v18.4s,v0.4s,#17
- orr w12,w12,w15
- ushr v19.4s,v0.4s,#10
- eor w11,w11,w5,ror#19
- eor w15,w9,w9,ror#11
- sli v18.4s,v0.4s,#15
- add w8,w8,w12
- ushr v17.4s,v0.4s,#19
- ror w11,w11,#6
- eor w13,w9,w10
- eor v19.16b,v19.16b,v18.16b
- eor w15,w15,w9,ror#20
- add w8,w8,w11
- sli v17.4s,v0.4s,#13
- ldr w12,[sp,#12]
- and w14,w14,w13
- ror w15,w15,#2
- ld1 {v4.4s},[x16], #16
- add w4,w4,w8
- eor v19.16b,v19.16b,v17.16b
- eor w14,w14,w10
- eor v17.16b,v17.16b,v17.16b
- add w7,w7,w12
- add w8,w8,w15
- and w12,w5,w4
- mov v17.d[1],v19.d[0]
- bic w15,w6,w4
- eor w11,w4,w4,ror#5
- add w8,w8,w14
- add v0.4s,v0.4s,v17.4s
- orr w12,w12,w15
- eor w11,w11,w4,ror#19
- eor w15,w8,w8,ror#11
- add v4.4s,v4.4s,v0.4s
- add w7,w7,w12
- ror w11,w11,#6
- eor w14,w8,w9
- eor w15,w15,w8,ror#20
- add w7,w7,w11
- ldr w12,[sp,#16]
- and w13,w13,w14
- ror w15,w15,#2
- add w3,w3,w7
- eor w13,w13,w9
- st1 {v4.4s},[x17], #16
- ext v4.16b,v1.16b,v2.16b,#4
- add w6,w6,w12
- add w7,w7,w15
- and w12,w4,w3
- bic w15,w5,w3
- ext v7.16b,v3.16b,v0.16b,#4
- eor w11,w3,w3,ror#5
- add w7,w7,w13
- mov d19,v0.d[1]
- orr w12,w12,w15
- eor w11,w11,w3,ror#19
- ushr v6.4s,v4.4s,#7
- eor w15,w7,w7,ror#11
- ushr v5.4s,v4.4s,#3
- add w6,w6,w12
- add v1.4s,v1.4s,v7.4s
- ror w11,w11,#6
- sli v6.4s,v4.4s,#25
- eor w13,w7,w8
- eor w15,w15,w7,ror#20
- ushr v7.4s,v4.4s,#18
- add w6,w6,w11
- ldr w12,[sp,#20]
- and w14,w14,w13
- eor v5.16b,v5.16b,v6.16b
- ror w15,w15,#2
- add w10,w10,w6
- sli v7.4s,v4.4s,#14
- eor w14,w14,w8
- ushr v16.4s,v19.4s,#17
- add w5,w5,w12
- add w6,w6,w15
- and w12,w3,w10
- eor v5.16b,v5.16b,v7.16b
- bic w15,w4,w10
- eor w11,w10,w10,ror#5
- sli v16.4s,v19.4s,#15
- add w6,w6,w14
- orr w12,w12,w15
- ushr v17.4s,v19.4s,#10
- eor w11,w11,w10,ror#19
- eor w15,w6,w6,ror#11
- ushr v7.4s,v19.4s,#19
- add w5,w5,w12
- ror w11,w11,#6
- add v1.4s,v1.4s,v5.4s
- eor w14,w6,w7
- eor w15,w15,w6,ror#20
- sli v7.4s,v19.4s,#13
- add w5,w5,w11
- ldr w12,[sp,#24]
- and w13,w13,w14
- eor v17.16b,v17.16b,v16.16b
- ror w15,w15,#2
- add w9,w9,w5
- eor w13,w13,w7
- eor v17.16b,v17.16b,v7.16b
- add w4,w4,w12
- add w5,w5,w15
- and w12,w10,w9
- add v1.4s,v1.4s,v17.4s
- bic w15,w3,w9
- eor w11,w9,w9,ror#5
- add w5,w5,w13
- ushr v18.4s,v1.4s,#17
- orr w12,w12,w15
- ushr v19.4s,v1.4s,#10
- eor w11,w11,w9,ror#19
- eor w15,w5,w5,ror#11
- sli v18.4s,v1.4s,#15
- add w4,w4,w12
- ushr v17.4s,v1.4s,#19
- ror w11,w11,#6
- eor w13,w5,w6
- eor v19.16b,v19.16b,v18.16b
- eor w15,w15,w5,ror#20
- add w4,w4,w11
- sli v17.4s,v1.4s,#13
- ldr w12,[sp,#28]
- and w14,w14,w13
- ror w15,w15,#2
- ld1 {v4.4s},[x16], #16
- add w8,w8,w4
- eor v19.16b,v19.16b,v17.16b
- eor w14,w14,w6
- eor v17.16b,v17.16b,v17.16b
- add w3,w3,w12
- add w4,w4,w15
- and w12,w9,w8
- mov v17.d[1],v19.d[0]
- bic w15,w10,w8
- eor w11,w8,w8,ror#5
- add w4,w4,w14
- add v1.4s,v1.4s,v17.4s
- orr w12,w12,w15
- eor w11,w11,w8,ror#19
- eor w15,w4,w4,ror#11
- add v4.4s,v4.4s,v1.4s
- add w3,w3,w12
- ror w11,w11,#6
- eor w14,w4,w5
- eor w15,w15,w4,ror#20
- add w3,w3,w11
- ldr w12,[sp,#32]
- and w13,w13,w14
- ror w15,w15,#2
- add w7,w7,w3
- eor w13,w13,w5
- st1 {v4.4s},[x17], #16
- ext v4.16b,v2.16b,v3.16b,#4
- add w10,w10,w12
- add w3,w3,w15
- and w12,w8,w7
- bic w15,w9,w7
- ext v7.16b,v0.16b,v1.16b,#4
- eor w11,w7,w7,ror#5
- add w3,w3,w13
- mov d19,v1.d[1]
- orr w12,w12,w15
- eor w11,w11,w7,ror#19
- ushr v6.4s,v4.4s,#7
- eor w15,w3,w3,ror#11
- ushr v5.4s,v4.4s,#3
- add w10,w10,w12
- add v2.4s,v2.4s,v7.4s
- ror w11,w11,#6
- sli v6.4s,v4.4s,#25
- eor w13,w3,w4
- eor w15,w15,w3,ror#20
- ushr v7.4s,v4.4s,#18
- add w10,w10,w11
- ldr w12,[sp,#36]
- and w14,w14,w13
- eor v5.16b,v5.16b,v6.16b
- ror w15,w15,#2
- add w6,w6,w10
- sli v7.4s,v4.4s,#14
- eor w14,w14,w4
- ushr v16.4s,v19.4s,#17
- add w9,w9,w12
- add w10,w10,w15
- and w12,w7,w6
- eor v5.16b,v5.16b,v7.16b
- bic w15,w8,w6
- eor w11,w6,w6,ror#5
- sli v16.4s,v19.4s,#15
- add w10,w10,w14
- orr w12,w12,w15
- ushr v17.4s,v19.4s,#10
- eor w11,w11,w6,ror#19
- eor w15,w10,w10,ror#11
- ushr v7.4s,v19.4s,#19
- add w9,w9,w12
- ror w11,w11,#6
- add v2.4s,v2.4s,v5.4s
- eor w14,w10,w3
- eor w15,w15,w10,ror#20
- sli v7.4s,v19.4s,#13
- add w9,w9,w11
- ldr w12,[sp,#40]
- and w13,w13,w14
- eor v17.16b,v17.16b,v16.16b
- ror w15,w15,#2
- add w5,w5,w9
- eor w13,w13,w3
- eor v17.16b,v17.16b,v7.16b
- add w8,w8,w12
- add w9,w9,w15
- and w12,w6,w5
- add v2.4s,v2.4s,v17.4s
- bic w15,w7,w5
- eor w11,w5,w5,ror#5
- add w9,w9,w13
- ushr v18.4s,v2.4s,#17
- orr w12,w12,w15
- ushr v19.4s,v2.4s,#10
- eor w11,w11,w5,ror#19
- eor w15,w9,w9,ror#11
- sli v18.4s,v2.4s,#15
- add w8,w8,w12
- ushr v17.4s,v2.4s,#19
- ror w11,w11,#6
- eor w13,w9,w10
- eor v19.16b,v19.16b,v18.16b
- eor w15,w15,w9,ror#20
- add w8,w8,w11
- sli v17.4s,v2.4s,#13
- ldr w12,[sp,#44]
- and w14,w14,w13
- ror w15,w15,#2
- ld1 {v4.4s},[x16], #16
- add w4,w4,w8
- eor v19.16b,v19.16b,v17.16b
- eor w14,w14,w10
- eor v17.16b,v17.16b,v17.16b
- add w7,w7,w12
- add w8,w8,w15
- and w12,w5,w4
- mov v17.d[1],v19.d[0]
- bic w15,w6,w4
- eor w11,w4,w4,ror#5
- add w8,w8,w14
- add v2.4s,v2.4s,v17.4s
- orr w12,w12,w15
- eor w11,w11,w4,ror#19
- eor w15,w8,w8,ror#11
- add v4.4s,v4.4s,v2.4s
- add w7,w7,w12
- ror w11,w11,#6
- eor w14,w8,w9
- eor w15,w15,w8,ror#20
- add w7,w7,w11
- ldr w12,[sp,#48]
- and w13,w13,w14
- ror w15,w15,#2
- add w3,w3,w7
- eor w13,w13,w9
- st1 {v4.4s},[x17], #16
- ext v4.16b,v3.16b,v0.16b,#4
- add w6,w6,w12
- add w7,w7,w15
- and w12,w4,w3
- bic w15,w5,w3
- ext v7.16b,v1.16b,v2.16b,#4
- eor w11,w3,w3,ror#5
- add w7,w7,w13
- mov d19,v2.d[1]
- orr w12,w12,w15
- eor w11,w11,w3,ror#19
- ushr v6.4s,v4.4s,#7
- eor w15,w7,w7,ror#11
- ushr v5.4s,v4.4s,#3
- add w6,w6,w12
- add v3.4s,v3.4s,v7.4s
- ror w11,w11,#6
- sli v6.4s,v4.4s,#25
- eor w13,w7,w8
- eor w15,w15,w7,ror#20
- ushr v7.4s,v4.4s,#18
- add w6,w6,w11
- ldr w12,[sp,#52]
- and w14,w14,w13
- eor v5.16b,v5.16b,v6.16b
- ror w15,w15,#2
- add w10,w10,w6
- sli v7.4s,v4.4s,#14
- eor w14,w14,w8
- ushr v16.4s,v19.4s,#17
- add w5,w5,w12
- add w6,w6,w15
- and w12,w3,w10
- eor v5.16b,v5.16b,v7.16b
- bic w15,w4,w10
- eor w11,w10,w10,ror#5
- sli v16.4s,v19.4s,#15
- add w6,w6,w14
- orr w12,w12,w15
- ushr v17.4s,v19.4s,#10
- eor w11,w11,w10,ror#19
- eor w15,w6,w6,ror#11
- ushr v7.4s,v19.4s,#19
- add w5,w5,w12
- ror w11,w11,#6
- add v3.4s,v3.4s,v5.4s
- eor w14,w6,w7
- eor w15,w15,w6,ror#20
- sli v7.4s,v19.4s,#13
- add w5,w5,w11
- ldr w12,[sp,#56]
- and w13,w13,w14
- eor v17.16b,v17.16b,v16.16b
- ror w15,w15,#2
- add w9,w9,w5
- eor w13,w13,w7
- eor v17.16b,v17.16b,v7.16b
- add w4,w4,w12
- add w5,w5,w15
- and w12,w10,w9
- add v3.4s,v3.4s,v17.4s
- bic w15,w3,w9
- eor w11,w9,w9,ror#5
- add w5,w5,w13
- ushr v18.4s,v3.4s,#17
- orr w12,w12,w15
- ushr v19.4s,v3.4s,#10
- eor w11,w11,w9,ror#19
- eor w15,w5,w5,ror#11
- sli v18.4s,v3.4s,#15
- add w4,w4,w12
- ushr v17.4s,v3.4s,#19
- ror w11,w11,#6
- eor w13,w5,w6
- eor v19.16b,v19.16b,v18.16b
- eor w15,w15,w5,ror#20
- add w4,w4,w11
- sli v17.4s,v3.4s,#13
- ldr w12,[sp,#60]
- and w14,w14,w13
- ror w15,w15,#2
- ld1 {v4.4s},[x16], #16
- add w8,w8,w4
- eor v19.16b,v19.16b,v17.16b
- eor w14,w14,w6
- eor v17.16b,v17.16b,v17.16b
- add w3,w3,w12
- add w4,w4,w15
- and w12,w9,w8
- mov v17.d[1],v19.d[0]
- bic w15,w10,w8
- eor w11,w8,w8,ror#5
- add w4,w4,w14
- add v3.4s,v3.4s,v17.4s
- orr w12,w12,w15
- eor w11,w11,w8,ror#19
- eor w15,w4,w4,ror#11
- add v4.4s,v4.4s,v3.4s
- add w3,w3,w12
- ror w11,w11,#6
- eor w14,w4,w5
- eor w15,w15,w4,ror#20
- add w3,w3,w11
- ldr w12,[x16]
- and w13,w13,w14
- ror w15,w15,#2
- add w7,w7,w3
- eor w13,w13,w5
- st1 {v4.4s},[x17], #16
- cmp w12,#0 // check for K256 terminator
- ldr w12,[sp,#0]
- sub x17,x17,#64
- bne .L_00_48
-
- sub x16,x16,#256 // rewind x16
- cmp x1,x2
- mov x17, #64
- csel x17, x17, xzr, eq
- sub x1,x1,x17 // avoid SEGV
- mov x17,sp
- add w10,w10,w12
- add w3,w3,w15
- and w12,w8,w7
- ld1 {v0.16b},[x1],#16
- bic w15,w9,w7
- eor w11,w7,w7,ror#5
- ld1 {v4.4s},[x16],#16
- add w3,w3,w13
- orr w12,w12,w15
- eor w11,w11,w7,ror#19
- eor w15,w3,w3,ror#11
- rev32 v0.16b,v0.16b
- add w10,w10,w12
- ror w11,w11,#6
- eor w13,w3,w4
- eor w15,w15,w3,ror#20
- add v4.4s,v4.4s,v0.4s
- add w10,w10,w11
- ldr w12,[sp,#4]
- and w14,w14,w13
- ror w15,w15,#2
- add w6,w6,w10
- eor w14,w14,w4
- add w9,w9,w12
- add w10,w10,w15
- and w12,w7,w6
- bic w15,w8,w6
- eor w11,w6,w6,ror#5
- add w10,w10,w14
- orr w12,w12,w15
- eor w11,w11,w6,ror#19
- eor w15,w10,w10,ror#11
- add w9,w9,w12
- ror w11,w11,#6
- eor w14,w10,w3
- eor w15,w15,w10,ror#20
- add w9,w9,w11
- ldr w12,[sp,#8]
- and w13,w13,w14
- ror w15,w15,#2
- add w5,w5,w9
- eor w13,w13,w3
- add w8,w8,w12
- add w9,w9,w15
- and w12,w6,w5
- bic w15,w7,w5
- eor w11,w5,w5,ror#5
- add w9,w9,w13
- orr w12,w12,w15
- eor w11,w11,w5,ror#19
- eor w15,w9,w9,ror#11
- add w8,w8,w12
- ror w11,w11,#6
- eor w13,w9,w10
- eor w15,w15,w9,ror#20
- add w8,w8,w11
- ldr w12,[sp,#12]
- and w14,w14,w13
- ror w15,w15,#2
- add w4,w4,w8
- eor w14,w14,w10
- add w7,w7,w12
- add w8,w8,w15
- and w12,w5,w4
- bic w15,w6,w4
- eor w11,w4,w4,ror#5
- add w8,w8,w14
- orr w12,w12,w15
- eor w11,w11,w4,ror#19
- eor w15,w8,w8,ror#11
- add w7,w7,w12
- ror w11,w11,#6
- eor w14,w8,w9
- eor w15,w15,w8,ror#20
- add w7,w7,w11
- ldr w12,[sp,#16]
- and w13,w13,w14
- ror w15,w15,#2
- add w3,w3,w7
- eor w13,w13,w9
- st1 {v4.4s},[x17], #16
- add w6,w6,w12
- add w7,w7,w15
- and w12,w4,w3
- ld1 {v1.16b},[x1],#16
- bic w15,w5,w3
- eor w11,w3,w3,ror#5
- ld1 {v4.4s},[x16],#16
- add w7,w7,w13
- orr w12,w12,w15
- eor w11,w11,w3,ror#19
- eor w15,w7,w7,ror#11
- rev32 v1.16b,v1.16b
- add w6,w6,w12
- ror w11,w11,#6
- eor w13,w7,w8
- eor w15,w15,w7,ror#20
- add v4.4s,v4.4s,v1.4s
- add w6,w6,w11
- ldr w12,[sp,#20]
- and w14,w14,w13
- ror w15,w15,#2
- add w10,w10,w6
- eor w14,w14,w8
- add w5,w5,w12
- add w6,w6,w15
- and w12,w3,w10
- bic w15,w4,w10
- eor w11,w10,w10,ror#5
- add w6,w6,w14
- orr w12,w12,w15
- eor w11,w11,w10,ror#19
- eor w15,w6,w6,ror#11
- add w5,w5,w12
- ror w11,w11,#6
- eor w14,w6,w7
- eor w15,w15,w6,ror#20
- add w5,w5,w11
- ldr w12,[sp,#24]
- and w13,w13,w14
- ror w15,w15,#2
- add w9,w9,w5
- eor w13,w13,w7
- add w4,w4,w12
- add w5,w5,w15
- and w12,w10,w9
- bic w15,w3,w9
- eor w11,w9,w9,ror#5
- add w5,w5,w13
- orr w12,w12,w15
- eor w11,w11,w9,ror#19
- eor w15,w5,w5,ror#11
- add w4,w4,w12
- ror w11,w11,#6
- eor w13,w5,w6
- eor w15,w15,w5,ror#20
- add w4,w4,w11
- ldr w12,[sp,#28]
- and w14,w14,w13
- ror w15,w15,#2
- add w8,w8,w4
- eor w14,w14,w6
- add w3,w3,w12
- add w4,w4,w15
- and w12,w9,w8
- bic w15,w10,w8
- eor w11,w8,w8,ror#5
- add w4,w4,w14
- orr w12,w12,w15
- eor w11,w11,w8,ror#19
- eor w15,w4,w4,ror#11
- add w3,w3,w12
- ror w11,w11,#6
- eor w14,w4,w5
- eor w15,w15,w4,ror#20
- add w3,w3,w11
- ldr w12,[sp,#32]
- and w13,w13,w14
- ror w15,w15,#2
- add w7,w7,w3
- eor w13,w13,w5
- st1 {v4.4s},[x17], #16
- add w10,w10,w12
- add w3,w3,w15
- and w12,w8,w7
- ld1 {v2.16b},[x1],#16
- bic w15,w9,w7
- eor w11,w7,w7,ror#5
- ld1 {v4.4s},[x16],#16
- add w3,w3,w13
- orr w12,w12,w15
- eor w11,w11,w7,ror#19
- eor w15,w3,w3,ror#11
- rev32 v2.16b,v2.16b
- add w10,w10,w12
- ror w11,w11,#6
- eor w13,w3,w4
- eor w15,w15,w3,ror#20
- add v4.4s,v4.4s,v2.4s
- add w10,w10,w11
- ldr w12,[sp,#36]
- and w14,w14,w13
- ror w15,w15,#2
- add w6,w6,w10
- eor w14,w14,w4
- add w9,w9,w12
- add w10,w10,w15
- and w12,w7,w6
- bic w15,w8,w6
- eor w11,w6,w6,ror#5
- add w10,w10,w14
- orr w12,w12,w15
- eor w11,w11,w6,ror#19
- eor w15,w10,w10,ror#11
- add w9,w9,w12
- ror w11,w11,#6
- eor w14,w10,w3
- eor w15,w15,w10,ror#20
- add w9,w9,w11
- ldr w12,[sp,#40]
- and w13,w13,w14
- ror w15,w15,#2
- add w5,w5,w9
- eor w13,w13,w3
- add w8,w8,w12
- add w9,w9,w15
- and w12,w6,w5
- bic w15,w7,w5
- eor w11,w5,w5,ror#5
- add w9,w9,w13
- orr w12,w12,w15
- eor w11,w11,w5,ror#19
- eor w15,w9,w9,ror#11
- add w8,w8,w12
- ror w11,w11,#6
- eor w13,w9,w10
- eor w15,w15,w9,ror#20
- add w8,w8,w11
- ldr w12,[sp,#44]
- and w14,w14,w13
- ror w15,w15,#2
- add w4,w4,w8
- eor w14,w14,w10
- add w7,w7,w12
- add w8,w8,w15
- and w12,w5,w4
- bic w15,w6,w4
- eor w11,w4,w4,ror#5
- add w8,w8,w14
- orr w12,w12,w15
- eor w11,w11,w4,ror#19
- eor w15,w8,w8,ror#11
- add w7,w7,w12
- ror w11,w11,#6
- eor w14,w8,w9
- eor w15,w15,w8,ror#20
- add w7,w7,w11
- ldr w12,[sp,#48]
- and w13,w13,w14
- ror w15,w15,#2
- add w3,w3,w7
- eor w13,w13,w9
- st1 {v4.4s},[x17], #16
- add w6,w6,w12
- add w7,w7,w15
- and w12,w4,w3
- ld1 {v3.16b},[x1],#16
- bic w15,w5,w3
- eor w11,w3,w3,ror#5
- ld1 {v4.4s},[x16],#16
- add w7,w7,w13
- orr w12,w12,w15
- eor w11,w11,w3,ror#19
- eor w15,w7,w7,ror#11
- rev32 v3.16b,v3.16b
- add w6,w6,w12
- ror w11,w11,#6
- eor w13,w7,w8
- eor w15,w15,w7,ror#20
- add v4.4s,v4.4s,v3.4s
- add w6,w6,w11
- ldr w12,[sp,#52]
- and w14,w14,w13
- ror w15,w15,#2
- add w10,w10,w6
- eor w14,w14,w8
- add w5,w5,w12
- add w6,w6,w15
- and w12,w3,w10
- bic w15,w4,w10
- eor w11,w10,w10,ror#5
- add w6,w6,w14
- orr w12,w12,w15
- eor w11,w11,w10,ror#19
- eor w15,w6,w6,ror#11
- add w5,w5,w12
- ror w11,w11,#6
- eor w14,w6,w7
- eor w15,w15,w6,ror#20
- add w5,w5,w11
- ldr w12,[sp,#56]
- and w13,w13,w14
- ror w15,w15,#2
- add w9,w9,w5
- eor w13,w13,w7
- add w4,w4,w12
- add w5,w5,w15
- and w12,w10,w9
- bic w15,w3,w9
- eor w11,w9,w9,ror#5
- add w5,w5,w13
- orr w12,w12,w15
- eor w11,w11,w9,ror#19
- eor w15,w5,w5,ror#11
- add w4,w4,w12
- ror w11,w11,#6
- eor w13,w5,w6
- eor w15,w15,w5,ror#20
- add w4,w4,w11
- ldr w12,[sp,#60]
- and w14,w14,w13
- ror w15,w15,#2
- add w8,w8,w4
- eor w14,w14,w6
- add w3,w3,w12
- add w4,w4,w15
- and w12,w9,w8
- bic w15,w10,w8
- eor w11,w8,w8,ror#5
- add w4,w4,w14
- orr w12,w12,w15
- eor w11,w11,w8,ror#19
- eor w15,w4,w4,ror#11
- add w3,w3,w12
- ror w11,w11,#6
- eor w14,w4,w5
- eor w15,w15,w4,ror#20
- add w3,w3,w11
- and w13,w13,w14
- ror w15,w15,#2
- add w7,w7,w3
- eor w13,w13,w5
- st1 {v4.4s},[x17], #16
- add w3,w3,w15 // h+=Sigma0(a) from the past
- ldp w11,w12,[x0,#0]
- add w3,w3,w13 // h+=Maj(a,b,c) from the past
- ldp w13,w14,[x0,#8]
- add w3,w3,w11 // accumulate
- add w4,w4,w12
- ldp w11,w12,[x0,#16]
- add w5,w5,w13
- add w6,w6,w14
- ldp w13,w14,[x0,#24]
- add w7,w7,w11
- add w8,w8,w12
- ldr w12,[sp,#0]
- stp w3,w4,[x0,#0]
- add w9,w9,w13
- mov w13,wzr
- stp w5,w6,[x0,#8]
- add w10,w10,w14
- stp w7,w8,[x0,#16]
- eor w14,w4,w5
- stp w9,w10,[x0,#24]
- mov w15,wzr
- mov x17,sp
- b.ne .L_00_48
-
- ldr x29,[x29]
- add sp,sp,#16*4+16
- ret
-.size sha256_block_neon,.-sha256_block_neon
-#ifndef __KERNEL__
-.comm OPENSSL_armcap_P,4,4
-#endif
diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c
index e273faca924f..35356987cc1e 100644
--- a/arch/arm64/crypto/sha256-glue.c
+++ b/arch/arm64/crypto/sha256-glue.c
@@ -10,11 +10,11 @@
#include <asm/simd.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha256_base.h>
-#include <linux/cryptohash.h>
-#include <linux/types.h>
+#include <linux/module.h>
#include <linux/string.h>
+#include <linux/types.h>
MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash for arm64");
MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>");
@@ -27,24 +27,33 @@ asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
unsigned int num_blks);
EXPORT_SYMBOL(sha256_block_data_order);
+static void sha256_arm64_transform(struct sha256_state *sst, u8 const *src,
+ int blocks)
+{
+ sha256_block_data_order(sst->state, src, blocks);
+}
+
asmlinkage void sha256_block_neon(u32 *digest, const void *data,
unsigned int num_blks);
+static void sha256_neon_transform(struct sha256_state *sst, u8 const *src,
+ int blocks)
+{
+ sha256_block_neon(sst->state, src, blocks);
+}
+
static int crypto_sha256_arm64_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- return sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_data_order);
+ return sha256_base_do_update(desc, data, len, sha256_arm64_transform);
}
static int crypto_sha256_arm64_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
if (len)
- sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_data_order);
- sha256_base_do_finalize(desc,
- (sha256_block_fn *)sha256_block_data_order);
+ sha256_base_do_update(desc, data, len, sha256_arm64_transform);
+ sha256_base_do_finalize(desc, sha256_arm64_transform);
return sha256_base_finish(desc, out);
}
@@ -87,7 +96,7 @@ static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
if (!crypto_simd_usable())
return sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_data_order);
+ sha256_arm64_transform);
while (len > 0) {
unsigned int chunk = len;
@@ -97,14 +106,13 @@ static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
* input when running on a preemptible kernel, but process the
* data block by block instead.
*/
- if (IS_ENABLED(CONFIG_PREEMPT) &&
+ if (IS_ENABLED(CONFIG_PREEMPTION) &&
chunk + sctx->count % SHA256_BLOCK_SIZE > SHA256_BLOCK_SIZE)
chunk = SHA256_BLOCK_SIZE -
sctx->count % SHA256_BLOCK_SIZE;
kernel_neon_begin();
- sha256_base_do_update(desc, data, chunk,
- (sha256_block_fn *)sha256_block_neon);
+ sha256_base_do_update(desc, data, chunk, sha256_neon_transform);
kernel_neon_end();
data += chunk;
len -= chunk;
@@ -118,15 +126,13 @@ static int sha256_finup_neon(struct shash_desc *desc, const u8 *data,
if (!crypto_simd_usable()) {
if (len)
sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_block_data_order);
- sha256_base_do_finalize(desc,
- (sha256_block_fn *)sha256_block_data_order);
+ sha256_arm64_transform);
+ sha256_base_do_finalize(desc, sha256_arm64_transform);
} else {
if (len)
sha256_update_neon(desc, data, len);
kernel_neon_begin();
- sha256_base_do_finalize(desc,
- (sha256_block_fn *)sha256_block_neon);
+ sha256_base_do_finalize(desc, sha256_neon_transform);
kernel_neon_end();
}
return sha256_base_finish(desc, out);
diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S
index a7d587fa54f6..9c77313f5a60 100644
--- a/arch/arm64/crypto/sha3-ce-core.S
+++ b/arch/arm64/crypto/sha3-ce-core.S
@@ -37,20 +37,13 @@
.endm
/*
- * sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
+ * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
*/
.text
-ENTRY(sha3_ce_transform)
- frame_push 4
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
- mov x22, x3
-
-0: /* load state */
- add x8, x19, #32
- ld1 { v0.1d- v3.1d}, [x19]
+SYM_FUNC_START(sha3_ce_transform)
+ /* load state */
+ add x8, x0, #32
+ ld1 { v0.1d- v3.1d}, [x0]
ld1 { v4.1d- v7.1d}, [x8], #32
ld1 { v8.1d-v11.1d}, [x8], #32
ld1 {v12.1d-v15.1d}, [x8], #32
@@ -58,13 +51,13 @@ ENTRY(sha3_ce_transform)
ld1 {v20.1d-v23.1d}, [x8], #32
ld1 {v24.1d}, [x8]
-1: sub w21, w21, #1
+0: sub w2, w2, #1
mov w8, #24
adr_l x9, .Lsha3_rcon
/* load input */
- ld1 {v25.8b-v28.8b}, [x20], #32
- ld1 {v29.8b-v31.8b}, [x20], #24
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ ld1 {v29.8b-v31.8b}, [x1], #24
eor v0.8b, v0.8b, v25.8b
eor v1.8b, v1.8b, v26.8b
eor v2.8b, v2.8b, v27.8b
@@ -73,10 +66,10 @@ ENTRY(sha3_ce_transform)
eor v5.8b, v5.8b, v30.8b
eor v6.8b, v6.8b, v31.8b
- tbnz x22, #6, 3f // SHA3-512
+ tbnz x3, #6, 2f // SHA3-512
- ld1 {v25.8b-v28.8b}, [x20], #32
- ld1 {v29.8b-v30.8b}, [x20], #16
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ ld1 {v29.8b-v30.8b}, [x1], #16
eor v7.8b, v7.8b, v25.8b
eor v8.8b, v8.8b, v26.8b
eor v9.8b, v9.8b, v27.8b
@@ -84,34 +77,34 @@ ENTRY(sha3_ce_transform)
eor v11.8b, v11.8b, v29.8b
eor v12.8b, v12.8b, v30.8b
- tbnz x22, #4, 2f // SHA3-384 or SHA3-224
+ tbnz x3, #4, 1f // SHA3-384 or SHA3-224
// SHA3-256
- ld1 {v25.8b-v28.8b}, [x20], #32
+ ld1 {v25.8b-v28.8b}, [x1], #32
eor v13.8b, v13.8b, v25.8b
eor v14.8b, v14.8b, v26.8b
eor v15.8b, v15.8b, v27.8b
eor v16.8b, v16.8b, v28.8b
- b 4f
+ b 3f
-2: tbz x22, #2, 4f // bit 2 cleared? SHA-384
+1: tbz x3, #2, 3f // bit 2 cleared? SHA-384
// SHA3-224
- ld1 {v25.8b-v28.8b}, [x20], #32
- ld1 {v29.8b}, [x20], #8
+ ld1 {v25.8b-v28.8b}, [x1], #32
+ ld1 {v29.8b}, [x1], #8
eor v13.8b, v13.8b, v25.8b
eor v14.8b, v14.8b, v26.8b
eor v15.8b, v15.8b, v27.8b
eor v16.8b, v16.8b, v28.8b
eor v17.8b, v17.8b, v29.8b
- b 4f
+ b 3f
// SHA3-512
-3: ld1 {v25.8b-v26.8b}, [x20], #16
+2: ld1 {v25.8b-v26.8b}, [x1], #16
eor v7.8b, v7.8b, v25.8b
eor v8.8b, v8.8b, v26.8b
-4: sub w8, w8, #1
+3: sub w8, w8, #1
eor3 v29.16b, v4.16b, v9.16b, v14.16b
eor3 v26.16b, v1.16b, v6.16b, v11.16b
@@ -190,35 +183,21 @@ ENTRY(sha3_ce_transform)
eor v0.16b, v0.16b, v31.16b
- cbnz w8, 4b
- cbz w21, 5f
-
- if_will_cond_yield_neon
- add x8, x19, #32
- st1 { v0.1d- v3.1d}, [x19]
- st1 { v4.1d- v7.1d}, [x8], #32
- st1 { v8.1d-v11.1d}, [x8], #32
- st1 {v12.1d-v15.1d}, [x8], #32
- st1 {v16.1d-v19.1d}, [x8], #32
- st1 {v20.1d-v23.1d}, [x8], #32
- st1 {v24.1d}, [x8]
- do_cond_yield_neon
- b 0b
- endif_yield_neon
-
- b 1b
+ cbnz w8, 3b
+ cond_yield 4f, x8, x9
+ cbnz w2, 0b
/* save state */
-5: st1 { v0.1d- v3.1d}, [x19], #32
- st1 { v4.1d- v7.1d}, [x19], #32
- st1 { v8.1d-v11.1d}, [x19], #32
- st1 {v12.1d-v15.1d}, [x19], #32
- st1 {v16.1d-v19.1d}, [x19], #32
- st1 {v20.1d-v23.1d}, [x19], #32
- st1 {v24.1d}, [x19]
- frame_pop
+4: st1 { v0.1d- v3.1d}, [x0], #32
+ st1 { v4.1d- v7.1d}, [x0], #32
+ st1 { v8.1d-v11.1d}, [x0], #32
+ st1 {v12.1d-v15.1d}, [x0], #32
+ st1 {v16.1d-v19.1d}, [x0], #32
+ st1 {v20.1d-v23.1d}, [x0], #32
+ st1 {v24.1d}, [x0]
+ mov w0, w2
ret
-ENDPROC(sha3_ce_transform)
+SYM_FUNC_END(sha3_ce_transform)
.section ".rodata", "a"
.align 8
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
index 9a4bbfc45f40..250e1377c481 100644
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/*
* sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions
*
@@ -23,9 +23,13 @@
MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("sha3-224");
+MODULE_ALIAS_CRYPTO("sha3-256");
+MODULE_ALIAS_CRYPTO("sha3-384");
+MODULE_ALIAS_CRYPTO("sha3-512");
-asmlinkage void sha3_ce_transform(u64 *st, const u8 *data, int blocks,
- int md_len);
+asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
+ int md_len);
static int sha3_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
@@ -55,11 +59,15 @@ static int sha3_update(struct shash_desc *desc, const u8 *data,
blocks = len / sctx->rsiz;
len %= sctx->rsiz;
- if (blocks) {
+ while (blocks) {
+ int rem;
+
kernel_neon_begin();
- sha3_ce_transform(sctx->st, data, blocks, digest_size);
+ rem = sha3_ce_transform(sctx->st, data, blocks,
+ digest_size);
kernel_neon_end();
- data += blocks * sctx->rsiz;
+ data += (blocks - rem) * sctx->rsiz;
+ blocks = rem;
}
}
@@ -94,7 +102,7 @@ static int sha3_final(struct shash_desc *desc, u8 *out)
if (digest_size & 4)
put_unaligned_le32(sctx->st[i], (__le32 *)digest);
- *sctx = (struct sha3_state){};
+ memzero_explicit(sctx, sizeof(*sctx));
return 0;
}
diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/crypto/sha512-armv8.pl
index 2d8655d5b1af..35ec9ae99fe1 100644
--- a/arch/arm64/crypto/sha512-armv8.pl
+++ b/arch/arm64/crypto/sha512-armv8.pl
@@ -43,7 +43,7 @@
# on Cortex-A53 (or by 4 cycles per round).
# (***) Super-impressive coefficients over gcc-generated code are
# indication of some compiler "pathology", most notably code
-# generated with -mgeneral-regs-only is significanty faster
+# generated with -mgeneral-regs-only is significantly faster
# and the gap is only 40-90%.
#
# October 2016.
diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S
index ce65e3abe4f2..91ef68b15fcc 100644
--- a/arch/arm64/crypto/sha512-ce-core.S
+++ b/arch/arm64/crypto/sha512-ce-core.S
@@ -102,28 +102,22 @@
.endm
/*
- * void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
- * int blocks)
+ * int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
+ * int blocks)
*/
.text
-ENTRY(sha512_ce_transform)
- frame_push 3
-
- mov x19, x0
- mov x20, x1
- mov x21, x2
-
+SYM_FUNC_START(__sha512_ce_transform)
/* load state */
-0: ld1 {v8.2d-v11.2d}, [x19]
+ ld1 {v8.2d-v11.2d}, [x0]
/* load first 4 round constants */
adr_l x3, .Lsha512_rcon
ld1 {v20.2d-v23.2d}, [x3], #64
/* load input */
-1: ld1 {v12.2d-v15.2d}, [x20], #64
- ld1 {v16.2d-v19.2d}, [x20], #64
- sub w21, w21, #1
+0: ld1 {v12.2d-v15.2d}, [x1], #64
+ ld1 {v16.2d-v19.2d}, [x1], #64
+ sub w2, w2, #1
CPU_LE( rev64 v12.16b, v12.16b )
CPU_LE( rev64 v13.16b, v13.16b )
@@ -201,19 +195,12 @@ CPU_LE( rev64 v19.16b, v19.16b )
add v10.2d, v10.2d, v2.2d
add v11.2d, v11.2d, v3.2d
+ cond_yield 3f, x4, x5
/* handled all input blocks? */
- cbz w21, 3f
-
- if_will_cond_yield_neon
- st1 {v8.2d-v11.2d}, [x19]
- do_cond_yield_neon
- b 0b
- endif_yield_neon
-
- b 1b
+ cbnz w2, 0b
/* store new state */
-3: st1 {v8.2d-v11.2d}, [x19]
- frame_pop
+3: st1 {v8.2d-v11.2d}, [x0]
+ mov w0, w2
ret
-ENDPROC(sha512_ce_transform)
+SYM_FUNC_END(__sha512_ce_transform)
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
index 2369540040aa..f3431fc62315 100644
--- a/arch/arm64/crypto/sha512-ce-glue.c
+++ b/arch/arm64/crypto/sha512-ce-glue.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/*
* sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions
*
@@ -14,7 +14,7 @@
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha512_base.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
@@ -23,58 +23,61 @@
MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("sha384");
+MODULE_ALIAS_CRYPTO("sha512");
-asmlinkage void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
- int blocks);
+asmlinkage int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
+ int blocks);
asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks);
+static void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
+ int blocks)
+{
+ while (blocks) {
+ int rem;
+
+ kernel_neon_begin();
+ rem = __sha512_ce_transform(sst, src, blocks);
+ kernel_neon_end();
+ src += (blocks - rem) * SHA512_BLOCK_SIZE;
+ blocks = rem;
+ }
+}
+
+static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src,
+ int blocks)
+{
+ sha512_block_data_order(sst->state, src, blocks);
+}
+
static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- if (!crypto_simd_usable())
- return sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_block_data_order);
-
- kernel_neon_begin();
- sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_ce_transform);
- kernel_neon_end();
+ sha512_block_fn *fn = crypto_simd_usable() ? sha512_ce_transform
+ : sha512_arm64_transform;
+ sha512_base_do_update(desc, data, len, fn);
return 0;
}
static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- if (!crypto_simd_usable()) {
- if (len)
- sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_block_data_order);
- sha512_base_do_finalize(desc,
- (sha512_block_fn *)sha512_block_data_order);
- return sha512_base_finish(desc, out);
- }
+ sha512_block_fn *fn = crypto_simd_usable() ? sha512_ce_transform
+ : sha512_arm64_transform;
- kernel_neon_begin();
- sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_ce_transform);
- sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_ce_transform);
- kernel_neon_end();
+ sha512_base_do_update(desc, data, len, fn);
+ sha512_base_do_finalize(desc, fn);
return sha512_base_finish(desc, out);
}
static int sha512_ce_final(struct shash_desc *desc, u8 *out)
{
- if (!crypto_simd_usable()) {
- sha512_base_do_finalize(desc,
- (sha512_block_fn *)sha512_block_data_order);
- return sha512_base_finish(desc, out);
- }
+ sha512_block_fn *fn = crypto_simd_usable() ? sha512_ce_transform
+ : sha512_arm64_transform;
- kernel_neon_begin();
- sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_ce_transform);
- kernel_neon_end();
+ sha512_base_do_finalize(desc, fn);
return sha512_base_finish(desc, out);
}
diff --git a/arch/arm64/crypto/sha512-core.S_shipped b/arch/arm64/crypto/sha512-core.S_shipped
deleted file mode 100644
index e063a6106720..000000000000
--- a/arch/arm64/crypto/sha512-core.S_shipped
+++ /dev/null
@@ -1,1093 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-// This code is taken from the OpenSSL project but the author (Andy Polyakov)
-// has relicensed it under the GPLv2. Therefore this program is free software;
-// you can redistribute it and/or modify it under the terms of the GNU General
-// Public License version 2 as published by the Free Software Foundation.
-//
-// The original headers, including the original license headers, are
-// included below for completeness.
-
-// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License"). You may not use
-// this file except in compliance with the License. You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-// ====================================================================
-//
-// SHA256/512 for ARMv8.
-//
-// Performance in cycles per processed byte and improvement coefficient
-// over code generated with "default" compiler:
-//
-// SHA256-hw SHA256(*) SHA512
-// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
-// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
-// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
-// Denver 2.01 10.5 (+26%) 6.70 (+8%)
-// X-Gene 20.0 (+100%) 12.8 (+300%(***))
-// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
-//
-// (*) Software SHA256 results are of lesser relevance, presented
-// mostly for informational purposes.
-// (**) The result is a trade-off: it's possible to improve it by
-// 10% (or by 1 cycle per round), but at the cost of 20% loss
-// on Cortex-A53 (or by 4 cycles per round).
-// (***) Super-impressive coefficients over gcc-generated code are
-// indication of some compiler "pathology", most notably code
-// generated with -mgeneral-regs-only is significanty faster
-// and the gap is only 40-90%.
-//
-// October 2016.
-//
-// Originally it was reckoned that it makes no sense to implement NEON
-// version of SHA256 for 64-bit processors. This is because performance
-// improvement on most wide-spread Cortex-A5x processors was observed
-// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
-// observed that 32-bit NEON SHA256 performs significantly better than
-// 64-bit scalar version on *some* of the more recent processors. As
-// result 64-bit NEON version of SHA256 was added to provide best
-// all-round performance. For example it executes ~30% faster on X-Gene
-// and Mongoose. [For reference, NEON version of SHA512 is bound to
-// deliver much less improvement, likely *negative* on Cortex-A5x.
-// Which is why NEON support is limited to SHA256.]
-
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#endif
-
-.text
-
-.extern OPENSSL_armcap_P
-.globl sha512_block_data_order
-.type sha512_block_data_order,%function
-.align 6
-sha512_block_data_order:
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
-
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#4*8
-
- ldp x20,x21,[x0] // load context
- ldp x22,x23,[x0,#2*8]
- ldp x24,x25,[x0,#4*8]
- add x2,x1,x2,lsl#7 // end of input
- ldp x26,x27,[x0,#6*8]
- adr x30,.LK512
- stp x0,x2,[x29,#96]
-
-.Loop:
- ldp x3,x4,[x1],#2*8
- ldr x19,[x30],#8 // *K++
- eor x28,x21,x22 // magic seed
- str x1,[x29,#112]
-#ifndef __AARCH64EB__
- rev x3,x3 // 0
-#endif
- ror x16,x24,#14
- add x27,x27,x19 // h+=K[i]
- eor x6,x24,x24,ror#23
- and x17,x25,x24
- bic x19,x26,x24
- add x27,x27,x3 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x20,x21 // a^b, b^c in next round
- eor x16,x16,x6,ror#18 // Sigma1(e)
- ror x6,x20,#28
- add x27,x27,x17 // h+=Ch(e,f,g)
- eor x17,x20,x20,ror#5
- add x27,x27,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x23,x23,x27 // d+=h
- eor x28,x28,x21 // Maj(a,b,c)
- eor x17,x6,x17,ror#34 // Sigma0(a)
- add x27,x27,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x27,x27,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x4,x4 // 1
-#endif
- ldp x5,x6,[x1],#2*8
- add x27,x27,x17 // h+=Sigma0(a)
- ror x16,x23,#14
- add x26,x26,x28 // h+=K[i]
- eor x7,x23,x23,ror#23
- and x17,x24,x23
- bic x28,x25,x23
- add x26,x26,x4 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x27,x20 // a^b, b^c in next round
- eor x16,x16,x7,ror#18 // Sigma1(e)
- ror x7,x27,#28
- add x26,x26,x17 // h+=Ch(e,f,g)
- eor x17,x27,x27,ror#5
- add x26,x26,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x22,x22,x26 // d+=h
- eor x19,x19,x20 // Maj(a,b,c)
- eor x17,x7,x17,ror#34 // Sigma0(a)
- add x26,x26,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x26,x26,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x5,x5 // 2
-#endif
- add x26,x26,x17 // h+=Sigma0(a)
- ror x16,x22,#14
- add x25,x25,x19 // h+=K[i]
- eor x8,x22,x22,ror#23
- and x17,x23,x22
- bic x19,x24,x22
- add x25,x25,x5 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x26,x27 // a^b, b^c in next round
- eor x16,x16,x8,ror#18 // Sigma1(e)
- ror x8,x26,#28
- add x25,x25,x17 // h+=Ch(e,f,g)
- eor x17,x26,x26,ror#5
- add x25,x25,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x21,x21,x25 // d+=h
- eor x28,x28,x27 // Maj(a,b,c)
- eor x17,x8,x17,ror#34 // Sigma0(a)
- add x25,x25,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x25,x25,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x6,x6 // 3
-#endif
- ldp x7,x8,[x1],#2*8
- add x25,x25,x17 // h+=Sigma0(a)
- ror x16,x21,#14
- add x24,x24,x28 // h+=K[i]
- eor x9,x21,x21,ror#23
- and x17,x22,x21
- bic x28,x23,x21
- add x24,x24,x6 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x25,x26 // a^b, b^c in next round
- eor x16,x16,x9,ror#18 // Sigma1(e)
- ror x9,x25,#28
- add x24,x24,x17 // h+=Ch(e,f,g)
- eor x17,x25,x25,ror#5
- add x24,x24,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x20,x20,x24 // d+=h
- eor x19,x19,x26 // Maj(a,b,c)
- eor x17,x9,x17,ror#34 // Sigma0(a)
- add x24,x24,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x24,x24,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x7,x7 // 4
-#endif
- add x24,x24,x17 // h+=Sigma0(a)
- ror x16,x20,#14
- add x23,x23,x19 // h+=K[i]
- eor x10,x20,x20,ror#23
- and x17,x21,x20
- bic x19,x22,x20
- add x23,x23,x7 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x24,x25 // a^b, b^c in next round
- eor x16,x16,x10,ror#18 // Sigma1(e)
- ror x10,x24,#28
- add x23,x23,x17 // h+=Ch(e,f,g)
- eor x17,x24,x24,ror#5
- add x23,x23,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x27,x27,x23 // d+=h
- eor x28,x28,x25 // Maj(a,b,c)
- eor x17,x10,x17,ror#34 // Sigma0(a)
- add x23,x23,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x23,x23,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x8,x8 // 5
-#endif
- ldp x9,x10,[x1],#2*8
- add x23,x23,x17 // h+=Sigma0(a)
- ror x16,x27,#14
- add x22,x22,x28 // h+=K[i]
- eor x11,x27,x27,ror#23
- and x17,x20,x27
- bic x28,x21,x27
- add x22,x22,x8 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x23,x24 // a^b, b^c in next round
- eor x16,x16,x11,ror#18 // Sigma1(e)
- ror x11,x23,#28
- add x22,x22,x17 // h+=Ch(e,f,g)
- eor x17,x23,x23,ror#5
- add x22,x22,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x26,x26,x22 // d+=h
- eor x19,x19,x24 // Maj(a,b,c)
- eor x17,x11,x17,ror#34 // Sigma0(a)
- add x22,x22,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x22,x22,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x9,x9 // 6
-#endif
- add x22,x22,x17 // h+=Sigma0(a)
- ror x16,x26,#14
- add x21,x21,x19 // h+=K[i]
- eor x12,x26,x26,ror#23
- and x17,x27,x26
- bic x19,x20,x26
- add x21,x21,x9 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x22,x23 // a^b, b^c in next round
- eor x16,x16,x12,ror#18 // Sigma1(e)
- ror x12,x22,#28
- add x21,x21,x17 // h+=Ch(e,f,g)
- eor x17,x22,x22,ror#5
- add x21,x21,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x25,x25,x21 // d+=h
- eor x28,x28,x23 // Maj(a,b,c)
- eor x17,x12,x17,ror#34 // Sigma0(a)
- add x21,x21,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x21,x21,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x10,x10 // 7
-#endif
- ldp x11,x12,[x1],#2*8
- add x21,x21,x17 // h+=Sigma0(a)
- ror x16,x25,#14
- add x20,x20,x28 // h+=K[i]
- eor x13,x25,x25,ror#23
- and x17,x26,x25
- bic x28,x27,x25
- add x20,x20,x10 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x21,x22 // a^b, b^c in next round
- eor x16,x16,x13,ror#18 // Sigma1(e)
- ror x13,x21,#28
- add x20,x20,x17 // h+=Ch(e,f,g)
- eor x17,x21,x21,ror#5
- add x20,x20,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x24,x24,x20 // d+=h
- eor x19,x19,x22 // Maj(a,b,c)
- eor x17,x13,x17,ror#34 // Sigma0(a)
- add x20,x20,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x20,x20,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x11,x11 // 8
-#endif
- add x20,x20,x17 // h+=Sigma0(a)
- ror x16,x24,#14
- add x27,x27,x19 // h+=K[i]
- eor x14,x24,x24,ror#23
- and x17,x25,x24
- bic x19,x26,x24
- add x27,x27,x11 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x20,x21 // a^b, b^c in next round
- eor x16,x16,x14,ror#18 // Sigma1(e)
- ror x14,x20,#28
- add x27,x27,x17 // h+=Ch(e,f,g)
- eor x17,x20,x20,ror#5
- add x27,x27,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x23,x23,x27 // d+=h
- eor x28,x28,x21 // Maj(a,b,c)
- eor x17,x14,x17,ror#34 // Sigma0(a)
- add x27,x27,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x27,x27,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x12,x12 // 9
-#endif
- ldp x13,x14,[x1],#2*8
- add x27,x27,x17 // h+=Sigma0(a)
- ror x16,x23,#14
- add x26,x26,x28 // h+=K[i]
- eor x15,x23,x23,ror#23
- and x17,x24,x23
- bic x28,x25,x23
- add x26,x26,x12 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x27,x20 // a^b, b^c in next round
- eor x16,x16,x15,ror#18 // Sigma1(e)
- ror x15,x27,#28
- add x26,x26,x17 // h+=Ch(e,f,g)
- eor x17,x27,x27,ror#5
- add x26,x26,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x22,x22,x26 // d+=h
- eor x19,x19,x20 // Maj(a,b,c)
- eor x17,x15,x17,ror#34 // Sigma0(a)
- add x26,x26,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x26,x26,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x13,x13 // 10
-#endif
- add x26,x26,x17 // h+=Sigma0(a)
- ror x16,x22,#14
- add x25,x25,x19 // h+=K[i]
- eor x0,x22,x22,ror#23
- and x17,x23,x22
- bic x19,x24,x22
- add x25,x25,x13 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x26,x27 // a^b, b^c in next round
- eor x16,x16,x0,ror#18 // Sigma1(e)
- ror x0,x26,#28
- add x25,x25,x17 // h+=Ch(e,f,g)
- eor x17,x26,x26,ror#5
- add x25,x25,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x21,x21,x25 // d+=h
- eor x28,x28,x27 // Maj(a,b,c)
- eor x17,x0,x17,ror#34 // Sigma0(a)
- add x25,x25,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x25,x25,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x14,x14 // 11
-#endif
- ldp x15,x0,[x1],#2*8
- add x25,x25,x17 // h+=Sigma0(a)
- str x6,[sp,#24]
- ror x16,x21,#14
- add x24,x24,x28 // h+=K[i]
- eor x6,x21,x21,ror#23
- and x17,x22,x21
- bic x28,x23,x21
- add x24,x24,x14 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x25,x26 // a^b, b^c in next round
- eor x16,x16,x6,ror#18 // Sigma1(e)
- ror x6,x25,#28
- add x24,x24,x17 // h+=Ch(e,f,g)
- eor x17,x25,x25,ror#5
- add x24,x24,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x20,x20,x24 // d+=h
- eor x19,x19,x26 // Maj(a,b,c)
- eor x17,x6,x17,ror#34 // Sigma0(a)
- add x24,x24,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x24,x24,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x15,x15 // 12
-#endif
- add x24,x24,x17 // h+=Sigma0(a)
- str x7,[sp,#0]
- ror x16,x20,#14
- add x23,x23,x19 // h+=K[i]
- eor x7,x20,x20,ror#23
- and x17,x21,x20
- bic x19,x22,x20
- add x23,x23,x15 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x24,x25 // a^b, b^c in next round
- eor x16,x16,x7,ror#18 // Sigma1(e)
- ror x7,x24,#28
- add x23,x23,x17 // h+=Ch(e,f,g)
- eor x17,x24,x24,ror#5
- add x23,x23,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x27,x27,x23 // d+=h
- eor x28,x28,x25 // Maj(a,b,c)
- eor x17,x7,x17,ror#34 // Sigma0(a)
- add x23,x23,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x23,x23,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x0,x0 // 13
-#endif
- ldp x1,x2,[x1]
- add x23,x23,x17 // h+=Sigma0(a)
- str x8,[sp,#8]
- ror x16,x27,#14
- add x22,x22,x28 // h+=K[i]
- eor x8,x27,x27,ror#23
- and x17,x20,x27
- bic x28,x21,x27
- add x22,x22,x0 // h+=X[i]
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x23,x24 // a^b, b^c in next round
- eor x16,x16,x8,ror#18 // Sigma1(e)
- ror x8,x23,#28
- add x22,x22,x17 // h+=Ch(e,f,g)
- eor x17,x23,x23,ror#5
- add x22,x22,x16 // h+=Sigma1(e)
- and x19,x19,x28 // (b^c)&=(a^b)
- add x26,x26,x22 // d+=h
- eor x19,x19,x24 // Maj(a,b,c)
- eor x17,x8,x17,ror#34 // Sigma0(a)
- add x22,x22,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- //add x22,x22,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x1,x1 // 14
-#endif
- ldr x6,[sp,#24]
- add x22,x22,x17 // h+=Sigma0(a)
- str x9,[sp,#16]
- ror x16,x26,#14
- add x21,x21,x19 // h+=K[i]
- eor x9,x26,x26,ror#23
- and x17,x27,x26
- bic x19,x20,x26
- add x21,x21,x1 // h+=X[i]
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x22,x23 // a^b, b^c in next round
- eor x16,x16,x9,ror#18 // Sigma1(e)
- ror x9,x22,#28
- add x21,x21,x17 // h+=Ch(e,f,g)
- eor x17,x22,x22,ror#5
- add x21,x21,x16 // h+=Sigma1(e)
- and x28,x28,x19 // (b^c)&=(a^b)
- add x25,x25,x21 // d+=h
- eor x28,x28,x23 // Maj(a,b,c)
- eor x17,x9,x17,ror#34 // Sigma0(a)
- add x21,x21,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- //add x21,x21,x17 // h+=Sigma0(a)
-#ifndef __AARCH64EB__
- rev x2,x2 // 15
-#endif
- ldr x7,[sp,#0]
- add x21,x21,x17 // h+=Sigma0(a)
- str x10,[sp,#24]
- ror x16,x25,#14
- add x20,x20,x28 // h+=K[i]
- ror x9,x4,#1
- and x17,x26,x25
- ror x8,x1,#19
- bic x28,x27,x25
- ror x10,x21,#28
- add x20,x20,x2 // h+=X[i]
- eor x16,x16,x25,ror#18
- eor x9,x9,x4,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x21,x22 // a^b, b^c in next round
- eor x16,x16,x25,ror#41 // Sigma1(e)
- eor x10,x10,x21,ror#34
- add x20,x20,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x8,x8,x1,ror#61
- eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
- add x20,x20,x16 // h+=Sigma1(e)
- eor x19,x19,x22 // Maj(a,b,c)
- eor x17,x10,x21,ror#39 // Sigma0(a)
- eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
- add x3,x3,x12
- add x24,x24,x20 // d+=h
- add x20,x20,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x3,x3,x9
- add x20,x20,x17 // h+=Sigma0(a)
- add x3,x3,x8
-.Loop_16_xx:
- ldr x8,[sp,#8]
- str x11,[sp,#0]
- ror x16,x24,#14
- add x27,x27,x19 // h+=K[i]
- ror x10,x5,#1
- and x17,x25,x24
- ror x9,x2,#19
- bic x19,x26,x24
- ror x11,x20,#28
- add x27,x27,x3 // h+=X[i]
- eor x16,x16,x24,ror#18
- eor x10,x10,x5,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x20,x21 // a^b, b^c in next round
- eor x16,x16,x24,ror#41 // Sigma1(e)
- eor x11,x11,x20,ror#34
- add x27,x27,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x9,x9,x2,ror#61
- eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
- add x27,x27,x16 // h+=Sigma1(e)
- eor x28,x28,x21 // Maj(a,b,c)
- eor x17,x11,x20,ror#39 // Sigma0(a)
- eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
- add x4,x4,x13
- add x23,x23,x27 // d+=h
- add x27,x27,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x4,x4,x10
- add x27,x27,x17 // h+=Sigma0(a)
- add x4,x4,x9
- ldr x9,[sp,#16]
- str x12,[sp,#8]
- ror x16,x23,#14
- add x26,x26,x28 // h+=K[i]
- ror x11,x6,#1
- and x17,x24,x23
- ror x10,x3,#19
- bic x28,x25,x23
- ror x12,x27,#28
- add x26,x26,x4 // h+=X[i]
- eor x16,x16,x23,ror#18
- eor x11,x11,x6,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x27,x20 // a^b, b^c in next round
- eor x16,x16,x23,ror#41 // Sigma1(e)
- eor x12,x12,x27,ror#34
- add x26,x26,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x10,x10,x3,ror#61
- eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
- add x26,x26,x16 // h+=Sigma1(e)
- eor x19,x19,x20 // Maj(a,b,c)
- eor x17,x12,x27,ror#39 // Sigma0(a)
- eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
- add x5,x5,x14
- add x22,x22,x26 // d+=h
- add x26,x26,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x5,x5,x11
- add x26,x26,x17 // h+=Sigma0(a)
- add x5,x5,x10
- ldr x10,[sp,#24]
- str x13,[sp,#16]
- ror x16,x22,#14
- add x25,x25,x19 // h+=K[i]
- ror x12,x7,#1
- and x17,x23,x22
- ror x11,x4,#19
- bic x19,x24,x22
- ror x13,x26,#28
- add x25,x25,x5 // h+=X[i]
- eor x16,x16,x22,ror#18
- eor x12,x12,x7,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x26,x27 // a^b, b^c in next round
- eor x16,x16,x22,ror#41 // Sigma1(e)
- eor x13,x13,x26,ror#34
- add x25,x25,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x11,x11,x4,ror#61
- eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
- add x25,x25,x16 // h+=Sigma1(e)
- eor x28,x28,x27 // Maj(a,b,c)
- eor x17,x13,x26,ror#39 // Sigma0(a)
- eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
- add x6,x6,x15
- add x21,x21,x25 // d+=h
- add x25,x25,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x6,x6,x12
- add x25,x25,x17 // h+=Sigma0(a)
- add x6,x6,x11
- ldr x11,[sp,#0]
- str x14,[sp,#24]
- ror x16,x21,#14
- add x24,x24,x28 // h+=K[i]
- ror x13,x8,#1
- and x17,x22,x21
- ror x12,x5,#19
- bic x28,x23,x21
- ror x14,x25,#28
- add x24,x24,x6 // h+=X[i]
- eor x16,x16,x21,ror#18
- eor x13,x13,x8,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x25,x26 // a^b, b^c in next round
- eor x16,x16,x21,ror#41 // Sigma1(e)
- eor x14,x14,x25,ror#34
- add x24,x24,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x12,x12,x5,ror#61
- eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
- add x24,x24,x16 // h+=Sigma1(e)
- eor x19,x19,x26 // Maj(a,b,c)
- eor x17,x14,x25,ror#39 // Sigma0(a)
- eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
- add x7,x7,x0
- add x20,x20,x24 // d+=h
- add x24,x24,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x7,x7,x13
- add x24,x24,x17 // h+=Sigma0(a)
- add x7,x7,x12
- ldr x12,[sp,#8]
- str x15,[sp,#0]
- ror x16,x20,#14
- add x23,x23,x19 // h+=K[i]
- ror x14,x9,#1
- and x17,x21,x20
- ror x13,x6,#19
- bic x19,x22,x20
- ror x15,x24,#28
- add x23,x23,x7 // h+=X[i]
- eor x16,x16,x20,ror#18
- eor x14,x14,x9,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x24,x25 // a^b, b^c in next round
- eor x16,x16,x20,ror#41 // Sigma1(e)
- eor x15,x15,x24,ror#34
- add x23,x23,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x13,x13,x6,ror#61
- eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
- add x23,x23,x16 // h+=Sigma1(e)
- eor x28,x28,x25 // Maj(a,b,c)
- eor x17,x15,x24,ror#39 // Sigma0(a)
- eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
- add x8,x8,x1
- add x27,x27,x23 // d+=h
- add x23,x23,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x8,x8,x14
- add x23,x23,x17 // h+=Sigma0(a)
- add x8,x8,x13
- ldr x13,[sp,#16]
- str x0,[sp,#8]
- ror x16,x27,#14
- add x22,x22,x28 // h+=K[i]
- ror x15,x10,#1
- and x17,x20,x27
- ror x14,x7,#19
- bic x28,x21,x27
- ror x0,x23,#28
- add x22,x22,x8 // h+=X[i]
- eor x16,x16,x27,ror#18
- eor x15,x15,x10,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x23,x24 // a^b, b^c in next round
- eor x16,x16,x27,ror#41 // Sigma1(e)
- eor x0,x0,x23,ror#34
- add x22,x22,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x14,x14,x7,ror#61
- eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
- add x22,x22,x16 // h+=Sigma1(e)
- eor x19,x19,x24 // Maj(a,b,c)
- eor x17,x0,x23,ror#39 // Sigma0(a)
- eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
- add x9,x9,x2
- add x26,x26,x22 // d+=h
- add x22,x22,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x9,x9,x15
- add x22,x22,x17 // h+=Sigma0(a)
- add x9,x9,x14
- ldr x14,[sp,#24]
- str x1,[sp,#16]
- ror x16,x26,#14
- add x21,x21,x19 // h+=K[i]
- ror x0,x11,#1
- and x17,x27,x26
- ror x15,x8,#19
- bic x19,x20,x26
- ror x1,x22,#28
- add x21,x21,x9 // h+=X[i]
- eor x16,x16,x26,ror#18
- eor x0,x0,x11,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x22,x23 // a^b, b^c in next round
- eor x16,x16,x26,ror#41 // Sigma1(e)
- eor x1,x1,x22,ror#34
- add x21,x21,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x15,x15,x8,ror#61
- eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
- add x21,x21,x16 // h+=Sigma1(e)
- eor x28,x28,x23 // Maj(a,b,c)
- eor x17,x1,x22,ror#39 // Sigma0(a)
- eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
- add x10,x10,x3
- add x25,x25,x21 // d+=h
- add x21,x21,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x10,x10,x0
- add x21,x21,x17 // h+=Sigma0(a)
- add x10,x10,x15
- ldr x15,[sp,#0]
- str x2,[sp,#24]
- ror x16,x25,#14
- add x20,x20,x28 // h+=K[i]
- ror x1,x12,#1
- and x17,x26,x25
- ror x0,x9,#19
- bic x28,x27,x25
- ror x2,x21,#28
- add x20,x20,x10 // h+=X[i]
- eor x16,x16,x25,ror#18
- eor x1,x1,x12,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x21,x22 // a^b, b^c in next round
- eor x16,x16,x25,ror#41 // Sigma1(e)
- eor x2,x2,x21,ror#34
- add x20,x20,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x0,x0,x9,ror#61
- eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
- add x20,x20,x16 // h+=Sigma1(e)
- eor x19,x19,x22 // Maj(a,b,c)
- eor x17,x2,x21,ror#39 // Sigma0(a)
- eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
- add x11,x11,x4
- add x24,x24,x20 // d+=h
- add x20,x20,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x11,x11,x1
- add x20,x20,x17 // h+=Sigma0(a)
- add x11,x11,x0
- ldr x0,[sp,#8]
- str x3,[sp,#0]
- ror x16,x24,#14
- add x27,x27,x19 // h+=K[i]
- ror x2,x13,#1
- and x17,x25,x24
- ror x1,x10,#19
- bic x19,x26,x24
- ror x3,x20,#28
- add x27,x27,x11 // h+=X[i]
- eor x16,x16,x24,ror#18
- eor x2,x2,x13,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x20,x21 // a^b, b^c in next round
- eor x16,x16,x24,ror#41 // Sigma1(e)
- eor x3,x3,x20,ror#34
- add x27,x27,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x1,x1,x10,ror#61
- eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
- add x27,x27,x16 // h+=Sigma1(e)
- eor x28,x28,x21 // Maj(a,b,c)
- eor x17,x3,x20,ror#39 // Sigma0(a)
- eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
- add x12,x12,x5
- add x23,x23,x27 // d+=h
- add x27,x27,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x12,x12,x2
- add x27,x27,x17 // h+=Sigma0(a)
- add x12,x12,x1
- ldr x1,[sp,#16]
- str x4,[sp,#8]
- ror x16,x23,#14
- add x26,x26,x28 // h+=K[i]
- ror x3,x14,#1
- and x17,x24,x23
- ror x2,x11,#19
- bic x28,x25,x23
- ror x4,x27,#28
- add x26,x26,x12 // h+=X[i]
- eor x16,x16,x23,ror#18
- eor x3,x3,x14,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x27,x20 // a^b, b^c in next round
- eor x16,x16,x23,ror#41 // Sigma1(e)
- eor x4,x4,x27,ror#34
- add x26,x26,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x2,x2,x11,ror#61
- eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
- add x26,x26,x16 // h+=Sigma1(e)
- eor x19,x19,x20 // Maj(a,b,c)
- eor x17,x4,x27,ror#39 // Sigma0(a)
- eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
- add x13,x13,x6
- add x22,x22,x26 // d+=h
- add x26,x26,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x13,x13,x3
- add x26,x26,x17 // h+=Sigma0(a)
- add x13,x13,x2
- ldr x2,[sp,#24]
- str x5,[sp,#16]
- ror x16,x22,#14
- add x25,x25,x19 // h+=K[i]
- ror x4,x15,#1
- and x17,x23,x22
- ror x3,x12,#19
- bic x19,x24,x22
- ror x5,x26,#28
- add x25,x25,x13 // h+=X[i]
- eor x16,x16,x22,ror#18
- eor x4,x4,x15,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x26,x27 // a^b, b^c in next round
- eor x16,x16,x22,ror#41 // Sigma1(e)
- eor x5,x5,x26,ror#34
- add x25,x25,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x3,x3,x12,ror#61
- eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
- add x25,x25,x16 // h+=Sigma1(e)
- eor x28,x28,x27 // Maj(a,b,c)
- eor x17,x5,x26,ror#39 // Sigma0(a)
- eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
- add x14,x14,x7
- add x21,x21,x25 // d+=h
- add x25,x25,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x14,x14,x4
- add x25,x25,x17 // h+=Sigma0(a)
- add x14,x14,x3
- ldr x3,[sp,#0]
- str x6,[sp,#24]
- ror x16,x21,#14
- add x24,x24,x28 // h+=K[i]
- ror x5,x0,#1
- and x17,x22,x21
- ror x4,x13,#19
- bic x28,x23,x21
- ror x6,x25,#28
- add x24,x24,x14 // h+=X[i]
- eor x16,x16,x21,ror#18
- eor x5,x5,x0,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x25,x26 // a^b, b^c in next round
- eor x16,x16,x21,ror#41 // Sigma1(e)
- eor x6,x6,x25,ror#34
- add x24,x24,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x4,x4,x13,ror#61
- eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
- add x24,x24,x16 // h+=Sigma1(e)
- eor x19,x19,x26 // Maj(a,b,c)
- eor x17,x6,x25,ror#39 // Sigma0(a)
- eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
- add x15,x15,x8
- add x20,x20,x24 // d+=h
- add x24,x24,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x15,x15,x5
- add x24,x24,x17 // h+=Sigma0(a)
- add x15,x15,x4
- ldr x4,[sp,#8]
- str x7,[sp,#0]
- ror x16,x20,#14
- add x23,x23,x19 // h+=K[i]
- ror x6,x1,#1
- and x17,x21,x20
- ror x5,x14,#19
- bic x19,x22,x20
- ror x7,x24,#28
- add x23,x23,x15 // h+=X[i]
- eor x16,x16,x20,ror#18
- eor x6,x6,x1,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x24,x25 // a^b, b^c in next round
- eor x16,x16,x20,ror#41 // Sigma1(e)
- eor x7,x7,x24,ror#34
- add x23,x23,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x5,x5,x14,ror#61
- eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
- add x23,x23,x16 // h+=Sigma1(e)
- eor x28,x28,x25 // Maj(a,b,c)
- eor x17,x7,x24,ror#39 // Sigma0(a)
- eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
- add x0,x0,x9
- add x27,x27,x23 // d+=h
- add x23,x23,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x0,x0,x6
- add x23,x23,x17 // h+=Sigma0(a)
- add x0,x0,x5
- ldr x5,[sp,#16]
- str x8,[sp,#8]
- ror x16,x27,#14
- add x22,x22,x28 // h+=K[i]
- ror x7,x2,#1
- and x17,x20,x27
- ror x6,x15,#19
- bic x28,x21,x27
- ror x8,x23,#28
- add x22,x22,x0 // h+=X[i]
- eor x16,x16,x27,ror#18
- eor x7,x7,x2,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x23,x24 // a^b, b^c in next round
- eor x16,x16,x27,ror#41 // Sigma1(e)
- eor x8,x8,x23,ror#34
- add x22,x22,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x6,x6,x15,ror#61
- eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
- add x22,x22,x16 // h+=Sigma1(e)
- eor x19,x19,x24 // Maj(a,b,c)
- eor x17,x8,x23,ror#39 // Sigma0(a)
- eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
- add x1,x1,x10
- add x26,x26,x22 // d+=h
- add x22,x22,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x1,x1,x7
- add x22,x22,x17 // h+=Sigma0(a)
- add x1,x1,x6
- ldr x6,[sp,#24]
- str x9,[sp,#16]
- ror x16,x26,#14
- add x21,x21,x19 // h+=K[i]
- ror x8,x3,#1
- and x17,x27,x26
- ror x7,x0,#19
- bic x19,x20,x26
- ror x9,x22,#28
- add x21,x21,x1 // h+=X[i]
- eor x16,x16,x26,ror#18
- eor x8,x8,x3,ror#8
- orr x17,x17,x19 // Ch(e,f,g)
- eor x19,x22,x23 // a^b, b^c in next round
- eor x16,x16,x26,ror#41 // Sigma1(e)
- eor x9,x9,x22,ror#34
- add x21,x21,x17 // h+=Ch(e,f,g)
- and x28,x28,x19 // (b^c)&=(a^b)
- eor x7,x7,x0,ror#61
- eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
- add x21,x21,x16 // h+=Sigma1(e)
- eor x28,x28,x23 // Maj(a,b,c)
- eor x17,x9,x22,ror#39 // Sigma0(a)
- eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
- add x2,x2,x11
- add x25,x25,x21 // d+=h
- add x21,x21,x28 // h+=Maj(a,b,c)
- ldr x28,[x30],#8 // *K++, x19 in next round
- add x2,x2,x8
- add x21,x21,x17 // h+=Sigma0(a)
- add x2,x2,x7
- ldr x7,[sp,#0]
- str x10,[sp,#24]
- ror x16,x25,#14
- add x20,x20,x28 // h+=K[i]
- ror x9,x4,#1
- and x17,x26,x25
- ror x8,x1,#19
- bic x28,x27,x25
- ror x10,x21,#28
- add x20,x20,x2 // h+=X[i]
- eor x16,x16,x25,ror#18
- eor x9,x9,x4,ror#8
- orr x17,x17,x28 // Ch(e,f,g)
- eor x28,x21,x22 // a^b, b^c in next round
- eor x16,x16,x25,ror#41 // Sigma1(e)
- eor x10,x10,x21,ror#34
- add x20,x20,x17 // h+=Ch(e,f,g)
- and x19,x19,x28 // (b^c)&=(a^b)
- eor x8,x8,x1,ror#61
- eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
- add x20,x20,x16 // h+=Sigma1(e)
- eor x19,x19,x22 // Maj(a,b,c)
- eor x17,x10,x21,ror#39 // Sigma0(a)
- eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
- add x3,x3,x12
- add x24,x24,x20 // d+=h
- add x20,x20,x19 // h+=Maj(a,b,c)
- ldr x19,[x30],#8 // *K++, x28 in next round
- add x3,x3,x9
- add x20,x20,x17 // h+=Sigma0(a)
- add x3,x3,x8
- cbnz x19,.Loop_16_xx
-
- ldp x0,x2,[x29,#96]
- ldr x1,[x29,#112]
- sub x30,x30,#648 // rewind
-
- ldp x3,x4,[x0]
- ldp x5,x6,[x0,#2*8]
- add x1,x1,#14*8 // advance input pointer
- ldp x7,x8,[x0,#4*8]
- add x20,x20,x3
- ldp x9,x10,[x0,#6*8]
- add x21,x21,x4
- add x22,x22,x5
- add x23,x23,x6
- stp x20,x21,[x0]
- add x24,x24,x7
- add x25,x25,x8
- stp x22,x23,[x0,#2*8]
- add x26,x26,x9
- add x27,x27,x10
- cmp x1,x2
- stp x24,x25,[x0,#4*8]
- stp x26,x27,[x0,#6*8]
- b.ne .Loop
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#4*8
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#128
- ret
-.size sha512_block_data_order,.-sha512_block_data_order
-
-.align 6
-.type .LK512,%object
-.LK512:
- .quad 0x428a2f98d728ae22,0x7137449123ef65cd
- .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
- .quad 0x3956c25bf348b538,0x59f111f1b605d019
- .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
- .quad 0xd807aa98a3030242,0x12835b0145706fbe
- .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
- .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
- .quad 0x9bdc06a725c71235,0xc19bf174cf692694
- .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
- .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
- .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
- .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
- .quad 0x983e5152ee66dfab,0xa831c66d2db43210
- .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
- .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
- .quad 0x06ca6351e003826f,0x142929670a0e6e70
- .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
- .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
- .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
- .quad 0x81c2c92e47edaee6,0x92722c851482353b
- .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
- .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
- .quad 0xd192e819d6ef5218,0xd69906245565a910
- .quad 0xf40e35855771202a,0x106aa07032bbd1b8
- .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
- .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
- .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
- .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
- .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
- .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
- .quad 0x90befffa23631e28,0xa4506cebde82bde9
- .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
- .quad 0xca273eceea26619c,0xd186b8c721c0c207
- .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
- .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
- .quad 0x113f9804bef90dae,0x1b710b35131c471b
- .quad 0x28db77f523047d84,0x32caab7b40c72493
- .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
- .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
- .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
- .quad 0 // terminator
-.size .LK512,.-.LK512
-#ifndef __KERNEL__
-.align 3
-.LOPENSSL_armcap_P:
-# ifdef __ILP32__
- .long OPENSSL_armcap_P-.
-# else
- .quad OPENSSL_armcap_P-.
-# endif
-#endif
-.asciz "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
-#ifndef __KERNEL__
-.comm OPENSSL_armcap_P,4,4
-#endif
diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c
index d915c656e5fe..62f129dea83d 100644
--- a/arch/arm64/crypto/sha512-glue.c
+++ b/arch/arm64/crypto/sha512-glue.c
@@ -6,10 +6,9 @@
*/
#include <crypto/internal/hash.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
#include <linux/string.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/sha512_base.h>
#include <asm/neon.h>
@@ -20,25 +19,28 @@ MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("sha384");
MODULE_ALIAS_CRYPTO("sha512");
-asmlinkage void sha512_block_data_order(u32 *digest, const void *data,
+asmlinkage void sha512_block_data_order(u64 *digest, const void *data,
unsigned int num_blks);
EXPORT_SYMBOL(sha512_block_data_order);
+static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src,
+ int blocks)
+{
+ sha512_block_data_order(sst->state, src, blocks);
+}
+
static int sha512_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- return sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_block_data_order);
+ return sha512_base_do_update(desc, data, len, sha512_arm64_transform);
}
static int sha512_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
if (len)
- sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_block_data_order);
- sha512_base_do_finalize(desc,
- (sha512_block_fn *)sha512_block_data_order);
+ sha512_base_do_update(desc, data, len, sha512_arm64_transform);
+ sha512_base_do_finalize(desc, sha512_arm64_transform);
return sha512_base_finish(desc, out);
}
diff --git a/arch/arm64/crypto/sm3-ce-core.S b/arch/arm64/crypto/sm3-ce-core.S
index d50d187906cb..ca70cfacd0d0 100644
--- a/arch/arm64/crypto/sm3-ce-core.S
+++ b/arch/arm64/crypto/sm3-ce-core.S
@@ -6,6 +6,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/assembler.h>
.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
@@ -73,7 +74,7 @@
* int blocks)
*/
.text
-ENTRY(sm3_ce_transform)
+SYM_TYPED_FUNC_START(sm3_ce_transform)
/* load state */
ld1 {v8.4s-v9.4s}, [x0]
rev64 v8.4s, v8.4s
@@ -131,7 +132,7 @@ CPU_LE( rev32 v3.16b, v3.16b )
ext v9.16b, v9.16b, v9.16b, #8
st1 {v8.4s-v9.4s}, [x0]
ret
-ENDPROC(sm3_ce_transform)
+SYM_FUNC_END(sm3_ce_transform)
.section ".rodata", "a"
.align 3
diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c
index d71faca322f2..54bf6ebcfffb 100644
--- a/arch/arm64/crypto/sm3-ce-glue.c
+++ b/arch/arm64/crypto/sm3-ce-glue.c
@@ -26,8 +26,10 @@ asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src,
static int sm3_ce_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- if (!crypto_simd_usable())
- return crypto_sm3_update(desc, data, len);
+ if (!crypto_simd_usable()) {
+ sm3_update(shash_desc_ctx(desc), data, len);
+ return 0;
+ }
kernel_neon_begin();
sm3_base_do_update(desc, data, len, sm3_ce_transform);
@@ -38,8 +40,10 @@ static int sm3_ce_update(struct shash_desc *desc, const u8 *data,
static int sm3_ce_final(struct shash_desc *desc, u8 *out)
{
- if (!crypto_simd_usable())
- return crypto_sm3_finup(desc, NULL, 0, out);
+ if (!crypto_simd_usable()) {
+ sm3_final(shash_desc_ctx(desc), out);
+ return 0;
+ }
kernel_neon_begin();
sm3_base_do_finalize(desc, sm3_ce_transform);
@@ -51,14 +55,22 @@ static int sm3_ce_final(struct shash_desc *desc, u8 *out)
static int sm3_ce_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- if (!crypto_simd_usable())
- return crypto_sm3_finup(desc, data, len, out);
+ if (!crypto_simd_usable()) {
+ struct sm3_state *sctx = shash_desc_ctx(desc);
+
+ if (len)
+ sm3_update(sctx, data, len);
+ sm3_final(sctx, out);
+ return 0;
+ }
kernel_neon_begin();
- sm3_base_do_update(desc, data, len, sm3_ce_transform);
+ if (len)
+ sm3_base_do_update(desc, data, len, sm3_ce_transform);
+ sm3_base_do_finalize(desc, sm3_ce_transform);
kernel_neon_end();
- return sm3_ce_final(desc, out);
+ return sm3_base_finish(desc, out);
}
static struct shash_alg sm3_alg = {
@@ -72,7 +84,7 @@ static struct shash_alg sm3_alg = {
.base.cra_driver_name = "sm3-ce",
.base.cra_blocksize = SM3_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
- .base.cra_priority = 200,
+ .base.cra_priority = 400,
};
static int __init sm3_ce_mod_init(void)
diff --git a/arch/arm64/crypto/sm3-neon-core.S b/arch/arm64/crypto/sm3-neon-core.S
new file mode 100644
index 000000000000..4357e0e51be3
--- /dev/null
+++ b/arch/arm64/crypto/sm3-neon-core.S
@@ -0,0 +1,601 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * sm3-neon-core.S - SM3 secure hash using NEON instructions
+ *
+ * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64
+ *
+ * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/assembler.h>
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+#define state_h5 20
+#define state_h6 24
+#define state_h7 28
+
+/* Stack structure */
+
+#define STACK_W_SIZE (32 * 2 * 3)
+
+#define STACK_W (0)
+#define STACK_SIZE (STACK_W + STACK_W_SIZE)
+
+/* Register macros */
+
+#define RSTATE x0
+#define RDATA x1
+#define RNBLKS x2
+#define RKPTR x28
+#define RFRAME x29
+
+#define ra w3
+#define rb w4
+#define rc w5
+#define rd w6
+#define re w7
+#define rf w8
+#define rg w9
+#define rh w10
+
+#define t0 w11
+#define t1 w12
+#define t2 w13
+#define t3 w14
+#define t4 w15
+#define t5 w16
+#define t6 w17
+
+#define k_even w19
+#define k_odd w20
+
+#define addr0 x21
+#define addr1 x22
+
+#define s0 w23
+#define s1 w24
+#define s2 w25
+#define s3 w26
+
+#define W0 v0
+#define W1 v1
+#define W2 v2
+#define W3 v3
+#define W4 v4
+#define W5 v5
+
+#define XTMP0 v6
+#define XTMP1 v7
+#define XTMP2 v16
+#define XTMP3 v17
+#define XTMP4 v18
+#define XTMP5 v19
+#define XTMP6 v20
+
+/* Helper macros. */
+
+#define _(...) /*_*/
+
+#define clear_vec(x) \
+ movi x.8h, #0;
+
+#define rolw(o, a, n) \
+ ror o, a, #(32 - n);
+
+/* Round function macros. */
+
+#define GG1_1(x, y, z, o, t) \
+ eor o, x, y;
+#define GG1_2(x, y, z, o, t) \
+ eor o, o, z;
+#define GG1_3(x, y, z, o, t)
+
+#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t)
+#define FF1_2(x, y, z, o, t)
+#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t)
+
+#define GG2_1(x, y, z, o, t) \
+ bic o, z, x;
+#define GG2_2(x, y, z, o, t) \
+ and t, y, x;
+#define GG2_3(x, y, z, o, t) \
+ eor o, o, t;
+
+#define FF2_1(x, y, z, o, t) \
+ eor o, x, y;
+#define FF2_2(x, y, z, o, t) \
+ and t, x, y; \
+ and o, o, z;
+#define FF2_3(x, y, z, o, t) \
+ eor o, o, t;
+
+#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
+ K_LOAD(round); \
+ ldr t5, [sp, #(wtype##_W1_ADDR(round, widx))]; \
+ rolw(t0, a, 12); /* rol(a, 12) => t0 */ \
+ IOP(1, iop_param); \
+ FF##i##_1(a, b, c, t1, t2); \
+ ldr t6, [sp, #(wtype##_W1W2_ADDR(round, widx))]; \
+ add k, k, e; \
+ IOP(2, iop_param); \
+ GG##i##_1(e, f, g, t3, t4); \
+ FF##i##_2(a, b, c, t1, t2); \
+ IOP(3, iop_param); \
+ add k, k, t0; \
+ add h, h, t5; \
+ add d, d, t6; /* w1w2 + d => d */ \
+ IOP(4, iop_param); \
+ rolw(k, k, 7); /* rol (t0 + e + t), 7) => k */ \
+ GG##i##_2(e, f, g, t3, t4); \
+ add h, h, k; /* h + w1 + k => h */ \
+ IOP(5, iop_param); \
+ FF##i##_3(a, b, c, t1, t2); \
+ eor t0, t0, k; /* k ^ t0 => t0 */ \
+ GG##i##_3(e, f, g, t3, t4); \
+ add d, d, t1; /* FF(a,b,c) + d => d */ \
+ IOP(6, iop_param); \
+ add t3, t3, h; /* GG(e,f,g) + h => t3 */ \
+ rolw(b, b, 9); /* rol(b, 9) => b */ \
+ eor h, t3, t3, ror #(32-9); \
+ IOP(7, iop_param); \
+ add d, d, t0; /* t0 + d => d */ \
+ rolw(f, f, 19); /* rol(f, 19) => f */ \
+ IOP(8, iop_param); \
+ eor h, h, t3, ror #(32-17); /* P0(t3) => h */
+
+#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
+ R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
+
+#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
+ R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
+
+#define KL(round) \
+ ldp k_even, k_odd, [RKPTR, #(4*(round))];
+
+/* Input expansion macros. */
+
+/* Byte-swapped input address. */
+#define IW_W_ADDR(round, widx, offs) \
+ (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))
+
+/* Expanded input address. */
+#define XW_W_ADDR(round, widx, offs) \
+ (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))
+
+/* Rounds 1-12, byte-swapped input block addresses. */
+#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 32)
+#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48)
+
+/* Rounds 1-12, expanded input block addresses. */
+#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0)
+#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16)
+
+/* Input block loading.
+ * Interleaving within round function needed for in-order CPUs. */
+#define LOAD_W_VEC_1_1() \
+ add addr0, sp, #IW_W1_ADDR(0, 0);
+#define LOAD_W_VEC_1_2() \
+ add addr1, sp, #IW_W1_ADDR(4, 0);
+#define LOAD_W_VEC_1_3() \
+ ld1 {W0.16b}, [RDATA], #16;
+#define LOAD_W_VEC_1_4() \
+ ld1 {W1.16b}, [RDATA], #16;
+#define LOAD_W_VEC_1_5() \
+ ld1 {W2.16b}, [RDATA], #16;
+#define LOAD_W_VEC_1_6() \
+ ld1 {W3.16b}, [RDATA], #16;
+#define LOAD_W_VEC_1_7() \
+ rev32 XTMP0.16b, W0.16b;
+#define LOAD_W_VEC_1_8() \
+ rev32 XTMP1.16b, W1.16b;
+#define LOAD_W_VEC_2_1() \
+ rev32 XTMP2.16b, W2.16b;
+#define LOAD_W_VEC_2_2() \
+ rev32 XTMP3.16b, W3.16b;
+#define LOAD_W_VEC_2_3() \
+ eor XTMP4.16b, XTMP1.16b, XTMP0.16b;
+#define LOAD_W_VEC_2_4() \
+ eor XTMP5.16b, XTMP2.16b, XTMP1.16b;
+#define LOAD_W_VEC_2_5() \
+ st1 {XTMP0.16b}, [addr0], #16;
+#define LOAD_W_VEC_2_6() \
+ st1 {XTMP4.16b}, [addr0]; \
+ add addr0, sp, #IW_W1_ADDR(8, 0);
+#define LOAD_W_VEC_2_7() \
+ eor XTMP6.16b, XTMP3.16b, XTMP2.16b;
+#define LOAD_W_VEC_2_8() \
+ ext W0.16b, XTMP0.16b, XTMP0.16b, #8; /* W0: xx, w0, xx, xx */
+#define LOAD_W_VEC_3_1() \
+ mov W2.16b, XTMP1.16b; /* W2: xx, w6, w5, w4 */
+#define LOAD_W_VEC_3_2() \
+ st1 {XTMP1.16b}, [addr1], #16;
+#define LOAD_W_VEC_3_3() \
+ st1 {XTMP5.16b}, [addr1]; \
+ ext W1.16b, XTMP0.16b, XTMP0.16b, #4; /* W1: xx, w3, w2, w1 */
+#define LOAD_W_VEC_3_4() \
+ ext W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */
+#define LOAD_W_VEC_3_5() \
+ ext W4.16b, XTMP2.16b, XTMP3.16b, #8; /* W4: xx, w12, w11, w10 */
+#define LOAD_W_VEC_3_6() \
+ st1 {XTMP2.16b}, [addr0], #16;
+#define LOAD_W_VEC_3_7() \
+ st1 {XTMP6.16b}, [addr0];
+#define LOAD_W_VEC_3_8() \
+ ext W5.16b, XTMP3.16b, XTMP3.16b, #4; /* W5: xx, w15, w14, w13 */
+
+#define LOAD_W_VEC_1(iop_num, ...) \
+ LOAD_W_VEC_1_##iop_num()
+#define LOAD_W_VEC_2(iop_num, ...) \
+ LOAD_W_VEC_2_##iop_num()
+#define LOAD_W_VEC_3(iop_num, ...) \
+ LOAD_W_VEC_3_##iop_num()
+
+/* Message scheduling. Note: 3 words per vector register.
+ * Interleaving within round function needed for in-order CPUs. */
+#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \
+ /* Load (w[i - 16]) => XTMP0 */ \
+ /* Load (w[i - 13]) => XTMP5 */ \
+ ext XTMP0.16b, w0.16b, w0.16b, #12; /* XTMP0: w0, xx, xx, xx */
+#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \
+ ext XTMP5.16b, w1.16b, w1.16b, #12;
+#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \
+ ext XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */
+#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \
+ ext XTMP5.16b, XTMP5.16b, w2.16b, #12;
+#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \
+ /* w[i - 9] == w3 */ \
+ /* W3 ^ XTMP0 => XTMP0 */ \
+ eor XTMP0.16b, XTMP0.16b, w3.16b;
+#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \
+ /* w[i - 3] == w5 */ \
+ /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \
+ /* rol(XTMP5, 7) => XTMP1 */ \
+ add addr0, sp, #XW_W1_ADDR((round), 0); \
+ shl XTMP2.4s, w5.4s, #15;
+#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \
+ shl XTMP1.4s, XTMP5.4s, #7;
+#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \
+ sri XTMP2.4s, w5.4s, #(32-15);
+#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \
+ sri XTMP1.4s, XTMP5.4s, #(32-7);
+#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \
+ eor XTMP0.16b, XTMP0.16b, XTMP2.16b;
+#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \
+ /* w[i - 6] == W4 */ \
+ /* W4 ^ XTMP1 => XTMP1 */ \
+ eor XTMP1.16b, XTMP1.16b, w4.16b;
+#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \
+ /* P1(XTMP0) ^ XTMP1 => W0 */ \
+ shl XTMP3.4s, XTMP0.4s, #15;
+#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \
+ shl XTMP4.4s, XTMP0.4s, #23;
+#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \
+ eor w0.16b, XTMP1.16b, XTMP0.16b;
+#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \
+ sri XTMP3.4s, XTMP0.4s, #(32-15);
+#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \
+ sri XTMP4.4s, XTMP0.4s, #(32-23);
+#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \
+ eor w0.16b, w0.16b, XTMP3.16b;
+#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \
+ /* Load (w[i - 3]) => XTMP2 */ \
+ ext XTMP2.16b, w4.16b, w4.16b, #12;
+#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \
+ eor w0.16b, w0.16b, XTMP4.16b;
+#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \
+ ext XTMP2.16b, XTMP2.16b, w5.16b, #12;
+#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \
+ /* W1 ^ W2 => XTMP3 */ \
+ eor XTMP3.16b, XTMP2.16b, w0.16b;
+#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5)
+#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \
+ st1 {XTMP2.16b-XTMP3.16b}, [addr0];
+#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5)
+
+#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \
+ SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5)
+#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \
+ SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5)
+#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \
+ SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5)
+
+#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \
+ SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0)
+#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \
+ SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0)
+#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \
+ SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0)
+
+#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \
+ SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1)
+#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \
+ SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1)
+#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \
+ SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1)
+
+#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \
+ SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2)
+#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \
+ SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2)
+#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \
+ SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2)
+
+#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \
+ SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3)
+#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \
+ SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3)
+#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \
+ SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3)
+
+#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \
+ SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4)
+#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \
+ SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4)
+#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \
+ SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4)
+
+
+ /*
+ * Transform blocks*64 bytes (blocks*16 32-bit words) at 'src'.
+ *
+ * void sm3_neon_transform(struct sm3_state *sst, u8 const *src,
+ * int blocks)
+ */
+ .text
+.align 3
+SYM_TYPED_FUNC_START(sm3_neon_transform)
+ ldp ra, rb, [RSTATE, #0]
+ ldp rc, rd, [RSTATE, #8]
+ ldp re, rf, [RSTATE, #16]
+ ldp rg, rh, [RSTATE, #24]
+
+ stp x28, x29, [sp, #-16]!
+ stp x19, x20, [sp, #-16]!
+ stp x21, x22, [sp, #-16]!
+ stp x23, x24, [sp, #-16]!
+ stp x25, x26, [sp, #-16]!
+ mov RFRAME, sp
+
+ sub addr0, sp, #STACK_SIZE
+ adr_l RKPTR, .LKtable
+ and sp, addr0, #(~63)
+
+ /* Preload first block. */
+ LOAD_W_VEC_1(1, 0)
+ LOAD_W_VEC_1(2, 0)
+ LOAD_W_VEC_1(3, 0)
+ LOAD_W_VEC_1(4, 0)
+ LOAD_W_VEC_1(5, 0)
+ LOAD_W_VEC_1(6, 0)
+ LOAD_W_VEC_1(7, 0)
+ LOAD_W_VEC_1(8, 0)
+ LOAD_W_VEC_2(1, 0)
+ LOAD_W_VEC_2(2, 0)
+ LOAD_W_VEC_2(3, 0)
+ LOAD_W_VEC_2(4, 0)
+ LOAD_W_VEC_2(5, 0)
+ LOAD_W_VEC_2(6, 0)
+ LOAD_W_VEC_2(7, 0)
+ LOAD_W_VEC_2(8, 0)
+ LOAD_W_VEC_3(1, 0)
+ LOAD_W_VEC_3(2, 0)
+ LOAD_W_VEC_3(3, 0)
+ LOAD_W_VEC_3(4, 0)
+ LOAD_W_VEC_3(5, 0)
+ LOAD_W_VEC_3(6, 0)
+ LOAD_W_VEC_3(7, 0)
+ LOAD_W_VEC_3(8, 0)
+
+.balign 16
+.Loop:
+ /* Transform 0-3 */
+ R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0)
+ R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 1, 1, IW, _, 0)
+ R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0)
+ R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 3, 3, IW, _, 0)
+
+ /* Transform 4-7 + Precalc 12-14 */
+ R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0)
+ R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 5, 1, IW, _, 0)
+ R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12)
+ R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12)
+
+ /* Transform 8-11 + Precalc 12-17 */
+ R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12)
+ R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15)
+ R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15)
+ R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15)
+
+ /* Transform 12-14 + Precalc 18-20 */
+ R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18)
+ R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18)
+ R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18)
+
+ /* Transform 15-17 + Precalc 21-23 */
+ R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21)
+ R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21)
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21)
+
+ /* Transform 18-20 + Precalc 24-26 */
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24)
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24)
+ R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24)
+
+ /* Transform 21-23 + Precalc 27-29 */
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27)
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27)
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27)
+
+ /* Transform 24-26 + Precalc 30-32 */
+ R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30)
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30)
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30)
+
+ /* Transform 27-29 + Precalc 33-35 */
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33)
+ R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33)
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33)
+
+ /* Transform 30-32 + Precalc 36-38 */
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36)
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36)
+ R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36)
+
+ /* Transform 33-35 + Precalc 39-41 */
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39)
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39)
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39)
+
+ /* Transform 36-38 + Precalc 42-44 */
+ R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42)
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42)
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42)
+
+ /* Transform 39-41 + Precalc 45-47 */
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45)
+ R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45)
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45)
+
+ /* Transform 42-44 + Precalc 48-50 */
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48)
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48)
+ R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48)
+
+ /* Transform 45-47 + Precalc 51-53 */
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51)
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51)
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51)
+
+ /* Transform 48-50 + Precalc 54-56 */
+ R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54)
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54)
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54)
+
+ /* Transform 51-53 + Precalc 57-59 */
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57)
+ R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57)
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57)
+
+ /* Transform 54-56 + Precalc 60-62 */
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60)
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60)
+ R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60)
+
+ /* Transform 57-59 + Precalc 63 */
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63)
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63)
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63)
+
+ /* Transform 60 */
+ R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _)
+ subs RNBLKS, RNBLKS, #1
+ b.eq .Lend
+
+ /* Transform 61-63 + Preload next block */
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, LOAD_W_VEC_1, _)
+ ldp s0, s1, [RSTATE, #0]
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _)
+ ldp s2, s3, [RSTATE, #8]
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, LOAD_W_VEC_3, _)
+
+ /* Update the chaining variables. */
+ eor ra, ra, s0
+ eor rb, rb, s1
+ ldp s0, s1, [RSTATE, #16]
+ eor rc, rc, s2
+ ldp k_even, k_odd, [RSTATE, #24]
+ eor rd, rd, s3
+ eor re, re, s0
+ stp ra, rb, [RSTATE, #0]
+ eor rf, rf, s1
+ stp rc, rd, [RSTATE, #8]
+ eor rg, rg, k_even
+ stp re, rf, [RSTATE, #16]
+ eor rh, rh, k_odd
+ stp rg, rh, [RSTATE, #24]
+ b .Loop
+
+.Lend:
+ /* Transform 61-63 */
+ R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, _, _)
+ ldp s0, s1, [RSTATE, #0]
+ R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _)
+ ldp s2, s3, [RSTATE, #8]
+ R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, _, _)
+
+ /* Update the chaining variables. */
+ eor ra, ra, s0
+ clear_vec(W0)
+ eor rb, rb, s1
+ clear_vec(W1)
+ ldp s0, s1, [RSTATE, #16]
+ clear_vec(W2)
+ eor rc, rc, s2
+ clear_vec(W3)
+ ldp k_even, k_odd, [RSTATE, #24]
+ clear_vec(W4)
+ eor rd, rd, s3
+ clear_vec(W5)
+ eor re, re, s0
+ clear_vec(XTMP0)
+ stp ra, rb, [RSTATE, #0]
+ clear_vec(XTMP1)
+ eor rf, rf, s1
+ clear_vec(XTMP2)
+ stp rc, rd, [RSTATE, #8]
+ clear_vec(XTMP3)
+ eor rg, rg, k_even
+ clear_vec(XTMP4)
+ stp re, rf, [RSTATE, #16]
+ clear_vec(XTMP5)
+ eor rh, rh, k_odd
+ clear_vec(XTMP6)
+ stp rg, rh, [RSTATE, #24]
+
+ /* Clear message expansion area */
+ add addr0, sp, #STACK_W
+ st1 {W0.16b-W3.16b}, [addr0], #64
+ st1 {W0.16b-W3.16b}, [addr0], #64
+ st1 {W0.16b-W3.16b}, [addr0]
+
+ mov sp, RFRAME
+
+ ldp x25, x26, [sp], #16
+ ldp x23, x24, [sp], #16
+ ldp x21, x22, [sp], #16
+ ldp x19, x20, [sp], #16
+ ldp x28, x29, [sp], #16
+
+ ret
+SYM_FUNC_END(sm3_neon_transform)
+
+
+ .section ".rodata", "a"
+
+ .align 4
+.LKtable:
+ .long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
+ .long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
+ .long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
+ .long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
+ .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
+ .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
+ .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
+ .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
+ .long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
+ .long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
+ .long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
+ .long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
+ .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
+ .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
+ .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
+ .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
diff --git a/arch/arm64/crypto/sm3-neon-glue.c b/arch/arm64/crypto/sm3-neon-glue.c
new file mode 100644
index 000000000000..7182ee683f14
--- /dev/null
+++ b/arch/arm64/crypto/sm3-neon-glue.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * sm3-neon-glue.c - SM3 secure hash using NEON instructions
+ *
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sm3.h>
+#include <crypto/sm3_base.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+
+asmlinkage void sm3_neon_transform(struct sm3_state *sst, u8 const *src,
+ int blocks);
+
+static int sm3_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ if (!crypto_simd_usable()) {
+ sm3_update(shash_desc_ctx(desc), data, len);
+ return 0;
+ }
+
+ kernel_neon_begin();
+ sm3_base_do_update(desc, data, len, sm3_neon_transform);
+ kernel_neon_end();
+
+ return 0;
+}
+
+static int sm3_neon_final(struct shash_desc *desc, u8 *out)
+{
+ if (!crypto_simd_usable()) {
+ sm3_final(shash_desc_ctx(desc), out);
+ return 0;
+ }
+
+ kernel_neon_begin();
+ sm3_base_do_finalize(desc, sm3_neon_transform);
+ kernel_neon_end();
+
+ return sm3_base_finish(desc, out);
+}
+
+static int sm3_neon_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ if (!crypto_simd_usable()) {
+ struct sm3_state *sctx = shash_desc_ctx(desc);
+
+ if (len)
+ sm3_update(sctx, data, len);
+ sm3_final(sctx, out);
+ return 0;
+ }
+
+ kernel_neon_begin();
+ if (len)
+ sm3_base_do_update(desc, data, len, sm3_neon_transform);
+ sm3_base_do_finalize(desc, sm3_neon_transform);
+ kernel_neon_end();
+
+ return sm3_base_finish(desc, out);
+}
+
+static struct shash_alg sm3_alg = {
+ .digestsize = SM3_DIGEST_SIZE,
+ .init = sm3_base_init,
+ .update = sm3_neon_update,
+ .final = sm3_neon_final,
+ .finup = sm3_neon_finup,
+ .descsize = sizeof(struct sm3_state),
+ .base.cra_name = "sm3",
+ .base.cra_driver_name = "sm3-neon",
+ .base.cra_blocksize = SM3_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+ .base.cra_priority = 200,
+};
+
+static int __init sm3_neon_init(void)
+{
+ return crypto_register_shash(&sm3_alg);
+}
+
+static void __exit sm3_neon_fini(void)
+{
+ crypto_unregister_shash(&sm3_alg);
+}
+
+module_init(sm3_neon_init);
+module_exit(sm3_neon_fini);
+
+MODULE_DESCRIPTION("SM3 secure hash using NEON instructions");
+MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/crypto/sm4-ce-asm.h b/arch/arm64/crypto/sm4-ce-asm.h
new file mode 100644
index 000000000000..7ea98e42e779
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-asm.h
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 helper macros for Crypto Extensions
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#define SM4_PREPARE(ptr) \
+ ld1 {v24.16b-v27.16b}, [ptr], #64; \
+ ld1 {v28.16b-v31.16b}, [ptr];
+
+#define SM4_CRYPT_BLK_BE(b0) \
+ sm4e b0.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ rev32 b0.16b, b0.16b;
+
+#define SM4_CRYPT_BLK(b0) \
+ rev32 b0.16b, b0.16b; \
+ SM4_CRYPT_BLK_BE(b0);
+
+#define SM4_CRYPT_BLK2_BE(b0, b1) \
+ sm4e b0.4s, v24.4s; \
+ sm4e b1.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b1.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b1.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b1.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b1.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b1.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b1.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ sm4e b1.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ rev64 b1.4s, b1.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ ext b1.16b, b1.16b, b1.16b, #8; \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+
+#define SM4_CRYPT_BLK2(b0, b1) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ SM4_CRYPT_BLK2_BE(b0, b1);
+
+#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \
+ sm4e b0.4s, v24.4s; \
+ sm4e b1.4s, v24.4s; \
+ sm4e b2.4s, v24.4s; \
+ sm4e b3.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b1.4s, v25.4s; \
+ sm4e b2.4s, v25.4s; \
+ sm4e b3.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b1.4s, v26.4s; \
+ sm4e b2.4s, v26.4s; \
+ sm4e b3.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b1.4s, v27.4s; \
+ sm4e b2.4s, v27.4s; \
+ sm4e b3.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b1.4s, v28.4s; \
+ sm4e b2.4s, v28.4s; \
+ sm4e b3.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b1.4s, v29.4s; \
+ sm4e b2.4s, v29.4s; \
+ sm4e b3.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b1.4s, v30.4s; \
+ sm4e b2.4s, v30.4s; \
+ sm4e b3.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ sm4e b1.4s, v31.4s; \
+ sm4e b2.4s, v31.4s; \
+ sm4e b3.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ rev64 b1.4s, b1.4s; \
+ rev64 b2.4s, b2.4s; \
+ rev64 b3.4s, b3.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ ext b1.16b, b1.16b, b1.16b, #8; \
+ ext b2.16b, b2.16b, b2.16b, #8; \
+ ext b3.16b, b3.16b, b3.16b, #8; \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b;
+
+#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
+
+#define SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7) \
+ sm4e b0.4s, v24.4s; \
+ sm4e b1.4s, v24.4s; \
+ sm4e b2.4s, v24.4s; \
+ sm4e b3.4s, v24.4s; \
+ sm4e b4.4s, v24.4s; \
+ sm4e b5.4s, v24.4s; \
+ sm4e b6.4s, v24.4s; \
+ sm4e b7.4s, v24.4s; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b1.4s, v25.4s; \
+ sm4e b2.4s, v25.4s; \
+ sm4e b3.4s, v25.4s; \
+ sm4e b4.4s, v25.4s; \
+ sm4e b5.4s, v25.4s; \
+ sm4e b6.4s, v25.4s; \
+ sm4e b7.4s, v25.4s; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b1.4s, v26.4s; \
+ sm4e b2.4s, v26.4s; \
+ sm4e b3.4s, v26.4s; \
+ sm4e b4.4s, v26.4s; \
+ sm4e b5.4s, v26.4s; \
+ sm4e b6.4s, v26.4s; \
+ sm4e b7.4s, v26.4s; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b1.4s, v27.4s; \
+ sm4e b2.4s, v27.4s; \
+ sm4e b3.4s, v27.4s; \
+ sm4e b4.4s, v27.4s; \
+ sm4e b5.4s, v27.4s; \
+ sm4e b6.4s, v27.4s; \
+ sm4e b7.4s, v27.4s; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b1.4s, v28.4s; \
+ sm4e b2.4s, v28.4s; \
+ sm4e b3.4s, v28.4s; \
+ sm4e b4.4s, v28.4s; \
+ sm4e b5.4s, v28.4s; \
+ sm4e b6.4s, v28.4s; \
+ sm4e b7.4s, v28.4s; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b1.4s, v29.4s; \
+ sm4e b2.4s, v29.4s; \
+ sm4e b3.4s, v29.4s; \
+ sm4e b4.4s, v29.4s; \
+ sm4e b5.4s, v29.4s; \
+ sm4e b6.4s, v29.4s; \
+ sm4e b7.4s, v29.4s; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b1.4s, v30.4s; \
+ sm4e b2.4s, v30.4s; \
+ sm4e b3.4s, v30.4s; \
+ sm4e b4.4s, v30.4s; \
+ sm4e b5.4s, v30.4s; \
+ sm4e b6.4s, v30.4s; \
+ sm4e b7.4s, v30.4s; \
+ sm4e b0.4s, v31.4s; \
+ sm4e b1.4s, v31.4s; \
+ sm4e b2.4s, v31.4s; \
+ sm4e b3.4s, v31.4s; \
+ sm4e b4.4s, v31.4s; \
+ sm4e b5.4s, v31.4s; \
+ sm4e b6.4s, v31.4s; \
+ sm4e b7.4s, v31.4s; \
+ rev64 b0.4s, b0.4s; \
+ rev64 b1.4s, b1.4s; \
+ rev64 b2.4s, b2.4s; \
+ rev64 b3.4s, b3.4s; \
+ rev64 b4.4s, b4.4s; \
+ rev64 b5.4s, b5.4s; \
+ rev64 b6.4s, b6.4s; \
+ rev64 b7.4s, b7.4s; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ ext b1.16b, b1.16b, b1.16b, #8; \
+ ext b2.16b, b2.16b, b2.16b, #8; \
+ ext b3.16b, b3.16b, b3.16b, #8; \
+ ext b4.16b, b4.16b, b4.16b, #8; \
+ ext b5.16b, b5.16b, b5.16b, #8; \
+ ext b6.16b, b6.16b, b6.16b, #8; \
+ ext b7.16b, b7.16b, b7.16b, #8; \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b;
+
+#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b; \
+ SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7);
diff --git a/arch/arm64/crypto/sm4-ce-ccm-core.S b/arch/arm64/crypto/sm4-ce-ccm-core.S
new file mode 100644
index 000000000000..fa85856f33ce
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-ccm-core.S
@@ -0,0 +1,329 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4-CCM AEAD Algorithm using ARMv8 Crypto Extensions
+ * as specified in rfc8998
+ * https://datatracker.ietf.org/doc/html/rfc8998
+ *
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/assembler.h>
+#include "sm4-ce-asm.h"
+
+.arch armv8-a+crypto
+
+.irp b, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24, 25, 26, 27, 28, 29, 30, 31
+ .set .Lv\b\().4s, \b
+.endr
+
+.macro sm4e, vd, vn
+ .inst 0xcec08400 | (.L\vn << 5) | .L\vd
+.endm
+
+/* Register macros */
+
+#define RMAC v16
+
+/* Helper macros. */
+
+#define inc_le128(vctr) \
+ mov vctr.d[1], x8; \
+ mov vctr.d[0], x7; \
+ adds x8, x8, #1; \
+ rev64 vctr.16b, vctr.16b; \
+ adc x7, x7, xzr;
+
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbcmac_update)
+ /* input:
+ * x0: round key array, CTX
+ * x1: mac
+ * x2: src
+ * w3: nblocks
+ */
+ SM4_PREPARE(x0)
+
+ ld1 {RMAC.16b}, [x1]
+
+.Lcbcmac_loop_4x:
+ cmp w3, #4
+ blt .Lcbcmac_loop_1x
+
+ sub w3, w3, #4
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+
+ SM4_CRYPT_BLK(RMAC)
+ eor RMAC.16b, RMAC.16b, v0.16b
+ SM4_CRYPT_BLK(RMAC)
+ eor RMAC.16b, RMAC.16b, v1.16b
+ SM4_CRYPT_BLK(RMAC)
+ eor RMAC.16b, RMAC.16b, v2.16b
+ SM4_CRYPT_BLK(RMAC)
+ eor RMAC.16b, RMAC.16b, v3.16b
+
+ cbz w3, .Lcbcmac_end
+ b .Lcbcmac_loop_4x
+
+.Lcbcmac_loop_1x:
+ sub w3, w3, #1
+
+ ld1 {v0.16b}, [x2], #16
+
+ SM4_CRYPT_BLK(RMAC)
+ eor RMAC.16b, RMAC.16b, v0.16b
+
+ cbnz w3, .Lcbcmac_loop_1x
+
+.Lcbcmac_end:
+ st1 {RMAC.16b}, [x1]
+ ret
+SYM_FUNC_END(sm4_ce_cbcmac_update)
+
+.align 3
+SYM_FUNC_START(sm4_ce_ccm_final)
+ /* input:
+ * x0: round key array, CTX
+ * x1: ctr0 (big endian, 128 bit)
+ * x2: mac
+ */
+ SM4_PREPARE(x0)
+
+ ld1 {RMAC.16b}, [x2]
+ ld1 {v0.16b}, [x1]
+
+ SM4_CRYPT_BLK2(RMAC, v0)
+
+ /* en-/decrypt the mac with ctr0 */
+ eor RMAC.16b, RMAC.16b, v0.16b
+ st1 {RMAC.16b}, [x2]
+
+ ret
+SYM_FUNC_END(sm4_ce_ccm_final)
+
+.align 3
+SYM_TYPED_FUNC_START(sm4_ce_ccm_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * w4: nbytes
+ * x5: mac
+ */
+ SM4_PREPARE(x0)
+
+ ldp x7, x8, [x3]
+ rev x7, x7
+ rev x8, x8
+
+ ld1 {RMAC.16b}, [x5]
+
+.Lccm_enc_loop_4x:
+ cmp w4, #(4 * 16)
+ blt .Lccm_enc_loop_1x
+
+ sub w4, w4, #(4 * 16)
+
+ /* construct CTRs */
+ inc_le128(v8) /* +0 */
+ inc_le128(v9) /* +1 */
+ inc_le128(v10) /* +2 */
+ inc_le128(v11) /* +3 */
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+
+ SM4_CRYPT_BLK2(v8, RMAC)
+ eor v8.16b, v8.16b, v0.16b
+ eor RMAC.16b, RMAC.16b, v0.16b
+ SM4_CRYPT_BLK2(v9, RMAC)
+ eor v9.16b, v9.16b, v1.16b
+ eor RMAC.16b, RMAC.16b, v1.16b
+ SM4_CRYPT_BLK2(v10, RMAC)
+ eor v10.16b, v10.16b, v2.16b
+ eor RMAC.16b, RMAC.16b, v2.16b
+ SM4_CRYPT_BLK2(v11, RMAC)
+ eor v11.16b, v11.16b, v3.16b
+ eor RMAC.16b, RMAC.16b, v3.16b
+
+ st1 {v8.16b-v11.16b}, [x1], #64
+
+ cbz w4, .Lccm_enc_end
+ b .Lccm_enc_loop_4x
+
+.Lccm_enc_loop_1x:
+ cmp w4, #16
+ blt .Lccm_enc_tail
+
+ sub w4, w4, #16
+
+ /* construct CTRs */
+ inc_le128(v8)
+
+ ld1 {v0.16b}, [x2], #16
+
+ SM4_CRYPT_BLK2(v8, RMAC)
+ eor v8.16b, v8.16b, v0.16b
+ eor RMAC.16b, RMAC.16b, v0.16b
+
+ st1 {v8.16b}, [x1], #16
+
+ cbz w4, .Lccm_enc_end
+ b .Lccm_enc_loop_1x
+
+.Lccm_enc_tail:
+ /* construct CTRs */
+ inc_le128(v8)
+
+ SM4_CRYPT_BLK2(RMAC, v8)
+
+ /* store new MAC */
+ st1 {RMAC.16b}, [x5]
+
+.Lccm_enc_tail_loop:
+ ldrb w0, [x2], #1 /* get 1 byte from input */
+ umov w9, v8.b[0] /* get top crypted CTR byte */
+ umov w6, RMAC.b[0] /* get top MAC byte */
+
+ eor w9, w9, w0 /* w9 = CTR ^ input */
+ eor w6, w6, w0 /* w6 = MAC ^ input */
+
+ strb w9, [x1], #1 /* store out byte */
+ strb w6, [x5], #1 /* store MAC byte */
+
+ subs w4, w4, #1
+ beq .Lccm_enc_ret
+
+ /* shift out one byte */
+ ext RMAC.16b, RMAC.16b, RMAC.16b, #1
+ ext v8.16b, v8.16b, v8.16b, #1
+
+ b .Lccm_enc_tail_loop
+
+.Lccm_enc_end:
+ /* store new MAC */
+ st1 {RMAC.16b}, [x5]
+
+ /* store new CTR */
+ rev x7, x7
+ rev x8, x8
+ stp x7, x8, [x3]
+
+.Lccm_enc_ret:
+ ret
+SYM_FUNC_END(sm4_ce_ccm_enc)
+
+.align 3
+SYM_TYPED_FUNC_START(sm4_ce_ccm_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * w4: nbytes
+ * x5: mac
+ */
+ SM4_PREPARE(x0)
+
+ ldp x7, x8, [x3]
+ rev x7, x7
+ rev x8, x8
+
+ ld1 {RMAC.16b}, [x5]
+
+.Lccm_dec_loop_4x:
+ cmp w4, #(4 * 16)
+ blt .Lccm_dec_loop_1x
+
+ sub w4, w4, #(4 * 16)
+
+ /* construct CTRs */
+ inc_le128(v8) /* +0 */
+ inc_le128(v9) /* +1 */
+ inc_le128(v10) /* +2 */
+ inc_le128(v11) /* +3 */
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+
+ SM4_CRYPT_BLK2(v8, RMAC)
+ eor v8.16b, v8.16b, v0.16b
+ eor RMAC.16b, RMAC.16b, v8.16b
+ SM4_CRYPT_BLK2(v9, RMAC)
+ eor v9.16b, v9.16b, v1.16b
+ eor RMAC.16b, RMAC.16b, v9.16b
+ SM4_CRYPT_BLK2(v10, RMAC)
+ eor v10.16b, v10.16b, v2.16b
+ eor RMAC.16b, RMAC.16b, v10.16b
+ SM4_CRYPT_BLK2(v11, RMAC)
+ eor v11.16b, v11.16b, v3.16b
+ eor RMAC.16b, RMAC.16b, v11.16b
+
+ st1 {v8.16b-v11.16b}, [x1], #64
+
+ cbz w4, .Lccm_dec_end
+ b .Lccm_dec_loop_4x
+
+.Lccm_dec_loop_1x:
+ cmp w4, #16
+ blt .Lccm_dec_tail
+
+ sub w4, w4, #16
+
+ /* construct CTRs */
+ inc_le128(v8)
+
+ ld1 {v0.16b}, [x2], #16
+
+ SM4_CRYPT_BLK2(v8, RMAC)
+ eor v8.16b, v8.16b, v0.16b
+ eor RMAC.16b, RMAC.16b, v8.16b
+
+ st1 {v8.16b}, [x1], #16
+
+ cbz w4, .Lccm_dec_end
+ b .Lccm_dec_loop_1x
+
+.Lccm_dec_tail:
+ /* construct CTRs */
+ inc_le128(v8)
+
+ SM4_CRYPT_BLK2(RMAC, v8)
+
+ /* store new MAC */
+ st1 {RMAC.16b}, [x5]
+
+.Lccm_dec_tail_loop:
+ ldrb w0, [x2], #1 /* get 1 byte from input */
+ umov w9, v8.b[0] /* get top crypted CTR byte */
+ umov w6, RMAC.b[0] /* get top MAC byte */
+
+ eor w9, w9, w0 /* w9 = CTR ^ input */
+ eor w6, w6, w9 /* w6 = MAC ^ output */
+
+ strb w9, [x1], #1 /* store out byte */
+ strb w6, [x5], #1 /* store MAC byte */
+
+ subs w4, w4, #1
+ beq .Lccm_dec_ret
+
+ /* shift out one byte */
+ ext RMAC.16b, RMAC.16b, RMAC.16b, #1
+ ext v8.16b, v8.16b, v8.16b, #1
+
+ b .Lccm_dec_tail_loop
+
+.Lccm_dec_end:
+ /* store new MAC */
+ st1 {RMAC.16b}, [x5]
+
+ /* store new CTR */
+ rev x7, x7
+ rev x8, x8
+ stp x7, x8, [x3]
+
+.Lccm_dec_ret:
+ ret
+SYM_FUNC_END(sm4_ce_ccm_dec)
diff --git a/arch/arm64/crypto/sm4-ce-ccm-glue.c b/arch/arm64/crypto/sm4-ce-ccm-glue.c
new file mode 100644
index 000000000000..5e7e17bbec81
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-ccm-glue.c
@@ -0,0 +1,307 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4-CCM AEAD Algorithm using ARMv8 Crypto Extensions
+ * as specified in rfc8998
+ * https://datatracker.ietf.org/doc/html/rfc8998
+ *
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/neon.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+#include "sm4-ce.h"
+
+asmlinkage void sm4_ce_cbcmac_update(const u32 *rkey_enc, u8 *mac,
+ const u8 *src, unsigned int nblocks);
+asmlinkage void sm4_ce_ccm_enc(const u32 *rkey_enc, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nbytes, u8 *mac);
+asmlinkage void sm4_ce_ccm_dec(const u32 *rkey_enc, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nbytes, u8 *mac);
+asmlinkage void sm4_ce_ccm_final(const u32 *rkey_enc, u8 *iv, u8 *mac);
+
+
+static int ccm_setkey(struct crypto_aead *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_aead_ctx(tfm);
+
+ if (key_len != SM4_KEY_SIZE)
+ return -EINVAL;
+
+ kernel_neon_begin();
+ sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec,
+ crypto_sm4_fk, crypto_sm4_ck);
+ kernel_neon_end();
+
+ return 0;
+}
+
+static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+ if ((authsize & 1) || authsize < 4)
+ return -EINVAL;
+ return 0;
+}
+
+static int ccm_format_input(u8 info[], struct aead_request *req,
+ unsigned int msglen)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ unsigned int l = req->iv[0] + 1;
+ unsigned int m;
+ __be32 len;
+
+ /* verify that CCM dimension 'L': 2 <= L <= 8 */
+ if (l < 2 || l > 8)
+ return -EINVAL;
+ if (l < 4 && msglen >> (8 * l))
+ return -EOVERFLOW;
+
+ memset(&req->iv[SM4_BLOCK_SIZE - l], 0, l);
+
+ memcpy(info, req->iv, SM4_BLOCK_SIZE);
+
+ m = crypto_aead_authsize(aead);
+
+ /* format flags field per RFC 3610/NIST 800-38C */
+ *info |= ((m - 2) / 2) << 3;
+ if (req->assoclen)
+ *info |= (1 << 6);
+
+ /*
+ * format message length field,
+ * Linux uses a u32 type to represent msglen
+ */
+ if (l >= 4)
+ l = 4;
+
+ len = cpu_to_be32(msglen);
+ memcpy(&info[SM4_BLOCK_SIZE - l], (u8 *)&len + 4 - l, l);
+
+ return 0;
+}
+
+static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_aead_ctx(aead);
+ struct __packed { __be16 l; __be32 h; } aadlen;
+ u32 assoclen = req->assoclen;
+ struct scatter_walk walk;
+ unsigned int len;
+
+ if (assoclen < 0xff00) {
+ aadlen.l = cpu_to_be16(assoclen);
+ len = 2;
+ } else {
+ aadlen.l = cpu_to_be16(0xfffe);
+ put_unaligned_be32(assoclen, &aadlen.h);
+ len = 6;
+ }
+
+ sm4_ce_crypt_block(ctx->rkey_enc, mac, mac);
+ crypto_xor(mac, (const u8 *)&aadlen, len);
+
+ scatterwalk_start(&walk, req->src);
+
+ do {
+ u32 n = scatterwalk_clamp(&walk, assoclen);
+ u8 *p, *ptr;
+
+ if (!n) {
+ scatterwalk_start(&walk, sg_next(walk.sg));
+ n = scatterwalk_clamp(&walk, assoclen);
+ }
+
+ p = ptr = scatterwalk_map(&walk);
+ assoclen -= n;
+ scatterwalk_advance(&walk, n);
+
+ while (n > 0) {
+ unsigned int l, nblocks;
+
+ if (len == SM4_BLOCK_SIZE) {
+ if (n < SM4_BLOCK_SIZE) {
+ sm4_ce_crypt_block(ctx->rkey_enc,
+ mac, mac);
+
+ len = 0;
+ } else {
+ nblocks = n / SM4_BLOCK_SIZE;
+ sm4_ce_cbcmac_update(ctx->rkey_enc,
+ mac, ptr, nblocks);
+
+ ptr += nblocks * SM4_BLOCK_SIZE;
+ n %= SM4_BLOCK_SIZE;
+
+ continue;
+ }
+ }
+
+ l = min(n, SM4_BLOCK_SIZE - len);
+ if (l) {
+ crypto_xor(mac + len, ptr, l);
+ len += l;
+ ptr += l;
+ n -= l;
+ }
+ }
+
+ scatterwalk_unmap(p);
+ scatterwalk_done(&walk, 0, assoclen);
+ } while (assoclen);
+}
+
+static int ccm_crypt(struct aead_request *req, struct skcipher_walk *walk,
+ u32 *rkey_enc, u8 mac[],
+ void (*sm4_ce_ccm_crypt)(const u32 *rkey_enc, u8 *dst,
+ const u8 *src, u8 *iv,
+ unsigned int nbytes, u8 *mac))
+{
+ u8 __aligned(8) ctr0[SM4_BLOCK_SIZE];
+ int err = 0;
+
+ /* preserve the initial ctr0 for the TAG */
+ memcpy(ctr0, walk->iv, SM4_BLOCK_SIZE);
+ crypto_inc(walk->iv, SM4_BLOCK_SIZE);
+
+ kernel_neon_begin();
+
+ if (req->assoclen)
+ ccm_calculate_auth_mac(req, mac);
+
+ while (walk->nbytes && walk->nbytes != walk->total) {
+ unsigned int tail = walk->nbytes % SM4_BLOCK_SIZE;
+
+ sm4_ce_ccm_crypt(rkey_enc, walk->dst.virt.addr,
+ walk->src.virt.addr, walk->iv,
+ walk->nbytes - tail, mac);
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(walk, tail);
+
+ kernel_neon_begin();
+ }
+
+ if (walk->nbytes) {
+ sm4_ce_ccm_crypt(rkey_enc, walk->dst.virt.addr,
+ walk->src.virt.addr, walk->iv,
+ walk->nbytes, mac);
+
+ sm4_ce_ccm_final(rkey_enc, ctr0, mac);
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(walk, 0);
+ } else {
+ sm4_ce_ccm_final(rkey_enc, ctr0, mac);
+
+ kernel_neon_end();
+ }
+
+ return err;
+}
+
+static int ccm_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_aead_ctx(aead);
+ u8 __aligned(8) mac[SM4_BLOCK_SIZE];
+ struct skcipher_walk walk;
+ int err;
+
+ err = ccm_format_input(mac, req, req->cryptlen);
+ if (err)
+ return err;
+
+ err = skcipher_walk_aead_encrypt(&walk, req, false);
+ if (err)
+ return err;
+
+ err = ccm_crypt(req, &walk, ctx->rkey_enc, mac, sm4_ce_ccm_enc);
+ if (err)
+ return err;
+
+ /* copy authtag to end of dst */
+ scatterwalk_map_and_copy(mac, req->dst, req->assoclen + req->cryptlen,
+ crypto_aead_authsize(aead), 1);
+
+ return 0;
+}
+
+static int ccm_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ unsigned int authsize = crypto_aead_authsize(aead);
+ struct sm4_ctx *ctx = crypto_aead_ctx(aead);
+ u8 __aligned(8) mac[SM4_BLOCK_SIZE];
+ u8 authtag[SM4_BLOCK_SIZE];
+ struct skcipher_walk walk;
+ int err;
+
+ err = ccm_format_input(mac, req, req->cryptlen - authsize);
+ if (err)
+ return err;
+
+ err = skcipher_walk_aead_decrypt(&walk, req, false);
+ if (err)
+ return err;
+
+ err = ccm_crypt(req, &walk, ctx->rkey_enc, mac, sm4_ce_ccm_dec);
+ if (err)
+ return err;
+
+ /* compare calculated auth tag with the stored one */
+ scatterwalk_map_and_copy(authtag, req->src,
+ req->assoclen + req->cryptlen - authsize,
+ authsize, 0);
+
+ if (crypto_memneq(authtag, mac, authsize))
+ return -EBADMSG;
+
+ return 0;
+}
+
+static struct aead_alg sm4_ccm_alg = {
+ .base = {
+ .cra_name = "ccm(sm4)",
+ .cra_driver_name = "ccm-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .maxauthsize = SM4_BLOCK_SIZE,
+ .setkey = ccm_setkey,
+ .setauthsize = ccm_setauthsize,
+ .encrypt = ccm_encrypt,
+ .decrypt = ccm_decrypt,
+};
+
+static int __init sm4_ce_ccm_init(void)
+{
+ return crypto_register_aead(&sm4_ccm_alg);
+}
+
+static void __exit sm4_ce_ccm_exit(void)
+{
+ crypto_unregister_aead(&sm4_ccm_alg);
+}
+
+module_cpu_feature_match(SM4, sm4_ce_ccm_init);
+module_exit(sm4_ce_ccm_exit);
+
+MODULE_DESCRIPTION("Synchronous SM4 in CCM mode using ARMv8 Crypto Extensions");
+MODULE_ALIAS_CRYPTO("ccm(sm4)");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/crypto/sm4-ce-cipher-core.S b/arch/arm64/crypto/sm4-ce-cipher-core.S
new file mode 100644
index 000000000000..4ac6cfbc5797
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-cipher-core.S
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8
+ .set .Lv\b\().4s, \b
+ .endr
+
+ .macro sm4e, rd, rn
+ .inst 0xcec08400 | .L\rd | (.L\rn << 5)
+ .endm
+
+ /*
+ * void sm4_ce_do_crypt(const u32 *rk, u32 *out, const u32 *in);
+ */
+ .text
+SYM_FUNC_START(sm4_ce_do_crypt)
+ ld1 {v8.4s}, [x2]
+ ld1 {v0.4s-v3.4s}, [x0], #64
+CPU_LE( rev32 v8.16b, v8.16b )
+ ld1 {v4.4s-v7.4s}, [x0]
+ sm4e v8.4s, v0.4s
+ sm4e v8.4s, v1.4s
+ sm4e v8.4s, v2.4s
+ sm4e v8.4s, v3.4s
+ sm4e v8.4s, v4.4s
+ sm4e v8.4s, v5.4s
+ sm4e v8.4s, v6.4s
+ sm4e v8.4s, v7.4s
+ rev64 v8.4s, v8.4s
+ ext v8.16b, v8.16b, v8.16b, #8
+CPU_LE( rev32 v8.16b, v8.16b )
+ st1 {v8.4s}, [x1]
+ ret
+SYM_FUNC_END(sm4_ce_do_crypt)
diff --git a/arch/arm64/crypto/sm4-ce-cipher-glue.c b/arch/arm64/crypto/sm4-ce-cipher-glue.c
new file mode 100644
index 000000000000..c31d76fb5a17
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-cipher-glue.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/algapi.h>
+#include <crypto/sm4.h>
+#include <crypto/internal/simd.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/types.h>
+
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("sm4-ce");
+MODULE_DESCRIPTION("SM4 symmetric cipher using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in);
+
+static int sm4_ce_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ return sm4_expandkey(ctx, key, key_len);
+}
+
+static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (!crypto_simd_usable()) {
+ sm4_crypt_block(ctx->rkey_enc, out, in);
+ } else {
+ kernel_neon_begin();
+ sm4_ce_do_crypt(ctx->rkey_enc, out, in);
+ kernel_neon_end();
+ }
+}
+
+static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (!crypto_simd_usable()) {
+ sm4_crypt_block(ctx->rkey_dec, out, in);
+ } else {
+ kernel_neon_begin();
+ sm4_ce_do_crypt(ctx->rkey_dec, out, in);
+ kernel_neon_end();
+ }
+}
+
+static struct crypto_alg sm4_ce_alg = {
+ .cra_name = "sm4",
+ .cra_driver_name = "sm4-ce",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_u.cipher = {
+ .cia_min_keysize = SM4_KEY_SIZE,
+ .cia_max_keysize = SM4_KEY_SIZE,
+ .cia_setkey = sm4_ce_setkey,
+ .cia_encrypt = sm4_ce_encrypt,
+ .cia_decrypt = sm4_ce_decrypt
+ }
+};
+
+static int __init sm4_ce_mod_init(void)
+{
+ return crypto_register_alg(&sm4_ce_alg);
+}
+
+static void __exit sm4_ce_mod_fini(void)
+{
+ crypto_unregister_alg(&sm4_ce_alg);
+}
+
+module_cpu_feature_match(SM4, sm4_ce_mod_init);
+module_exit(sm4_ce_mod_fini);
diff --git a/arch/arm64/crypto/sm4-ce-core.S b/arch/arm64/crypto/sm4-ce-core.S
index af3bfbc3f4d4..1f3625c2c67e 100644
--- a/arch/arm64/crypto/sm4-ce-core.S
+++ b/arch/arm64/crypto/sm4-ce-core.S
@@ -1,36 +1,935 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include "sm4-ce-asm.h"
- .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8
- .set .Lv\b\().4s, \b
- .endr
-
- .macro sm4e, rd, rn
- .inst 0xcec08400 | .L\rd | (.L\rn << 5)
- .endm
-
- /*
- * void sm4_ce_do_crypt(const u32 *rk, u32 *out, const u32 *in);
- */
- .text
-ENTRY(sm4_ce_do_crypt)
- ld1 {v8.4s}, [x2]
- ld1 {v0.4s-v3.4s}, [x0], #64
-CPU_LE( rev32 v8.16b, v8.16b )
- ld1 {v4.4s-v7.4s}, [x0]
- sm4e v8.4s, v0.4s
- sm4e v8.4s, v1.4s
- sm4e v8.4s, v2.4s
- sm4e v8.4s, v3.4s
- sm4e v8.4s, v4.4s
- sm4e v8.4s, v5.4s
- sm4e v8.4s, v6.4s
- sm4e v8.4s, v7.4s
- rev64 v8.4s, v8.4s
- ext v8.16b, v8.16b, v8.16b, #8
-CPU_LE( rev32 v8.16b, v8.16b )
- st1 {v8.4s}, [x1]
- ret
-ENDPROC(sm4_ce_do_crypt)
+.arch armv8-a+crypto
+
+.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ 20, 24, 25, 26, 27, 28, 29, 30, 31
+ .set .Lv\b\().4s, \b
+.endr
+
+.macro sm4e, vd, vn
+ .inst 0xcec08400 | (.L\vn << 5) | .L\vd
+.endm
+
+.macro sm4ekey, vd, vn, vm
+ .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
+.endm
+
+/* Register macros */
+
+#define RTMP0 v16
+#define RTMP1 v17
+#define RTMP2 v18
+#define RTMP3 v19
+
+#define RIV v20
+#define RMAC v20
+#define RMASK v21
+
+
+.align 3
+SYM_FUNC_START(sm4_ce_expand_key)
+ /* input:
+ * x0: 128-bit key
+ * x1: rkey_enc
+ * x2: rkey_dec
+ * x3: fk array
+ * x4: ck array
+ */
+ ld1 {v0.16b}, [x0];
+ rev32 v0.16b, v0.16b;
+ ld1 {v1.16b}, [x3];
+ /* load ck */
+ ld1 {v24.16b-v27.16b}, [x4], #64;
+ ld1 {v28.16b-v31.16b}, [x4];
+
+ /* input ^ fk */
+ eor v0.16b, v0.16b, v1.16b;
+
+ sm4ekey v0.4s, v0.4s, v24.4s;
+ sm4ekey v1.4s, v0.4s, v25.4s;
+ sm4ekey v2.4s, v1.4s, v26.4s;
+ sm4ekey v3.4s, v2.4s, v27.4s;
+ sm4ekey v4.4s, v3.4s, v28.4s;
+ sm4ekey v5.4s, v4.4s, v29.4s;
+ sm4ekey v6.4s, v5.4s, v30.4s;
+ sm4ekey v7.4s, v6.4s, v31.4s;
+
+ adr_l x5, .Lbswap128_mask
+ ld1 {v24.16b}, [x5]
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b-v7.16b}, [x1];
+
+ tbl v16.16b, {v7.16b}, v24.16b
+ tbl v17.16b, {v6.16b}, v24.16b
+ tbl v18.16b, {v5.16b}, v24.16b
+ tbl v19.16b, {v4.16b}, v24.16b
+ tbl v20.16b, {v3.16b}, v24.16b
+ tbl v21.16b, {v2.16b}, v24.16b
+ tbl v22.16b, {v1.16b}, v24.16b
+ tbl v23.16b, {v0.16b}, v24.16b
+
+ st1 {v16.16b-v19.16b}, [x2], #64
+ st1 {v20.16b-v23.16b}, [x2]
+
+ ret;
+SYM_FUNC_END(sm4_ce_expand_key)
+
+.align 3
+SYM_FUNC_START(sm4_ce_crypt_block)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ */
+ SM4_PREPARE(x0)
+
+ ld1 {v0.16b}, [x2];
+ SM4_CRYPT_BLK(v0);
+ st1 {v0.16b}, [x1];
+
+ ret;
+SYM_FUNC_END(sm4_ce_crypt_block)
+
+.align 3
+SYM_FUNC_START(sm4_ce_crypt)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * w3: nblocks
+ */
+ SM4_PREPARE(x0)
+
+.Lcrypt_loop_blk:
+ sub w3, w3, #8;
+ tbnz w3, #31, .Lcrypt_tail8;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ ld1 {v4.16b-v7.16b}, [x2], #64;
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+ st1 {v0.16b-v3.16b}, [x1], #64;
+ st1 {v4.16b-v7.16b}, [x1], #64;
+
+ cbz w3, .Lcrypt_end;
+ b .Lcrypt_loop_blk;
+
+.Lcrypt_tail8:
+ add w3, w3, #8;
+ cmp w3, #4;
+ blt .Lcrypt_tail4;
+
+ sub w3, w3, #4;
+
+ ld1 {v0.16b-v3.16b}, [x2], #64;
+ SM4_CRYPT_BLK4(v0, v1, v2, v3);
+ st1 {v0.16b-v3.16b}, [x1], #64;
+
+ cbz w3, .Lcrypt_end;
+
+.Lcrypt_tail4:
+ sub w3, w3, #1;
+
+ ld1 {v0.16b}, [x2], #16;
+ SM4_CRYPT_BLK(v0);
+ st1 {v0.16b}, [x1], #16;
+
+ cbnz w3, .Lcrypt_tail4;
+
+.Lcrypt_end:
+ ret;
+SYM_FUNC_END(sm4_ce_crypt)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbc_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ SM4_PREPARE(x0)
+
+ ld1 {RIV.16b}, [x3]
+
+.Lcbc_enc_loop_4x:
+ cmp w4, #4
+ blt .Lcbc_enc_loop_1x
+
+ sub w4, w4, #4
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+
+ eor v0.16b, v0.16b, RIV.16b
+ SM4_CRYPT_BLK(v0)
+ eor v1.16b, v1.16b, v0.16b
+ SM4_CRYPT_BLK(v1)
+ eor v2.16b, v2.16b, v1.16b
+ SM4_CRYPT_BLK(v2)
+ eor v3.16b, v3.16b, v2.16b
+ SM4_CRYPT_BLK(v3)
+
+ st1 {v0.16b-v3.16b}, [x1], #64
+ mov RIV.16b, v3.16b
+
+ cbz w4, .Lcbc_enc_end
+ b .Lcbc_enc_loop_4x
+
+.Lcbc_enc_loop_1x:
+ sub w4, w4, #1
+
+ ld1 {v0.16b}, [x2], #16
+
+ eor RIV.16b, RIV.16b, v0.16b
+ SM4_CRYPT_BLK(RIV)
+
+ st1 {RIV.16b}, [x1], #16
+
+ cbnz w4, .Lcbc_enc_loop_1x
+
+.Lcbc_enc_end:
+ /* store new IV */
+ st1 {RIV.16b}, [x3]
+
+ ret
+SYM_FUNC_END(sm4_ce_cbc_enc)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbc_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ SM4_PREPARE(x0)
+
+ ld1 {RIV.16b}, [x3]
+
+.Lcbc_dec_loop_8x:
+ sub w4, w4, #8
+ tbnz w4, #31, .Lcbc_dec_4x
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+ ld1 {v4.16b-v7.16b}, [x2], #64
+
+ rev32 v8.16b, v0.16b
+ rev32 v9.16b, v1.16b
+ rev32 v10.16b, v2.16b
+ rev32 v11.16b, v3.16b
+ rev32 v12.16b, v4.16b
+ rev32 v13.16b, v5.16b
+ rev32 v14.16b, v6.16b
+ rev32 v15.16b, v7.16b
+
+ SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
+
+ eor v8.16b, v8.16b, RIV.16b
+ eor v9.16b, v9.16b, v0.16b
+ eor v10.16b, v10.16b, v1.16b
+ eor v11.16b, v11.16b, v2.16b
+ eor v12.16b, v12.16b, v3.16b
+ eor v13.16b, v13.16b, v4.16b
+ eor v14.16b, v14.16b, v5.16b
+ eor v15.16b, v15.16b, v6.16b
+
+ st1 {v8.16b-v11.16b}, [x1], #64
+ st1 {v12.16b-v15.16b}, [x1], #64
+
+ mov RIV.16b, v7.16b
+
+ cbz w4, .Lcbc_dec_end
+ b .Lcbc_dec_loop_8x
+
+.Lcbc_dec_4x:
+ add w4, w4, #8
+ cmp w4, #4
+ blt .Lcbc_dec_loop_1x
+
+ sub w4, w4, #4
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+
+ rev32 v8.16b, v0.16b
+ rev32 v9.16b, v1.16b
+ rev32 v10.16b, v2.16b
+ rev32 v11.16b, v3.16b
+
+ SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
+
+ eor v8.16b, v8.16b, RIV.16b
+ eor v9.16b, v9.16b, v0.16b
+ eor v10.16b, v10.16b, v1.16b
+ eor v11.16b, v11.16b, v2.16b
+
+ st1 {v8.16b-v11.16b}, [x1], #64
+
+ mov RIV.16b, v3.16b
+
+ cbz w4, .Lcbc_dec_end
+
+.Lcbc_dec_loop_1x:
+ sub w4, w4, #1
+
+ ld1 {v0.16b}, [x2], #16
+
+ rev32 v8.16b, v0.16b
+
+ SM4_CRYPT_BLK_BE(v8)
+
+ eor v8.16b, v8.16b, RIV.16b
+ st1 {v8.16b}, [x1], #16
+
+ mov RIV.16b, v0.16b
+
+ cbnz w4, .Lcbc_dec_loop_1x
+
+.Lcbc_dec_end:
+ /* store new IV */
+ st1 {RIV.16b}, [x3]
+
+ ret
+SYM_FUNC_END(sm4_ce_cbc_dec)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbc_cts_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nbytes
+ */
+ SM4_PREPARE(x0)
+
+ sub w5, w4, #16
+ uxtw x5, w5
+
+ ld1 {RIV.16b}, [x3]
+
+ ld1 {v0.16b}, [x2]
+ eor RIV.16b, RIV.16b, v0.16b
+ SM4_CRYPT_BLK(RIV)
+
+ /* load permute table */
+ adr_l x6, .Lcts_permute_table
+ add x7, x6, #32
+ add x6, x6, x5
+ sub x7, x7, x5
+ ld1 {v3.16b}, [x6]
+ ld1 {v4.16b}, [x7]
+
+ /* overlapping loads */
+ add x2, x2, x5
+ ld1 {v1.16b}, [x2]
+
+ /* create Cn from En-1 */
+ tbl v0.16b, {RIV.16b}, v3.16b
+ /* padding Pn with zeros */
+ tbl v1.16b, {v1.16b}, v4.16b
+
+ eor v1.16b, v1.16b, RIV.16b
+ SM4_CRYPT_BLK(v1)
+
+ /* overlapping stores */
+ add x5, x1, x5
+ st1 {v0.16b}, [x5]
+ st1 {v1.16b}, [x1]
+
+ ret
+SYM_FUNC_END(sm4_ce_cbc_cts_enc)
+
+.align 3
+SYM_FUNC_START(sm4_ce_cbc_cts_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nbytes
+ */
+ SM4_PREPARE(x0)
+
+ sub w5, w4, #16
+ uxtw x5, w5
+
+ ld1 {RIV.16b}, [x3]
+
+ /* load permute table */
+ adr_l x6, .Lcts_permute_table
+ add x7, x6, #32
+ add x6, x6, x5
+ sub x7, x7, x5
+ ld1 {v3.16b}, [x6]
+ ld1 {v4.16b}, [x7]
+
+ /* overlapping loads */
+ ld1 {v0.16b}, [x2], x5
+ ld1 {v1.16b}, [x2]
+
+ SM4_CRYPT_BLK(v0)
+ /* select the first Ln bytes of Xn to create Pn */
+ tbl v2.16b, {v0.16b}, v3.16b
+ eor v2.16b, v2.16b, v1.16b
+
+ /* overwrite the first Ln bytes with Cn to create En-1 */
+ tbx v0.16b, {v1.16b}, v4.16b
+ SM4_CRYPT_BLK(v0)
+ eor v0.16b, v0.16b, RIV.16b
+
+ /* overlapping stores */
+ add x5, x1, x5
+ st1 {v2.16b}, [x5]
+ st1 {v0.16b}, [x1]
+
+ ret
+SYM_FUNC_END(sm4_ce_cbc_cts_dec)
+
+.align 3
+SYM_FUNC_START(sm4_ce_ctr_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * w4: nblocks
+ */
+ SM4_PREPARE(x0)
+
+ ldp x7, x8, [x3]
+ rev x7, x7
+ rev x8, x8
+
+.Lctr_loop_8x:
+ sub w4, w4, #8
+ tbnz w4, #31, .Lctr_4x
+
+#define inc_le128(vctr) \
+ mov vctr.d[1], x8; \
+ mov vctr.d[0], x7; \
+ adds x8, x8, #1; \
+ rev64 vctr.16b, vctr.16b; \
+ adc x7, x7, xzr;
+
+ /* construct CTRs */
+ inc_le128(v0) /* +0 */
+ inc_le128(v1) /* +1 */
+ inc_le128(v2) /* +2 */
+ inc_le128(v3) /* +3 */
+ inc_le128(v4) /* +4 */
+ inc_le128(v5) /* +5 */
+ inc_le128(v6) /* +6 */
+ inc_le128(v7) /* +7 */
+
+ ld1 {v8.16b-v11.16b}, [x2], #64
+ ld1 {v12.16b-v15.16b}, [x2], #64
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
+
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ eor v4.16b, v4.16b, v12.16b
+ eor v5.16b, v5.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v7.16b, v7.16b, v15.16b
+
+ st1 {v0.16b-v3.16b}, [x1], #64
+ st1 {v4.16b-v7.16b}, [x1], #64
+
+ cbz w4, .Lctr_end
+ b .Lctr_loop_8x
+
+.Lctr_4x:
+ add w4, w4, #8
+ cmp w4, #4
+ blt .Lctr_loop_1x
+
+ sub w4, w4, #4
+
+ /* construct CTRs */
+ inc_le128(v0) /* +0 */
+ inc_le128(v1) /* +1 */
+ inc_le128(v2) /* +2 */
+ inc_le128(v3) /* +3 */
+
+ ld1 {v8.16b-v11.16b}, [x2], #64
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3)
+
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+
+ st1 {v0.16b-v3.16b}, [x1], #64
+
+ cbz w4, .Lctr_end
+
+.Lctr_loop_1x:
+ sub w4, w4, #1
+
+ /* construct CTRs */
+ inc_le128(v0)
+
+ ld1 {v8.16b}, [x2], #16
+
+ SM4_CRYPT_BLK(v0)
+
+ eor v0.16b, v0.16b, v8.16b
+ st1 {v0.16b}, [x1], #16
+
+ cbnz w4, .Lctr_loop_1x
+
+.Lctr_end:
+ /* store new CTR */
+ rev x7, x7
+ rev x8, x8
+ stp x7, x8, [x3]
+
+ ret
+SYM_FUNC_END(sm4_ce_ctr_enc)
+
+
+#define tweak_next(vt, vin, RTMP) \
+ sshr RTMP.2d, vin.2d, #63; \
+ and RTMP.16b, RTMP.16b, RMASK.16b; \
+ add vt.2d, vin.2d, vin.2d; \
+ ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \
+ eor vt.16b, vt.16b, RTMP.16b;
+
+.align 3
+SYM_FUNC_START(sm4_ce_xts_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: tweak (big endian, 128 bit)
+ * w4: nbytes
+ * x5: round key array for IV
+ */
+ ld1 {v8.16b}, [x3]
+
+ cbz x5, .Lxts_enc_nofirst
+
+ SM4_PREPARE(x5)
+
+ /* Generate first tweak */
+ SM4_CRYPT_BLK(v8)
+
+.Lxts_enc_nofirst:
+ SM4_PREPARE(x0)
+
+ ands w5, w4, #15
+ lsr w4, w4, #4
+ sub w6, w4, #1
+ csel w4, w4, w6, eq
+ uxtw x5, w5
+
+ movi RMASK.2s, #0x1
+ movi RTMP0.2s, #0x87
+ uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
+
+ cbz w4, .Lxts_enc_cts
+
+.Lxts_enc_loop_8x:
+ sub w4, w4, #8
+ tbnz w4, #31, .Lxts_enc_4x
+
+ tweak_next( v9, v8, RTMP0)
+ tweak_next(v10, v9, RTMP1)
+ tweak_next(v11, v10, RTMP2)
+ tweak_next(v12, v11, RTMP3)
+ tweak_next(v13, v12, RTMP0)
+ tweak_next(v14, v13, RTMP1)
+ tweak_next(v15, v14, RTMP2)
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+ ld1 {v4.16b-v7.16b}, [x2], #64
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ eor v4.16b, v4.16b, v12.16b
+ eor v5.16b, v5.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v7.16b, v7.16b, v15.16b
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
+
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ eor v4.16b, v4.16b, v12.16b
+ eor v5.16b, v5.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v7.16b, v7.16b, v15.16b
+ st1 {v0.16b-v3.16b}, [x1], #64
+ st1 {v4.16b-v7.16b}, [x1], #64
+
+ tweak_next(v8, v15, RTMP3)
+
+ cbz w4, .Lxts_enc_cts
+ b .Lxts_enc_loop_8x
+
+.Lxts_enc_4x:
+ add w4, w4, #8
+ cmp w4, #4
+ blt .Lxts_enc_loop_1x
+
+ sub w4, w4, #4
+
+ tweak_next( v9, v8, RTMP0)
+ tweak_next(v10, v9, RTMP1)
+ tweak_next(v11, v10, RTMP2)
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3)
+
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ st1 {v0.16b-v3.16b}, [x1], #64
+
+ tweak_next(v8, v11, RTMP3)
+
+ cbz w4, .Lxts_enc_cts
+
+.Lxts_enc_loop_1x:
+ sub w4, w4, #1
+
+ ld1 {v0.16b}, [x2], #16
+ eor v0.16b, v0.16b, v8.16b
+
+ SM4_CRYPT_BLK(v0)
+
+ eor v0.16b, v0.16b, v8.16b
+ st1 {v0.16b}, [x1], #16
+
+ tweak_next(v8, v8, RTMP0)
+
+ cbnz w4, .Lxts_enc_loop_1x
+
+.Lxts_enc_cts:
+ cbz x5, .Lxts_enc_end
+
+ /* cipher text stealing */
+
+ tweak_next(v9, v8, RTMP0)
+ ld1 {v0.16b}, [x2]
+ eor v0.16b, v0.16b, v8.16b
+ SM4_CRYPT_BLK(v0)
+ eor v0.16b, v0.16b, v8.16b
+
+ /* load permute table */
+ adr_l x6, .Lcts_permute_table
+ add x7, x6, #32
+ add x6, x6, x5
+ sub x7, x7, x5
+ ld1 {v3.16b}, [x6]
+ ld1 {v4.16b}, [x7]
+
+ /* overlapping loads */
+ add x2, x2, x5
+ ld1 {v1.16b}, [x2]
+
+ /* create Cn from En-1 */
+ tbl v2.16b, {v0.16b}, v3.16b
+ /* padding Pn with En-1 at the end */
+ tbx v0.16b, {v1.16b}, v4.16b
+
+ eor v0.16b, v0.16b, v9.16b
+ SM4_CRYPT_BLK(v0)
+ eor v0.16b, v0.16b, v9.16b
+
+
+ /* overlapping stores */
+ add x5, x1, x5
+ st1 {v2.16b}, [x5]
+ st1 {v0.16b}, [x1]
+
+ b .Lxts_enc_ret
+
+.Lxts_enc_end:
+ /* store new tweak */
+ st1 {v8.16b}, [x3]
+
+.Lxts_enc_ret:
+ ret
+SYM_FUNC_END(sm4_ce_xts_enc)
+
+.align 3
+SYM_FUNC_START(sm4_ce_xts_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: tweak (big endian, 128 bit)
+ * w4: nbytes
+ * x5: round key array for IV
+ */
+ ld1 {v8.16b}, [x3]
+
+ cbz x5, .Lxts_dec_nofirst
+
+ SM4_PREPARE(x5)
+
+ /* Generate first tweak */
+ SM4_CRYPT_BLK(v8)
+
+.Lxts_dec_nofirst:
+ SM4_PREPARE(x0)
+
+ ands w5, w4, #15
+ lsr w4, w4, #4
+ sub w6, w4, #1
+ csel w4, w4, w6, eq
+ uxtw x5, w5
+
+ movi RMASK.2s, #0x1
+ movi RTMP0.2s, #0x87
+ uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
+
+ cbz w4, .Lxts_dec_cts
+
+.Lxts_dec_loop_8x:
+ sub w4, w4, #8
+ tbnz w4, #31, .Lxts_dec_4x
+
+ tweak_next( v9, v8, RTMP0)
+ tweak_next(v10, v9, RTMP1)
+ tweak_next(v11, v10, RTMP2)
+ tweak_next(v12, v11, RTMP3)
+ tweak_next(v13, v12, RTMP0)
+ tweak_next(v14, v13, RTMP1)
+ tweak_next(v15, v14, RTMP2)
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+ ld1 {v4.16b-v7.16b}, [x2], #64
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ eor v4.16b, v4.16b, v12.16b
+ eor v5.16b, v5.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v7.16b, v7.16b, v15.16b
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
+
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ eor v4.16b, v4.16b, v12.16b
+ eor v5.16b, v5.16b, v13.16b
+ eor v6.16b, v6.16b, v14.16b
+ eor v7.16b, v7.16b, v15.16b
+ st1 {v0.16b-v3.16b}, [x1], #64
+ st1 {v4.16b-v7.16b}, [x1], #64
+
+ tweak_next(v8, v15, RTMP3)
+
+ cbz w4, .Lxts_dec_cts
+ b .Lxts_dec_loop_8x
+
+.Lxts_dec_4x:
+ add w4, w4, #8
+ cmp w4, #4
+ blt .Lxts_dec_loop_1x
+
+ sub w4, w4, #4
+
+ tweak_next( v9, v8, RTMP0)
+ tweak_next(v10, v9, RTMP1)
+ tweak_next(v11, v10, RTMP2)
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3)
+
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+ st1 {v0.16b-v3.16b}, [x1], #64
+
+ tweak_next(v8, v11, RTMP3)
+
+ cbz w4, .Lxts_dec_cts
+
+.Lxts_dec_loop_1x:
+ sub w4, w4, #1
+
+ ld1 {v0.16b}, [x2], #16
+ eor v0.16b, v0.16b, v8.16b
+
+ SM4_CRYPT_BLK(v0)
+
+ eor v0.16b, v0.16b, v8.16b
+ st1 {v0.16b}, [x1], #16
+
+ tweak_next(v8, v8, RTMP0)
+
+ cbnz w4, .Lxts_dec_loop_1x
+
+.Lxts_dec_cts:
+ cbz x5, .Lxts_dec_end
+
+ /* cipher text stealing */
+
+ tweak_next(v9, v8, RTMP0)
+ ld1 {v0.16b}, [x2]
+ eor v0.16b, v0.16b, v9.16b
+ SM4_CRYPT_BLK(v0)
+ eor v0.16b, v0.16b, v9.16b
+
+ /* load permute table */
+ adr_l x6, .Lcts_permute_table
+ add x7, x6, #32
+ add x6, x6, x5
+ sub x7, x7, x5
+ ld1 {v3.16b}, [x6]
+ ld1 {v4.16b}, [x7]
+
+ /* overlapping loads */
+ add x2, x2, x5
+ ld1 {v1.16b}, [x2]
+
+ /* create Cn from En-1 */
+ tbl v2.16b, {v0.16b}, v3.16b
+ /* padding Pn with En-1 at the end */
+ tbx v0.16b, {v1.16b}, v4.16b
+
+ eor v0.16b, v0.16b, v8.16b
+ SM4_CRYPT_BLK(v0)
+ eor v0.16b, v0.16b, v8.16b
+
+
+ /* overlapping stores */
+ add x5, x1, x5
+ st1 {v2.16b}, [x5]
+ st1 {v0.16b}, [x1]
+
+ b .Lxts_dec_ret
+
+.Lxts_dec_end:
+ /* store new tweak */
+ st1 {v8.16b}, [x3]
+
+.Lxts_dec_ret:
+ ret
+SYM_FUNC_END(sm4_ce_xts_dec)
+
+.align 3
+SYM_FUNC_START(sm4_ce_mac_update)
+ /* input:
+ * x0: round key array, CTX
+ * x1: digest
+ * x2: src
+ * w3: nblocks
+ * w4: enc_before
+ * w5: enc_after
+ */
+ SM4_PREPARE(x0)
+
+ ld1 {RMAC.16b}, [x1]
+
+ cbz w4, .Lmac_update
+
+ SM4_CRYPT_BLK(RMAC)
+
+.Lmac_update:
+ cbz w3, .Lmac_ret
+
+ sub w6, w3, #1
+ cmp w5, wzr
+ csel w3, w3, w6, ne
+
+ cbz w3, .Lmac_end
+
+.Lmac_loop_4x:
+ cmp w3, #4
+ blt .Lmac_loop_1x
+
+ sub w3, w3, #4
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+
+ eor RMAC.16b, RMAC.16b, v0.16b
+ SM4_CRYPT_BLK(RMAC)
+ eor RMAC.16b, RMAC.16b, v1.16b
+ SM4_CRYPT_BLK(RMAC)
+ eor RMAC.16b, RMAC.16b, v2.16b
+ SM4_CRYPT_BLK(RMAC)
+ eor RMAC.16b, RMAC.16b, v3.16b
+ SM4_CRYPT_BLK(RMAC)
+
+ cbz w3, .Lmac_end
+ b .Lmac_loop_4x
+
+.Lmac_loop_1x:
+ sub w3, w3, #1
+
+ ld1 {v0.16b}, [x2], #16
+
+ eor RMAC.16b, RMAC.16b, v0.16b
+ SM4_CRYPT_BLK(RMAC)
+
+ cbnz w3, .Lmac_loop_1x
+
+
+.Lmac_end:
+ cbnz w5, .Lmac_ret
+
+ ld1 {v0.16b}, [x2], #16
+ eor RMAC.16b, RMAC.16b, v0.16b
+
+.Lmac_ret:
+ st1 {RMAC.16b}, [x1]
+ ret
+SYM_FUNC_END(sm4_ce_mac_update)
+
+
+ .section ".rodata", "a"
+ .align 4
+.Lbswap128_mask:
+ .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
+ .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
+
+.Lcts_permute_table:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
diff --git a/arch/arm64/crypto/sm4-ce-gcm-core.S b/arch/arm64/crypto/sm4-ce-gcm-core.S
new file mode 100644
index 000000000000..347f25d75727
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-gcm-core.S
@@ -0,0 +1,742 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
+ * as specified in rfc8998
+ * https://datatracker.ietf.org/doc/html/rfc8998
+ *
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/assembler.h>
+#include "sm4-ce-asm.h"
+
+.arch armv8-a+crypto
+
+.irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31
+ .set .Lv\b\().4s, \b
+.endr
+
+.macro sm4e, vd, vn
+ .inst 0xcec08400 | (.L\vn << 5) | .L\vd
+.endm
+
+/* Register macros */
+
+/* Used for both encryption and decryption */
+#define RHASH v21
+#define RRCONST v22
+#define RZERO v23
+
+/* Helper macros. */
+
+/*
+ * input: m0, m1
+ * output: r0:r1 (low 128-bits in r0, high in r1)
+ */
+#define PMUL_128x128(r0, r1, m0, m1, T0, T1) \
+ ext T0.16b, m1.16b, m1.16b, #8; \
+ pmull r0.1q, m0.1d, m1.1d; \
+ pmull T1.1q, m0.1d, T0.1d; \
+ pmull2 T0.1q, m0.2d, T0.2d; \
+ pmull2 r1.1q, m0.2d, m1.2d; \
+ eor T0.16b, T0.16b, T1.16b; \
+ ext T1.16b, RZERO.16b, T0.16b, #8; \
+ ext T0.16b, T0.16b, RZERO.16b, #8; \
+ eor r0.16b, r0.16b, T1.16b; \
+ eor r1.16b, r1.16b, T0.16b;
+
+#define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1, \
+ r2, r3, m2, m3, T2, T3, \
+ r4, r5, m4, m5, T4, T5, \
+ r6, r7, m6, m7, T6, T7) \
+ ext T0.16b, m1.16b, m1.16b, #8; \
+ ext T2.16b, m3.16b, m3.16b, #8; \
+ ext T4.16b, m5.16b, m5.16b, #8; \
+ ext T6.16b, m7.16b, m7.16b, #8; \
+ pmull r0.1q, m0.1d, m1.1d; \
+ pmull r2.1q, m2.1d, m3.1d; \
+ pmull r4.1q, m4.1d, m5.1d; \
+ pmull r6.1q, m6.1d, m7.1d; \
+ pmull T1.1q, m0.1d, T0.1d; \
+ pmull T3.1q, m2.1d, T2.1d; \
+ pmull T5.1q, m4.1d, T4.1d; \
+ pmull T7.1q, m6.1d, T6.1d; \
+ pmull2 T0.1q, m0.2d, T0.2d; \
+ pmull2 T2.1q, m2.2d, T2.2d; \
+ pmull2 T4.1q, m4.2d, T4.2d; \
+ pmull2 T6.1q, m6.2d, T6.2d; \
+ pmull2 r1.1q, m0.2d, m1.2d; \
+ pmull2 r3.1q, m2.2d, m3.2d; \
+ pmull2 r5.1q, m4.2d, m5.2d; \
+ pmull2 r7.1q, m6.2d, m7.2d; \
+ eor T0.16b, T0.16b, T1.16b; \
+ eor T2.16b, T2.16b, T3.16b; \
+ eor T4.16b, T4.16b, T5.16b; \
+ eor T6.16b, T6.16b, T7.16b; \
+ ext T1.16b, RZERO.16b, T0.16b, #8; \
+ ext T3.16b, RZERO.16b, T2.16b, #8; \
+ ext T5.16b, RZERO.16b, T4.16b, #8; \
+ ext T7.16b, RZERO.16b, T6.16b, #8; \
+ ext T0.16b, T0.16b, RZERO.16b, #8; \
+ ext T2.16b, T2.16b, RZERO.16b, #8; \
+ ext T4.16b, T4.16b, RZERO.16b, #8; \
+ ext T6.16b, T6.16b, RZERO.16b, #8; \
+ eor r0.16b, r0.16b, T1.16b; \
+ eor r2.16b, r2.16b, T3.16b; \
+ eor r4.16b, r4.16b, T5.16b; \
+ eor r6.16b, r6.16b, T7.16b; \
+ eor r1.16b, r1.16b, T0.16b; \
+ eor r3.16b, r3.16b, T2.16b; \
+ eor r5.16b, r5.16b, T4.16b; \
+ eor r7.16b, r7.16b, T6.16b;
+
+/*
+ * input: r0:r1 (low 128-bits in r0, high in r1)
+ * output: a
+ */
+#define REDUCTION(a, r0, r1, rconst, T0, T1) \
+ pmull2 T0.1q, r1.2d, rconst.2d; \
+ ext T1.16b, T0.16b, RZERO.16b, #8; \
+ ext T0.16b, RZERO.16b, T0.16b, #8; \
+ eor r1.16b, r1.16b, T1.16b; \
+ eor r0.16b, r0.16b, T0.16b; \
+ pmull T0.1q, r1.1d, rconst.1d; \
+ eor a.16b, r0.16b, T0.16b;
+
+#define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1) \
+ rev32 b0.16b, b0.16b; \
+ ext T0.16b, m1.16b, m1.16b, #8; \
+ sm4e b0.4s, v24.4s; \
+ pmull r0.1q, m0.1d, m1.1d; \
+ sm4e b0.4s, v25.4s; \
+ pmull T1.1q, m0.1d, T0.1d; \
+ sm4e b0.4s, v26.4s; \
+ pmull2 T0.1q, m0.2d, T0.2d; \
+ sm4e b0.4s, v27.4s; \
+ pmull2 r1.1q, m0.2d, m1.2d; \
+ sm4e b0.4s, v28.4s; \
+ eor T0.16b, T0.16b, T1.16b; \
+ sm4e b0.4s, v29.4s; \
+ ext T1.16b, RZERO.16b, T0.16b, #8; \
+ sm4e b0.4s, v30.4s; \
+ ext T0.16b, T0.16b, RZERO.16b, #8; \
+ sm4e b0.4s, v31.4s; \
+ eor r0.16b, r0.16b, T1.16b; \
+ rev64 b0.4s, b0.4s; \
+ eor r1.16b, r1.16b, T0.16b; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ rev32 b0.16b, b0.16b;
+
+#define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2, \
+ r0, r1, m0, m1, T0, T1, \
+ r2, r3, m2, m3, T2, T3, \
+ r4, r5, m4, m5, T4, T5) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ ext T0.16b, m1.16b, m1.16b, #8; \
+ ext T2.16b, m3.16b, m3.16b, #8; \
+ ext T4.16b, m5.16b, m5.16b, #8; \
+ sm4e b0.4s, v24.4s; \
+ sm4e b1.4s, v24.4s; \
+ sm4e b2.4s, v24.4s; \
+ pmull r0.1q, m0.1d, m1.1d; \
+ pmull r2.1q, m2.1d, m3.1d; \
+ pmull r4.1q, m4.1d, m5.1d; \
+ sm4e b0.4s, v25.4s; \
+ sm4e b1.4s, v25.4s; \
+ sm4e b2.4s, v25.4s; \
+ pmull T1.1q, m0.1d, T0.1d; \
+ pmull T3.1q, m2.1d, T2.1d; \
+ pmull T5.1q, m4.1d, T4.1d; \
+ sm4e b0.4s, v26.4s; \
+ sm4e b1.4s, v26.4s; \
+ sm4e b2.4s, v26.4s; \
+ pmull2 T0.1q, m0.2d, T0.2d; \
+ pmull2 T2.1q, m2.2d, T2.2d; \
+ pmull2 T4.1q, m4.2d, T4.2d; \
+ sm4e b0.4s, v27.4s; \
+ sm4e b1.4s, v27.4s; \
+ sm4e b2.4s, v27.4s; \
+ pmull2 r1.1q, m0.2d, m1.2d; \
+ pmull2 r3.1q, m2.2d, m3.2d; \
+ pmull2 r5.1q, m4.2d, m5.2d; \
+ sm4e b0.4s, v28.4s; \
+ sm4e b1.4s, v28.4s; \
+ sm4e b2.4s, v28.4s; \
+ eor T0.16b, T0.16b, T1.16b; \
+ eor T2.16b, T2.16b, T3.16b; \
+ eor T4.16b, T4.16b, T5.16b; \
+ sm4e b0.4s, v29.4s; \
+ sm4e b1.4s, v29.4s; \
+ sm4e b2.4s, v29.4s; \
+ ext T1.16b, RZERO.16b, T0.16b, #8; \
+ ext T3.16b, RZERO.16b, T2.16b, #8; \
+ ext T5.16b, RZERO.16b, T4.16b, #8; \
+ sm4e b0.4s, v30.4s; \
+ sm4e b1.4s, v30.4s; \
+ sm4e b2.4s, v30.4s; \
+ ext T0.16b, T0.16b, RZERO.16b, #8; \
+ ext T2.16b, T2.16b, RZERO.16b, #8; \
+ ext T4.16b, T4.16b, RZERO.16b, #8; \
+ sm4e b0.4s, v31.4s; \
+ sm4e b1.4s, v31.4s; \
+ sm4e b2.4s, v31.4s; \
+ eor r0.16b, r0.16b, T1.16b; \
+ eor r2.16b, r2.16b, T3.16b; \
+ eor r4.16b, r4.16b, T5.16b; \
+ rev64 b0.4s, b0.4s; \
+ rev64 b1.4s, b1.4s; \
+ rev64 b2.4s, b2.4s; \
+ eor r1.16b, r1.16b, T0.16b; \
+ eor r3.16b, r3.16b, T2.16b; \
+ eor r5.16b, r5.16b, T4.16b; \
+ ext b0.16b, b0.16b, b0.16b, #8; \
+ ext b1.16b, b1.16b, b1.16b, #8; \
+ ext b2.16b, b2.16b, b2.16b, #8; \
+ eor r0.16b, r0.16b, r2.16b; \
+ eor r1.16b, r1.16b, r3.16b; \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ eor r0.16b, r0.16b, r4.16b; \
+ eor r1.16b, r1.16b, r5.16b;
+
+#define inc32_le128(vctr) \
+ mov vctr.d[1], x9; \
+ add w6, w9, #1; \
+ mov vctr.d[0], x8; \
+ bfi x9, x6, #0, #32; \
+ rev64 vctr.16b, vctr.16b;
+
+#define GTAG_HASH_LENGTHS(vctr0, vlen) \
+ ld1 {vlen.16b}, [x7]; \
+ /* construct CTR0 */ \
+ /* the lower 32-bits of initial IV is always be32(1) */ \
+ mov x6, #0x1; \
+ bfi x9, x6, #0, #32; \
+ mov vctr0.d[0], x8; \
+ mov vctr0.d[1], x9; \
+ rbit vlen.16b, vlen.16b; \
+ rev64 vctr0.16b, vctr0.16b; \
+ /* authtag = GCTR(CTR0, GHASH) */ \
+ eor RHASH.16b, RHASH.16b, vlen.16b; \
+ SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \
+ RTMP0, RTMP1); \
+ REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3); \
+ rbit RHASH.16b, RHASH.16b; \
+ eor RHASH.16b, RHASH.16b, vctr0.16b;
+
+
+/* Register macros for encrypt and ghash */
+
+/* can be the same as input v0-v3 */
+#define RR1 v0
+#define RR3 v1
+#define RR5 v2
+#define RR7 v3
+
+#define RR0 v4
+#define RR2 v5
+#define RR4 v6
+#define RR6 v7
+
+#define RTMP0 v8
+#define RTMP1 v9
+#define RTMP2 v10
+#define RTMP3 v11
+#define RTMP4 v12
+#define RTMP5 v13
+#define RTMP6 v14
+#define RTMP7 v15
+
+#define RH1 v16
+#define RH2 v17
+#define RH3 v18
+#define RH4 v19
+
+.align 3
+SYM_FUNC_START(sm4_ce_pmull_ghash_setup)
+ /* input:
+ * x0: round key array, CTX
+ * x1: ghash table
+ */
+ SM4_PREPARE(x0)
+
+ adr_l x2, .Lghash_rconst
+ ld1r {RRCONST.2d}, [x2]
+
+ eor RZERO.16b, RZERO.16b, RZERO.16b
+
+ /* H = E(K, 0^128) */
+ rev32 v0.16b, RZERO.16b
+ SM4_CRYPT_BLK_BE(v0)
+
+ /* H ^ 1 */
+ rbit RH1.16b, v0.16b
+
+ /* H ^ 2 */
+ PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1)
+ REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3)
+
+ /* H ^ 3 */
+ PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1)
+ REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3)
+
+ /* H ^ 4 */
+ PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1)
+ REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3)
+
+ st1 {RH1.16b-RH4.16b}, [x1]
+
+ ret
+SYM_FUNC_END(sm4_ce_pmull_ghash_setup)
+
+.align 3
+SYM_FUNC_START(pmull_ghash_update)
+ /* input:
+ * x0: ghash table
+ * x1: ghash result
+ * x2: src
+ * w3: nblocks
+ */
+ ld1 {RH1.16b-RH4.16b}, [x0]
+
+ ld1 {RHASH.16b}, [x1]
+ rbit RHASH.16b, RHASH.16b
+
+ adr_l x4, .Lghash_rconst
+ ld1r {RRCONST.2d}, [x4]
+
+ eor RZERO.16b, RZERO.16b, RZERO.16b
+
+.Lghash_loop_4x:
+ cmp w3, #4
+ blt .Lghash_loop_1x
+
+ sub w3, w3, #4
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+
+ rbit v0.16b, v0.16b
+ rbit v1.16b, v1.16b
+ rbit v2.16b, v2.16b
+ rbit v3.16b, v3.16b
+
+ /*
+ * (in0 ^ HASH) * H^4 => rr0:rr1
+ * (in1) * H^3 => rr2:rr3
+ * (in2) * H^2 => rr4:rr5
+ * (in3) * H^1 => rr6:rr7
+ */
+ eor RHASH.16b, RHASH.16b, v0.16b
+
+ PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
+ RR2, RR3, v1, RH3, RTMP2, RTMP3,
+ RR4, RR5, v2, RH2, RTMP4, RTMP5,
+ RR6, RR7, v3, RH1, RTMP6, RTMP7)
+
+ eor RR0.16b, RR0.16b, RR2.16b
+ eor RR1.16b, RR1.16b, RR3.16b
+ eor RR0.16b, RR0.16b, RR4.16b
+ eor RR1.16b, RR1.16b, RR5.16b
+ eor RR0.16b, RR0.16b, RR6.16b
+ eor RR1.16b, RR1.16b, RR7.16b
+
+ REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
+
+ cbz w3, .Lghash_end
+ b .Lghash_loop_4x
+
+.Lghash_loop_1x:
+ sub w3, w3, #1
+
+ ld1 {v0.16b}, [x2], #16
+ rbit v0.16b, v0.16b
+ eor RHASH.16b, RHASH.16b, v0.16b
+
+ PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
+ REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
+
+ cbnz w3, .Lghash_loop_1x
+
+.Lghash_end:
+ rbit RHASH.16b, RHASH.16b
+ st1 {RHASH.2d}, [x1]
+
+ ret
+SYM_FUNC_END(pmull_ghash_update)
+
+.align 3
+SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * w4: nbytes
+ * x5: ghash result
+ * x6: ghash table
+ * x7: lengths (only for last block)
+ */
+ SM4_PREPARE(x0)
+
+ ldp x8, x9, [x3]
+ rev x8, x8
+ rev x9, x9
+
+ ld1 {RH1.16b-RH4.16b}, [x6]
+
+ ld1 {RHASH.16b}, [x5]
+ rbit RHASH.16b, RHASH.16b
+
+ adr_l x6, .Lghash_rconst
+ ld1r {RRCONST.2d}, [x6]
+
+ eor RZERO.16b, RZERO.16b, RZERO.16b
+
+ cbz w4, .Lgcm_enc_hash_len
+
+.Lgcm_enc_loop_4x:
+ cmp w4, #(4 * 16)
+ blt .Lgcm_enc_loop_1x
+
+ sub w4, w4, #(4 * 16)
+
+ /* construct CTRs */
+ inc32_le128(v0) /* +0 */
+ inc32_le128(v1) /* +1 */
+ inc32_le128(v2) /* +2 */
+ inc32_le128(v3) /* +3 */
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3)
+
+ eor v0.16b, v0.16b, RTMP0.16b
+ eor v1.16b, v1.16b, RTMP1.16b
+ eor v2.16b, v2.16b, RTMP2.16b
+ eor v3.16b, v3.16b, RTMP3.16b
+ st1 {v0.16b-v3.16b}, [x1], #64
+
+ /* ghash update */
+
+ rbit v0.16b, v0.16b
+ rbit v1.16b, v1.16b
+ rbit v2.16b, v2.16b
+ rbit v3.16b, v3.16b
+
+ /*
+ * (in0 ^ HASH) * H^4 => rr0:rr1
+ * (in1) * H^3 => rr2:rr3
+ * (in2) * H^2 => rr4:rr5
+ * (in3) * H^1 => rr6:rr7
+ */
+ eor RHASH.16b, RHASH.16b, v0.16b
+
+ PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1,
+ RR2, RR3, v1, RH3, RTMP2, RTMP3,
+ RR4, RR5, v2, RH2, RTMP4, RTMP5,
+ RR6, RR7, v3, RH1, RTMP6, RTMP7)
+
+ eor RR0.16b, RR0.16b, RR2.16b
+ eor RR1.16b, RR1.16b, RR3.16b
+ eor RR0.16b, RR0.16b, RR4.16b
+ eor RR1.16b, RR1.16b, RR5.16b
+ eor RR0.16b, RR0.16b, RR6.16b
+ eor RR1.16b, RR1.16b, RR7.16b
+
+ REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
+
+ cbz w4, .Lgcm_enc_hash_len
+ b .Lgcm_enc_loop_4x
+
+.Lgcm_enc_loop_1x:
+ cmp w4, #16
+ blt .Lgcm_enc_tail
+
+ sub w4, w4, #16
+
+ /* construct CTRs */
+ inc32_le128(v0)
+
+ ld1 {RTMP0.16b}, [x2], #16
+
+ SM4_CRYPT_BLK(v0)
+
+ eor v0.16b, v0.16b, RTMP0.16b
+ st1 {v0.16b}, [x1], #16
+
+ /* ghash update */
+ rbit v0.16b, v0.16b
+ eor RHASH.16b, RHASH.16b, v0.16b
+ PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
+ REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
+
+ cbz w4, .Lgcm_enc_hash_len
+ b .Lgcm_enc_loop_1x
+
+.Lgcm_enc_tail:
+ /* construct CTRs */
+ inc32_le128(v0)
+ SM4_CRYPT_BLK(v0)
+
+ /* load permute table */
+ adr_l x0, .Lcts_permute_table
+ add x0, x0, #32
+ sub x0, x0, w4, uxtw
+ ld1 {v3.16b}, [x0]
+
+.Lgcm_enc_tail_loop:
+ /* do encrypt */
+ ldrb w0, [x2], #1 /* get 1 byte from input */
+ umov w6, v0.b[0] /* get top crypted byte */
+ eor w6, w6, w0 /* w6 = CTR ^ input */
+ strb w6, [x1], #1 /* store out byte */
+
+ /* shift right out one byte */
+ ext v0.16b, v0.16b, v0.16b, #1
+ /* the last ciphertext is placed in high bytes */
+ ins v0.b[15], w6
+
+ subs w4, w4, #1
+ bne .Lgcm_enc_tail_loop
+
+ /* padding last block with zeros */
+ tbl v0.16b, {v0.16b}, v3.16b
+
+ /* ghash update */
+ rbit v0.16b, v0.16b
+ eor RHASH.16b, RHASH.16b, v0.16b
+ PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
+ REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
+
+.Lgcm_enc_hash_len:
+ cbz x7, .Lgcm_enc_end
+
+ GTAG_HASH_LENGTHS(v1, v3)
+
+ b .Lgcm_enc_ret
+
+.Lgcm_enc_end:
+ /* store new CTR */
+ rev x8, x8
+ rev x9, x9
+ stp x8, x9, [x3]
+
+ rbit RHASH.16b, RHASH.16b
+
+.Lgcm_enc_ret:
+ /* store new MAC */
+ st1 {RHASH.2d}, [x5]
+
+ ret
+SYM_FUNC_END(sm4_ce_pmull_gcm_enc)
+
+#undef RR1
+#undef RR3
+#undef RR5
+#undef RR7
+#undef RR0
+#undef RR2
+#undef RR4
+#undef RR6
+#undef RTMP0
+#undef RTMP1
+#undef RTMP2
+#undef RTMP3
+#undef RTMP4
+#undef RTMP5
+#undef RTMP6
+#undef RTMP7
+#undef RH1
+#undef RH2
+#undef RH3
+#undef RH4
+
+
+/* Register macros for decrypt */
+
+/* v0-v2 for building CTRs, v3-v5 for saving inputs */
+
+#define RR1 v6
+#define RR3 v7
+#define RR5 v8
+
+#define RR0 v9
+#define RR2 v10
+#define RR4 v11
+
+#define RTMP0 v12
+#define RTMP1 v13
+#define RTMP2 v14
+#define RTMP3 v15
+#define RTMP4 v16
+#define RTMP5 v17
+
+#define RH1 v18
+#define RH2 v19
+#define RH3 v20
+
+.align 3
+SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * w4: nbytes
+ * x5: ghash result
+ * x6: ghash table
+ * x7: lengths (only for last block)
+ */
+ SM4_PREPARE(x0)
+
+ ldp x8, x9, [x3]
+ rev x8, x8
+ rev x9, x9
+
+ ld1 {RH1.16b-RH3.16b}, [x6]
+
+ ld1 {RHASH.16b}, [x5]
+ rbit RHASH.16b, RHASH.16b
+
+ adr_l x6, .Lghash_rconst
+ ld1r {RRCONST.2d}, [x6]
+
+ eor RZERO.16b, RZERO.16b, RZERO.16b
+
+ cbz w4, .Lgcm_dec_hash_len
+
+.Lgcm_dec_loop_3x:
+ cmp w4, #(3 * 16)
+ blt .Lgcm_dec_loop_1x
+
+ sub w4, w4, #(3 * 16)
+
+ ld1 {v3.16b-v5.16b}, [x2], #(3 * 16)
+
+ /* construct CTRs */
+ inc32_le128(v0) /* +0 */
+ rbit v6.16b, v3.16b
+ inc32_le128(v1) /* +1 */
+ rbit v7.16b, v4.16b
+ inc32_le128(v2) /* +2 */
+ rbit v8.16b, v5.16b
+
+ eor RHASH.16b, RHASH.16b, v6.16b
+
+ /* decrypt & ghash update */
+ SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2,
+ RR0, RR1, RHASH, RH3, RTMP0, RTMP1,
+ RR2, RR3, v7, RH2, RTMP2, RTMP3,
+ RR4, RR5, v8, RH1, RTMP4, RTMP5)
+
+ eor v0.16b, v0.16b, v3.16b
+ eor v1.16b, v1.16b, v4.16b
+ eor v2.16b, v2.16b, v5.16b
+
+ REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1)
+
+ st1 {v0.16b-v2.16b}, [x1], #(3 * 16)
+
+ cbz w4, .Lgcm_dec_hash_len
+ b .Lgcm_dec_loop_3x
+
+.Lgcm_dec_loop_1x:
+ cmp w4, #16
+ blt .Lgcm_dec_tail
+
+ sub w4, w4, #16
+
+ ld1 {v3.16b}, [x2], #16
+
+ /* construct CTRs */
+ inc32_le128(v0)
+ rbit v6.16b, v3.16b
+
+ eor RHASH.16b, RHASH.16b, v6.16b
+
+ SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
+
+ eor v0.16b, v0.16b, v3.16b
+
+ REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
+
+ st1 {v0.16b}, [x1], #16
+
+ cbz w4, .Lgcm_dec_hash_len
+ b .Lgcm_dec_loop_1x
+
+.Lgcm_dec_tail:
+ /* construct CTRs */
+ inc32_le128(v0)
+ SM4_CRYPT_BLK(v0)
+
+ /* load permute table */
+ adr_l x0, .Lcts_permute_table
+ add x0, x0, #32
+ sub x0, x0, w4, uxtw
+ ld1 {v3.16b}, [x0]
+
+.Lgcm_dec_tail_loop:
+ /* do decrypt */
+ ldrb w0, [x2], #1 /* get 1 byte from input */
+ umov w6, v0.b[0] /* get top crypted byte */
+ eor w6, w6, w0 /* w6 = CTR ^ input */
+ strb w6, [x1], #1 /* store out byte */
+
+ /* shift right out one byte */
+ ext v0.16b, v0.16b, v0.16b, #1
+ /* the last ciphertext is placed in high bytes */
+ ins v0.b[15], w0
+
+ subs w4, w4, #1
+ bne .Lgcm_dec_tail_loop
+
+ /* padding last block with zeros */
+ tbl v0.16b, {v0.16b}, v3.16b
+
+ /* ghash update */
+ rbit v0.16b, v0.16b
+ eor RHASH.16b, RHASH.16b, v0.16b
+ PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1)
+ REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3)
+
+.Lgcm_dec_hash_len:
+ cbz x7, .Lgcm_dec_end
+
+ GTAG_HASH_LENGTHS(v1, v3)
+
+ b .Lgcm_dec_ret
+
+.Lgcm_dec_end:
+ /* store new CTR */
+ rev x8, x8
+ rev x9, x9
+ stp x8, x9, [x3]
+
+ rbit RHASH.16b, RHASH.16b
+
+.Lgcm_dec_ret:
+ /* store new MAC */
+ st1 {RHASH.2d}, [x5]
+
+ ret
+SYM_FUNC_END(sm4_ce_pmull_gcm_dec)
+
+ .section ".rodata", "a"
+ .align 4
+.Lcts_permute_table:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+.Lghash_rconst:
+ .quad 0x87
diff --git a/arch/arm64/crypto/sm4-ce-gcm-glue.c b/arch/arm64/crypto/sm4-ce-gcm-glue.c
new file mode 100644
index 000000000000..73bfb6972d3a
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce-gcm-glue.c
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions
+ * as specified in rfc8998
+ * https://datatracker.ietf.org/doc/html/rfc8998
+ *
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/neon.h>
+#include <crypto/b128ops.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+#include "sm4-ce.h"
+
+asmlinkage void sm4_ce_pmull_ghash_setup(const u32 *rkey_enc, u8 *ghash_table);
+asmlinkage void pmull_ghash_update(const u8 *ghash_table, u8 *ghash,
+ const u8 *src, unsigned int nblocks);
+asmlinkage void sm4_ce_pmull_gcm_enc(const u32 *rkey_enc, u8 *dst,
+ const u8 *src, u8 *iv,
+ unsigned int nbytes, u8 *ghash,
+ const u8 *ghash_table, const u8 *lengths);
+asmlinkage void sm4_ce_pmull_gcm_dec(const u32 *rkey_enc, u8 *dst,
+ const u8 *src, u8 *iv,
+ unsigned int nbytes, u8 *ghash,
+ const u8 *ghash_table, const u8 *lengths);
+
+#define GHASH_BLOCK_SIZE 16
+#define GCM_IV_SIZE 12
+
+struct sm4_gcm_ctx {
+ struct sm4_ctx key;
+ u8 ghash_table[16 * 4];
+};
+
+
+static int gcm_setkey(struct crypto_aead *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_gcm_ctx *ctx = crypto_aead_ctx(tfm);
+
+ if (key_len != SM4_KEY_SIZE)
+ return -EINVAL;
+
+ kernel_neon_begin();
+
+ sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
+ crypto_sm4_fk, crypto_sm4_ck);
+ sm4_ce_pmull_ghash_setup(ctx->key.rkey_enc, ctx->ghash_table);
+
+ kernel_neon_end();
+ return 0;
+}
+
+static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+ switch (authsize) {
+ case 4:
+ case 8:
+ case 12 ... 16:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static void gcm_calculate_auth_mac(struct aead_request *req, u8 ghash[])
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct sm4_gcm_ctx *ctx = crypto_aead_ctx(aead);
+ u8 __aligned(8) buffer[GHASH_BLOCK_SIZE];
+ u32 assoclen = req->assoclen;
+ struct scatter_walk walk;
+ unsigned int buflen = 0;
+
+ scatterwalk_start(&walk, req->src);
+
+ do {
+ u32 n = scatterwalk_clamp(&walk, assoclen);
+ u8 *p, *ptr;
+
+ if (!n) {
+ scatterwalk_start(&walk, sg_next(walk.sg));
+ n = scatterwalk_clamp(&walk, assoclen);
+ }
+
+ p = ptr = scatterwalk_map(&walk);
+ assoclen -= n;
+ scatterwalk_advance(&walk, n);
+
+ if (n + buflen < GHASH_BLOCK_SIZE) {
+ memcpy(&buffer[buflen], ptr, n);
+ buflen += n;
+ } else {
+ unsigned int nblocks;
+
+ if (buflen) {
+ unsigned int l = GHASH_BLOCK_SIZE - buflen;
+
+ memcpy(&buffer[buflen], ptr, l);
+ ptr += l;
+ n -= l;
+
+ pmull_ghash_update(ctx->ghash_table, ghash,
+ buffer, 1);
+ }
+
+ nblocks = n / GHASH_BLOCK_SIZE;
+ if (nblocks) {
+ pmull_ghash_update(ctx->ghash_table, ghash,
+ ptr, nblocks);
+ ptr += nblocks * GHASH_BLOCK_SIZE;
+ }
+
+ buflen = n % GHASH_BLOCK_SIZE;
+ if (buflen)
+ memcpy(&buffer[0], ptr, buflen);
+ }
+
+ scatterwalk_unmap(p);
+ scatterwalk_done(&walk, 0, assoclen);
+ } while (assoclen);
+
+ /* padding with '0' */
+ if (buflen) {
+ memset(&buffer[buflen], 0, GHASH_BLOCK_SIZE - buflen);
+ pmull_ghash_update(ctx->ghash_table, ghash, buffer, 1);
+ }
+}
+
+static int gcm_crypt(struct aead_request *req, struct skcipher_walk *walk,
+ u8 ghash[], int err,
+ void (*sm4_ce_pmull_gcm_crypt)(const u32 *rkey_enc,
+ u8 *dst, const u8 *src, u8 *iv,
+ unsigned int nbytes, u8 *ghash,
+ const u8 *ghash_table, const u8 *lengths))
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct sm4_gcm_ctx *ctx = crypto_aead_ctx(aead);
+ u8 __aligned(8) iv[SM4_BLOCK_SIZE];
+ be128 __aligned(8) lengths;
+
+ memset(ghash, 0, SM4_BLOCK_SIZE);
+
+ lengths.a = cpu_to_be64(req->assoclen * 8);
+ lengths.b = cpu_to_be64(walk->total * 8);
+
+ memcpy(iv, req->iv, GCM_IV_SIZE);
+ put_unaligned_be32(2, iv + GCM_IV_SIZE);
+
+ kernel_neon_begin();
+
+ if (req->assoclen)
+ gcm_calculate_auth_mac(req, ghash);
+
+ while (walk->nbytes) {
+ unsigned int tail = walk->nbytes % SM4_BLOCK_SIZE;
+ const u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+
+ if (walk->nbytes == walk->total) {
+ sm4_ce_pmull_gcm_crypt(ctx->key.rkey_enc, dst, src, iv,
+ walk->nbytes, ghash,
+ ctx->ghash_table,
+ (const u8 *)&lengths);
+
+ kernel_neon_end();
+
+ return skcipher_walk_done(walk, 0);
+ }
+
+ sm4_ce_pmull_gcm_crypt(ctx->key.rkey_enc, dst, src, iv,
+ walk->nbytes - tail, ghash,
+ ctx->ghash_table, NULL);
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(walk, tail);
+
+ kernel_neon_begin();
+ }
+
+ sm4_ce_pmull_gcm_crypt(ctx->key.rkey_enc, NULL, NULL, iv,
+ walk->nbytes, ghash, ctx->ghash_table,
+ (const u8 *)&lengths);
+
+ kernel_neon_end();
+
+ return err;
+}
+
+static int gcm_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ u8 __aligned(8) ghash[SM4_BLOCK_SIZE];
+ struct skcipher_walk walk;
+ int err;
+
+ err = skcipher_walk_aead_encrypt(&walk, req, false);
+ err = gcm_crypt(req, &walk, ghash, err, sm4_ce_pmull_gcm_enc);
+ if (err)
+ return err;
+
+ /* copy authtag to end of dst */
+ scatterwalk_map_and_copy(ghash, req->dst, req->assoclen + req->cryptlen,
+ crypto_aead_authsize(aead), 1);
+
+ return 0;
+}
+
+static int gcm_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ unsigned int authsize = crypto_aead_authsize(aead);
+ u8 __aligned(8) ghash[SM4_BLOCK_SIZE];
+ u8 authtag[SM4_BLOCK_SIZE];
+ struct skcipher_walk walk;
+ int err;
+
+ err = skcipher_walk_aead_decrypt(&walk, req, false);
+ err = gcm_crypt(req, &walk, ghash, err, sm4_ce_pmull_gcm_dec);
+ if (err)
+ return err;
+
+ /* compare calculated auth tag with the stored one */
+ scatterwalk_map_and_copy(authtag, req->src,
+ req->assoclen + req->cryptlen - authsize,
+ authsize, 0);
+
+ if (crypto_memneq(authtag, ghash, authsize))
+ return -EBADMSG;
+
+ return 0;
+}
+
+static struct aead_alg sm4_gcm_alg = {
+ .base = {
+ .cra_name = "gcm(sm4)",
+ .cra_driver_name = "gcm-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_gcm_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .ivsize = GCM_IV_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .maxauthsize = SM4_BLOCK_SIZE,
+ .setkey = gcm_setkey,
+ .setauthsize = gcm_setauthsize,
+ .encrypt = gcm_encrypt,
+ .decrypt = gcm_decrypt,
+};
+
+static int __init sm4_ce_gcm_init(void)
+{
+ if (!cpu_have_named_feature(PMULL))
+ return -ENODEV;
+
+ return crypto_register_aead(&sm4_gcm_alg);
+}
+
+static void __exit sm4_ce_gcm_exit(void)
+{
+ crypto_unregister_aead(&sm4_gcm_alg);
+}
+
+static const struct cpu_feature __maybe_unused sm4_ce_gcm_cpu_feature[] = {
+ { cpu_feature(PMULL) },
+ {}
+};
+MODULE_DEVICE_TABLE(cpu, sm4_ce_gcm_cpu_feature);
+
+module_cpu_feature_match(SM4, sm4_ce_gcm_init);
+module_exit(sm4_ce_gcm_exit);
+
+MODULE_DESCRIPTION("Synchronous SM4 in GCM mode using ARMv8 Crypto Extensions");
+MODULE_ALIAS_CRYPTO("gcm(sm4)");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c
index 2754c875d39c..43741bed874e 100644
--- a/arch/arm64/crypto/sm4-ce-glue.c
+++ b/arch/arm64/crypto/sm4-ce-glue.c
@@ -1,74 +1,779 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, using ARMv8 Crypto Extensions
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
#include <asm/neon.h>
#include <asm/simd.h>
-#include <crypto/sm4.h>
+#include <crypto/b128ops.h>
#include <crypto/internal/simd.h>
-#include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/types.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/internal/hash.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/xts.h>
+#include <crypto/sm4.h>
-MODULE_ALIAS_CRYPTO("sm4");
-MODULE_ALIAS_CRYPTO("sm4-ce");
-MODULE_DESCRIPTION("SM4 symmetric cipher using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
+#define BYTES2BLKS(nbytes) ((nbytes) >> 4)
+
+asmlinkage void sm4_ce_expand_key(const u8 *key, u32 *rkey_enc, u32 *rkey_dec,
+ const u32 *fk, const u32 *ck);
+asmlinkage void sm4_ce_crypt_block(const u32 *rkey, u8 *dst, const u8 *src);
+asmlinkage void sm4_ce_crypt(const u32 *rkey, u8 *dst, const u8 *src,
+ unsigned int nblks);
+asmlinkage void sm4_ce_cbc_enc(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblocks);
+asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblocks);
+asmlinkage void sm4_ce_cbc_cts_enc(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nbytes);
+asmlinkage void sm4_ce_cbc_cts_dec(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nbytes);
+asmlinkage void sm4_ce_ctr_enc(const u32 *rkey, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblks);
+asmlinkage void sm4_ce_xts_enc(const u32 *rkey1, u8 *dst, const u8 *src,
+ u8 *tweak, unsigned int nbytes,
+ const u32 *rkey2_enc);
+asmlinkage void sm4_ce_xts_dec(const u32 *rkey1, u8 *dst, const u8 *src,
+ u8 *tweak, unsigned int nbytes,
+ const u32 *rkey2_enc);
+asmlinkage void sm4_ce_mac_update(const u32 *rkey_enc, u8 *digest,
+ const u8 *src, unsigned int nblocks,
+ bool enc_before, bool enc_after);
+
+EXPORT_SYMBOL(sm4_ce_expand_key);
+EXPORT_SYMBOL(sm4_ce_crypt_block);
+EXPORT_SYMBOL(sm4_ce_cbc_enc);
-asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in);
+struct sm4_xts_ctx {
+ struct sm4_ctx key1;
+ struct sm4_ctx key2;
+};
+
+struct sm4_mac_tfm_ctx {
+ struct sm4_ctx key;
+ u8 __aligned(8) consts[];
+};
-static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+struct sm4_mac_desc_ctx {
+ unsigned int len;
+ u8 digest[SM4_BLOCK_SIZE];
+};
+
+static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
{
- const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ if (key_len != SM4_KEY_SIZE)
+ return -EINVAL;
+
+ kernel_neon_begin();
+ sm4_ce_expand_key(key, ctx->rkey_enc, ctx->rkey_dec,
+ crypto_sm4_fk, crypto_sm4_ck);
+ kernel_neon_end();
+ return 0;
+}
+
+static int sm4_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int ret;
+
+ if (key_len != SM4_KEY_SIZE * 2)
+ return -EINVAL;
+
+ ret = xts_verify_key(tfm, key, key_len);
+ if (ret)
+ return ret;
+
+ kernel_neon_begin();
+ sm4_ce_expand_key(key, ctx->key1.rkey_enc,
+ ctx->key1.rkey_dec, crypto_sm4_fk, crypto_sm4_ck);
+ sm4_ce_expand_key(&key[SM4_KEY_SIZE], ctx->key2.rkey_enc,
+ ctx->key2.rkey_dec, crypto_sm4_fk, crypto_sm4_ck);
+ kernel_neon_end();
+
+ return 0;
+}
+
+static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
+{
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
+
+ kernel_neon_begin();
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_crypt(rkey, dst, src, nblks);
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_enc);
+}
+
+static int sm4_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_dec);
+}
+
+static int sm4_cbc_crypt(struct skcipher_request *req,
+ struct sm4_ctx *ctx, bool encrypt)
+{
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblocks;
+
+ nblocks = nbytes / SM4_BLOCK_SIZE;
+ if (nblocks) {
+ kernel_neon_begin();
+
+ if (encrypt)
+ sm4_ce_cbc_enc(ctx->rkey_enc, dst, src,
+ walk.iv, nblocks);
+ else
+ sm4_ce_cbc_dec(ctx->rkey_dec, dst, src,
+ walk.iv, nblocks);
+
+ kernel_neon_end();
+ }
+
+ err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE);
+ }
+
+ return err;
+}
+
+static int sm4_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_cbc_crypt(req, ctx, true);
+}
+
+static int sm4_cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_cbc_crypt(req, ctx, false);
+}
+
+static int sm4_cbc_cts_crypt(struct skcipher_request *req, bool encrypt)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct scatterlist *src = req->src;
+ struct scatterlist *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ int cbc_blocks;
+ int err;
+
+ if (req->cryptlen < SM4_BLOCK_SIZE)
+ return -EINVAL;
+
+ if (req->cryptlen == SM4_BLOCK_SIZE)
+ return sm4_cbc_crypt(req, ctx, encrypt);
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
+
+ /* handle the CBC cryption part */
+ cbc_blocks = DIV_ROUND_UP(req->cryptlen, SM4_BLOCK_SIZE) - 2;
+ if (cbc_blocks) {
+ skcipher_request_set_crypt(&subreq, src, dst,
+ cbc_blocks * SM4_BLOCK_SIZE,
+ req->iv);
+
+ err = sm4_cbc_crypt(&subreq, ctx, encrypt);
+ if (err)
+ return err;
+
+ dst = src = scatterwalk_ffwd(sg_src, src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * SM4_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_neon_begin();
+
+ if (encrypt)
+ sm4_ce_cbc_cts_enc(ctx->rkey_enc, walk.dst.virt.addr,
+ walk.src.virt.addr, walk.iv, walk.nbytes);
+ else
+ sm4_ce_cbc_cts_dec(ctx->rkey_dec, walk.dst.virt.addr,
+ walk.src.virt.addr, walk.iv, walk.nbytes);
+
+ kernel_neon_end();
+
+ return skcipher_walk_done(&walk, 0);
+}
+
+static int sm4_cbc_cts_encrypt(struct skcipher_request *req)
+{
+ return sm4_cbc_cts_crypt(req, true);
+}
+
+static int sm4_cbc_cts_decrypt(struct skcipher_request *req)
+{
+ return sm4_cbc_cts_crypt(req, false);
+}
+
+static int sm4_ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblks;
- if (!crypto_simd_usable()) {
- crypto_sm4_encrypt(tfm, out, in);
- } else {
kernel_neon_begin();
- sm4_ce_do_crypt(ctx->rkey_enc, out, in);
+
+ nblks = BYTES2BLKS(nbytes);
+ if (nblks) {
+ sm4_ce_ctr_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
+ dst += nblks * SM4_BLOCK_SIZE;
+ src += nblks * SM4_BLOCK_SIZE;
+ nbytes -= nblks * SM4_BLOCK_SIZE;
+ }
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_ce_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
kernel_neon_end();
+
+ err = skcipher_walk_done(&walk, nbytes);
}
+
+ return err;
}
-static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+static int sm4_xts_crypt(struct skcipher_request *req, bool encrypt)
{
- const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int tail = req->cryptlen % SM4_BLOCK_SIZE;
+ const u32 *rkey2_enc = ctx->key2.rkey_enc;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct scatterlist *src, *dst;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ if (req->cryptlen < SM4_BLOCK_SIZE)
+ return -EINVAL;
+
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
+
+ if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
+ int nblocks = DIV_ROUND_UP(req->cryptlen, SM4_BLOCK_SIZE) - 2;
- if (!crypto_simd_usable()) {
- crypto_sm4_decrypt(tfm, out, in);
+ skcipher_walk_abort(&walk);
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq,
+ skcipher_request_flags(req),
+ NULL, NULL);
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ nblocks * SM4_BLOCK_SIZE, req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
} else {
+ tail = 0;
+ }
+
+ while ((nbytes = walk.nbytes) >= SM4_BLOCK_SIZE) {
+ if (nbytes < walk.total)
+ nbytes &= ~(SM4_BLOCK_SIZE - 1);
+
kernel_neon_begin();
- sm4_ce_do_crypt(ctx->rkey_dec, out, in);
+
+ if (encrypt)
+ sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr,
+ walk.src.virt.addr, walk.iv, nbytes,
+ rkey2_enc);
+ else
+ sm4_ce_xts_dec(ctx->key1.rkey_dec, walk.dst.virt.addr,
+ walk.src.virt.addr, walk.iv, nbytes,
+ rkey2_enc);
+
kernel_neon_end();
+
+ rkey2_enc = NULL;
+
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ if (err)
+ return err;
}
+
+ if (likely(tail == 0))
+ return 0;
+
+ /* handle ciphertext stealing */
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst, subreq.cryptlen);
+
+ skcipher_request_set_crypt(&subreq, src, dst, SM4_BLOCK_SIZE + tail,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_neon_begin();
+
+ if (encrypt)
+ sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr,
+ walk.src.virt.addr, walk.iv, walk.nbytes,
+ rkey2_enc);
+ else
+ sm4_ce_xts_dec(ctx->key1.rkey_dec, walk.dst.virt.addr,
+ walk.src.virt.addr, walk.iv, walk.nbytes,
+ rkey2_enc);
+
+ kernel_neon_end();
+
+ return skcipher_walk_done(&walk, 0);
}
-static struct crypto_alg sm4_ce_alg = {
- .cra_name = "sm4",
- .cra_driver_name = "sm4-ce",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
- .cra_blocksize = SM4_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_sm4_ctx),
- .cra_module = THIS_MODULE,
- .cra_u.cipher = {
- .cia_min_keysize = SM4_KEY_SIZE,
- .cia_max_keysize = SM4_KEY_SIZE,
- .cia_setkey = crypto_sm4_set_key,
- .cia_encrypt = sm4_ce_encrypt,
- .cia_decrypt = sm4_ce_decrypt
+static int sm4_xts_encrypt(struct skcipher_request *req)
+{
+ return sm4_xts_crypt(req, true);
+}
+
+static int sm4_xts_decrypt(struct skcipher_request *req)
+{
+ return sm4_xts_crypt(req, false);
+}
+
+static struct skcipher_alg sm4_algs[] = {
+ {
+ .base = {
+ .cra_name = "ecb(sm4)",
+ .cra_driver_name = "ecb-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ecb_encrypt,
+ .decrypt = sm4_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cbc(sm4)",
+ .cra_driver_name = "cbc-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = sm4_cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "ctr(sm4)",
+ .cra_driver_name = "ctr-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ctr_crypt,
+ .decrypt = sm4_ctr_crypt,
+ }, {
+ .base = {
+ .cra_name = "cts(cbc(sm4))",
+ .cra_driver_name = "cts-cbc-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .walksize = SM4_BLOCK_SIZE * 2,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cbc_cts_encrypt,
+ .decrypt = sm4_cbc_cts_decrypt,
+ }, {
+ .base = {
+ .cra_name = "xts(sm4)",
+ .cra_driver_name = "xts-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_xts_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE * 2,
+ .max_keysize = SM4_KEY_SIZE * 2,
+ .ivsize = SM4_BLOCK_SIZE,
+ .walksize = SM4_BLOCK_SIZE * 2,
+ .setkey = sm4_xts_setkey,
+ .encrypt = sm4_xts_encrypt,
+ .decrypt = sm4_xts_decrypt,
}
};
-static int __init sm4_ce_mod_init(void)
+static int sm4_cbcmac_setkey(struct crypto_shash *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+
+ if (key_len != SM4_KEY_SIZE)
+ return -EINVAL;
+
+ kernel_neon_begin();
+ sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
+ crypto_sm4_fk, crypto_sm4_ck);
+ kernel_neon_end();
+
+ return 0;
+}
+
+static int sm4_cmac_setkey(struct crypto_shash *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+ be128 *consts = (be128 *)ctx->consts;
+ u64 a, b;
+
+ if (key_len != SM4_KEY_SIZE)
+ return -EINVAL;
+
+ memset(consts, 0, SM4_BLOCK_SIZE);
+
+ kernel_neon_begin();
+
+ sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
+ crypto_sm4_fk, crypto_sm4_ck);
+
+ /* encrypt the zero block */
+ sm4_ce_crypt_block(ctx->key.rkey_enc, (u8 *)consts, (const u8 *)consts);
+
+ kernel_neon_end();
+
+ /* gf(2^128) multiply zero-ciphertext with u and u^2 */
+ a = be64_to_cpu(consts[0].a);
+ b = be64_to_cpu(consts[0].b);
+ consts[0].a = cpu_to_be64((a << 1) | (b >> 63));
+ consts[0].b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0));
+
+ a = be64_to_cpu(consts[0].a);
+ b = be64_to_cpu(consts[0].b);
+ consts[1].a = cpu_to_be64((a << 1) | (b >> 63));
+ consts[1].b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0));
+
+ return 0;
+}
+
+static int sm4_xcbc_setkey(struct crypto_shash *tfm, const u8 *key,
+ unsigned int key_len)
{
- return crypto_register_alg(&sm4_ce_alg);
+ struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+ u8 __aligned(8) key2[SM4_BLOCK_SIZE];
+ static u8 const ks[3][SM4_BLOCK_SIZE] = {
+ { [0 ... SM4_BLOCK_SIZE - 1] = 0x1},
+ { [0 ... SM4_BLOCK_SIZE - 1] = 0x2},
+ { [0 ... SM4_BLOCK_SIZE - 1] = 0x3},
+ };
+
+ if (key_len != SM4_KEY_SIZE)
+ return -EINVAL;
+
+ kernel_neon_begin();
+
+ sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
+ crypto_sm4_fk, crypto_sm4_ck);
+
+ sm4_ce_crypt_block(ctx->key.rkey_enc, key2, ks[0]);
+ sm4_ce_crypt(ctx->key.rkey_enc, ctx->consts, ks[1], 2);
+
+ sm4_ce_expand_key(key2, ctx->key.rkey_enc, ctx->key.rkey_dec,
+ crypto_sm4_fk, crypto_sm4_ck);
+
+ kernel_neon_end();
+
+ return 0;
}
-static void __exit sm4_ce_mod_fini(void)
+static int sm4_mac_init(struct shash_desc *desc)
{
- crypto_unregister_alg(&sm4_ce_alg);
+ struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ memset(ctx->digest, 0, SM4_BLOCK_SIZE);
+ ctx->len = 0;
+
+ return 0;
}
-module_cpu_feature_match(SM4, sm4_ce_mod_init);
-module_exit(sm4_ce_mod_fini);
+static int sm4_mac_update(struct shash_desc *desc, const u8 *p,
+ unsigned int len)
+{
+ struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
+ unsigned int l, nblocks;
+
+ if (len == 0)
+ return 0;
+
+ if (ctx->len || ctx->len + len < SM4_BLOCK_SIZE) {
+ l = min(len, SM4_BLOCK_SIZE - ctx->len);
+
+ crypto_xor(ctx->digest + ctx->len, p, l);
+ ctx->len += l;
+ len -= l;
+ p += l;
+ }
+
+ if (len && (ctx->len % SM4_BLOCK_SIZE) == 0) {
+ kernel_neon_begin();
+
+ if (len < SM4_BLOCK_SIZE && ctx->len == SM4_BLOCK_SIZE) {
+ sm4_ce_crypt_block(tctx->key.rkey_enc,
+ ctx->digest, ctx->digest);
+ ctx->len = 0;
+ } else {
+ nblocks = len / SM4_BLOCK_SIZE;
+ len %= SM4_BLOCK_SIZE;
+
+ sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p,
+ nblocks, (ctx->len == SM4_BLOCK_SIZE),
+ (len != 0));
+
+ p += nblocks * SM4_BLOCK_SIZE;
+
+ if (len == 0)
+ ctx->len = SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_end();
+
+ if (len) {
+ crypto_xor(ctx->digest, p, len);
+ ctx->len = len;
+ }
+ }
+
+ return 0;
+}
+
+static int sm4_cmac_final(struct shash_desc *desc, u8 *out)
+{
+ struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
+ const u8 *consts = tctx->consts;
+
+ if (ctx->len != SM4_BLOCK_SIZE) {
+ ctx->digest[ctx->len] ^= 0x80;
+ consts += SM4_BLOCK_SIZE;
+ }
+
+ kernel_neon_begin();
+ sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, consts, 1,
+ false, true);
+ kernel_neon_end();
+
+ memcpy(out, ctx->digest, SM4_BLOCK_SIZE);
+
+ return 0;
+}
+
+static int sm4_cbcmac_final(struct shash_desc *desc, u8 *out)
+{
+ struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ if (ctx->len) {
+ kernel_neon_begin();
+ sm4_ce_crypt_block(tctx->key.rkey_enc, ctx->digest,
+ ctx->digest);
+ kernel_neon_end();
+ }
+
+ memcpy(out, ctx->digest, SM4_BLOCK_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg sm4_mac_algs[] = {
+ {
+ .base = {
+ .cra_name = "cmac(sm4)",
+ .cra_driver_name = "cmac-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx)
+ + SM4_BLOCK_SIZE * 2,
+ .cra_module = THIS_MODULE,
+ },
+ .digestsize = SM4_BLOCK_SIZE,
+ .init = sm4_mac_init,
+ .update = sm4_mac_update,
+ .final = sm4_cmac_final,
+ .setkey = sm4_cmac_setkey,
+ .descsize = sizeof(struct sm4_mac_desc_ctx),
+ }, {
+ .base = {
+ .cra_name = "xcbc(sm4)",
+ .cra_driver_name = "xcbc-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx)
+ + SM4_BLOCK_SIZE * 2,
+ .cra_module = THIS_MODULE,
+ },
+ .digestsize = SM4_BLOCK_SIZE,
+ .init = sm4_mac_init,
+ .update = sm4_mac_update,
+ .final = sm4_cmac_final,
+ .setkey = sm4_xcbc_setkey,
+ .descsize = sizeof(struct sm4_mac_desc_ctx),
+ }, {
+ .base = {
+ .cra_name = "cbcmac(sm4)",
+ .cra_driver_name = "cbcmac-sm4-ce",
+ .cra_priority = 400,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .digestsize = SM4_BLOCK_SIZE,
+ .init = sm4_mac_init,
+ .update = sm4_mac_update,
+ .final = sm4_cbcmac_final,
+ .setkey = sm4_cbcmac_setkey,
+ .descsize = sizeof(struct sm4_mac_desc_ctx),
+ }
+};
+
+static int __init sm4_init(void)
+{
+ int err;
+
+ err = crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+ if (err)
+ return err;
+
+ err = crypto_register_shashes(sm4_mac_algs, ARRAY_SIZE(sm4_mac_algs));
+ if (err)
+ goto out_err;
+
+ return 0;
+
+out_err:
+ crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+ return err;
+}
+
+static void __exit sm4_exit(void)
+{
+ crypto_unregister_shashes(sm4_mac_algs, ARRAY_SIZE(sm4_mac_algs));
+ crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+}
+
+module_cpu_feature_match(SM4, sm4_init);
+module_exit(sm4_exit);
+
+MODULE_DESCRIPTION("SM4 ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
+MODULE_ALIAS_CRYPTO("sm4-ce");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("ecb(sm4)");
+MODULE_ALIAS_CRYPTO("cbc(sm4)");
+MODULE_ALIAS_CRYPTO("ctr(sm4)");
+MODULE_ALIAS_CRYPTO("cts(cbc(sm4))");
+MODULE_ALIAS_CRYPTO("xts(sm4)");
+MODULE_ALIAS_CRYPTO("cmac(sm4)");
+MODULE_ALIAS_CRYPTO("xcbc(sm4)");
+MODULE_ALIAS_CRYPTO("cbcmac(sm4)");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/crypto/sm4-ce.h b/arch/arm64/crypto/sm4-ce.h
new file mode 100644
index 000000000000..1e235c4371eb
--- /dev/null
+++ b/arch/arm64/crypto/sm4-ce.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 common functions for Crypto Extensions
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+void sm4_ce_expand_key(const u8 *key, u32 *rkey_enc, u32 *rkey_dec,
+ const u32 *fk, const u32 *ck);
+
+void sm4_ce_crypt_block(const u32 *rkey, u8 *dst, const u8 *src);
+
+void sm4_ce_cbc_enc(const u32 *rkey_enc, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblocks);
diff --git a/arch/arm64/crypto/sm4-neon-core.S b/arch/arm64/crypto/sm4-neon-core.S
new file mode 100644
index 000000000000..734dc7193610
--- /dev/null
+++ b/arch/arm64/crypto/sm4-neon-core.S
@@ -0,0 +1,566 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm for ARMv8 NEON
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/* Register macros */
+
+#define RTMP0 v8
+#define RTMP1 v9
+#define RTMP2 v10
+#define RTMP3 v11
+
+#define RTMP4 v12
+#define RTMP5 v13
+#define RTMP6 v14
+#define RTMP7 v15
+
+#define RX0 v12
+#define RX1 v13
+#define RKEY v14
+#define RIV v15
+
+/* Helper macros. */
+
+#define SM4_PREPARE() \
+ adr_l x5, crypto_sm4_sbox; \
+ ld1 {v16.16b-v19.16b}, [x5], #64; \
+ ld1 {v20.16b-v23.16b}, [x5], #64; \
+ ld1 {v24.16b-v27.16b}, [x5], #64; \
+ ld1 {v28.16b-v31.16b}, [x5];
+
+#define transpose_4x4(s0, s1, s2, s3) \
+ zip1 RTMP0.4s, s0.4s, s1.4s; \
+ zip1 RTMP1.4s, s2.4s, s3.4s; \
+ zip2 RTMP2.4s, s0.4s, s1.4s; \
+ zip2 RTMP3.4s, s2.4s, s3.4s; \
+ zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
+ zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
+ zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
+ zip2 s3.2d, RTMP2.2d, RTMP3.2d;
+
+#define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
+ zip1 RTMP0.4s, s0.4s, s1.4s; \
+ zip1 RTMP1.4s, s2.4s, s3.4s; \
+ zip2 RTMP2.4s, s0.4s, s1.4s; \
+ zip2 RTMP3.4s, s2.4s, s3.4s; \
+ zip1 RTMP4.4s, s4.4s, s5.4s; \
+ zip1 RTMP5.4s, s6.4s, s7.4s; \
+ zip2 RTMP6.4s, s4.4s, s5.4s; \
+ zip2 RTMP7.4s, s6.4s, s7.4s; \
+ zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
+ zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
+ zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
+ zip2 s3.2d, RTMP2.2d, RTMP3.2d; \
+ zip1 s4.2d, RTMP4.2d, RTMP5.2d; \
+ zip2 s5.2d, RTMP4.2d, RTMP5.2d; \
+ zip1 s6.2d, RTMP6.2d, RTMP7.2d; \
+ zip2 s7.2d, RTMP6.2d, RTMP7.2d;
+
+#define rotate_clockwise_4x4(s0, s1, s2, s3) \
+ zip1 RTMP0.4s, s1.4s, s0.4s; \
+ zip2 RTMP1.4s, s1.4s, s0.4s; \
+ zip1 RTMP2.4s, s3.4s, s2.4s; \
+ zip2 RTMP3.4s, s3.4s, s2.4s; \
+ zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
+ zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
+ zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
+ zip2 s3.2d, RTMP3.2d, RTMP1.2d;
+
+#define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
+ zip1 RTMP0.4s, s1.4s, s0.4s; \
+ zip1 RTMP2.4s, s3.4s, s2.4s; \
+ zip2 RTMP1.4s, s1.4s, s0.4s; \
+ zip2 RTMP3.4s, s3.4s, s2.4s; \
+ zip1 RTMP4.4s, s5.4s, s4.4s; \
+ zip1 RTMP6.4s, s7.4s, s6.4s; \
+ zip2 RTMP5.4s, s5.4s, s4.4s; \
+ zip2 RTMP7.4s, s7.4s, s6.4s; \
+ zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
+ zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
+ zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
+ zip2 s3.2d, RTMP3.2d, RTMP1.2d; \
+ zip1 s4.2d, RTMP6.2d, RTMP4.2d; \
+ zip2 s5.2d, RTMP6.2d, RTMP4.2d; \
+ zip1 s6.2d, RTMP7.2d, RTMP5.2d; \
+ zip2 s7.2d, RTMP7.2d, RTMP5.2d;
+
+#define ROUND4(round, s0, s1, s2, s3) \
+ dup RX0.4s, RKEY.s[round]; \
+ /* rk ^ s1 ^ s2 ^ s3 */ \
+ eor RTMP1.16b, s2.16b, s3.16b; \
+ eor RX0.16b, RX0.16b, s1.16b; \
+ eor RX0.16b, RX0.16b, RTMP1.16b; \
+ \
+ /* sbox, non-linear part */ \
+ movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
+ tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
+ \
+ /* linear part */ \
+ shl RTMP1.4s, RTMP0.4s, #8; \
+ shl RTMP2.4s, RTMP0.4s, #16; \
+ shl RTMP3.4s, RTMP0.4s, #24; \
+ sri RTMP1.4s, RTMP0.4s, #(32-8); \
+ sri RTMP2.4s, RTMP0.4s, #(32-16); \
+ sri RTMP3.4s, RTMP0.4s, #(32-24); \
+ /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
+ eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \
+ eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \
+ /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
+ eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \
+ shl RTMP2.4s, RTMP1.4s, 2; \
+ sri RTMP2.4s, RTMP1.4s, #(32-2); \
+ eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \
+ /* s0 ^= RTMP3 */ \
+ eor s0.16b, s0.16b, RTMP3.16b;
+
+#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \
+ mov x6, 8; \
+4: \
+ ld1 {RKEY.4s}, [x0], #16; \
+ subs x6, x6, #1; \
+ \
+ ROUND4(0, b0, b1, b2, b3); \
+ ROUND4(1, b1, b2, b3, b0); \
+ ROUND4(2, b2, b3, b0, b1); \
+ ROUND4(3, b3, b0, b1, b2); \
+ \
+ bne 4b; \
+ \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ \
+ rotate_clockwise_4x4(b0, b1, b2, b3); \
+ \
+ /* repoint to rkey */ \
+ sub x0, x0, #128;
+
+#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
+
+#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \
+ /* rk ^ s1 ^ s2 ^ s3 */ \
+ dup RX0.4s, RKEY.s[round]; \
+ eor RTMP0.16b, s2.16b, s3.16b; \
+ mov RX1.16b, RX0.16b; \
+ eor RTMP1.16b, t2.16b, t3.16b; \
+ eor RX0.16b, RX0.16b, s1.16b; \
+ eor RX1.16b, RX1.16b, t1.16b; \
+ eor RX0.16b, RX0.16b, RTMP0.16b; \
+ eor RX1.16b, RX1.16b, RTMP1.16b; \
+ \
+ /* sbox, non-linear part */ \
+ movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
+ tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
+ tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ sub RX1.16b, RX1.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
+ tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ sub RX1.16b, RX1.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
+ tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \
+ sub RX0.16b, RX0.16b, RTMP3.16b; \
+ sub RX1.16b, RX1.16b, RTMP3.16b; \
+ tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
+ tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \
+ \
+ /* linear part */ \
+ shl RX0.4s, RTMP0.4s, #8; \
+ shl RX1.4s, RTMP1.4s, #8; \
+ shl RTMP2.4s, RTMP0.4s, #16; \
+ shl RTMP3.4s, RTMP1.4s, #16; \
+ sri RX0.4s, RTMP0.4s, #(32 - 8); \
+ sri RX1.4s, RTMP1.4s, #(32 - 8); \
+ sri RTMP2.4s, RTMP0.4s, #(32 - 16); \
+ sri RTMP3.4s, RTMP1.4s, #(32 - 16); \
+ /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
+ eor RX0.16b, RX0.16b, RTMP0.16b; \
+ eor RX1.16b, RX1.16b, RTMP1.16b; \
+ eor RX0.16b, RX0.16b, RTMP2.16b; \
+ eor RX1.16b, RX1.16b, RTMP3.16b; \
+ /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \
+ shl RTMP2.4s, RTMP0.4s, #24; \
+ shl RTMP3.4s, RTMP1.4s, #24; \
+ sri RTMP2.4s, RTMP0.4s, #(32 - 24); \
+ sri RTMP3.4s, RTMP1.4s, #(32 - 24); \
+ eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
+ eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
+ shl RTMP2.4s, RX0.4s, #2; \
+ shl RTMP3.4s, RX1.4s, #2; \
+ sri RTMP2.4s, RX0.4s, #(32 - 2); \
+ sri RTMP3.4s, RX1.4s, #(32 - 2); \
+ eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
+ eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
+ /* s0/t0 ^= RTMP0/1 */ \
+ eor s0.16b, s0.16b, RTMP0.16b; \
+ eor t0.16b, t0.16b, RTMP1.16b;
+
+#define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b; \
+ \
+ mov x6, 8; \
+8: \
+ ld1 {RKEY.4s}, [x0], #16; \
+ subs x6, x6, #1; \
+ \
+ ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \
+ ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \
+ ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \
+ ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \
+ \
+ bne 8b; \
+ \
+ rev32 b0.16b, b0.16b; \
+ rev32 b1.16b, b1.16b; \
+ rev32 b2.16b, b2.16b; \
+ rev32 b3.16b, b3.16b; \
+ rev32 b4.16b, b4.16b; \
+ rev32 b5.16b, b5.16b; \
+ rev32 b6.16b, b6.16b; \
+ rev32 b7.16b, b7.16b; \
+ \
+ /* repoint to rkey */ \
+ sub x0, x0, #128;
+
+#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
+ SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \
+ rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \
+
+
+.align 3
+SYM_FUNC_START(sm4_neon_crypt)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * w3: nblocks
+ */
+ SM4_PREPARE()
+
+.Lcrypt_loop_8x:
+ sub w3, w3, #8
+ tbnz w3, #31, .Lcrypt_4x
+
+ ld4 {v0.4s-v3.4s}, [x2], #64
+ ld4 {v4.4s-v7.4s}, [x2], #64
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
+
+ st1 {v0.16b-v3.16b}, [x1], #64
+ st1 {v4.16b-v7.16b}, [x1], #64
+
+ cbz w3, .Lcrypt_end
+ b .Lcrypt_loop_8x
+
+.Lcrypt_4x:
+ add w3, w3, #8
+ cmp w3, #4
+ blt .Lcrypt_tail
+
+ sub w3, w3, #4
+
+ ld4 {v0.4s-v3.4s}, [x2], #64
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3)
+
+ st1 {v0.16b-v3.16b}, [x1], #64
+
+ cbz w3, .Lcrypt_end
+
+.Lcrypt_tail:
+ cmp w3, #2
+ ld1 {v0.16b}, [x2], #16
+ blt .Lcrypt_tail_load_done
+ ld1 {v1.16b}, [x2], #16
+ beq .Lcrypt_tail_load_done
+ ld1 {v2.16b}, [x2], #16
+
+.Lcrypt_tail_load_done:
+ transpose_4x4(v0, v1, v2, v3)
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3)
+
+ cmp w3, #2
+ st1 {v0.16b}, [x1], #16
+ blt .Lcrypt_end
+ st1 {v1.16b}, [x1], #16
+ beq .Lcrypt_end
+ st1 {v2.16b}, [x1], #16
+
+.Lcrypt_end:
+ ret
+SYM_FUNC_END(sm4_neon_crypt)
+
+.align 3
+SYM_FUNC_START(sm4_neon_cbc_dec)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: iv (big endian, 128 bit)
+ * w4: nblocks
+ */
+ SM4_PREPARE()
+
+ ld1 {RIV.16b}, [x3]
+
+.Lcbc_dec_loop_8x:
+ sub w4, w4, #8
+ tbnz w4, #31, .Lcbc_dec_4x
+
+ ld4 {v0.4s-v3.4s}, [x2], #64
+ ld4 {v4.4s-v7.4s}, [x2]
+
+ SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
+
+ /* Avoid overwriting the RIV register */
+ rotate_clockwise_4x4(v0, v1, v2, v3)
+ rotate_clockwise_4x4(v4, v5, v6, v7)
+
+ sub x2, x2, #64
+
+ eor v0.16b, v0.16b, RIV.16b
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
+ ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
+
+ eor v1.16b, v1.16b, RTMP0.16b
+ eor v2.16b, v2.16b, RTMP1.16b
+ eor v3.16b, v3.16b, RTMP2.16b
+ eor v4.16b, v4.16b, RTMP3.16b
+ eor v5.16b, v5.16b, RTMP4.16b
+ eor v6.16b, v6.16b, RTMP5.16b
+ eor v7.16b, v7.16b, RTMP6.16b
+
+ mov RIV.16b, RTMP7.16b
+
+ st1 {v0.16b-v3.16b}, [x1], #64
+ st1 {v4.16b-v7.16b}, [x1], #64
+
+ cbz w4, .Lcbc_dec_end
+ b .Lcbc_dec_loop_8x
+
+.Lcbc_dec_4x:
+ add w4, w4, #8
+ cmp w4, #4
+ blt .Lcbc_dec_tail
+
+ sub w4, w4, #4
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+
+ rev32 v4.16b, v0.16b
+ rev32 v5.16b, v1.16b
+ rev32 v6.16b, v2.16b
+ rev32 v7.16b, v3.16b
+
+ transpose_4x4(v4, v5, v6, v7)
+
+ SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
+
+ eor v4.16b, v4.16b, RIV.16b
+ eor v5.16b, v5.16b, v0.16b
+ eor v6.16b, v6.16b, v1.16b
+ eor v7.16b, v7.16b, v2.16b
+
+ mov RIV.16b, v3.16b
+
+ st1 {v4.16b-v7.16b}, [x1], #64
+
+ cbz w4, .Lcbc_dec_end
+
+.Lcbc_dec_tail:
+ cmp w4, #2
+ ld1 {v0.16b}, [x2], #16
+ blt .Lcbc_dec_tail_load_done
+ ld1 {v1.16b}, [x2], #16
+ beq .Lcbc_dec_tail_load_done
+ ld1 {v2.16b}, [x2], #16
+
+.Lcbc_dec_tail_load_done:
+ rev32 v4.16b, v0.16b
+ rev32 v5.16b, v1.16b
+ rev32 v6.16b, v2.16b
+
+ transpose_4x4(v4, v5, v6, v7)
+
+ SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
+
+ cmp w4, #2
+ eor v4.16b, v4.16b, RIV.16b
+ mov RIV.16b, v0.16b
+ st1 {v4.16b}, [x1], #16
+ blt .Lcbc_dec_end
+
+ eor v5.16b, v5.16b, v0.16b
+ mov RIV.16b, v1.16b
+ st1 {v5.16b}, [x1], #16
+ beq .Lcbc_dec_end
+
+ eor v6.16b, v6.16b, v1.16b
+ mov RIV.16b, v2.16b
+ st1 {v6.16b}, [x1], #16
+
+.Lcbc_dec_end:
+ /* store new IV */
+ st1 {RIV.16b}, [x3]
+
+ ret
+SYM_FUNC_END(sm4_neon_cbc_dec)
+
+.align 3
+SYM_FUNC_START(sm4_neon_ctr_crypt)
+ /* input:
+ * x0: round key array, CTX
+ * x1: dst
+ * x2: src
+ * x3: ctr (big endian, 128 bit)
+ * w4: nblocks
+ */
+ SM4_PREPARE()
+
+ ldp x7, x8, [x3]
+ rev x7, x7
+ rev x8, x8
+
+.Lctr_crypt_loop_8x:
+ sub w4, w4, #8
+ tbnz w4, #31, .Lctr_crypt_4x
+
+#define inc_le128(vctr) \
+ mov vctr.d[1], x8; \
+ mov vctr.d[0], x7; \
+ adds x8, x8, #1; \
+ rev64 vctr.16b, vctr.16b; \
+ adc x7, x7, xzr;
+
+ /* construct CTRs */
+ inc_le128(v0) /* +0 */
+ inc_le128(v1) /* +1 */
+ inc_le128(v2) /* +2 */
+ inc_le128(v3) /* +3 */
+ inc_le128(v4) /* +4 */
+ inc_le128(v5) /* +5 */
+ inc_le128(v6) /* +6 */
+ inc_le128(v7) /* +7 */
+
+ transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
+
+ SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
+
+ ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
+ ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
+
+ eor v0.16b, v0.16b, RTMP0.16b
+ eor v1.16b, v1.16b, RTMP1.16b
+ eor v2.16b, v2.16b, RTMP2.16b
+ eor v3.16b, v3.16b, RTMP3.16b
+ eor v4.16b, v4.16b, RTMP4.16b
+ eor v5.16b, v5.16b, RTMP5.16b
+ eor v6.16b, v6.16b, RTMP6.16b
+ eor v7.16b, v7.16b, RTMP7.16b
+
+ st1 {v0.16b-v3.16b}, [x1], #64
+ st1 {v4.16b-v7.16b}, [x1], #64
+
+ cbz w4, .Lctr_crypt_end
+ b .Lctr_crypt_loop_8x
+
+.Lctr_crypt_4x:
+ add w4, w4, #8
+ cmp w4, #4
+ blt .Lctr_crypt_tail
+
+ sub w4, w4, #4
+
+ /* construct CTRs */
+ inc_le128(v0) /* +0 */
+ inc_le128(v1) /* +1 */
+ inc_le128(v2) /* +2 */
+ inc_le128(v3) /* +3 */
+
+ ld1 {v4.16b-v7.16b}, [x2], #64
+
+ transpose_4x4(v0, v1, v2, v3)
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3)
+
+ eor v0.16b, v0.16b, v4.16b
+ eor v1.16b, v1.16b, v5.16b
+ eor v2.16b, v2.16b, v6.16b
+ eor v3.16b, v3.16b, v7.16b
+
+ st1 {v0.16b-v3.16b}, [x1], #64
+
+ cbz w4, .Lctr_crypt_end
+
+.Lctr_crypt_tail:
+ /* inc_le128 will change the sign bit */
+ ld1 {v4.16b}, [x2], #16
+ inc_le128(v0)
+ cmp w4, #2
+ blt .Lctr_crypt_tail_load_done
+
+ ld1 {v5.16b}, [x2], #16
+ inc_le128(v1)
+ cmp w4, #2
+ beq .Lctr_crypt_tail_load_done
+
+ ld1 {v6.16b}, [x2], #16
+ inc_le128(v2)
+
+.Lctr_crypt_tail_load_done:
+ transpose_4x4(v0, v1, v2, v3)
+
+ SM4_CRYPT_BLK4(v0, v1, v2, v3)
+
+ cmp w4, #2
+
+ eor v0.16b, v0.16b, v4.16b
+ st1 {v0.16b}, [x1], #16
+ blt .Lctr_crypt_end
+
+ eor v1.16b, v1.16b, v5.16b
+ st1 {v1.16b}, [x1], #16
+ beq .Lctr_crypt_end
+
+ eor v2.16b, v2.16b, v6.16b
+ st1 {v2.16b}, [x1], #16
+
+.Lctr_crypt_end:
+ /* store new CTR */
+ rev x7, x7
+ rev x8, x8
+ stp x7, x8, [x3]
+
+ ret
+SYM_FUNC_END(sm4_neon_ctr_crypt)
diff --git a/arch/arm64/crypto/sm4-neon-glue.c b/arch/arm64/crypto/sm4-neon-glue.c
new file mode 100644
index 000000000000..e3500aca2d18
--- /dev/null
+++ b/arch/arm64/crypto/sm4-neon-glue.c
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, using ARMv8 NEON
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
+ *
+ * Copyright (C) 2022, Alibaba Group.
+ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/sm4.h>
+
+asmlinkage void sm4_neon_crypt(const u32 *rkey, u8 *dst, const u8 *src,
+ unsigned int nblocks);
+asmlinkage void sm4_neon_cbc_dec(const u32 *rkey_dec, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblocks);
+asmlinkage void sm4_neon_ctr_crypt(const u32 *rkey_enc, u8 *dst, const u8 *src,
+ u8 *iv, unsigned int nblocks);
+
+static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_expandkey(ctx, key, key_len);
+}
+
+static int sm4_ecb_do_crypt(struct skcipher_request *req, const u32 *rkey)
+{
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblocks;
+
+ nblocks = nbytes / SM4_BLOCK_SIZE;
+ if (nblocks) {
+ kernel_neon_begin();
+
+ sm4_neon_crypt(rkey, dst, src, nblocks);
+
+ kernel_neon_end();
+ }
+
+ err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE);
+ }
+
+ return err;
+}
+
+static int sm4_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_enc);
+}
+
+static int sm4_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return sm4_ecb_do_crypt(req, ctx->rkey_dec);
+}
+
+static int sm4_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *iv = walk.iv;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= SM4_BLOCK_SIZE) {
+ crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE);
+ sm4_crypt_block(ctx->rkey_enc, dst, dst);
+ iv = dst;
+ src += SM4_BLOCK_SIZE;
+ dst += SM4_BLOCK_SIZE;
+ nbytes -= SM4_BLOCK_SIZE;
+ }
+ if (iv != walk.iv)
+ memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int sm4_cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblocks;
+
+ nblocks = nbytes / SM4_BLOCK_SIZE;
+ if (nblocks) {
+ kernel_neon_begin();
+
+ sm4_neon_cbc_dec(ctx->rkey_dec, dst, src,
+ walk.iv, nblocks);
+
+ kernel_neon_end();
+ }
+
+ err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE);
+ }
+
+ return err;
+}
+
+static int sm4_ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int nblocks;
+
+ nblocks = nbytes / SM4_BLOCK_SIZE;
+ if (nblocks) {
+ kernel_neon_begin();
+
+ sm4_neon_ctr_crypt(ctx->rkey_enc, dst, src,
+ walk.iv, nblocks);
+
+ kernel_neon_end();
+
+ dst += nblocks * SM4_BLOCK_SIZE;
+ src += nblocks * SM4_BLOCK_SIZE;
+ nbytes -= nblocks * SM4_BLOCK_SIZE;
+ }
+
+ /* tail */
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ u8 keystream[SM4_BLOCK_SIZE];
+
+ sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
+ crypto_inc(walk.iv, SM4_BLOCK_SIZE);
+ crypto_xor_cpy(dst, src, keystream, nbytes);
+ nbytes = 0;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static struct skcipher_alg sm4_algs[] = {
+ {
+ .base = {
+ .cra_name = "ecb(sm4)",
+ .cra_driver_name = "ecb-sm4-neon",
+ .cra_priority = 200,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ecb_encrypt,
+ .decrypt = sm4_ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "cbc(sm4)",
+ .cra_driver_name = "cbc-sm4-neon",
+ .cra_priority = 200,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_cbc_encrypt,
+ .decrypt = sm4_cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "ctr(sm4)",
+ .cra_driver_name = "ctr-sm4-neon",
+ .cra_priority = 200,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct sm4_ctx),
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = SM4_KEY_SIZE,
+ .max_keysize = SM4_KEY_SIZE,
+ .ivsize = SM4_BLOCK_SIZE,
+ .chunksize = SM4_BLOCK_SIZE,
+ .setkey = sm4_setkey,
+ .encrypt = sm4_ctr_crypt,
+ .decrypt = sm4_ctr_crypt,
+ }
+};
+
+static int __init sm4_init(void)
+{
+ return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+}
+
+static void __exit sm4_exit(void)
+{
+ crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
+}
+
+module_init(sm4_init);
+module_exit(sm4_exit);
+
+MODULE_DESCRIPTION("SM4 ECB/CBC/CTR using ARMv8 NEON");
+MODULE_ALIAS_CRYPTO("sm4-neon");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("ecb(sm4)");
+MODULE_ALIAS_CRYPTO("cbc(sm4)");
+MODULE_ALIAS_CRYPTO("ctr(sm4)");
+MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
+MODULE_LICENSE("GPL v2");