diff options
Diffstat (limited to 'arch/arm/crypto')
30 files changed, 79 insertions, 7280 deletions
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index 23e4ea067ddb..1e5f3cdf691c 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -46,30 +46,6 @@ config CRYPTO_NHPOLY1305_NEON Architecture: arm using: - NEON (Advanced SIMD) extensions -config CRYPTO_POLY1305_ARM - tristate - select CRYPTO_HASH - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - default CRYPTO_LIB_POLY1305_INTERNAL - help - Poly1305 authenticator algorithm (RFC7539) - - Architecture: arm optionally using - - NEON (Advanced SIMD) extensions - -config CRYPTO_BLAKE2S_ARM - bool "Hash functions: BLAKE2s" - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: arm - - This is faster than the generic implementations of BLAKE2s and - BLAKE2b, but slower than the NEON implementation of BLAKE2b. - There is no NEON implementation of BLAKE2s, since NEON doesn't - really help with it. - config CRYPTO_BLAKE2B_NEON tristate "Hash functions: BLAKE2b (NEON)" depends on KERNEL_MODE_NEON @@ -86,68 +62,6 @@ config CRYPTO_BLAKE2B_NEON much faster than the SHA-2 family and slightly faster than SHA-1. -config CRYPTO_SHA1_ARM - tristate "Hash functions: SHA-1" - select CRYPTO_SHA1 - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: arm - -config CRYPTO_SHA1_ARM_NEON - tristate "Hash functions: SHA-1 (NEON)" - depends on KERNEL_MODE_NEON - select CRYPTO_SHA1_ARM - select CRYPTO_SHA1 - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: arm using - - NEON (Advanced SIMD) extensions - -config CRYPTO_SHA1_ARM_CE - tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)" - depends on KERNEL_MODE_NEON - select CRYPTO_SHA1_ARM - select CRYPTO_HASH - help - SHA-1 secure hash algorithm (FIPS 180) - - Architecture: arm using ARMv8 Crypto Extensions - -config CRYPTO_SHA2_ARM_CE - tristate "Hash functions: SHA-224 and SHA-256 (ARMv8 Crypto Extensions)" - depends on KERNEL_MODE_NEON - select CRYPTO_SHA256_ARM - select CRYPTO_HASH - help - SHA-224 and SHA-256 secure hash algorithms (FIPS 180) - - Architecture: arm using - - ARMv8 Crypto Extensions - -config CRYPTO_SHA256_ARM - tristate "Hash functions: SHA-224 and SHA-256 (NEON)" - select CRYPTO_HASH - depends on !CPU_V7M - help - SHA-224 and SHA-256 secure hash algorithms (FIPS 180) - - Architecture: arm using - - NEON (Advanced SIMD) extensions - -config CRYPTO_SHA512_ARM - tristate "Hash functions: SHA-384 and SHA-512 (NEON)" - select CRYPTO_HASH - depends on !CPU_V7M - help - SHA-384 and SHA-512 secure hash algorithms (FIPS 180) - - Architecture: arm using - - NEON (Advanced SIMD) extensions - config CRYPTO_AES_ARM tristate "Ciphers: AES" select CRYPTO_ALGAPI @@ -172,7 +86,6 @@ config CRYPTO_AES_ARM_BS select CRYPTO_AES_ARM select CRYPTO_SKCIPHER select CRYPTO_LIB_AES - select CRYPTO_SIMD help Length-preserving ciphers: AES cipher algorithms (FIPS-197) with block cipher modes: @@ -200,7 +113,6 @@ config CRYPTO_AES_ARM_CE depends on KERNEL_MODE_NEON select CRYPTO_SKCIPHER select CRYPTO_LIB_AES - select CRYPTO_SIMD help Length-preserving ciphers: AES cipher algorithms (FIPS-197) with block cipher modes: @@ -214,17 +126,5 @@ config CRYPTO_AES_ARM_CE Architecture: arm using: - ARMv8 Crypto Extensions -config CRYPTO_CHACHA20_NEON - tristate - select CRYPTO_SKCIPHER - select CRYPTO_ARCH_HAVE_LIB_CHACHA - default CRYPTO_LIB_CHACHA_INTERNAL - help - Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 - stream cipher algorithms - - Architecture: arm using: - - NEON (Advanced SIMD) extensions - endmenu diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 3d0e23ff9e74..4f23999ae17d 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -5,56 +5,17 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o -obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o -obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o -obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o -obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o -obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o -obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o -obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o -obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o aes-arm-y := aes-cipher-core.o aes-cipher-glue.o aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o -sha1-arm-y := sha1-armv4-large.o sha1_glue.o -sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o -sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o -sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y) -sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o -sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y) -libblake2s-arm-y:= blake2s-core.o blake2s-glue.o blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o -sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o -sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o -chacha-neon-y := chacha-scalar-core.o chacha-glue.o -chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o -poly1305-arm-y := poly1305-core.o poly1305-glue.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o curve25519-neon-y := curve25519-core.o curve25519-glue.o - -quiet_cmd_perl = PERL $@ - cmd_perl = $(PERL) $(<) > $(@) - -$(obj)/%-core.S: $(src)/%-armv4.pl - $(call cmd,perl) - -clean-files += poly1305-core.S sha256-core.S sha512-core.S - -aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 - -AFLAGS_sha256-core.o += $(aflags-thumb2-y) -AFLAGS_sha512-core.o += $(aflags-thumb2-y) - -# massage the perlasm code a bit so we only get the NEON routine if we need it -poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 -poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 -AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c index 1cf61f51e766..00591895d540 100644 --- a/arch/arm/crypto/aes-ce-glue.c +++ b/arch/arm/crypto/aes-ce-glue.c @@ -10,8 +10,6 @@ #include <asm/simd.h> #include <linux/unaligned.h> #include <crypto/aes.h> -#include <crypto/ctr.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/cpufeature.h> @@ -418,29 +416,6 @@ static int ctr_encrypt(struct skcipher_request *req) return err; } -static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst) -{ - struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); - unsigned long flags; - - /* - * Temporarily disable interrupts to avoid races where - * cachelines are evicted when the CPU is interrupted - * to do something else. - */ - local_irq_save(flags); - aes_encrypt(ctx, dst, src); - local_irq_restore(flags); -} - -static int ctr_encrypt_sync(struct skcipher_request *req) -{ - if (!crypto_simd_usable()) - return crypto_ctr_encrypt_walk(req, ctr_encrypt_one); - - return ctr_encrypt(req); -} - static int xts_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -586,10 +561,9 @@ static int xts_decrypt(struct skcipher_request *req) } static struct skcipher_alg aes_algs[] = { { - .base.cra_name = "__ecb(aes)", - .base.cra_driver_name = "__ecb-aes-ce", + .base.cra_name = "ecb(aes)", + .base.cra_driver_name = "ecb-aes-ce", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct crypto_aes_ctx), .base.cra_module = THIS_MODULE, @@ -600,10 +574,9 @@ static struct skcipher_alg aes_algs[] = { { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(aes)", - .base.cra_driver_name = "__cbc-aes-ce", + .base.cra_name = "cbc(aes)", + .base.cra_driver_name = "cbc-aes-ce", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct crypto_aes_ctx), .base.cra_module = THIS_MODULE, @@ -615,10 +588,9 @@ static struct skcipher_alg aes_algs[] = { { .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, { - .base.cra_name = "__cts(cbc(aes))", - .base.cra_driver_name = "__cts-cbc-aes-ce", + .base.cra_name = "cts(cbc(aes))", + .base.cra_driver_name = "cts-cbc-aes-ce", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct crypto_aes_ctx), .base.cra_module = THIS_MODULE, @@ -631,10 +603,9 @@ static struct skcipher_alg aes_algs[] = { { .encrypt = cts_cbc_encrypt, .decrypt = cts_cbc_decrypt, }, { - .base.cra_name = "__ctr(aes)", - .base.cra_driver_name = "__ctr-aes-ce", + .base.cra_name = "ctr(aes)", + .base.cra_driver_name = "ctr-aes-ce", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct crypto_aes_ctx), .base.cra_module = THIS_MODULE, @@ -647,25 +618,9 @@ static struct skcipher_alg aes_algs[] = { { .encrypt = ctr_encrypt, .decrypt = ctr_encrypt, }, { - .base.cra_name = "ctr(aes)", - .base.cra_driver_name = "ctr-aes-ce-sync", - .base.cra_priority = 300 - 1, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct crypto_aes_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .chunksize = AES_BLOCK_SIZE, - .setkey = ce_aes_setkey, - .encrypt = ctr_encrypt_sync, - .decrypt = ctr_encrypt_sync, -}, { - .base.cra_name = "__xts(aes)", - .base.cra_driver_name = "__xts-aes-ce", + .base.cra_name = "xts(aes)", + .base.cra_driver_name = "xts-aes-ce", .base.cra_priority = 300, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), .base.cra_module = THIS_MODULE, @@ -679,51 +634,14 @@ static struct skcipher_alg aes_algs[] = { { .decrypt = xts_decrypt, } }; -static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)]; - static void aes_exit(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(aes_simd_algs) && aes_simd_algs[i]; i++) - simd_skcipher_free(aes_simd_algs[i]); - crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } static int __init aes_init(void) { - struct simd_skcipher_alg *simd; - const char *basename; - const char *algname; - const char *drvname; - int err; - int i; - - err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); - if (err) - return err; - - for (i = 0; i < ARRAY_SIZE(aes_algs); i++) { - if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL)) - continue; - - algname = aes_algs[i].base.cra_name + 2; - drvname = aes_algs[i].base.cra_driver_name + 2; - basename = aes_algs[i].base.cra_driver_name; - simd = simd_skcipher_create_compat(aes_algs + i, algname, drvname, basename); - err = PTR_ERR(simd); - if (IS_ERR(simd)) - goto unregister_simds; - - aes_simd_algs[i] = simd; - } - - return 0; - -unregister_simds: - aes_exit(); - return err; + return crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } module_cpu_feature_match(AES, aes_init); diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c index f6be80b5938b..df5afe601e4a 100644 --- a/arch/arm/crypto/aes-neonbs-glue.c +++ b/arch/arm/crypto/aes-neonbs-glue.c @@ -8,8 +8,6 @@ #include <asm/neon.h> #include <asm/simd.h> #include <crypto/aes.h> -#include <crypto/ctr.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <crypto/xts.h> @@ -59,11 +57,6 @@ struct aesbs_xts_ctx { struct crypto_aes_ctx tweak_key; }; -struct aesbs_ctr_ctx { - struct aesbs_ctx key; /* must be first member */ - struct crypto_aes_ctx fallback; -}; - static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { @@ -200,25 +193,6 @@ static int cbc_decrypt(struct skcipher_request *req) return err; } -static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key, - unsigned int key_len) -{ - struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - err = aes_expandkey(&ctx->fallback, in_key, key_len); - if (err) - return err; - - ctx->key.rounds = 6 + key_len / 4; - - kernel_neon_begin(); - aesbs_convert_key(ctx->key.rk, ctx->fallback.key_enc, ctx->key.rounds); - kernel_neon_end(); - - return 0; -} - static int ctr_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); @@ -232,7 +206,7 @@ static int ctr_encrypt(struct skcipher_request *req) while (walk.nbytes > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; - int bytes = walk.nbytes; + unsigned int bytes = walk.nbytes; if (unlikely(bytes < AES_BLOCK_SIZE)) src = dst = memcpy(buf + sizeof(buf) - bytes, @@ -254,21 +228,6 @@ static int ctr_encrypt(struct skcipher_request *req) return err; } -static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst) -{ - struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); - - __aes_arm_encrypt(ctx->fallback.key_enc, ctx->key.rounds, src, dst); -} - -static int ctr_encrypt_sync(struct skcipher_request *req) -{ - if (!crypto_simd_usable()) - return crypto_ctr_encrypt_walk(req, ctr_encrypt_one); - - return ctr_encrypt(req); -} - static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key, unsigned int key_len) { @@ -374,13 +333,12 @@ static int xts_decrypt(struct skcipher_request *req) } static struct skcipher_alg aes_algs[] = { { - .base.cra_name = "__ecb(aes)", - .base.cra_driver_name = "__ecb-aes-neonbs", + .base.cra_name = "ecb(aes)", + .base.cra_driver_name = "ecb-aes-neonbs", .base.cra_priority = 250, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aesbs_ctx), .base.cra_module = THIS_MODULE, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, @@ -389,13 +347,12 @@ static struct skcipher_alg aes_algs[] = { { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(aes)", - .base.cra_driver_name = "__cbc-aes-neonbs", + .base.cra_name = "cbc(aes)", + .base.cra_driver_name = "cbc-aes-neonbs", .base.cra_priority = 250, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aesbs_cbc_ctx), .base.cra_module = THIS_MODULE, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, @@ -405,13 +362,12 @@ static struct skcipher_alg aes_algs[] = { { .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, { - .base.cra_name = "__ctr(aes)", - .base.cra_driver_name = "__ctr-aes-neonbs", + .base.cra_name = "ctr(aes)", + .base.cra_driver_name = "ctr-aes-neonbs", .base.cra_priority = 250, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aesbs_ctx), .base.cra_module = THIS_MODULE, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .min_keysize = AES_MIN_KEY_SIZE, .max_keysize = AES_MAX_KEY_SIZE, @@ -422,29 +378,12 @@ static struct skcipher_alg aes_algs[] = { { .encrypt = ctr_encrypt, .decrypt = ctr_encrypt, }, { - .base.cra_name = "ctr(aes)", - .base.cra_driver_name = "ctr-aes-neonbs-sync", - .base.cra_priority = 250 - 1, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct aesbs_ctr_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .chunksize = AES_BLOCK_SIZE, - .walksize = 8 * AES_BLOCK_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = aesbs_ctr_setkey_sync, - .encrypt = ctr_encrypt_sync, - .decrypt = ctr_encrypt_sync, -}, { - .base.cra_name = "__xts(aes)", - .base.cra_driver_name = "__xts-aes-neonbs", + .base.cra_name = "xts(aes)", + .base.cra_driver_name = "xts-aes-neonbs", .base.cra_priority = 250, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aesbs_xts_ctx), .base.cra_module = THIS_MODULE, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .min_keysize = 2 * AES_MIN_KEY_SIZE, .max_keysize = 2 * AES_MAX_KEY_SIZE, @@ -455,55 +394,18 @@ static struct skcipher_alg aes_algs[] = { { .decrypt = xts_decrypt, } }; -static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)]; - static void aes_exit(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++) - if (aes_simd_algs[i]) - simd_skcipher_free(aes_simd_algs[i]); - crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } static int __init aes_init(void) { - struct simd_skcipher_alg *simd; - const char *basename; - const char *algname; - const char *drvname; - int err; - int i; - if (!(elf_hwcap & HWCAP_NEON)) return -ENODEV; - err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); - if (err) - return err; - - for (i = 0; i < ARRAY_SIZE(aes_algs); i++) { - if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL)) - continue; - - algname = aes_algs[i].base.cra_name + 2; - drvname = aes_algs[i].base.cra_driver_name + 2; - basename = aes_algs[i].base.cra_driver_name; - simd = simd_skcipher_create_compat(aes_algs + i, algname, drvname, basename); - err = PTR_ERR(simd); - if (IS_ERR(simd)) - goto unregister_simds; - - aes_simd_algs[i] = simd; - } - return 0; - -unregister_simds: - aes_exit(); - return err; + return crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } -late_initcall(aes_init); +module_init(aes_init); module_exit(aes_exit); diff --git a/arch/arm/crypto/blake2b-neon-glue.c b/arch/arm/crypto/blake2b-neon-glue.c index 4b59d027ba4a..2ff443a91724 100644 --- a/arch/arm/crypto/blake2b-neon-glue.c +++ b/arch/arm/crypto/blake2b-neon-glue.c @@ -7,7 +7,6 @@ #include <crypto/internal/blake2b.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <linux/module.h> #include <linux/sizes.h> @@ -21,11 +20,6 @@ asmlinkage void blake2b_compress_neon(struct blake2b_state *state, static void blake2b_compress_arch(struct blake2b_state *state, const u8 *block, size_t nblocks, u32 inc) { - if (!crypto_simd_usable()) { - blake2b_compress_generic(state, block, nblocks, inc); - return; - } - do { const size_t blocks = min_t(size_t, nblocks, SZ_4K / BLAKE2B_BLOCK_SIZE); @@ -42,12 +36,14 @@ static void blake2b_compress_arch(struct blake2b_state *state, static int crypto_blake2b_update_neon(struct shash_desc *desc, const u8 *in, unsigned int inlen) { - return crypto_blake2b_update(desc, in, inlen, blake2b_compress_arch); + return crypto_blake2b_update_bo(desc, in, inlen, blake2b_compress_arch); } -static int crypto_blake2b_final_neon(struct shash_desc *desc, u8 *out) +static int crypto_blake2b_finup_neon(struct shash_desc *desc, const u8 *in, + unsigned int inlen, u8 *out) { - return crypto_blake2b_final(desc, out, blake2b_compress_arch); + return crypto_blake2b_finup(desc, in, inlen, out, + blake2b_compress_arch); } #define BLAKE2B_ALG(name, driver_name, digest_size) \ @@ -55,7 +51,9 @@ static int crypto_blake2b_final_neon(struct shash_desc *desc, u8 *out) .base.cra_name = name, \ .base.cra_driver_name = driver_name, \ .base.cra_priority = 200, \ - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY | \ + CRYPTO_AHASH_ALG_BLOCK_ONLY | \ + CRYPTO_AHASH_ALG_FINAL_NONZERO, \ .base.cra_blocksize = BLAKE2B_BLOCK_SIZE, \ .base.cra_ctxsize = sizeof(struct blake2b_tfm_ctx), \ .base.cra_module = THIS_MODULE, \ @@ -63,8 +61,9 @@ static int crypto_blake2b_final_neon(struct shash_desc *desc, u8 *out) .setkey = crypto_blake2b_setkey, \ .init = crypto_blake2b_init, \ .update = crypto_blake2b_update_neon, \ - .final = crypto_blake2b_final_neon, \ + .finup = crypto_blake2b_finup_neon, \ .descsize = sizeof(struct blake2b_state), \ + .statesize = BLAKE2B_STATE_SIZE, \ } static struct shash_alg blake2b_neon_algs[] = { diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S deleted file mode 100644 index df40e46601f1..000000000000 --- a/arch/arm/crypto/blake2s-core.S +++ /dev/null @@ -1,306 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * BLAKE2s digest algorithm, ARM scalar implementation - * - * Copyright 2020 Google LLC - * - * Author: Eric Biggers <ebiggers@google.com> - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - - // Registers used to hold message words temporarily. There aren't - // enough ARM registers to hold the whole message block, so we have to - // load the words on-demand. - M_0 .req r12 - M_1 .req r14 - -// The BLAKE2s initialization vector -.Lblake2s_IV: - .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A - .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 - -.macro __ldrd a, b, src, offset -#if __LINUX_ARM_ARCH__ >= 6 - ldrd \a, \b, [\src, #\offset] -#else - ldr \a, [\src, #\offset] - ldr \b, [\src, #\offset + 4] -#endif -.endm - -.macro __strd a, b, dst, offset -#if __LINUX_ARM_ARCH__ >= 6 - strd \a, \b, [\dst, #\offset] -#else - str \a, [\dst, #\offset] - str \b, [\dst, #\offset + 4] -#endif -.endm - -.macro _le32_bswap a, tmp -#ifdef __ARMEB__ - rev_l \a, \tmp -#endif -.endm - -.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp - _le32_bswap \a, \tmp - _le32_bswap \b, \tmp - _le32_bswap \c, \tmp - _le32_bswap \d, \tmp - _le32_bswap \e, \tmp - _le32_bswap \f, \tmp - _le32_bswap \g, \tmp - _le32_bswap \h, \tmp -.endm - -// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. -// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two -// columns/diagonals. s0-s1 are the word offsets to the message words the first -// column/diagonal needs, and likewise s2-s3 for the second column/diagonal. -// M_0 and M_1 are free to use, and the message block can be found at sp + 32. -// -// Note that to save instructions, the rotations don't happen when the -// pseudocode says they should, but rather they are delayed until the values are -// used. See the comment above _blake2s_round(). -.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3 - - ldr M_0, [sp, #32 + 4 * \s0] - ldr M_1, [sp, #32 + 4 * \s2] - - // a += b + m[blake2s_sigma[r][2*i + 0]]; - add \a0, \a0, \b0, ror #brot - add \a1, \a1, \b1, ror #brot - add \a0, \a0, M_0 - add \a1, \a1, M_1 - - // d = ror32(d ^ a, 16); - eor \d0, \a0, \d0, ror #drot - eor \d1, \a1, \d1, ror #drot - - // c += d; - add \c0, \c0, \d0, ror #16 - add \c1, \c1, \d1, ror #16 - - // b = ror32(b ^ c, 12); - eor \b0, \c0, \b0, ror #brot - eor \b1, \c1, \b1, ror #brot - - ldr M_0, [sp, #32 + 4 * \s1] - ldr M_1, [sp, #32 + 4 * \s3] - - // a += b + m[blake2s_sigma[r][2*i + 1]]; - add \a0, \a0, \b0, ror #12 - add \a1, \a1, \b1, ror #12 - add \a0, \a0, M_0 - add \a1, \a1, M_1 - - // d = ror32(d ^ a, 8); - eor \d0, \a0, \d0, ror#16 - eor \d1, \a1, \d1, ror#16 - - // c += d; - add \c0, \c0, \d0, ror#8 - add \c1, \c1, \d1, ror#8 - - // b = ror32(b ^ c, 7); - eor \b0, \c0, \b0, ror#12 - eor \b1, \c1, \b1, ror#12 -.endm - -// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9] -// are in r0..r9. The stack pointer points to 8 bytes of scratch space for -// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and -// r14 are free to use. The macro arguments s0-s15 give the order in which the -// message words are used in this round. -// -// All rotates are performed using the implicit rotate operand accepted by the -// 'add' and 'eor' instructions. This is faster than using explicit rotate -// instructions. To make this work, we allow the values in the second and last -// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the -// wrong rotation amount. The rotation amount is then fixed up just in time -// when the values are used. 'brot' is the number of bits the values in row 'b' -// need to be rotated right to arrive at the correct values, and 'drot' -// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such -// that they end up as (7, 8) after every round. -.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \ - s8, s9, s10, s11, s12, s13, s14, s15 - - // Mix first two columns: - // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]). - __ldrd r10, r11, sp, 16 // load v[12] and v[13] - _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \ - \s0, \s1, \s2, \s3 - __strd r8, r9, sp, 0 - __strd r10, r11, sp, 16 - - // Mix second two columns: - // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]). - __ldrd r8, r9, sp, 8 // load v[10] and v[11] - __ldrd r10, r11, sp, 24 // load v[14] and v[15] - _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \ - \s4, \s5, \s6, \s7 - str r10, [sp, #24] // store v[14] - // v[10], v[11], and v[15] are used below, so no need to store them yet. - - .set brot, 7 - .set drot, 8 - - // Mix first two diagonals: - // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]). - ldr r10, [sp, #16] // load v[12] - _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \ - \s8, \s9, \s10, \s11 - __strd r8, r9, sp, 8 - str r11, [sp, #28] - str r10, [sp, #16] - - // Mix second two diagonals: - // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]). - __ldrd r8, r9, sp, 0 // load v[8] and v[9] - __ldrd r10, r11, sp, 20 // load v[13] and v[14] - _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \ - \s12, \s13, \s14, \s15 - __strd r10, r11, sp, 20 -.endm - -// -// void blake2s_compress(struct blake2s_state *state, -// const u8 *block, size_t nblocks, u32 inc); -// -// Only the first three fields of struct blake2s_state are used: -// u32 h[8]; (inout) -// u32 t[2]; (inout) -// u32 f[2]; (in) -// - .align 5 -ENTRY(blake2s_compress) - push {r0-r2,r4-r11,lr} // keep this an even number - -.Lnext_block: - // r0 is 'state' - // r1 is 'block' - // r3 is 'inc' - - // Load and increment the counter t[0..1]. - __ldrd r10, r11, r0, 32 - adds r10, r10, r3 - adc r11, r11, #0 - __strd r10, r11, r0, 32 - - // _blake2s_round is very short on registers, so copy the message block - // to the stack to save a register during the rounds. This also has the - // advantage that misalignment only needs to be dealt with in one place. - sub sp, sp, #64 - mov r12, sp - tst r1, #3 - bne .Lcopy_block_misaligned - ldmia r1!, {r2-r9} - _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 - stmia r12!, {r2-r9} - ldmia r1!, {r2-r9} - _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 - stmia r12, {r2-r9} -.Lcopy_block_done: - str r1, [sp, #68] // Update message pointer - - // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space - // for spilling v[8..9]. Leave v[8..9] in r8-r9. - mov r14, r0 // r14 = state - adr r12, .Lblake2s_IV - ldmia r12!, {r8-r9} // load IV[0..1] - __ldrd r0, r1, r14, 40 // load f[0..1] - ldm r12, {r2-r7} // load IV[3..7] - eor r4, r4, r10 // v[12] = IV[4] ^ t[0] - eor r5, r5, r11 // v[13] = IV[5] ^ t[1] - eor r6, r6, r0 // v[14] = IV[6] ^ f[0] - eor r7, r7, r1 // v[15] = IV[7] ^ f[1] - push {r2-r7} // push v[9..15] - sub sp, sp, #8 // leave space for v[8..9] - - // Load h[0..7] == v[0..7]. - ldm r14, {r0-r7} - - // Execute the rounds. Each round is provided the order in which it - // needs to use the message words. - .set brot, 0 - .set drot, 0 - _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 - _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 - _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 - _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 - _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 - _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 - _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 - _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 - _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 - - // Fold the final state matrix into the hash chaining value: - // - // for (i = 0; i < 8; i++) - // h[i] ^= v[i] ^ v[i + 8]; - // - ldr r14, [sp, #96] // r14 = &h[0] - add sp, sp, #8 // v[8..9] are already loaded. - pop {r10-r11} // load v[10..11] - eor r0, r0, r8 - eor r1, r1, r9 - eor r2, r2, r10 - eor r3, r3, r11 - ldm r14, {r8-r11} // load h[0..3] - eor r0, r0, r8 - eor r1, r1, r9 - eor r2, r2, r10 - eor r3, r3, r11 - stmia r14!, {r0-r3} // store new h[0..3] - ldm r14, {r0-r3} // load old h[4..7] - pop {r8-r11} // load v[12..15] - eor r0, r0, r4, ror #brot - eor r1, r1, r5, ror #brot - eor r2, r2, r6, ror #brot - eor r3, r3, r7, ror #brot - eor r0, r0, r8, ror #drot - eor r1, r1, r9, ror #drot - eor r2, r2, r10, ror #drot - eor r3, r3, r11, ror #drot - add sp, sp, #64 // skip copy of message block - stm r14, {r0-r3} // store new h[4..7] - - // Advance to the next block, if there is one. Note that if there are - // multiple blocks, then 'inc' (the counter increment amount) must be - // 64. So we can simply set it to 64 without re-loading it. - ldm sp, {r0, r1, r2} // load (state, block, nblocks) - mov r3, #64 // set 'inc' - subs r2, r2, #1 // nblocks-- - str r2, [sp, #8] - bne .Lnext_block // nblocks != 0? - - pop {r0-r2,r4-r11,pc} - - // The next message block (pointed to by r1) isn't 4-byte aligned, so it - // can't be loaded using ldmia. Copy it to the stack buffer (pointed to - // by r12) using an alternative method. r2-r9 are free to use. -.Lcopy_block_misaligned: - mov r2, #64 -1: -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - ldr r3, [r1], #4 - _le32_bswap r3, r4 -#else - ldrb r3, [r1, #0] - ldrb r4, [r1, #1] - ldrb r5, [r1, #2] - ldrb r6, [r1, #3] - add r1, r1, #4 - orr r3, r3, r4, lsl #8 - orr r3, r3, r5, lsl #16 - orr r3, r3, r6, lsl #24 -#endif - subs r2, r2, #4 - str r3, [r12], #4 - bne 1b - b .Lcopy_block_done -ENDPROC(blake2s_compress) diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c deleted file mode 100644 index 0238a70d9581..000000000000 --- a/arch/arm/crypto/blake2s-glue.c +++ /dev/null @@ -1,7 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include <crypto/internal/blake2s.h> -#include <linux/module.h> - -/* defined in blake2s-core.S */ -EXPORT_SYMBOL(blake2s_compress); diff --git a/arch/arm/crypto/chacha-glue.c b/arch/arm/crypto/chacha-glue.c deleted file mode 100644 index 50e635512046..000000000000 --- a/arch/arm/crypto/chacha-glue.c +++ /dev/null @@ -1,352 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ARM NEON accelerated ChaCha and XChaCha stream ciphers, - * including ChaCha20 (RFC7539) - * - * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * Copyright (C) 2015 Martin Willi - */ - -#include <crypto/algapi.h> -#include <crypto/internal/chacha.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/skcipher.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/module.h> - -#include <asm/cputype.h> -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <asm/simd.h> - -asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src, - int nrounds); -asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src, - int nrounds, unsigned int nbytes); -asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds); -asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); - -asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, - const u32 *state, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon); - -static inline bool neon_usable(void) -{ - return static_branch_likely(&use_neon) && crypto_simd_usable(); -} - -static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - u8 buf[CHACHA_BLOCK_SIZE]; - - while (bytes > CHACHA_BLOCK_SIZE) { - unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U); - - chacha_4block_xor_neon(state, dst, src, nrounds, l); - bytes -= l; - src += l; - dst += l; - state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); - } - if (bytes) { - const u8 *s = src; - u8 *d = dst; - - if (bytes != CHACHA_BLOCK_SIZE) - s = d = memcpy(buf, src, bytes); - chacha_block_xor_neon(state, d, s, nrounds); - if (d != dst) - memcpy(dst, buf, bytes); - state[12]++; - } -} - -void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) -{ - if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { - hchacha_block_arm(state, stream, nrounds); - } else { - kernel_neon_begin(); - hchacha_block_neon(state, stream, nrounds); - kernel_neon_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, - int nrounds) -{ - if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() || - bytes <= CHACHA_BLOCK_SIZE) { - chacha_doarm(dst, src, bytes, state, nrounds); - state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE); - return; - } - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_neon_begin(); - chacha_doneon(state, dst, src, todo, nrounds); - kernel_neon_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static int chacha_stream_xor(struct skcipher_request *req, - const struct chacha_ctx *ctx, const u8 *iv, - bool neon) -{ - struct skcipher_walk walk; - u32 state[16]; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - chacha_init(state, ctx->key, iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) { - chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr, - nbytes, state, ctx->nrounds); - state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE); - } else { - kernel_neon_begin(); - chacha_doneon(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, ctx->nrounds); - kernel_neon_end(); - } - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static int do_chacha(struct skcipher_request *req, bool neon) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - - return chacha_stream_xor(req, ctx, req->iv, neon); -} - -static int chacha_arm(struct skcipher_request *req) -{ - return do_chacha(req, false); -} - -static int chacha_neon(struct skcipher_request *req) -{ - return do_chacha(req, neon_usable()); -} - -static int do_xchacha(struct skcipher_request *req, bool neon) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct chacha_ctx subctx; - u32 state[16]; - u8 real_iv[16]; - - chacha_init(state, ctx->key, req->iv); - - if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) { - hchacha_block_arm(state, subctx.key, ctx->nrounds); - } else { - kernel_neon_begin(); - hchacha_block_neon(state, subctx.key, ctx->nrounds); - kernel_neon_end(); - } - subctx.nrounds = ctx->nrounds; - - memcpy(&real_iv[0], req->iv + 24, 8); - memcpy(&real_iv[8], req->iv + 16, 8); - return chacha_stream_xor(req, &subctx, real_iv, neon); -} - -static int xchacha_arm(struct skcipher_request *req) -{ - return do_xchacha(req, false); -} - -static int xchacha_neon(struct skcipher_request *req) -{ - return do_xchacha(req, neon_usable()); -} - -static struct skcipher_alg arm_algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-arm", - .base.cra_priority = 200, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = chacha_arm, - .decrypt = chacha_arm, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-arm", - .base.cra_priority = 200, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = xchacha_arm, - .decrypt = xchacha_arm, - }, { - .base.cra_name = "xchacha12", - .base.cra_driver_name = "xchacha12-arm", - .base.cra_priority = 200, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha12_setkey, - .encrypt = xchacha_arm, - .decrypt = xchacha_arm, - }, -}; - -static struct skcipher_alg neon_algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = chacha_neon, - .decrypt = chacha_neon, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = xchacha_neon, - .decrypt = xchacha_neon, - }, { - .base.cra_name = "xchacha12", - .base.cra_driver_name = "xchacha12-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 4 * CHACHA_BLOCK_SIZE, - .setkey = chacha12_setkey, - .encrypt = xchacha_neon, - .decrypt = xchacha_neon, - } -}; - -static int __init chacha_simd_mod_init(void) -{ - int err = 0; - - if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) { - err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs)); - if (err) - return err; - } - - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { - int i; - - switch (read_cpuid_part()) { - case ARM_CPU_PART_CORTEX_A7: - case ARM_CPU_PART_CORTEX_A5: - /* - * The Cortex-A7 and Cortex-A5 do not perform well with - * the NEON implementation but do incredibly with the - * scalar one and use less power. - */ - for (i = 0; i < ARRAY_SIZE(neon_algs); i++) - neon_algs[i].base.cra_priority = 0; - break; - default: - static_branch_enable(&use_neon); - } - - if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) { - err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs)); - if (err) - crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs)); - } - } - return err; -} - -static void __exit chacha_simd_mod_fini(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER)) { - crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs)); - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) - crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs)); - } -} - -module_init(chacha_simd_mod_init); -module_exit(chacha_simd_mod_fini); - -MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-arm"); -MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-arm"); -MODULE_ALIAS_CRYPTO("xchacha12"); -MODULE_ALIAS_CRYPTO("xchacha12-arm"); -#ifdef CONFIG_KERNEL_MODE_NEON -MODULE_ALIAS_CRYPTO("chacha20-neon"); -MODULE_ALIAS_CRYPTO("xchacha20-neon"); -MODULE_ALIAS_CRYPTO("xchacha12-neon"); -#endif diff --git a/arch/arm/crypto/chacha-neon-core.S b/arch/arm/crypto/chacha-neon-core.S deleted file mode 100644 index 13d12f672656..000000000000 --- a/arch/arm/crypto/chacha-neon-core.S +++ /dev/null @@ -1,643 +0,0 @@ -/* - * ChaCha/XChaCha NEON helper functions - * - * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - - /* - * NEON doesn't have a rotate instruction. The alternatives are, more or less: - * - * (a) vshl.u32 + vsri.u32 (needs temporary register) - * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) - * (c) vrev32.16 (16-bit rotations only) - * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, - * needs index vector) - * - * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations, - * the only choices are (a) and (b). We use (a) since it takes two-thirds the - * cycles of (b) on both Cortex-A7 and Cortex-A53. - * - * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest - * and doesn't need a temporary register. - * - * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence - * is twice as fast as (a), even when doing (a) on multiple registers - * simultaneously to eliminate the stall between vshl and vsri. Also, it - * parallelizes better when temporary registers are scarce. - * - * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as - * (a), so the need to load the rotation table actually makes the vtbl method - * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it - * seems to be a good compromise to get a more significant speed boost on some - * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. - */ - -#include <linux/linkage.h> -#include <asm/cache.h> - - .text - .fpu neon - .align 5 - -/* - * chacha_permute - permute one block - * - * Permute one 64-byte block where the state matrix is stored in the four NEON - * registers q0-q3. It performs matrix operations on four words in parallel, - * but requires shuffling to rearrange the words after each round. - * - * The round count is given in r3. - * - * Clobbers: r3, ip, q4-q5 - */ -chacha_permute: - - adr ip, .Lrol8_table - vld1.8 {d10}, [ip, :64] - -.Ldoubleround: - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vrev32.16 q3, q3 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #12 - vsri.u32 q1, q4, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vtbl.8 d6, {d6}, d10 - vtbl.8 d7, {d7}, d10 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #7 - vsri.u32 q1, q4, #25 - - // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vext.8 q1, q1, q1, #4 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vext.8 q2, q2, q2, #8 - // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vext.8 q3, q3, q3, #12 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vrev32.16 q3, q3 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #12 - vsri.u32 q1, q4, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vtbl.8 d6, {d6}, d10 - vtbl.8 d7, {d7}, d10 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #7 - vsri.u32 q1, q4, #25 - - // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vext.8 q1, q1, q1, #12 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vext.8 q2, q2, q2, #8 - // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vext.8 q3, q3, q3, #4 - - subs r3, r3, #2 - bne .Ldoubleround - - bx lr -ENDPROC(chacha_permute) - -ENTRY(chacha_block_xor_neon) - // r0: Input state matrix, s - // r1: 1 data block output, o - // r2: 1 data block input, i - // r3: nrounds - push {lr} - - // x0..3 = s0..3 - add ip, r0, #0x20 - vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [ip] - - vmov q8, q0 - vmov q9, q1 - vmov q10, q2 - vmov q11, q3 - - bl chacha_permute - - add ip, r2, #0x20 - vld1.8 {q4-q5}, [r2] - vld1.8 {q6-q7}, [ip] - - // o0 = i0 ^ (x0 + s0) - vadd.i32 q0, q0, q8 - veor q0, q0, q4 - - // o1 = i1 ^ (x1 + s1) - vadd.i32 q1, q1, q9 - veor q1, q1, q5 - - // o2 = i2 ^ (x2 + s2) - vadd.i32 q2, q2, q10 - veor q2, q2, q6 - - // o3 = i3 ^ (x3 + s3) - vadd.i32 q3, q3, q11 - veor q3, q3, q7 - - add ip, r1, #0x20 - vst1.8 {q0-q1}, [r1] - vst1.8 {q2-q3}, [ip] - - pop {pc} -ENDPROC(chacha_block_xor_neon) - -ENTRY(hchacha_block_neon) - // r0: Input state matrix, s - // r1: output (8 32-bit words) - // r2: nrounds - push {lr} - - vld1.32 {q0-q1}, [r0]! - vld1.32 {q2-q3}, [r0] - - mov r3, r2 - bl chacha_permute - - vst1.32 {q0}, [r1]! - vst1.32 {q3}, [r1] - - pop {pc} -ENDPROC(hchacha_block_neon) - - .align 4 -.Lctrinc: .word 0, 1, 2, 3 -.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 - - .align 5 -ENTRY(chacha_4block_xor_neon) - push {r4, lr} - mov r4, sp // preserve the stack pointer - sub ip, sp, #0x20 // allocate a 32 byte buffer - bic ip, ip, #0x1f // aligned to 32 bytes - mov sp, ip - - // r0: Input state matrix, s - // r1: 4 data blocks output, o - // r2: 4 data blocks input, i - // r3: nrounds - - // - // This function encrypts four consecutive ChaCha blocks by loading - // the state matrix in NEON registers four times. The algorithm performs - // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. The words are re-interleaved before the - // final addition of the original state and the XORing step. - // - - // x0..15[0-3] = s0..15[0-3] - add ip, r0, #0x20 - vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [ip] - - adr lr, .Lctrinc - vdup.32 q15, d7[1] - vdup.32 q14, d7[0] - vld1.32 {q4}, [lr, :128] - vdup.32 q13, d6[1] - vdup.32 q12, d6[0] - vdup.32 q11, d5[1] - vdup.32 q10, d5[0] - vadd.u32 q12, q12, q4 // x12 += counter values 0-3 - vdup.32 q9, d4[1] - vdup.32 q8, d4[0] - vdup.32 q7, d3[1] - vdup.32 q6, d3[0] - vdup.32 q5, d2[1] - vdup.32 q4, d2[0] - vdup.32 q3, d1[1] - vdup.32 q2, d1[0] - vdup.32 q1, d0[1] - vdup.32 q0, d0[0] - - adr ip, .Lrol8_table - b 1f - -.Ldoubleround4: - vld1.32 {q8-q9}, [sp, :256] -1: - // x0 += x4, x12 = rotl32(x12 ^ x0, 16) - // x1 += x5, x13 = rotl32(x13 ^ x1, 16) - // x2 += x6, x14 = rotl32(x14 ^ x2, 16) - // x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vadd.i32 q0, q0, q4 - vadd.i32 q1, q1, q5 - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - - veor q12, q12, q0 - veor q13, q13, q1 - veor q14, q14, q2 - veor q15, q15, q3 - - vrev32.16 q12, q12 - vrev32.16 q13, q13 - vrev32.16 q14, q14 - vrev32.16 q15, q15 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 12) - // x9 += x13, x5 = rotl32(x5 ^ x9, 12) - // x10 += x14, x6 = rotl32(x6 ^ x10, 12) - // x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vadd.i32 q8, q8, q12 - vadd.i32 q9, q9, q13 - vadd.i32 q10, q10, q14 - vadd.i32 q11, q11, q15 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q4, q8 - veor q9, q5, q9 - vshl.u32 q4, q8, #12 - vshl.u32 q5, q9, #12 - vsri.u32 q4, q8, #20 - vsri.u32 q5, q9, #20 - - veor q8, q6, q10 - veor q9, q7, q11 - vshl.u32 q6, q8, #12 - vshl.u32 q7, q9, #12 - vsri.u32 q6, q8, #20 - vsri.u32 q7, q9, #20 - - // x0 += x4, x12 = rotl32(x12 ^ x0, 8) - // x1 += x5, x13 = rotl32(x13 ^ x1, 8) - // x2 += x6, x14 = rotl32(x14 ^ x2, 8) - // x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vld1.8 {d16}, [ip, :64] - vadd.i32 q0, q0, q4 - vadd.i32 q1, q1, q5 - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - - veor q12, q12, q0 - veor q13, q13, q1 - veor q14, q14, q2 - veor q15, q15, q3 - - vtbl.8 d24, {d24}, d16 - vtbl.8 d25, {d25}, d16 - vtbl.8 d26, {d26}, d16 - vtbl.8 d27, {d27}, d16 - vtbl.8 d28, {d28}, d16 - vtbl.8 d29, {d29}, d16 - vtbl.8 d30, {d30}, d16 - vtbl.8 d31, {d31}, d16 - - vld1.32 {q8-q9}, [sp, :256] - - // x8 += x12, x4 = rotl32(x4 ^ x8, 7) - // x9 += x13, x5 = rotl32(x5 ^ x9, 7) - // x10 += x14, x6 = rotl32(x6 ^ x10, 7) - // x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vadd.i32 q8, q8, q12 - vadd.i32 q9, q9, q13 - vadd.i32 q10, q10, q14 - vadd.i32 q11, q11, q15 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q4, q8 - veor q9, q5, q9 - vshl.u32 q4, q8, #7 - vshl.u32 q5, q9, #7 - vsri.u32 q4, q8, #25 - vsri.u32 q5, q9, #25 - - veor q8, q6, q10 - veor q9, q7, q11 - vshl.u32 q6, q8, #7 - vshl.u32 q7, q9, #7 - vsri.u32 q6, q8, #25 - vsri.u32 q7, q9, #25 - - vld1.32 {q8-q9}, [sp, :256] - - // x0 += x5, x15 = rotl32(x15 ^ x0, 16) - // x1 += x6, x12 = rotl32(x12 ^ x1, 16) - // x2 += x7, x13 = rotl32(x13 ^ x2, 16) - // x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vadd.i32 q0, q0, q5 - vadd.i32 q1, q1, q6 - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q4 - - veor q15, q15, q0 - veor q12, q12, q1 - veor q13, q13, q2 - veor q14, q14, q3 - - vrev32.16 q15, q15 - vrev32.16 q12, q12 - vrev32.16 q13, q13 - vrev32.16 q14, q14 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 12) - // x11 += x12, x6 = rotl32(x6 ^ x11, 12) - // x8 += x13, x7 = rotl32(x7 ^ x8, 12) - // x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vadd.i32 q10, q10, q15 - vadd.i32 q11, q11, q12 - vadd.i32 q8, q8, q13 - vadd.i32 q9, q9, q14 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q7, q8 - veor q9, q4, q9 - vshl.u32 q7, q8, #12 - vshl.u32 q4, q9, #12 - vsri.u32 q7, q8, #20 - vsri.u32 q4, q9, #20 - - veor q8, q5, q10 - veor q9, q6, q11 - vshl.u32 q5, q8, #12 - vshl.u32 q6, q9, #12 - vsri.u32 q5, q8, #20 - vsri.u32 q6, q9, #20 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 8) - // x1 += x6, x12 = rotl32(x12 ^ x1, 8) - // x2 += x7, x13 = rotl32(x13 ^ x2, 8) - // x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vld1.8 {d16}, [ip, :64] - vadd.i32 q0, q0, q5 - vadd.i32 q1, q1, q6 - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q4 - - veor q15, q15, q0 - veor q12, q12, q1 - veor q13, q13, q2 - veor q14, q14, q3 - - vtbl.8 d30, {d30}, d16 - vtbl.8 d31, {d31}, d16 - vtbl.8 d24, {d24}, d16 - vtbl.8 d25, {d25}, d16 - vtbl.8 d26, {d26}, d16 - vtbl.8 d27, {d27}, d16 - vtbl.8 d28, {d28}, d16 - vtbl.8 d29, {d29}, d16 - - vld1.32 {q8-q9}, [sp, :256] - - // x10 += x15, x5 = rotl32(x5 ^ x10, 7) - // x11 += x12, x6 = rotl32(x6 ^ x11, 7) - // x8 += x13, x7 = rotl32(x7 ^ x8, 7) - // x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vadd.i32 q10, q10, q15 - vadd.i32 q11, q11, q12 - vadd.i32 q8, q8, q13 - vadd.i32 q9, q9, q14 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q7, q8 - veor q9, q4, q9 - vshl.u32 q7, q8, #7 - vshl.u32 q4, q9, #7 - vsri.u32 q7, q8, #25 - vsri.u32 q4, q9, #25 - - veor q8, q5, q10 - veor q9, q6, q11 - vshl.u32 q5, q8, #7 - vshl.u32 q6, q9, #7 - vsri.u32 q5, q8, #25 - vsri.u32 q6, q9, #25 - - subs r3, r3, #2 - bne .Ldoubleround4 - - // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. - // x8..9[0-3] are on the stack. - - // Re-interleave the words in the first two rows of each block (x0..7). - // Also add the counter values 0-3 to x12[0-3]. - vld1.32 {q8}, [lr, :128] // load counter values 0-3 - vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) - vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) - vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) - vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) - vadd.u32 q12, q8 // x12 += counter values 0-3 - vswp d1, d4 - vswp d3, d6 - vld1.32 {q8-q9}, [r0]! // load s0..7 - vswp d9, d12 - vswp d11, d14 - - // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) - // after XORing the first 32 bytes. - vswp q1, q4 - - // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) - - // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) - vadd.u32 q0, q0, q8 - vadd.u32 q2, q2, q8 - vadd.u32 q4, q4, q8 - vadd.u32 q3, q3, q8 - - // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) - vadd.u32 q1, q1, q9 - vadd.u32 q6, q6, q9 - vadd.u32 q5, q5, q9 - vadd.u32 q7, q7, q9 - - // XOR first 32 bytes using keystream from first two rows of first block - vld1.8 {q8-q9}, [r2]! - veor q8, q8, q0 - veor q9, q9, q1 - vst1.8 {q8-q9}, [r1]! - - // Re-interleave the words in the last two rows of each block (x8..15). - vld1.32 {q8-q9}, [sp, :256] - mov sp, r4 // restore original stack pointer - ldr r4, [r4, #8] // load number of bytes - vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) - vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) - vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) - vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) - vld1.32 {q0-q1}, [r0] // load s8..15 - vswp d25, d28 - vswp d27, d30 - vswp d17, d20 - vswp d19, d22 - - // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) - - // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) - vadd.u32 q8, q8, q0 - vadd.u32 q10, q10, q0 - vadd.u32 q9, q9, q0 - vadd.u32 q11, q11, q0 - - // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) - vadd.u32 q12, q12, q1 - vadd.u32 q14, q14, q1 - vadd.u32 q13, q13, q1 - vadd.u32 q15, q15, q1 - - // XOR the rest of the data with the keystream - - vld1.8 {q0-q1}, [r2]! - subs r4, r4, #96 - veor q0, q0, q8 - veor q1, q1, q12 - ble .Lle96 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - subs r4, r4, #32 - veor q0, q0, q2 - veor q1, q1, q6 - ble .Lle128 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - subs r4, r4, #32 - veor q0, q0, q10 - veor q1, q1, q14 - ble .Lle160 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - subs r4, r4, #32 - veor q0, q0, q4 - veor q1, q1, q5 - ble .Lle192 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - subs r4, r4, #32 - veor q0, q0, q9 - veor q1, q1, q13 - ble .Lle224 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - subs r4, r4, #32 - veor q0, q0, q3 - veor q1, q1, q7 - blt .Llt256 -.Lout: - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2] - veor q0, q0, q11 - veor q1, q1, q15 - vst1.8 {q0-q1}, [r1] - - pop {r4, pc} - -.Lle192: - vmov q4, q9 - vmov q5, q13 - -.Lle160: - // nothing to do - -.Lfinalblock: - // Process the final block if processing less than 4 full blocks. - // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the - // previous 32 byte output block that still needs to be written at - // [r1] in q0-q1. - beq .Lfullblock - -.Lpartialblock: - adr lr, .Lpermute + 32 - add r2, r2, r4 - add lr, lr, r4 - add r4, r4, r1 - - vld1.8 {q2-q3}, [lr] - vld1.8 {q6-q7}, [r2] - - add r4, r4, #32 - - vtbl.8 d4, {q4-q5}, d4 - vtbl.8 d5, {q4-q5}, d5 - vtbl.8 d6, {q4-q5}, d6 - vtbl.8 d7, {q4-q5}, d7 - - veor q6, q6, q2 - veor q7, q7, q3 - - vst1.8 {q6-q7}, [r4] // overlapping stores - vst1.8 {q0-q1}, [r1] - pop {r4, pc} - -.Lfullblock: - vmov q11, q4 - vmov q15, q5 - b .Lout -.Lle96: - vmov q4, q2 - vmov q5, q6 - b .Lfinalblock -.Lle128: - vmov q4, q10 - vmov q5, q14 - b .Lfinalblock -.Lle224: - vmov q4, q3 - vmov q5, q7 - b .Lfinalblock -.Llt256: - vmov q4, q11 - vmov q5, q15 - b .Lpartialblock -ENDPROC(chacha_4block_xor_neon) - - .align L1_CACHE_SHIFT -.Lpermute: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 - .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 - .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f diff --git a/arch/arm/crypto/chacha-scalar-core.S b/arch/arm/crypto/chacha-scalar-core.S deleted file mode 100644 index 083fe1ab96d0..000000000000 --- a/arch/arm/crypto/chacha-scalar-core.S +++ /dev/null @@ -1,443 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2018 Google, Inc. - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - -/* - * Design notes: - * - * 16 registers would be needed to hold the state matrix, but only 14 are - * available because 'sp' and 'pc' cannot be used. So we spill the elements - * (x8, x9) to the stack and swap them out with (x10, x11). This adds one - * 'ldrd' and one 'strd' instruction per round. - * - * All rotates are performed using the implicit rotate operand accepted by the - * 'add' and 'eor' instructions. This is faster than using explicit rotate - * instructions. To make this work, we allow the values in the second and last - * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the - * wrong rotation amount. The rotation amount is then fixed up just in time - * when the values are used. 'brot' is the number of bits the values in row 'b' - * need to be rotated right to arrive at the correct values, and 'drot' - * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such - * that they end up as (25, 24) after every round. - */ - - // ChaCha state registers - X0 .req r0 - X1 .req r1 - X2 .req r2 - X3 .req r3 - X4 .req r4 - X5 .req r5 - X6 .req r6 - X7 .req r7 - X8_X10 .req r8 // shared by x8 and x10 - X9_X11 .req r9 // shared by x9 and x11 - X12 .req r10 - X13 .req r11 - X14 .req r12 - X15 .req r14 - -.macro _le32_bswap_4x a, b, c, d, tmp -#ifdef __ARMEB__ - rev_l \a, \tmp - rev_l \b, \tmp - rev_l \c, \tmp - rev_l \d, \tmp -#endif -.endm - -.macro __ldrd a, b, src, offset -#if __LINUX_ARM_ARCH__ >= 6 - ldrd \a, \b, [\src, #\offset] -#else - ldr \a, [\src, #\offset] - ldr \b, [\src, #\offset + 4] -#endif -.endm - -.macro __strd a, b, dst, offset -#if __LINUX_ARM_ARCH__ >= 6 - strd \a, \b, [\dst, #\offset] -#else - str \a, [\dst, #\offset] - str \b, [\dst, #\offset + 4] -#endif -.endm - -.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 - - // a += b; d ^= a; d = rol(d, 16); - add \a1, \a1, \b1, ror #brot - add \a2, \a2, \b2, ror #brot - eor \d1, \a1, \d1, ror #drot - eor \d2, \a2, \d2, ror #drot - // drot == 32 - 16 == 16 - - // c += d; b ^= c; b = rol(b, 12); - add \c1, \c1, \d1, ror #16 - add \c2, \c2, \d2, ror #16 - eor \b1, \c1, \b1, ror #brot - eor \b2, \c2, \b2, ror #brot - // brot == 32 - 12 == 20 - - // a += b; d ^= a; d = rol(d, 8); - add \a1, \a1, \b1, ror #20 - add \a2, \a2, \b2, ror #20 - eor \d1, \a1, \d1, ror #16 - eor \d2, \a2, \d2, ror #16 - // drot == 32 - 8 == 24 - - // c += d; b ^= c; b = rol(b, 7); - add \c1, \c1, \d1, ror #24 - add \c2, \c2, \d2, ror #24 - eor \b1, \c1, \b1, ror #20 - eor \b2, \c2, \b2, ror #20 - // brot == 32 - 7 == 25 -.endm - -.macro _doubleround - - // column round - - // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) - _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 - - // save (x8, x9); restore (x10, x11) - __strd X8_X10, X9_X11, sp, 0 - __ldrd X8_X10, X9_X11, sp, 8 - - // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) - _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 - - .set brot, 25 - .set drot, 24 - - // diagonal round - - // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) - _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 - - // save (x10, x11); restore (x8, x9) - __strd X8_X10, X9_X11, sp, 8 - __ldrd X8_X10, X9_X11, sp, 0 - - // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) - _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 -.endm - -.macro _chacha_permute nrounds - .set brot, 0 - .set drot, 0 - .rept \nrounds / 2 - _doubleround - .endr -.endm - -.macro _chacha nrounds - -.Lnext_block\@: - // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN - // Registers contain x0-x9,x12-x15. - - // Do the core ChaCha permutation to update x0-x15. - _chacha_permute \nrounds - - add sp, #8 - // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN - // Registers contain x0-x9,x12-x15. - // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. - - // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). - push {X8_X10, X9_X11, X12, X13, X14, X15} - - // Load (OUT, IN, LEN). - ldr r14, [sp, #96] - ldr r12, [sp, #100] - ldr r11, [sp, #104] - - orr r10, r14, r12 - - // Use slow path if fewer than 64 bytes remain. - cmp r11, #64 - blt .Lxor_slowpath\@ - - // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on - // ARMv6+, since ldmia and stmia (used below) still require alignment. - tst r10, #3 - bne .Lxor_slowpath\@ - - // Fast path: XOR 64 bytes of aligned data. - - // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN - // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. - // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. - - // x0-x3 - __ldrd r8, r9, sp, 32 - __ldrd r10, r11, sp, 40 - add X0, X0, r8 - add X1, X1, r9 - add X2, X2, r10 - add X3, X3, r11 - _le32_bswap_4x X0, X1, X2, X3, r8 - ldmia r12!, {r8-r11} - eor X0, X0, r8 - eor X1, X1, r9 - eor X2, X2, r10 - eor X3, X3, r11 - stmia r14!, {X0-X3} - - // x4-x7 - __ldrd r8, r9, sp, 48 - __ldrd r10, r11, sp, 56 - add X4, r8, X4, ror #brot - add X5, r9, X5, ror #brot - ldmia r12!, {X0-X3} - add X6, r10, X6, ror #brot - add X7, r11, X7, ror #brot - _le32_bswap_4x X4, X5, X6, X7, r8 - eor X4, X4, X0 - eor X5, X5, X1 - eor X6, X6, X2 - eor X7, X7, X3 - stmia r14!, {X4-X7} - - // x8-x15 - pop {r0-r7} // (x8-x9,x12-x15,x10-x11) - __ldrd r8, r9, sp, 32 - __ldrd r10, r11, sp, 40 - add r0, r0, r8 // x8 - add r1, r1, r9 // x9 - add r6, r6, r10 // x10 - add r7, r7, r11 // x11 - _le32_bswap_4x r0, r1, r6, r7, r8 - ldmia r12!, {r8-r11} - eor r0, r0, r8 // x8 - eor r1, r1, r9 // x9 - eor r6, r6, r10 // x10 - eor r7, r7, r11 // x11 - stmia r14!, {r0,r1,r6,r7} - ldmia r12!, {r0,r1,r6,r7} - __ldrd r8, r9, sp, 48 - __ldrd r10, r11, sp, 56 - add r2, r8, r2, ror #drot // x12 - add r3, r9, r3, ror #drot // x13 - add r4, r10, r4, ror #drot // x14 - add r5, r11, r5, ror #drot // x15 - _le32_bswap_4x r2, r3, r4, r5, r9 - ldr r9, [sp, #72] // load LEN - eor r2, r2, r0 // x12 - eor r3, r3, r1 // x13 - eor r4, r4, r6 // x14 - eor r5, r5, r7 // x15 - subs r9, #64 // decrement and check LEN - stmia r14!, {r2-r5} - - beq .Ldone\@ - -.Lprepare_for_next_block\@: - - // Stack: x0-x15 OUT IN LEN - - // Increment block counter (x12) - add r8, #1 - - // Store updated (OUT, IN, LEN) - str r14, [sp, #64] - str r12, [sp, #68] - str r9, [sp, #72] - - mov r14, sp - - // Store updated block counter (x12) - str r8, [sp, #48] - - sub sp, #16 - - // Reload state and do next block - ldmia r14!, {r0-r11} // load x0-x11 - __strd r10, r11, sp, 8 // store x10-x11 before state - ldmia r14, {r10-r12,r14} // load x12-x15 - b .Lnext_block\@ - -.Lxor_slowpath\@: - // Slow path: < 64 bytes remaining, or unaligned input or output buffer. - // We handle it by storing the 64 bytes of keystream to the stack, then - // XOR-ing the needed portion with the data. - - // Allocate keystream buffer - sub sp, #64 - mov r14, sp - - // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN - // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. - // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. - - // Save keystream for x0-x3 - __ldrd r8, r9, sp, 96 - __ldrd r10, r11, sp, 104 - add X0, X0, r8 - add X1, X1, r9 - add X2, X2, r10 - add X3, X3, r11 - _le32_bswap_4x X0, X1, X2, X3, r8 - stmia r14!, {X0-X3} - - // Save keystream for x4-x7 - __ldrd r8, r9, sp, 112 - __ldrd r10, r11, sp, 120 - add X4, r8, X4, ror #brot - add X5, r9, X5, ror #brot - add X6, r10, X6, ror #brot - add X7, r11, X7, ror #brot - _le32_bswap_4x X4, X5, X6, X7, r8 - add r8, sp, #64 - stmia r14!, {X4-X7} - - // Save keystream for x8-x15 - ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) - __ldrd r8, r9, sp, 128 - __ldrd r10, r11, sp, 136 - add r0, r0, r8 // x8 - add r1, r1, r9 // x9 - add r6, r6, r10 // x10 - add r7, r7, r11 // x11 - _le32_bswap_4x r0, r1, r6, r7, r8 - stmia r14!, {r0,r1,r6,r7} - __ldrd r8, r9, sp, 144 - __ldrd r10, r11, sp, 152 - add r2, r8, r2, ror #drot // x12 - add r3, r9, r3, ror #drot // x13 - add r4, r10, r4, ror #drot // x14 - add r5, r11, r5, ror #drot // x15 - _le32_bswap_4x r2, r3, r4, r5, r9 - stmia r14, {r2-r5} - - // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN - // Registers: r8 is block counter, r12 is IN. - - ldr r9, [sp, #168] // LEN - ldr r14, [sp, #160] // OUT - cmp r9, #64 - mov r0, sp - movle r1, r9 - movgt r1, #64 - // r1 is number of bytes to XOR, in range [1, 64] - -.if __LINUX_ARM_ARCH__ < 6 - orr r2, r12, r14 - tst r2, #3 // IN or OUT misaligned? - bne .Lxor_next_byte\@ -.endif - - // XOR a word at a time -.rept 16 - subs r1, #4 - blt .Lxor_words_done\@ - ldr r2, [r12], #4 - ldr r3, [r0], #4 - eor r2, r2, r3 - str r2, [r14], #4 -.endr - b .Lxor_slowpath_done\@ -.Lxor_words_done\@: - ands r1, r1, #3 - beq .Lxor_slowpath_done\@ - - // XOR a byte at a time -.Lxor_next_byte\@: - ldrb r2, [r12], #1 - ldrb r3, [r0], #1 - eor r2, r2, r3 - strb r2, [r14], #1 - subs r1, #1 - bne .Lxor_next_byte\@ - -.Lxor_slowpath_done\@: - subs r9, #64 - add sp, #96 - bgt .Lprepare_for_next_block\@ - -.Ldone\@: -.endm // _chacha - -/* - * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, - * const u32 *state, int nrounds); - */ -ENTRY(chacha_doarm) - cmp r2, #0 // len == 0? - reteq lr - - ldr ip, [sp] - cmp ip, #12 - - push {r0-r2,r4-r11,lr} - - // Push state x0-x15 onto stack. - // Also store an extra copy of x10-x11 just before the state. - - add X12, r3, #48 - ldm X12, {X12,X13,X14,X15} - push {X12,X13,X14,X15} - sub sp, sp, #64 - - __ldrd X8_X10, X9_X11, r3, 40 - __strd X8_X10, X9_X11, sp, 8 - __strd X8_X10, X9_X11, sp, 56 - ldm r3, {X0-X9_X11} - __strd X0, X1, sp, 16 - __strd X2, X3, sp, 24 - __strd X4, X5, sp, 32 - __strd X6, X7, sp, 40 - __strd X8_X10, X9_X11, sp, 48 - - beq 1f - _chacha 20 - -0: add sp, #76 - pop {r4-r11, pc} - -1: _chacha 12 - b 0b -ENDPROC(chacha_doarm) - -/* - * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds); - */ -ENTRY(hchacha_block_arm) - push {r1,r4-r11,lr} - - cmp r2, #12 // ChaCha12 ? - - mov r14, r0 - ldmia r14!, {r0-r11} // load x0-x11 - push {r10-r11} // store x10-x11 to stack - ldm r14, {r10-r12,r14} // load x12-x15 - sub sp, #8 - - beq 1f - _chacha_permute 20 - - // Skip over (unused0-unused1, x10-x11) -0: add sp, #16 - - // Fix up rotations of x12-x15 - ror X12, X12, #drot - ror X13, X13, #drot - pop {r4} // load 'out' - ror X14, X14, #drot - ror X15, X15, #drot - - // Store (x0-x3,x12-x15) to 'out' - stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} - - pop {r4-r11,pc} - -1: _chacha_permute 12 - b 0b -ENDPROC(hchacha_block_arm) diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c index aabfcf522a2c..a52dcc8c1e33 100644 --- a/arch/arm/crypto/ghash-ce-glue.c +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -8,22 +8,22 @@ #include <asm/hwcap.h> #include <asm/neon.h> -#include <asm/simd.h> -#include <linux/unaligned.h> #include <crypto/aes.h> -#include <crypto/gcm.h> #include <crypto/b128ops.h> -#include <crypto/cryptd.h> +#include <crypto/gcm.h> +#include <crypto/gf128mul.h> +#include <crypto/ghash.h> #include <crypto/internal/aead.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> -#include <crypto/gf128mul.h> #include <crypto/scatterwalk.h> #include <linux/cpufeature.h> -#include <linux/crypto.h> +#include <linux/errno.h> #include <linux/jump_label.h> +#include <linux/kernel.h> #include <linux/module.h> +#include <linux/string.h> +#include <linux/unaligned.h> MODULE_DESCRIPTION("GHASH hash function using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>"); @@ -32,9 +32,6 @@ MODULE_ALIAS_CRYPTO("ghash"); MODULE_ALIAS_CRYPTO("gcm(aes)"); MODULE_ALIAS_CRYPTO("rfc4106(gcm(aes))"); -#define GHASH_BLOCK_SIZE 16 -#define GHASH_DIGEST_SIZE 16 - #define RFC4106_NONCE_SIZE 4 struct ghash_key { @@ -49,10 +46,8 @@ struct gcm_key { u8 nonce[]; // for RFC4106 nonce }; -struct ghash_desc_ctx { +struct arm_ghash_desc_ctx { u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)]; - u8 buf[GHASH_BLOCK_SIZE]; - u32 count; }; asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, @@ -65,9 +60,9 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_p64); static int ghash_init(struct shash_desc *desc) { - struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); + struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc); - *ctx = (struct ghash_desc_ctx){}; + *ctx = (struct arm_ghash_desc_ctx){}; return 0; } @@ -85,52 +80,49 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src, static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int len) { - struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); - unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; - - ctx->count += len; + struct ghash_key *key = crypto_shash_ctx(desc->tfm); + struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc); + int blocks; - if ((partial + len) >= GHASH_BLOCK_SIZE) { - struct ghash_key *key = crypto_shash_ctx(desc->tfm); - int blocks; + blocks = len / GHASH_BLOCK_SIZE; + ghash_do_update(blocks, ctx->digest, src, key, NULL); + return len - blocks * GHASH_BLOCK_SIZE; +} - if (partial) { - int p = GHASH_BLOCK_SIZE - partial; +static int ghash_export(struct shash_desc *desc, void *out) +{ + struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc); + u8 *dst = out; - memcpy(ctx->buf + partial, src, p); - src += p; - len -= p; - } + put_unaligned_be64(ctx->digest[1], dst); + put_unaligned_be64(ctx->digest[0], dst + 8); + return 0; +} - blocks = len / GHASH_BLOCK_SIZE; - len %= GHASH_BLOCK_SIZE; +static int ghash_import(struct shash_desc *desc, const void *in) +{ + struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc); + const u8 *src = in; - ghash_do_update(blocks, ctx->digest, src, key, - partial ? ctx->buf : NULL); - src += blocks * GHASH_BLOCK_SIZE; - partial = 0; - } - if (len) - memcpy(ctx->buf + partial, src, len); + ctx->digest[1] = get_unaligned_be64(src); + ctx->digest[0] = get_unaligned_be64(src + 8); return 0; } -static int ghash_final(struct shash_desc *desc, u8 *dst) +static int ghash_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *dst) { - struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); - unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; + struct ghash_key *key = crypto_shash_ctx(desc->tfm); + struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc); - if (partial) { - struct ghash_key *key = crypto_shash_ctx(desc->tfm); + if (len) { + u8 buf[GHASH_BLOCK_SIZE] = {}; - memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); - ghash_do_update(1, ctx->digest, ctx->buf, key, NULL); + memcpy(buf, src, len); + ghash_do_update(1, ctx->digest, buf, key, NULL); + memzero_explicit(buf, sizeof(buf)); } - put_unaligned_be64(ctx->digest[1], dst); - put_unaligned_be64(ctx->digest[0], dst + 8); - - *ctx = (struct ghash_desc_ctx){}; - return 0; + return ghash_export(desc, dst); } static void ghash_reflect(u64 h[], const be128 *k) @@ -175,13 +167,17 @@ static struct shash_alg ghash_alg = { .digestsize = GHASH_DIGEST_SIZE, .init = ghash_init, .update = ghash_update, - .final = ghash_final, + .finup = ghash_finup, .setkey = ghash_setkey, - .descsize = sizeof(struct ghash_desc_ctx), + .export = ghash_export, + .import = ghash_import, + .descsize = sizeof(struct arm_ghash_desc_ctx), + .statesize = sizeof(struct ghash_desc_ctx), .base.cra_name = "ghash", .base.cra_driver_name = "ghash-ce", .base.cra_priority = 300, + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .base.cra_blocksize = GHASH_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct ghash_key) + sizeof(u64[2]), .base.cra_module = THIS_MODULE, @@ -317,9 +313,6 @@ static int gcm_encrypt(struct aead_request *req, const u8 *iv, u32 assoclen) u8 *tag, *dst; int tail, err; - if (WARN_ON_ONCE(!may_use_simd())) - return -EBUSY; - err = skcipher_walk_aead_encrypt(&walk, req, false); kernel_neon_begin(); @@ -409,9 +402,6 @@ static int gcm_decrypt(struct aead_request *req, const u8 *iv, u32 assoclen) u8 *tag, *dst; int tail, err, ret; - if (WARN_ON_ONCE(!may_use_simd())) - return -EBUSY; - scatterwalk_map_and_copy(otag, req->src, req->assoclen + req->cryptlen - authsize, authsize, 0); diff --git a/arch/arm/crypto/poly1305-armv4.pl b/arch/arm/crypto/poly1305-armv4.pl deleted file mode 100644 index 6d79498d3115..000000000000 --- a/arch/arm/crypto/poly1305-armv4.pl +++ /dev/null @@ -1,1236 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause -# -# ==================================================================== -# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -# project. -# ==================================================================== -# -# IALU(*)/gcc-4.4 NEON -# -# ARM11xx(ARMv6) 7.78/+100% - -# Cortex-A5 6.35/+130% 3.00 -# Cortex-A8 6.25/+115% 2.36 -# Cortex-A9 5.10/+95% 2.55 -# Cortex-A15 3.85/+85% 1.25(**) -# Snapdragon S4 5.70/+100% 1.48(**) -# -# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; -# (**) these are trade-off results, they can be improved by ~8% but at -# the cost of 15/12% regression on Cortex-A5/A7, it's even possible -# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; - -$flavour = shift; -if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } -else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); - -$code.=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ -# define poly1305_init poly1305_init_arm -# define poly1305_blocks poly1305_blocks_arm -# define poly1305_emit poly1305_emit_arm -.globl poly1305_blocks_neon -#endif - -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - -.text - -.globl poly1305_emit -.globl poly1305_blocks -.globl poly1305_init -.type poly1305_init,%function -.align 5 -poly1305_init: -.Lpoly1305_init: - stmdb sp!,{r4-r11} - - eor r3,r3,r3 - cmp $inp,#0 - str r3,[$ctx,#0] @ zero hash value - str r3,[$ctx,#4] - str r3,[$ctx,#8] - str r3,[$ctx,#12] - str r3,[$ctx,#16] - str r3,[$ctx,#36] @ clear is_base2_26 - add $ctx,$ctx,#20 - -#ifdef __thumb2__ - it eq -#endif - moveq r0,#0 - beq .Lno_key - -#if __ARM_MAX_ARCH__>=7 - mov r3,#-1 - str r3,[$ctx,#28] @ impossible key power value -# ifndef __KERNEL__ - adr r11,.Lpoly1305_init - ldr r12,.LOPENSSL_armcap -# endif -#endif - ldrb r4,[$inp,#0] - mov r10,#0x0fffffff - ldrb r5,[$inp,#1] - and r3,r10,#-4 @ 0x0ffffffc - ldrb r6,[$inp,#2] - ldrb r7,[$inp,#3] - orr r4,r4,r5,lsl#8 - ldrb r5,[$inp,#4] - orr r4,r4,r6,lsl#16 - ldrb r6,[$inp,#5] - orr r4,r4,r7,lsl#24 - ldrb r7,[$inp,#6] - and r4,r4,r10 - -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -# if !defined(_WIN32) - ldr r12,[r11,r12] @ OPENSSL_armcap_P -# endif -# if defined(__APPLE__) || defined(_WIN32) - ldr r12,[r12] -# endif -#endif - ldrb r8,[$inp,#7] - orr r5,r5,r6,lsl#8 - ldrb r6,[$inp,#8] - orr r5,r5,r7,lsl#16 - ldrb r7,[$inp,#9] - orr r5,r5,r8,lsl#24 - ldrb r8,[$inp,#10] - and r5,r5,r3 - -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - tst r12,#ARMV7_NEON @ check for NEON -# ifdef __thumb2__ - adr r9,.Lpoly1305_blocks_neon - adr r11,.Lpoly1305_blocks - it ne - movne r11,r9 - adr r12,.Lpoly1305_emit - orr r11,r11,#1 @ thumb-ify addresses - orr r12,r12,#1 -# else - add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) - ite eq - addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) - addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) -# endif -#endif - ldrb r9,[$inp,#11] - orr r6,r6,r7,lsl#8 - ldrb r7,[$inp,#12] - orr r6,r6,r8,lsl#16 - ldrb r8,[$inp,#13] - orr r6,r6,r9,lsl#24 - ldrb r9,[$inp,#14] - and r6,r6,r3 - - ldrb r10,[$inp,#15] - orr r7,r7,r8,lsl#8 - str r4,[$ctx,#0] - orr r7,r7,r9,lsl#16 - str r5,[$ctx,#4] - orr r7,r7,r10,lsl#24 - str r6,[$ctx,#8] - and r7,r7,r3 - str r7,[$ctx,#12] -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - stmia r2,{r11,r12} @ fill functions table - mov r0,#1 -#else - mov r0,#0 -#endif -.Lno_key: - ldmia sp!,{r4-r11} -#if __ARM_ARCH__>=5 - ret @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size poly1305_init,.-poly1305_init -___ -{ -my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); -my ($s1,$s2,$s3)=($r1,$r2,$r3); - -$code.=<<___; -.type poly1305_blocks,%function -.align 5 -poly1305_blocks: -.Lpoly1305_blocks: - stmdb sp!,{r3-r11,lr} - - ands $len,$len,#-16 - beq .Lno_data - - add $len,$len,$inp @ end pointer - sub sp,sp,#32 - -#if __ARM_ARCH__<7 - ldmia $ctx,{$h0-$r3} @ load context - add $ctx,$ctx,#20 - str $len,[sp,#16] @ offload stuff - str $ctx,[sp,#12] -#else - ldr lr,[$ctx,#36] @ is_base2_26 - ldmia $ctx!,{$h0-$h4} @ load hash value - str $len,[sp,#16] @ offload stuff - str $ctx,[sp,#12] - - adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 - mov $r1,$h1,lsr#6 - adcs $r1,$r1,$h2,lsl#20 - mov $r2,$h2,lsr#12 - adcs $r2,$r2,$h3,lsl#14 - mov $r3,$h3,lsr#18 - adcs $r3,$r3,$h4,lsl#8 - mov $len,#0 - teq lr,#0 - str $len,[$ctx,#16] @ clear is_base2_26 - adc $len,$len,$h4,lsr#24 - - itttt ne - movne $h0,$r0 @ choose between radixes - movne $h1,$r1 - movne $h2,$r2 - movne $h3,$r3 - ldmia $ctx,{$r0-$r3} @ load key - it ne - movne $h4,$len -#endif - - mov lr,$inp - cmp $padbit,#0 - str $r1,[sp,#20] - str $r2,[sp,#24] - str $r3,[sp,#28] - b .Loop - -.align 4 -.Loop: -#if __ARM_ARCH__<7 - ldrb r0,[lr],#16 @ load input -# ifdef __thumb2__ - it hi -# endif - addhi $h4,$h4,#1 @ 1<<128 - ldrb r1,[lr,#-15] - ldrb r2,[lr,#-14] - ldrb r3,[lr,#-13] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-12] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-11] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-10] - adds $h0,$h0,r3 @ accumulate input - - ldrb r3,[lr,#-9] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-8] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-7] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-6] - adcs $h1,$h1,r3 - - ldrb r3,[lr,#-5] - orr r1,r0,r1,lsl#8 - ldrb r0,[lr,#-4] - orr r2,r1,r2,lsl#16 - ldrb r1,[lr,#-3] - orr r3,r2,r3,lsl#24 - ldrb r2,[lr,#-2] - adcs $h2,$h2,r3 - - ldrb r3,[lr,#-1] - orr r1,r0,r1,lsl#8 - str lr,[sp,#8] @ offload input pointer - orr r2,r1,r2,lsl#16 - add $s1,$r1,$r1,lsr#2 - orr r3,r2,r3,lsl#24 -#else - ldr r0,[lr],#16 @ load input - it hi - addhi $h4,$h4,#1 @ padbit - ldr r1,[lr,#-12] - ldr r2,[lr,#-8] - ldr r3,[lr,#-4] -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif - adds $h0,$h0,r0 @ accumulate input - str lr,[sp,#8] @ offload input pointer - adcs $h1,$h1,r1 - add $s1,$r1,$r1,lsr#2 - adcs $h2,$h2,r2 -#endif - add $s2,$r2,$r2,lsr#2 - adcs $h3,$h3,r3 - add $s3,$r3,$r3,lsr#2 - - umull r2,r3,$h1,$r0 - adc $h4,$h4,#0 - umull r0,r1,$h0,$r0 - umlal r2,r3,$h4,$s1 - umlal r0,r1,$h3,$s1 - ldr $r1,[sp,#20] @ reload $r1 - umlal r2,r3,$h2,$s3 - umlal r0,r1,$h1,$s3 - umlal r2,r3,$h3,$s2 - umlal r0,r1,$h2,$s2 - umlal r2,r3,$h0,$r1 - str r0,[sp,#0] @ future $h0 - mul r0,$s2,$h4 - ldr $r2,[sp,#24] @ reload $r2 - adds r2,r2,r1 @ d1+=d0>>32 - eor r1,r1,r1 - adc lr,r3,#0 @ future $h2 - str r2,[sp,#4] @ future $h1 - - mul r2,$s3,$h4 - eor r3,r3,r3 - umlal r0,r1,$h3,$s3 - ldr $r3,[sp,#28] @ reload $r3 - umlal r2,r3,$h3,$r0 - umlal r0,r1,$h2,$r0 - umlal r2,r3,$h2,$r1 - umlal r0,r1,$h1,$r1 - umlal r2,r3,$h1,$r2 - umlal r0,r1,$h0,$r2 - umlal r2,r3,$h0,$r3 - ldr $h0,[sp,#0] - mul $h4,$r0,$h4 - ldr $h1,[sp,#4] - - adds $h2,lr,r0 @ d2+=d1>>32 - ldr lr,[sp,#8] @ reload input pointer - adc r1,r1,#0 - adds $h3,r2,r1 @ d3+=d2>>32 - ldr r0,[sp,#16] @ reload end pointer - adc r3,r3,#0 - add $h4,$h4,r3 @ h4+=d3>>32 - - and r1,$h4,#-4 - and $h4,$h4,#3 - add r1,r1,r1,lsr#2 @ *=5 - adds $h0,$h0,r1 - adcs $h1,$h1,#0 - adcs $h2,$h2,#0 - adcs $h3,$h3,#0 - adc $h4,$h4,#0 - - cmp r0,lr @ done yet? - bhi .Loop - - ldr $ctx,[sp,#12] - add sp,sp,#32 - stmdb $ctx,{$h0-$h4} @ store the result - -.Lno_data: -#if __ARM_ARCH__>=5 - ldmia sp!,{r3-r11,pc} -#else - ldmia sp!,{r3-r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size poly1305_blocks,.-poly1305_blocks -___ -} -{ -my ($ctx,$mac,$nonce)=map("r$_",(0..2)); -my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); -my $g4=$ctx; - -$code.=<<___; -.type poly1305_emit,%function -.align 5 -poly1305_emit: -.Lpoly1305_emit: - stmdb sp!,{r4-r11} - - ldmia $ctx,{$h0-$h4} - -#if __ARM_ARCH__>=7 - ldr ip,[$ctx,#36] @ is_base2_26 - - adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 - mov $g1,$h1,lsr#6 - adcs $g1,$g1,$h2,lsl#20 - mov $g2,$h2,lsr#12 - adcs $g2,$g2,$h3,lsl#14 - mov $g3,$h3,lsr#18 - adcs $g3,$g3,$h4,lsl#8 - mov $g4,#0 - adc $g4,$g4,$h4,lsr#24 - - tst ip,ip - itttt ne - movne $h0,$g0 - movne $h1,$g1 - movne $h2,$g2 - movne $h3,$g3 - it ne - movne $h4,$g4 -#endif - - adds $g0,$h0,#5 @ compare to modulus - adcs $g1,$h1,#0 - adcs $g2,$h2,#0 - adcs $g3,$h3,#0 - adc $g4,$h4,#0 - tst $g4,#4 @ did it carry/borrow? - -#ifdef __thumb2__ - it ne -#endif - movne $h0,$g0 - ldr $g0,[$nonce,#0] -#ifdef __thumb2__ - it ne -#endif - movne $h1,$g1 - ldr $g1,[$nonce,#4] -#ifdef __thumb2__ - it ne -#endif - movne $h2,$g2 - ldr $g2,[$nonce,#8] -#ifdef __thumb2__ - it ne -#endif - movne $h3,$g3 - ldr $g3,[$nonce,#12] - - adds $h0,$h0,$g0 - adcs $h1,$h1,$g1 - adcs $h2,$h2,$g2 - adc $h3,$h3,$g3 - -#if __ARM_ARCH__>=7 -# ifdef __ARMEB__ - rev $h0,$h0 - rev $h1,$h1 - rev $h2,$h2 - rev $h3,$h3 -# endif - str $h0,[$mac,#0] - str $h1,[$mac,#4] - str $h2,[$mac,#8] - str $h3,[$mac,#12] -#else - strb $h0,[$mac,#0] - mov $h0,$h0,lsr#8 - strb $h1,[$mac,#4] - mov $h1,$h1,lsr#8 - strb $h2,[$mac,#8] - mov $h2,$h2,lsr#8 - strb $h3,[$mac,#12] - mov $h3,$h3,lsr#8 - - strb $h0,[$mac,#1] - mov $h0,$h0,lsr#8 - strb $h1,[$mac,#5] - mov $h1,$h1,lsr#8 - strb $h2,[$mac,#9] - mov $h2,$h2,lsr#8 - strb $h3,[$mac,#13] - mov $h3,$h3,lsr#8 - - strb $h0,[$mac,#2] - mov $h0,$h0,lsr#8 - strb $h1,[$mac,#6] - mov $h1,$h1,lsr#8 - strb $h2,[$mac,#10] - mov $h2,$h2,lsr#8 - strb $h3,[$mac,#14] - mov $h3,$h3,lsr#8 - - strb $h0,[$mac,#3] - strb $h1,[$mac,#7] - strb $h2,[$mac,#11] - strb $h3,[$mac,#15] -#endif - ldmia sp!,{r4-r11} -#if __ARM_ARCH__>=5 - ret @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size poly1305_emit,.-poly1305_emit -___ -{ -my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); -my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); -my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); - -my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); - -$code.=<<___; -#if __ARM_MAX_ARCH__>=7 -.fpu neon - -.type poly1305_init_neon,%function -.align 5 -poly1305_init_neon: -.Lpoly1305_init_neon: - ldr r3,[$ctx,#48] @ first table element - cmp r3,#-1 @ is value impossible? - bne .Lno_init_neon - - ldr r4,[$ctx,#20] @ load key base 2^32 - ldr r5,[$ctx,#24] - ldr r6,[$ctx,#28] - ldr r7,[$ctx,#32] - - and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 - mov r3,r4,lsr#26 - mov r4,r5,lsr#20 - orr r3,r3,r5,lsl#6 - mov r5,r6,lsr#14 - orr r4,r4,r6,lsl#12 - mov r6,r7,lsr#8 - orr r5,r5,r7,lsl#18 - and r3,r3,#0x03ffffff - and r4,r4,#0x03ffffff - and r5,r5,#0x03ffffff - - vdup.32 $R0,r2 @ r^1 in both lanes - add r2,r3,r3,lsl#2 @ *5 - vdup.32 $R1,r3 - add r3,r4,r4,lsl#2 - vdup.32 $S1,r2 - vdup.32 $R2,r4 - add r4,r5,r5,lsl#2 - vdup.32 $S2,r3 - vdup.32 $R3,r5 - add r5,r6,r6,lsl#2 - vdup.32 $S3,r4 - vdup.32 $R4,r6 - vdup.32 $S4,r5 - - mov $zeros,#2 @ counter - -.Lsquare_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - - vmull.u32 $D0,$R0,${R0}[1] - vmull.u32 $D1,$R1,${R0}[1] - vmull.u32 $D2,$R2,${R0}[1] - vmull.u32 $D3,$R3,${R0}[1] - vmull.u32 $D4,$R4,${R0}[1] - - vmlal.u32 $D0,$R4,${S1}[1] - vmlal.u32 $D1,$R0,${R1}[1] - vmlal.u32 $D2,$R1,${R1}[1] - vmlal.u32 $D3,$R2,${R1}[1] - vmlal.u32 $D4,$R3,${R1}[1] - - vmlal.u32 $D0,$R3,${S2}[1] - vmlal.u32 $D1,$R4,${S2}[1] - vmlal.u32 $D3,$R1,${R2}[1] - vmlal.u32 $D2,$R0,${R2}[1] - vmlal.u32 $D4,$R2,${R2}[1] - - vmlal.u32 $D0,$R2,${S3}[1] - vmlal.u32 $D3,$R0,${R3}[1] - vmlal.u32 $D1,$R3,${S3}[1] - vmlal.u32 $D2,$R4,${S3}[1] - vmlal.u32 $D4,$R1,${R3}[1] - - vmlal.u32 $D3,$R4,${S4}[1] - vmlal.u32 $D0,$R1,${S4}[1] - vmlal.u32 $D1,$R2,${S4}[1] - vmlal.u32 $D2,$R3,${S4}[1] - vmlal.u32 $D4,$R0,${R4}[1] - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - @ and P. Schwabe - @ - @ H0>>+H1>>+H2>>+H3>>+H4 - @ H3>>+H4>>*5+H0>>+H1 - @ - @ Trivia. - @ - @ Result of multiplication of n-bit number by m-bit number is - @ n+m bits wide. However! Even though 2^n is a n+1-bit number, - @ m-bit number multiplied by 2^n is still n+m bits wide. - @ - @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, - @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit - @ one is n+1 bits wide. - @ - @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that - @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 - @ can be 27. However! In cases when their width exceeds 26 bits - @ they are limited by 2^26+2^6. This in turn means that *sum* - @ of the products with these values can still be viewed as sum - @ of 52-bit numbers as long as the amount of addends is not a - @ power of 2. For example, - @ - @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, - @ - @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or - @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than - @ 8 * (2^52) or 2^55. However, the value is then multiplied by - @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), - @ which is less than 32 * (2^52) or 2^57. And when processing - @ data we are looking at triple as many addends... - @ - @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and - @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the - @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while - @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 - @ instruction accepts 2x32-bit input and writes 2x64-bit result. - @ This means that result of reduction have to be compressed upon - @ loop wrap-around. This can be done in the process of reduction - @ to minimize amount of instructions [as well as amount of - @ 128-bit instructions, which benefits low-end processors], but - @ one has to watch for H2 (which is narrower than H0) and 5*H4 - @ not being wider than 58 bits, so that result of right shift - @ by 26 bits fits in 32 bits. This is also useful on x86, - @ because it allows to use paddd in place for paddq, which - @ benefits Atom, where paddq is ridiculously slow. - - vshr.u64 $T0,$D3,#26 - vmovn.i64 $D3#lo,$D3 - vshr.u64 $T1,$D0,#26 - vmovn.i64 $D0#lo,$D0 - vadd.i64 $D4,$D4,$T0 @ h3 -> h4 - vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff - vadd.i64 $D1,$D1,$T1 @ h0 -> h1 - vbic.i32 $D0#lo,#0xfc000000 - - vshrn.u64 $T0#lo,$D4,#26 - vmovn.i64 $D4#lo,$D4 - vshr.u64 $T1,$D1,#26 - vmovn.i64 $D1#lo,$D1 - vadd.i64 $D2,$D2,$T1 @ h1 -> h2 - vbic.i32 $D4#lo,#0xfc000000 - vbic.i32 $D1#lo,#0xfc000000 - - vadd.i32 $D0#lo,$D0#lo,$T0#lo - vshl.u32 $T0#lo,$T0#lo,#2 - vshrn.u64 $T1#lo,$D2,#26 - vmovn.i64 $D2#lo,$D2 - vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 - vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 - vbic.i32 $D2#lo,#0xfc000000 - - vshr.u32 $T0#lo,$D0#lo,#26 - vbic.i32 $D0#lo,#0xfc000000 - vshr.u32 $T1#lo,$D3#lo,#26 - vbic.i32 $D3#lo,#0xfc000000 - vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 - vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 - - subs $zeros,$zeros,#1 - beq .Lsquare_break_neon - - add $tbl0,$ctx,#(48+0*9*4) - add $tbl1,$ctx,#(48+1*9*4) - - vtrn.32 $R0,$D0#lo @ r^2:r^1 - vtrn.32 $R2,$D2#lo - vtrn.32 $R3,$D3#lo - vtrn.32 $R1,$D1#lo - vtrn.32 $R4,$D4#lo - - vshl.u32 $S2,$R2,#2 @ *5 - vshl.u32 $S3,$R3,#2 - vshl.u32 $S1,$R1,#2 - vshl.u32 $S4,$R4,#2 - vadd.i32 $S2,$S2,$R2 - vadd.i32 $S1,$S1,$R1 - vadd.i32 $S3,$S3,$R3 - vadd.i32 $S4,$S4,$R4 - - vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! - vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! - vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vst1.32 {${S4}[0]},[$tbl0,:32] - vst1.32 {${S4}[1]},[$tbl1,:32] - - b .Lsquare_neon - -.align 4 -.Lsquare_break_neon: - add $tbl0,$ctx,#(48+2*4*9) - add $tbl1,$ctx,#(48+3*4*9) - - vmov $R0,$D0#lo @ r^4:r^3 - vshl.u32 $S1,$D1#lo,#2 @ *5 - vmov $R1,$D1#lo - vshl.u32 $S2,$D2#lo,#2 - vmov $R2,$D2#lo - vshl.u32 $S3,$D3#lo,#2 - vmov $R3,$D3#lo - vshl.u32 $S4,$D4#lo,#2 - vmov $R4,$D4#lo - vadd.i32 $S1,$S1,$D1#lo - vadd.i32 $S2,$S2,$D2#lo - vadd.i32 $S3,$S3,$D3#lo - vadd.i32 $S4,$S4,$D4#lo - - vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! - vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! - vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vst1.32 {${S4}[0]},[$tbl0] - vst1.32 {${S4}[1]},[$tbl1] - -.Lno_init_neon: - ret @ bx lr -.size poly1305_init_neon,.-poly1305_init_neon - -.type poly1305_blocks_neon,%function -.align 5 -poly1305_blocks_neon: -.Lpoly1305_blocks_neon: - ldr ip,[$ctx,#36] @ is_base2_26 - - cmp $len,#64 - blo .Lpoly1305_blocks - - stmdb sp!,{r4-r7} - vstmdb sp!,{d8-d15} @ ABI specification says so - - tst ip,ip @ is_base2_26? - bne .Lbase2_26_neon - - stmdb sp!,{r1-r3,lr} - bl .Lpoly1305_init_neon - - ldr r4,[$ctx,#0] @ load hash value base 2^32 - ldr r5,[$ctx,#4] - ldr r6,[$ctx,#8] - ldr r7,[$ctx,#12] - ldr ip,[$ctx,#16] - - and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 - mov r3,r4,lsr#26 - veor $D0#lo,$D0#lo,$D0#lo - mov r4,r5,lsr#20 - orr r3,r3,r5,lsl#6 - veor $D1#lo,$D1#lo,$D1#lo - mov r5,r6,lsr#14 - orr r4,r4,r6,lsl#12 - veor $D2#lo,$D2#lo,$D2#lo - mov r6,r7,lsr#8 - orr r5,r5,r7,lsl#18 - veor $D3#lo,$D3#lo,$D3#lo - and r3,r3,#0x03ffffff - orr r6,r6,ip,lsl#24 - veor $D4#lo,$D4#lo,$D4#lo - and r4,r4,#0x03ffffff - mov r1,#1 - and r5,r5,#0x03ffffff - str r1,[$ctx,#36] @ set is_base2_26 - - vmov.32 $D0#lo[0],r2 - vmov.32 $D1#lo[0],r3 - vmov.32 $D2#lo[0],r4 - vmov.32 $D3#lo[0],r5 - vmov.32 $D4#lo[0],r6 - adr $zeros,.Lzeros - - ldmia sp!,{r1-r3,lr} - b .Lhash_loaded - -.align 4 -.Lbase2_26_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ load hash value - - veor $D0#lo,$D0#lo,$D0#lo - veor $D1#lo,$D1#lo,$D1#lo - veor $D2#lo,$D2#lo,$D2#lo - veor $D3#lo,$D3#lo,$D3#lo - veor $D4#lo,$D4#lo,$D4#lo - vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! - adr $zeros,.Lzeros - vld1.32 {$D4#lo[0]},[$ctx] - sub $ctx,$ctx,#16 @ rewind - -.Lhash_loaded: - add $in2,$inp,#32 - mov $padbit,$padbit,lsl#24 - tst $len,#31 - beq .Leven - - vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! - vmov.32 $H4#lo[0],$padbit - sub $len,$len,#16 - add $in2,$inp,#32 - -# ifdef __ARMEB__ - vrev32.8 $H0,$H0 - vrev32.8 $H3,$H3 - vrev32.8 $H1,$H1 - vrev32.8 $H2,$H2 -# endif - vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 - vshl.u32 $H3#lo,$H3#lo,#18 - - vsri.u32 $H3#lo,$H2#lo,#14 - vshl.u32 $H2#lo,$H2#lo,#12 - vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi - - vbic.i32 $H3#lo,#0xfc000000 - vsri.u32 $H2#lo,$H1#lo,#20 - vshl.u32 $H1#lo,$H1#lo,#6 - - vbic.i32 $H2#lo,#0xfc000000 - vsri.u32 $H1#lo,$H0#lo,#26 - vadd.i32 $H3#hi,$H3#lo,$D3#lo - - vbic.i32 $H0#lo,#0xfc000000 - vbic.i32 $H1#lo,#0xfc000000 - vadd.i32 $H2#hi,$H2#lo,$D2#lo - - vadd.i32 $H0#hi,$H0#lo,$D0#lo - vadd.i32 $H1#hi,$H1#lo,$D1#lo - - mov $tbl1,$zeros - add $tbl0,$ctx,#48 - - cmp $len,$len - b .Long_tail - -.align 4 -.Leven: - subs $len,$len,#64 - it lo - movlo $in2,$zeros - - vmov.i32 $H4,#1<<24 @ padbit, yes, always - vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] - add $inp,$inp,#64 - vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) - add $in2,$in2,#64 - itt hi - addhi $tbl1,$ctx,#(48+1*9*4) - addhi $tbl0,$ctx,#(48+3*9*4) - -# ifdef __ARMEB__ - vrev32.8 $H0,$H0 - vrev32.8 $H3,$H3 - vrev32.8 $H1,$H1 - vrev32.8 $H2,$H2 -# endif - vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 - vshl.u32 $H3,$H3,#18 - - vsri.u32 $H3,$H2,#14 - vshl.u32 $H2,$H2,#12 - - vbic.i32 $H3,#0xfc000000 - vsri.u32 $H2,$H1,#20 - vshl.u32 $H1,$H1,#6 - - vbic.i32 $H2,#0xfc000000 - vsri.u32 $H1,$H0,#26 - - vbic.i32 $H0,#0xfc000000 - vbic.i32 $H1,#0xfc000000 - - bls .Lskip_loop - - vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 - vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 - vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - b .Loop_neon - -.align 5 -.Loop_neon: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - @ \___________________/ - @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - @ \___________________/ \____________________/ - @ - @ Note that we start with inp[2:3]*r^2. This is because it - @ doesn't depend on reduction in previous iteration. - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ inp[2:3]*r^2 - - vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] - vmull.u32 $D2,$H2#hi,${R0}[1] - vadd.i32 $H0#lo,$H0#lo,$D0#lo - vmull.u32 $D0,$H0#hi,${R0}[1] - vadd.i32 $H3#lo,$H3#lo,$D3#lo - vmull.u32 $D3,$H3#hi,${R0}[1] - vmlal.u32 $D2,$H1#hi,${R1}[1] - vadd.i32 $H1#lo,$H1#lo,$D1#lo - vmull.u32 $D1,$H1#hi,${R0}[1] - - vadd.i32 $H4#lo,$H4#lo,$D4#lo - vmull.u32 $D4,$H4#hi,${R0}[1] - subs $len,$len,#64 - vmlal.u32 $D0,$H4#hi,${S1}[1] - it lo - movlo $in2,$zeros - vmlal.u32 $D3,$H2#hi,${R1}[1] - vld1.32 ${S4}[1],[$tbl1,:32] - vmlal.u32 $D1,$H0#hi,${R1}[1] - vmlal.u32 $D4,$H3#hi,${R1}[1] - - vmlal.u32 $D0,$H3#hi,${S2}[1] - vmlal.u32 $D3,$H1#hi,${R2}[1] - vmlal.u32 $D4,$H2#hi,${R2}[1] - vmlal.u32 $D1,$H4#hi,${S2}[1] - vmlal.u32 $D2,$H0#hi,${R2}[1] - - vmlal.u32 $D3,$H0#hi,${R3}[1] - vmlal.u32 $D0,$H2#hi,${S3}[1] - vmlal.u32 $D4,$H1#hi,${R3}[1] - vmlal.u32 $D1,$H3#hi,${S3}[1] - vmlal.u32 $D2,$H4#hi,${S3}[1] - - vmlal.u32 $D3,$H4#hi,${S4}[1] - vmlal.u32 $D0,$H1#hi,${S4}[1] - vmlal.u32 $D4,$H0#hi,${R4}[1] - vmlal.u32 $D1,$H2#hi,${S4}[1] - vmlal.u32 $D2,$H3#hi,${S4}[1] - - vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) - add $in2,$in2,#64 - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ (hash+inp[0:1])*r^4 and accumulate - - vmlal.u32 $D3,$H3#lo,${R0}[0] - vmlal.u32 $D0,$H0#lo,${R0}[0] - vmlal.u32 $D4,$H4#lo,${R0}[0] - vmlal.u32 $D1,$H1#lo,${R0}[0] - vmlal.u32 $D2,$H2#lo,${R0}[0] - vld1.32 ${S4}[0],[$tbl0,:32] - - vmlal.u32 $D3,$H2#lo,${R1}[0] - vmlal.u32 $D0,$H4#lo,${S1}[0] - vmlal.u32 $D4,$H3#lo,${R1}[0] - vmlal.u32 $D1,$H0#lo,${R1}[0] - vmlal.u32 $D2,$H1#lo,${R1}[0] - - vmlal.u32 $D3,$H1#lo,${R2}[0] - vmlal.u32 $D0,$H3#lo,${S2}[0] - vmlal.u32 $D4,$H2#lo,${R2}[0] - vmlal.u32 $D1,$H4#lo,${S2}[0] - vmlal.u32 $D2,$H0#lo,${R2}[0] - - vmlal.u32 $D3,$H0#lo,${R3}[0] - vmlal.u32 $D0,$H2#lo,${S3}[0] - vmlal.u32 $D4,$H1#lo,${R3}[0] - vmlal.u32 $D1,$H3#lo,${S3}[0] - vmlal.u32 $D3,$H4#lo,${S4}[0] - - vmlal.u32 $D2,$H4#lo,${S3}[0] - vmlal.u32 $D0,$H1#lo,${S4}[0] - vmlal.u32 $D4,$H0#lo,${R4}[0] - vmov.i32 $H4,#1<<24 @ padbit, yes, always - vmlal.u32 $D1,$H2#lo,${S4}[0] - vmlal.u32 $D2,$H3#lo,${S4}[0] - - vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] - add $inp,$inp,#64 -# ifdef __ARMEB__ - vrev32.8 $H0,$H0 - vrev32.8 $H1,$H1 - vrev32.8 $H2,$H2 - vrev32.8 $H3,$H3 -# endif - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction interleaved with base 2^32 -> base 2^26 of - @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. - - vshr.u64 $T0,$D3,#26 - vmovn.i64 $D3#lo,$D3 - vshr.u64 $T1,$D0,#26 - vmovn.i64 $D0#lo,$D0 - vadd.i64 $D4,$D4,$T0 @ h3 -> h4 - vbic.i32 $D3#lo,#0xfc000000 - vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 - vadd.i64 $D1,$D1,$T1 @ h0 -> h1 - vshl.u32 $H3,$H3,#18 - vbic.i32 $D0#lo,#0xfc000000 - - vshrn.u64 $T0#lo,$D4,#26 - vmovn.i64 $D4#lo,$D4 - vshr.u64 $T1,$D1,#26 - vmovn.i64 $D1#lo,$D1 - vadd.i64 $D2,$D2,$T1 @ h1 -> h2 - vsri.u32 $H3,$H2,#14 - vbic.i32 $D4#lo,#0xfc000000 - vshl.u32 $H2,$H2,#12 - vbic.i32 $D1#lo,#0xfc000000 - - vadd.i32 $D0#lo,$D0#lo,$T0#lo - vshl.u32 $T0#lo,$T0#lo,#2 - vbic.i32 $H3,#0xfc000000 - vshrn.u64 $T1#lo,$D2,#26 - vmovn.i64 $D2#lo,$D2 - vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] - vsri.u32 $H2,$H1,#20 - vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 - vshl.u32 $H1,$H1,#6 - vbic.i32 $D2#lo,#0xfc000000 - vbic.i32 $H2,#0xfc000000 - - vshrn.u64 $T0#lo,$D0,#26 @ re-narrow - vmovn.i64 $D0#lo,$D0 - vsri.u32 $H1,$H0,#26 - vbic.i32 $H0,#0xfc000000 - vshr.u32 $T1#lo,$D3#lo,#26 - vbic.i32 $D3#lo,#0xfc000000 - vbic.i32 $D0#lo,#0xfc000000 - vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 - vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 - vbic.i32 $H1,#0xfc000000 - - bhi .Loop_neon - -.Lskip_loop: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - add $tbl1,$ctx,#(48+0*9*4) - add $tbl0,$ctx,#(48+1*9*4) - adds $len,$len,#32 - it ne - movne $len,#0 - bne .Long_tail - - vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi - vadd.i32 $H0#hi,$H0#lo,$D0#lo - vadd.i32 $H3#hi,$H3#lo,$D3#lo - vadd.i32 $H1#hi,$H1#lo,$D1#lo - vadd.i32 $H4#hi,$H4#lo,$D4#lo - -.Long_tail: - vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 - vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 - - vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant - vmull.u32 $D2,$H2#hi,$R0 - vadd.i32 $H0#lo,$H0#lo,$D0#lo - vmull.u32 $D0,$H0#hi,$R0 - vadd.i32 $H3#lo,$H3#lo,$D3#lo - vmull.u32 $D3,$H3#hi,$R0 - vadd.i32 $H1#lo,$H1#lo,$D1#lo - vmull.u32 $D1,$H1#hi,$R0 - vadd.i32 $H4#lo,$H4#lo,$D4#lo - vmull.u32 $D4,$H4#hi,$R0 - - vmlal.u32 $D0,$H4#hi,$S1 - vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vmlal.u32 $D3,$H2#hi,$R1 - vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vmlal.u32 $D1,$H0#hi,$R1 - vmlal.u32 $D4,$H3#hi,$R1 - vmlal.u32 $D2,$H1#hi,$R1 - - vmlal.u32 $D3,$H1#hi,$R2 - vld1.32 ${S4}[1],[$tbl1,:32] - vmlal.u32 $D0,$H3#hi,$S2 - vld1.32 ${S4}[0],[$tbl0,:32] - vmlal.u32 $D4,$H2#hi,$R2 - vmlal.u32 $D1,$H4#hi,$S2 - vmlal.u32 $D2,$H0#hi,$R2 - - vmlal.u32 $D3,$H0#hi,$R3 - it ne - addne $tbl1,$ctx,#(48+2*9*4) - vmlal.u32 $D0,$H2#hi,$S3 - it ne - addne $tbl0,$ctx,#(48+3*9*4) - vmlal.u32 $D4,$H1#hi,$R3 - vmlal.u32 $D1,$H3#hi,$S3 - vmlal.u32 $D2,$H4#hi,$S3 - - vmlal.u32 $D3,$H4#hi,$S4 - vorn $MASK,$MASK,$MASK @ all-ones, can be redundant - vmlal.u32 $D0,$H1#hi,$S4 - vshr.u64 $MASK,$MASK,#38 - vmlal.u32 $D4,$H0#hi,$R4 - vmlal.u32 $D1,$H2#hi,$S4 - vmlal.u32 $D2,$H3#hi,$S4 - - beq .Lshort_tail - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ (hash+inp[0:1])*r^4:r^3 and accumulate - - vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 - vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 - - vmlal.u32 $D2,$H2#lo,$R0 - vmlal.u32 $D0,$H0#lo,$R0 - vmlal.u32 $D3,$H3#lo,$R0 - vmlal.u32 $D1,$H1#lo,$R0 - vmlal.u32 $D4,$H4#lo,$R0 - - vmlal.u32 $D0,$H4#lo,$S1 - vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! - vmlal.u32 $D3,$H2#lo,$R1 - vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! - vmlal.u32 $D1,$H0#lo,$R1 - vmlal.u32 $D4,$H3#lo,$R1 - vmlal.u32 $D2,$H1#lo,$R1 - - vmlal.u32 $D3,$H1#lo,$R2 - vld1.32 ${S4}[1],[$tbl1,:32] - vmlal.u32 $D0,$H3#lo,$S2 - vld1.32 ${S4}[0],[$tbl0,:32] - vmlal.u32 $D4,$H2#lo,$R2 - vmlal.u32 $D1,$H4#lo,$S2 - vmlal.u32 $D2,$H0#lo,$R2 - - vmlal.u32 $D3,$H0#lo,$R3 - vmlal.u32 $D0,$H2#lo,$S3 - vmlal.u32 $D4,$H1#lo,$R3 - vmlal.u32 $D1,$H3#lo,$S3 - vmlal.u32 $D2,$H4#lo,$S3 - - vmlal.u32 $D3,$H4#lo,$S4 - vorn $MASK,$MASK,$MASK @ all-ones - vmlal.u32 $D0,$H1#lo,$S4 - vshr.u64 $MASK,$MASK,#38 - vmlal.u32 $D4,$H0#lo,$R4 - vmlal.u32 $D1,$H2#lo,$S4 - vmlal.u32 $D2,$H3#lo,$S4 - -.Lshort_tail: - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ horizontal addition - - vadd.i64 $D3#lo,$D3#lo,$D3#hi - vadd.i64 $D0#lo,$D0#lo,$D0#hi - vadd.i64 $D4#lo,$D4#lo,$D4#hi - vadd.i64 $D1#lo,$D1#lo,$D1#hi - vadd.i64 $D2#lo,$D2#lo,$D2#hi - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ lazy reduction, but without narrowing - - vshr.u64 $T0,$D3,#26 - vand.i64 $D3,$D3,$MASK - vshr.u64 $T1,$D0,#26 - vand.i64 $D0,$D0,$MASK - vadd.i64 $D4,$D4,$T0 @ h3 -> h4 - vadd.i64 $D1,$D1,$T1 @ h0 -> h1 - - vshr.u64 $T0,$D4,#26 - vand.i64 $D4,$D4,$MASK - vshr.u64 $T1,$D1,#26 - vand.i64 $D1,$D1,$MASK - vadd.i64 $D2,$D2,$T1 @ h1 -> h2 - - vadd.i64 $D0,$D0,$T0 - vshl.u64 $T0,$T0,#2 - vshr.u64 $T1,$D2,#26 - vand.i64 $D2,$D2,$MASK - vadd.i64 $D0,$D0,$T0 @ h4 -> h0 - vadd.i64 $D3,$D3,$T1 @ h2 -> h3 - - vshr.u64 $T0,$D0,#26 - vand.i64 $D0,$D0,$MASK - vshr.u64 $T1,$D3,#26 - vand.i64 $D3,$D3,$MASK - vadd.i64 $D1,$D1,$T0 @ h0 -> h1 - vadd.i64 $D4,$D4,$T1 @ h3 -> h4 - - cmp $len,#0 - bne .Leven - - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ store hash value - - vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! - vst1.32 {$D4#lo[0]},[$ctx] - - vldmia sp!,{d8-d15} @ epilogue - ldmia sp!,{r4-r7} - ret @ bx lr -.size poly1305_blocks_neon,.-poly1305_blocks_neon - -.align 5 -.Lzeros: -.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -#ifndef __KERNEL__ -.LOPENSSL_armcap: -# ifdef _WIN32 -.word OPENSSL_armcap_P -# else -.word OPENSSL_armcap_P-.Lpoly1305_init -# endif -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif -#endif -___ -} } -$code.=<<___; -.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm" -.align 2 -___ - -foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval $1/geo; - - s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or - s/\bret\b/bx lr/go or - s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 - - print $_,"\n"; -} -close STDOUT; # enforce flush diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c deleted file mode 100644 index 4464ffbf8fd1..000000000000 --- a/arch/arm/crypto/poly1305-glue.c +++ /dev/null @@ -1,274 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM - * - * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> - */ - -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <asm/simd.h> -#include <linux/unaligned.h> -#include <crypto/algapi.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/poly1305.h> -#include <crypto/internal/simd.h> -#include <linux/cpufeature.h> -#include <linux/crypto.h> -#include <linux/jump_label.h> -#include <linux/module.h> - -void poly1305_init_arm(void *state, const u8 *key); -void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit); -void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit); -void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce); - -void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit) -{ -} - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) -{ - poly1305_init_arm(&dctx->h, key); - dctx->s[0] = get_unaligned_le32(key + 16); - dctx->s[1] = get_unaligned_le32(key + 20); - dctx->s[2] = get_unaligned_le32(key + 24); - dctx->s[3] = get_unaligned_le32(key + 28); - dctx->buflen = 0; -} -EXPORT_SYMBOL(poly1305_init_arch); - -static int arm_poly1305_init(struct shash_desc *desc) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - dctx->buflen = 0; - dctx->rset = 0; - dctx->sset = false; - - return 0; -} - -static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, - u32 len, u32 hibit, bool do_neon) -{ - if (unlikely(!dctx->sset)) { - if (!dctx->rset) { - poly1305_init_arm(&dctx->h, src); - src += POLY1305_BLOCK_SIZE; - len -= POLY1305_BLOCK_SIZE; - dctx->rset = 1; - } - if (len >= POLY1305_BLOCK_SIZE) { - dctx->s[0] = get_unaligned_le32(src + 0); - dctx->s[1] = get_unaligned_le32(src + 4); - dctx->s[2] = get_unaligned_le32(src + 8); - dctx->s[3] = get_unaligned_le32(src + 12); - src += POLY1305_BLOCK_SIZE; - len -= POLY1305_BLOCK_SIZE; - dctx->sset = true; - } - if (len < POLY1305_BLOCK_SIZE) - return; - } - - len &= ~(POLY1305_BLOCK_SIZE - 1); - - if (static_branch_likely(&have_neon) && likely(do_neon)) - poly1305_blocks_neon(&dctx->h, src, len, hibit); - else - poly1305_blocks_arm(&dctx->h, src, len, hibit); -} - -static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx, - const u8 *src, u32 len, bool do_neon) -{ - if (unlikely(dctx->buflen)) { - u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen); - - memcpy(dctx->buf + dctx->buflen, src, bytes); - src += bytes; - len -= bytes; - dctx->buflen += bytes; - - if (dctx->buflen == POLY1305_BLOCK_SIZE) { - arm_poly1305_blocks(dctx, dctx->buf, - POLY1305_BLOCK_SIZE, 1, false); - dctx->buflen = 0; - } - } - - if (likely(len >= POLY1305_BLOCK_SIZE)) { - arm_poly1305_blocks(dctx, src, len, 1, do_neon); - src += round_down(len, POLY1305_BLOCK_SIZE); - len %= POLY1305_BLOCK_SIZE; - } - - if (unlikely(len)) { - dctx->buflen = len; - memcpy(dctx->buf, src, len); - } -} - -static int arm_poly1305_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - arm_poly1305_do_update(dctx, src, srclen, false); - return 0; -} - -static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc, - const u8 *src, - unsigned int srclen) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - bool do_neon = crypto_simd_usable() && srclen > 128; - - if (static_branch_likely(&have_neon) && do_neon) - kernel_neon_begin(); - arm_poly1305_do_update(dctx, src, srclen, do_neon); - if (static_branch_likely(&have_neon) && do_neon) - kernel_neon_end(); - return 0; -} - -void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, - unsigned int nbytes) -{ - bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - crypto_simd_usable(); - - if (unlikely(dctx->buflen)) { - u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen); - - memcpy(dctx->buf + dctx->buflen, src, bytes); - src += bytes; - nbytes -= bytes; - dctx->buflen += bytes; - - if (dctx->buflen == POLY1305_BLOCK_SIZE) { - poly1305_blocks_arm(&dctx->h, dctx->buf, - POLY1305_BLOCK_SIZE, 1); - dctx->buflen = 0; - } - } - - if (likely(nbytes >= POLY1305_BLOCK_SIZE)) { - unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); - - if (static_branch_likely(&have_neon) && do_neon) { - do { - unsigned int todo = min_t(unsigned int, len, SZ_4K); - - kernel_neon_begin(); - poly1305_blocks_neon(&dctx->h, src, todo, 1); - kernel_neon_end(); - - len -= todo; - src += todo; - } while (len); - } else { - poly1305_blocks_arm(&dctx->h, src, len, 1); - src += len; - } - nbytes %= POLY1305_BLOCK_SIZE; - } - - if (unlikely(nbytes)) { - dctx->buflen = nbytes; - memcpy(dctx->buf, src, nbytes); - } -} -EXPORT_SYMBOL(poly1305_update_arch); - -void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) -{ - if (unlikely(dctx->buflen)) { - dctx->buf[dctx->buflen++] = 1; - memset(dctx->buf + dctx->buflen, 0, - POLY1305_BLOCK_SIZE - dctx->buflen); - poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); - } - - poly1305_emit_arm(&dctx->h, dst, dctx->s); - *dctx = (struct poly1305_desc_ctx){}; -} -EXPORT_SYMBOL(poly1305_final_arch); - -static int arm_poly1305_final(struct shash_desc *desc, u8 *dst) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - if (unlikely(!dctx->sset)) - return -ENOKEY; - - poly1305_final_arch(dctx, dst); - return 0; -} - -static struct shash_alg arm_poly1305_algs[] = {{ - .init = arm_poly1305_init, - .update = arm_poly1305_update, - .final = arm_poly1305_final, - .digestsize = POLY1305_DIGEST_SIZE, - .descsize = sizeof(struct poly1305_desc_ctx), - - .base.cra_name = "poly1305", - .base.cra_driver_name = "poly1305-arm", - .base.cra_priority = 150, - .base.cra_blocksize = POLY1305_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -#ifdef CONFIG_KERNEL_MODE_NEON -}, { - .init = arm_poly1305_init, - .update = arm_poly1305_update_neon, - .final = arm_poly1305_final, - .digestsize = POLY1305_DIGEST_SIZE, - .descsize = sizeof(struct poly1305_desc_ctx), - - .base.cra_name = "poly1305", - .base.cra_driver_name = "poly1305-neon", - .base.cra_priority = 200, - .base.cra_blocksize = POLY1305_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -#endif -}}; - -static int __init arm_poly1305_mod_init(void) -{ - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && - (elf_hwcap & HWCAP_NEON)) - static_branch_enable(&have_neon); - else if (IS_REACHABLE(CONFIG_CRYPTO_HASH)) - /* register only the first entry */ - return crypto_register_shash(&arm_poly1305_algs[0]); - - return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? - crypto_register_shashes(arm_poly1305_algs, - ARRAY_SIZE(arm_poly1305_algs)) : 0; -} - -static void __exit arm_poly1305_mod_exit(void) -{ - if (!IS_REACHABLE(CONFIG_CRYPTO_HASH)) - return; - if (!static_branch_likely(&have_neon)) { - crypto_unregister_shash(&arm_poly1305_algs[0]); - return; - } - crypto_unregister_shashes(arm_poly1305_algs, - ARRAY_SIZE(arm_poly1305_algs)); -} - -module_init(arm_poly1305_mod_init); -module_exit(arm_poly1305_mod_exit); - -MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("poly1305"); -MODULE_ALIAS_CRYPTO("poly1305-arm"); -MODULE_ALIAS_CRYPTO("poly1305-neon"); diff --git a/arch/arm/crypto/sha1-armv4-large.S b/arch/arm/crypto/sha1-armv4-large.S deleted file mode 100644 index 1c8b685149f2..000000000000 --- a/arch/arm/crypto/sha1-armv4-large.S +++ /dev/null @@ -1,507 +0,0 @@ -#define __ARM_ARCH__ __LINUX_ARM_ARCH__ -@ SPDX-License-Identifier: GPL-2.0 - -@ This code is taken from the OpenSSL project but the author (Andy Polyakov) -@ has relicensed it under the GPLv2. Therefore this program is free software; -@ you can redistribute it and/or modify it under the terms of the GNU General -@ Public License version 2 as published by the Free Software Foundation. -@ -@ The original headers, including the original license headers, are -@ included below for completeness. - -@ ==================================================================== -@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see https://www.openssl.org/~appro/cryptogams/. -@ ==================================================================== - -@ sha1_block procedure for ARMv4. -@ -@ January 2007. - -@ Size/performance trade-off -@ ==================================================================== -@ impl size in bytes comp cycles[*] measured performance -@ ==================================================================== -@ thumb 304 3212 4420 -@ armv4-small 392/+29% 1958/+64% 2250/+96% -@ armv4-compact 740/+89% 1552/+26% 1840/+22% -@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] -@ full unroll ~5100/+260% ~1260/+4% ~1300/+5% -@ ==================================================================== -@ thumb = same as 'small' but in Thumb instructions[**] and -@ with recurring code in two private functions; -@ small = detached Xload/update, loops are folded; -@ compact = detached Xload/update, 5x unroll; -@ large = interleaved Xload/update, 5x unroll; -@ full unroll = interleaved Xload/update, full unroll, estimated[!]; -@ -@ [*] Manually counted instructions in "grand" loop body. Measured -@ performance is affected by prologue and epilogue overhead, -@ i-cache availability, branch penalties, etc. -@ [**] While each Thumb instruction is twice smaller, they are not as -@ diverse as ARM ones: e.g., there are only two arithmetic -@ instructions with 3 arguments, no [fixed] rotate, addressing -@ modes are limited. As result it takes more instructions to do -@ the same job in Thumb, therefore the code is never twice as -@ small and always slower. -@ [***] which is also ~35% better than compiler generated code. Dual- -@ issue Cortex A8 core was measured to process input block in -@ ~990 cycles. - -@ August 2010. -@ -@ Rescheduling for dual-issue pipeline resulted in 13% improvement on -@ Cortex A8 core and in absolute terms ~870 cycles per input block -@ [or 13.6 cycles per byte]. - -@ February 2011. -@ -@ Profiler-assisted and platform-specific optimization resulted in 10% -@ improvement on Cortex A8 core and 12.2 cycles per byte. - -#include <linux/linkage.h> - -.text - -.align 2 -ENTRY(sha1_block_data_order) - stmdb sp!,{r4-r12,lr} - add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 - ldmia r0,{r3,r4,r5,r6,r7} -.Lloop: - ldr r8,.LK_00_19 - mov r14,sp - sub sp,sp,#15*4 - mov r5,r5,ror#30 - mov r6,r6,ror#30 - mov r7,r7,ror#30 @ [6] -.L_00_15: -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r6,r8,r6,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r4,r5 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r6,r8,r6,ror#2 @ E+=K_00_19 - eor r10,r4,r5 @ F_xx_xx - add r6,r6,r7,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r3,r10,ror#2 - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r6,r6,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r5,r8,r5,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r3,r4 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r5,r8,r5,ror#2 @ E+=K_00_19 - eor r10,r3,r4 @ F_xx_xx - add r5,r5,r6,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r7,r10,ror#2 - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r5,r5,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r4,r8,r4,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r7,r3 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r4,r8,r4,ror#2 @ E+=K_00_19 - eor r10,r7,r3 @ F_xx_xx - add r4,r4,r5,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r6,r10,ror#2 - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r4,r4,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r3,r8,r3,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r6,r7 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r3,r8,r3,ror#2 @ E+=K_00_19 - eor r10,r6,r7 @ F_xx_xx - add r3,r3,r4,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r5,r10,ror#2 - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r3,r3,r10 @ E+=F_00_19(B,C,D) - cmp r14,sp - bne .L_00_15 @ [((11+4)*5+2)*3] - sub sp,sp,#25*4 -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - add r6,r6,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - add r5,r5,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - add r4,r4,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - add r3,r3,r10 @ E+=F_00_19(B,C,D) - - ldr r8,.LK_20_39 @ [+15+16*4] - cmn sp,#0 @ [+3], clear carry to denote 20_39 -.L_20_39_or_60_79: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r4,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_20_39(B,C,D) - ARM( teq r14,sp ) @ preserve carry - THUMB( mov r11,sp ) - THUMB( teq r14,r11 ) @ preserve carry - bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] - bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes - - ldr r8,.LK_40_59 - sub sp,sp,#20*4 @ [+2] -.L_40_59: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r4,r10,ror#2 @ F_xx_xx - and r11,r5,r6 @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_40_59(B,C,D) - add r7,r7,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - and r11,r4,r5 @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_40_59(B,C,D) - add r6,r6,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - and r11,r3,r4 @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_40_59(B,C,D) - add r5,r5,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - and r11,r7,r3 @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_40_59(B,C,D) - add r4,r4,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - and r11,r6,r7 @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_40_59(B,C,D) - add r3,r3,r11,ror#2 - cmp r14,sp - bne .L_40_59 @ [+((12+5)*5+2)*4] - - ldr r8,.LK_60_79 - sub sp,sp,#20*4 - cmp sp,#0 @ set carry to denote 60_79 - b .L_20_39_or_60_79 @ [+4], spare 300 bytes -.L_done: - add sp,sp,#80*4 @ "deallocate" stack frame - ldmia r0,{r8,r9,r10,r11,r12} - add r3,r8,r3 - add r4,r9,r4 - add r5,r10,r5,ror#2 - add r6,r11,r6,ror#2 - add r7,r12,r7,ror#2 - stmia r0,{r3,r4,r5,r6,r7} - teq r1,r2 - bne .Lloop @ [+18], total 1307 - - ldmia sp!,{r4-r12,pc} -.align 2 -.LK_00_19: .word 0x5a827999 -.LK_20_39: .word 0x6ed9eba1 -.LK_40_59: .word 0x8f1bbcdc -.LK_60_79: .word 0xca62c1d6 -ENDPROC(sha1_block_data_order) -.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" -.align 2 diff --git a/arch/arm/crypto/sha1-armv7-neon.S b/arch/arm/crypto/sha1-armv7-neon.S deleted file mode 100644 index 28d816a6a530..000000000000 --- a/arch/arm/crypto/sha1-armv7-neon.S +++ /dev/null @@ -1,634 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function - * - * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - -.syntax unified -.fpu neon - -.text - - -/* Context structure */ - -#define state_h0 0 -#define state_h1 4 -#define state_h2 8 -#define state_h3 12 -#define state_h4 16 - - -/* Constants */ - -#define K1 0x5A827999 -#define K2 0x6ED9EBA1 -#define K3 0x8F1BBCDC -#define K4 0xCA62C1D6 -.align 4 -.LK_VEC: -.LK1: .long K1, K1, K1, K1 -.LK2: .long K2, K2, K2, K2 -.LK3: .long K3, K3, K3, K3 -.LK4: .long K4, K4, K4, K4 - - -/* Register macros */ - -#define RSTATE r0 -#define RDATA r1 -#define RNBLKS r2 -#define ROLDSTACK r3 -#define RWK lr - -#define _a r4 -#define _b r5 -#define _c r6 -#define _d r7 -#define _e r8 - -#define RT0 r9 -#define RT1 r10 -#define RT2 r11 -#define RT3 r12 - -#define W0 q0 -#define W1 q7 -#define W2 q2 -#define W3 q3 -#define W4 q4 -#define W5 q6 -#define W6 q5 -#define W7 q1 - -#define tmp0 q8 -#define tmp1 q9 -#define tmp2 q10 -#define tmp3 q11 - -#define qK1 q12 -#define qK2 q13 -#define qK3 q14 -#define qK4 q15 - -#ifdef CONFIG_CPU_BIG_ENDIAN -#define ARM_LE(code...) -#else -#define ARM_LE(code...) code -#endif - -/* Round function macros. */ - -#define WK_offs(i) (((i) & 15) * 4) - -#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ldr RT3, [sp, WK_offs(i)]; \ - pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - bic RT0, d, b; \ - add e, e, a, ror #(32 - 5); \ - and RT1, c, b; \ - pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - add RT0, RT0, RT3; \ - add e, e, RT1; \ - ror b, #(32 - 30); \ - pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - add e, e, RT0; - -#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ldr RT3, [sp, WK_offs(i)]; \ - pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - eor RT0, d, b; \ - add e, e, a, ror #(32 - 5); \ - eor RT0, RT0, c; \ - pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - add e, e, RT3; \ - ror b, #(32 - 30); \ - pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - add e, e, RT0; \ - -#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ldr RT3, [sp, WK_offs(i)]; \ - pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - eor RT0, b, c; \ - and RT1, b, c; \ - add e, e, a, ror #(32 - 5); \ - pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - and RT0, RT0, d; \ - add RT1, RT1, RT3; \ - add e, e, RT0; \ - ror b, #(32 - 30); \ - pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ - add e, e, RT1; - -#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) - -#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) - -#define R(a,b,c,d,e,f,i) \ - _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\ - W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) - -#define dummy(...) - - -/* Input expansion macros. */ - -/********* Precalc macros for rounds 0-15 *************************************/ - -#define W_PRECALC_00_15() \ - add RWK, sp, #(WK_offs(0)); \ - \ - vld1.32 {W0, W7}, [RDATA]!; \ - ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ - vld1.32 {W6, W5}, [RDATA]!; \ - vadd.u32 tmp0, W0, curK; \ - ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ - ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ - vadd.u32 tmp1, W7, curK; \ - ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ - vadd.u32 tmp2, W6, curK; \ - vst1.32 {tmp0, tmp1}, [RWK]!; \ - vadd.u32 tmp3, W5, curK; \ - vst1.32 {tmp2, tmp3}, [RWK]; \ - -#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vld1.32 {W0, W7}, [RDATA]!; \ - -#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - add RWK, sp, #(WK_offs(0)); \ - -#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ - -#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vld1.32 {W6, W5}, [RDATA]!; \ - -#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vadd.u32 tmp0, W0, curK; \ - -#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ - -#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ - -#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vadd.u32 tmp1, W7, curK; \ - -#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ - -#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vadd.u32 tmp2, W6, curK; \ - -#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vst1.32 {tmp0, tmp1}, [RWK]!; \ - -#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vadd.u32 tmp3, W5, curK; \ - -#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vst1.32 {tmp2, tmp3}, [RWK]; \ - - -/********* Precalc macros for rounds 16-31 ************************************/ - -#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor tmp0, tmp0; \ - vext.8 W, W_m16, W_m12, #8; \ - -#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - add RWK, sp, #(WK_offs(i)); \ - vext.8 tmp0, W_m04, tmp0, #4; \ - -#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor tmp0, tmp0, W_m16; \ - veor.32 W, W, W_m08; \ - -#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor tmp1, tmp1; \ - veor W, W, tmp0; \ - -#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vshl.u32 tmp0, W, #1; \ - -#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vext.8 tmp1, tmp1, W, #(16-12); \ - vshr.u32 W, W, #31; \ - -#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vorr tmp0, tmp0, W; \ - vshr.u32 W, tmp1, #30; \ - -#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vshl.u32 tmp1, tmp1, #2; \ - -#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor tmp0, tmp0, W; \ - -#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor W, tmp0, tmp1; \ - -#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vadd.u32 tmp0, W, curK; \ - -#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vst1.32 {tmp0}, [RWK]; - - -/********* Precalc macros for rounds 32-79 ************************************/ - -#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor W, W_m28; \ - -#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vext.8 tmp0, W_m08, W_m04, #8; \ - -#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor W, W_m16; \ - -#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - veor W, tmp0; \ - -#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - add RWK, sp, #(WK_offs(i&~3)); \ - -#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vshl.u32 tmp1, W, #2; \ - -#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vshr.u32 tmp0, W, #30; \ - -#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vorr W, tmp0, tmp1; \ - -#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vadd.u32 tmp0, W, curK; \ - -#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ - vst1.32 {tmp0}, [RWK]; - - -/* - * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. - * - * unsigned int - * sha1_transform_neon (void *ctx, const unsigned char *data, - * unsigned int nblks) - */ -.align 3 -ENTRY(sha1_transform_neon) - /* input: - * r0: ctx, CTX - * r1: data (64*nblks bytes) - * r2: nblks - */ - - cmp RNBLKS, #0; - beq .Ldo_nothing; - - push {r4-r12, lr}; - /*vpush {q4-q7};*/ - - adr RT3, .LK_VEC; - - mov ROLDSTACK, sp; - - /* Align stack. */ - sub RT0, sp, #(16*4); - and RT0, #(~(16-1)); - mov sp, RT0; - - vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */ - - /* Get the values of the chaining variables. */ - ldm RSTATE, {_a-_e}; - - vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */ - -#undef curK -#define curK qK1 - /* Precalc 0-15. */ - W_PRECALC_00_15(); - -.Loop: - /* Transform 0-15 + Precalc 16-31. */ - _R( _a, _b, _c, _d, _e, F1, 0, - WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, - W4, W5, W6, W7, W0, _, _, _ ); - _R( _e, _a, _b, _c, _d, F1, 1, - WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, - W4, W5, W6, W7, W0, _, _, _ ); - _R( _d, _e, _a, _b, _c, F1, 2, - WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, - W4, W5, W6, W7, W0, _, _, _ ); - _R( _c, _d, _e, _a, _b, F1, 3, - WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, - W4, W5, W6, W7, W0, _, _, _ ); - -#undef curK -#define curK qK2 - _R( _b, _c, _d, _e, _a, F1, 4, - WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, - W3, W4, W5, W6, W7, _, _, _ ); - _R( _a, _b, _c, _d, _e, F1, 5, - WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, - W3, W4, W5, W6, W7, _, _, _ ); - _R( _e, _a, _b, _c, _d, F1, 6, - WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, - W3, W4, W5, W6, W7, _, _, _ ); - _R( _d, _e, _a, _b, _c, F1, 7, - WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, - W3, W4, W5, W6, W7, _, _, _ ); - - _R( _c, _d, _e, _a, _b, F1, 8, - WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, - W2, W3, W4, W5, W6, _, _, _ ); - _R( _b, _c, _d, _e, _a, F1, 9, - WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, - W2, W3, W4, W5, W6, _, _, _ ); - _R( _a, _b, _c, _d, _e, F1, 10, - WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, - W2, W3, W4, W5, W6, _, _, _ ); - _R( _e, _a, _b, _c, _d, F1, 11, - WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, - W2, W3, W4, W5, W6, _, _, _ ); - - _R( _d, _e, _a, _b, _c, F1, 12, - WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, - W1, W2, W3, W4, W5, _, _, _ ); - _R( _c, _d, _e, _a, _b, F1, 13, - WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, - W1, W2, W3, W4, W5, _, _, _ ); - _R( _b, _c, _d, _e, _a, F1, 14, - WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, - W1, W2, W3, W4, W5, _, _, _ ); - _R( _a, _b, _c, _d, _e, F1, 15, - WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, - W1, W2, W3, W4, W5, _, _, _ ); - - /* Transform 16-63 + Precalc 32-79. */ - _R( _e, _a, _b, _c, _d, F1, 16, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, - W0, W1, W2, W3, W4, W5, W6, W7); - _R( _d, _e, _a, _b, _c, F1, 17, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, - W0, W1, W2, W3, W4, W5, W6, W7); - _R( _c, _d, _e, _a, _b, F1, 18, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32, - W0, W1, W2, W3, W4, W5, W6, W7); - _R( _b, _c, _d, _e, _a, F1, 19, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32, - W0, W1, W2, W3, W4, W5, W6, W7); - - _R( _a, _b, _c, _d, _e, F2, 20, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, - W7, W0, W1, W2, W3, W4, W5, W6); - _R( _e, _a, _b, _c, _d, F2, 21, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, - W7, W0, W1, W2, W3, W4, W5, W6); - _R( _d, _e, _a, _b, _c, F2, 22, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36, - W7, W0, W1, W2, W3, W4, W5, W6); - _R( _c, _d, _e, _a, _b, F2, 23, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36, - W7, W0, W1, W2, W3, W4, W5, W6); - -#undef curK -#define curK qK3 - _R( _b, _c, _d, _e, _a, F2, 24, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, - W6, W7, W0, W1, W2, W3, W4, W5); - _R( _a, _b, _c, _d, _e, F2, 25, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, - W6, W7, W0, W1, W2, W3, W4, W5); - _R( _e, _a, _b, _c, _d, F2, 26, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40, - W6, W7, W0, W1, W2, W3, W4, W5); - _R( _d, _e, _a, _b, _c, F2, 27, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40, - W6, W7, W0, W1, W2, W3, W4, W5); - - _R( _c, _d, _e, _a, _b, F2, 28, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, - W5, W6, W7, W0, W1, W2, W3, W4); - _R( _b, _c, _d, _e, _a, F2, 29, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, - W5, W6, W7, W0, W1, W2, W3, W4); - _R( _a, _b, _c, _d, _e, F2, 30, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44, - W5, W6, W7, W0, W1, W2, W3, W4); - _R( _e, _a, _b, _c, _d, F2, 31, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44, - W5, W6, W7, W0, W1, W2, W3, W4); - - _R( _d, _e, _a, _b, _c, F2, 32, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, - W4, W5, W6, W7, W0, W1, W2, W3); - _R( _c, _d, _e, _a, _b, F2, 33, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, - W4, W5, W6, W7, W0, W1, W2, W3); - _R( _b, _c, _d, _e, _a, F2, 34, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48, - W4, W5, W6, W7, W0, W1, W2, W3); - _R( _a, _b, _c, _d, _e, F2, 35, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48, - W4, W5, W6, W7, W0, W1, W2, W3); - - _R( _e, _a, _b, _c, _d, F2, 36, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, - W3, W4, W5, W6, W7, W0, W1, W2); - _R( _d, _e, _a, _b, _c, F2, 37, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, - W3, W4, W5, W6, W7, W0, W1, W2); - _R( _c, _d, _e, _a, _b, F2, 38, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52, - W3, W4, W5, W6, W7, W0, W1, W2); - _R( _b, _c, _d, _e, _a, F2, 39, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52, - W3, W4, W5, W6, W7, W0, W1, W2); - - _R( _a, _b, _c, _d, _e, F3, 40, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, - W2, W3, W4, W5, W6, W7, W0, W1); - _R( _e, _a, _b, _c, _d, F3, 41, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, - W2, W3, W4, W5, W6, W7, W0, W1); - _R( _d, _e, _a, _b, _c, F3, 42, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56, - W2, W3, W4, W5, W6, W7, W0, W1); - _R( _c, _d, _e, _a, _b, F3, 43, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56, - W2, W3, W4, W5, W6, W7, W0, W1); - -#undef curK -#define curK qK4 - _R( _b, _c, _d, _e, _a, F3, 44, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, - W1, W2, W3, W4, W5, W6, W7, W0); - _R( _a, _b, _c, _d, _e, F3, 45, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, - W1, W2, W3, W4, W5, W6, W7, W0); - _R( _e, _a, _b, _c, _d, F3, 46, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60, - W1, W2, W3, W4, W5, W6, W7, W0); - _R( _d, _e, _a, _b, _c, F3, 47, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60, - W1, W2, W3, W4, W5, W6, W7, W0); - - _R( _c, _d, _e, _a, _b, F3, 48, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, - W0, W1, W2, W3, W4, W5, W6, W7); - _R( _b, _c, _d, _e, _a, F3, 49, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, - W0, W1, W2, W3, W4, W5, W6, W7); - _R( _a, _b, _c, _d, _e, F3, 50, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64, - W0, W1, W2, W3, W4, W5, W6, W7); - _R( _e, _a, _b, _c, _d, F3, 51, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64, - W0, W1, W2, W3, W4, W5, W6, W7); - - _R( _d, _e, _a, _b, _c, F3, 52, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, - W7, W0, W1, W2, W3, W4, W5, W6); - _R( _c, _d, _e, _a, _b, F3, 53, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, - W7, W0, W1, W2, W3, W4, W5, W6); - _R( _b, _c, _d, _e, _a, F3, 54, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68, - W7, W0, W1, W2, W3, W4, W5, W6); - _R( _a, _b, _c, _d, _e, F3, 55, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68, - W7, W0, W1, W2, W3, W4, W5, W6); - - _R( _e, _a, _b, _c, _d, F3, 56, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, - W6, W7, W0, W1, W2, W3, W4, W5); - _R( _d, _e, _a, _b, _c, F3, 57, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, - W6, W7, W0, W1, W2, W3, W4, W5); - _R( _c, _d, _e, _a, _b, F3, 58, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72, - W6, W7, W0, W1, W2, W3, W4, W5); - _R( _b, _c, _d, _e, _a, F3, 59, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72, - W6, W7, W0, W1, W2, W3, W4, W5); - - subs RNBLKS, #1; - - _R( _a, _b, _c, _d, _e, F4, 60, - WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, - W5, W6, W7, W0, W1, W2, W3, W4); - _R( _e, _a, _b, _c, _d, F4, 61, - WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, - W5, W6, W7, W0, W1, W2, W3, W4); - _R( _d, _e, _a, _b, _c, F4, 62, - WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76, - W5, W6, W7, W0, W1, W2, W3, W4); - _R( _c, _d, _e, _a, _b, F4, 63, - WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76, - W5, W6, W7, W0, W1, W2, W3, W4); - - beq .Lend; - - /* Transform 64-79 + Precalc 0-15 of next block. */ -#undef curK -#define curK qK1 - _R( _b, _c, _d, _e, _a, F4, 64, - WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _a, _b, _c, _d, _e, F4, 65, - WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _e, _a, _b, _c, _d, F4, 66, - WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _d, _e, _a, _b, _c, F4, 67, - WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - - _R( _c, _d, _e, _a, _b, F4, 68, - dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _b, _c, _d, _e, _a, F4, 69, - dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _a, _b, _c, _d, _e, F4, 70, - WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _e, _a, _b, _c, _d, F4, 71, - WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - - _R( _d, _e, _a, _b, _c, F4, 72, - dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _c, _d, _e, _a, _b, F4, 73, - dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _b, _c, _d, _e, _a, F4, 74, - WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _a, _b, _c, _d, _e, F4, 75, - WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - - _R( _e, _a, _b, _c, _d, F4, 76, - WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _d, _e, _a, _b, _c, F4, 77, - WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _c, _d, _e, _a, _b, F4, 78, - WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ ); - _R( _b, _c, _d, _e, _a, F4, 79, - WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ ); - - /* Update the chaining variables. */ - ldm RSTATE, {RT0-RT3}; - add _a, RT0; - ldr RT0, [RSTATE, #state_h4]; - add _b, RT1; - add _c, RT2; - add _d, RT3; - add _e, RT0; - stm RSTATE, {_a-_e}; - - b .Loop; - -.Lend: - /* Transform 64-79 */ - R( _b, _c, _d, _e, _a, F4, 64 ); - R( _a, _b, _c, _d, _e, F4, 65 ); - R( _e, _a, _b, _c, _d, F4, 66 ); - R( _d, _e, _a, _b, _c, F4, 67 ); - R( _c, _d, _e, _a, _b, F4, 68 ); - R( _b, _c, _d, _e, _a, F4, 69 ); - R( _a, _b, _c, _d, _e, F4, 70 ); - R( _e, _a, _b, _c, _d, F4, 71 ); - R( _d, _e, _a, _b, _c, F4, 72 ); - R( _c, _d, _e, _a, _b, F4, 73 ); - R( _b, _c, _d, _e, _a, F4, 74 ); - R( _a, _b, _c, _d, _e, F4, 75 ); - R( _e, _a, _b, _c, _d, F4, 76 ); - R( _d, _e, _a, _b, _c, F4, 77 ); - R( _c, _d, _e, _a, _b, F4, 78 ); - R( _b, _c, _d, _e, _a, F4, 79 ); - - mov sp, ROLDSTACK; - - /* Update the chaining variables. */ - ldm RSTATE, {RT0-RT3}; - add _a, RT0; - ldr RT0, [RSTATE, #state_h4]; - add _b, RT1; - add _c, RT2; - add _d, RT3; - /*vpop {q4-q7};*/ - add _e, RT0; - stm RSTATE, {_a-_e}; - - pop {r4-r12, pc}; - -.Ldo_nothing: - bx lr -ENDPROC(sha1_transform_neon) diff --git a/arch/arm/crypto/sha1-ce-core.S b/arch/arm/crypto/sha1-ce-core.S deleted file mode 100644 index 8a702e051738..000000000000 --- a/arch/arm/crypto/sha1-ce-core.S +++ /dev/null @@ -1,123 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions - * - * Copyright (C) 2015 Linaro Ltd. - * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org> - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - - .text - .arch armv8-a - .fpu crypto-neon-fp-armv8 - - k0 .req q0 - k1 .req q1 - k2 .req q2 - k3 .req q3 - - ta0 .req q4 - ta1 .req q5 - tb0 .req q5 - tb1 .req q4 - - dga .req q6 - dgb .req q7 - dgbs .req s28 - - dg0 .req q12 - dg1a0 .req q13 - dg1a1 .req q14 - dg1b0 .req q14 - dg1b1 .req q13 - - .macro add_only, op, ev, rc, s0, dg1 - .ifnb \s0 - vadd.u32 tb\ev, q\s0, \rc - .endif - sha1h.32 dg1b\ev, dg0 - .ifb \dg1 - sha1\op\().32 dg0, dg1a\ev, ta\ev - .else - sha1\op\().32 dg0, \dg1, ta\ev - .endif - .endm - - .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 - sha1su0.32 q\s0, q\s1, q\s2 - add_only \op, \ev, \rc, \s1, \dg1 - sha1su1.32 q\s0, q\s3 - .endm - - .align 6 -.Lsha1_rcon: - .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 - .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 - .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc - .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 - - /* - * void sha1_ce_transform(struct sha1_state *sst, u8 const *src, - * int blocks); - */ -ENTRY(sha1_ce_transform) - /* load round constants */ - adr ip, .Lsha1_rcon - vld1.32 {k0-k1}, [ip, :128]! - vld1.32 {k2-k3}, [ip, :128] - - /* load state */ - vld1.32 {dga}, [r0] - vldr dgbs, [r0, #16] - - /* load input */ -0: vld1.32 {q8-q9}, [r1]! - vld1.32 {q10-q11}, [r1]! - subs r2, r2, #1 - -#ifndef CONFIG_CPU_BIG_ENDIAN - vrev32.8 q8, q8 - vrev32.8 q9, q9 - vrev32.8 q10, q10 - vrev32.8 q11, q11 -#endif - - vadd.u32 ta0, q8, k0 - vmov dg0, dga - - add_update c, 0, k0, 8, 9, 10, 11, dgb - add_update c, 1, k0, 9, 10, 11, 8 - add_update c, 0, k0, 10, 11, 8, 9 - add_update c, 1, k0, 11, 8, 9, 10 - add_update c, 0, k1, 8, 9, 10, 11 - - add_update p, 1, k1, 9, 10, 11, 8 - add_update p, 0, k1, 10, 11, 8, 9 - add_update p, 1, k1, 11, 8, 9, 10 - add_update p, 0, k1, 8, 9, 10, 11 - add_update p, 1, k2, 9, 10, 11, 8 - - add_update m, 0, k2, 10, 11, 8, 9 - add_update m, 1, k2, 11, 8, 9, 10 - add_update m, 0, k2, 8, 9, 10, 11 - add_update m, 1, k2, 9, 10, 11, 8 - add_update m, 0, k3, 10, 11, 8, 9 - - add_update p, 1, k3, 11, 8, 9, 10 - add_only p, 0, k3, 9 - add_only p, 1, k3, 10 - add_only p, 0, k3, 11 - add_only p, 1 - - /* update state */ - vadd.u32 dga, dga, dg0 - vadd.u32 dgb, dgb, dg1a0 - bne 0b - - /* store new state */ - vst1.32 {dga}, [r0] - vstr dgbs, [r0, #16] - bx lr -ENDPROC(sha1_ce_transform) diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c deleted file mode 100644 index de9100c67b37..000000000000 --- a/arch/arm/crypto/sha1-ce-glue.c +++ /dev/null @@ -1,92 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions - * - * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> - */ - -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <crypto/sha1.h> -#include <crypto/sha1_base.h> -#include <linux/cpufeature.h> -#include <linux/crypto.h> -#include <linux/module.h> - -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <asm/simd.h> - -#include "sha1.h" - -MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); - -asmlinkage void sha1_ce_transform(struct sha1_state *sst, u8 const *src, - int blocks); - -static int sha1_ce_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) - return sha1_update_arm(desc, data, len); - - kernel_neon_begin(); - sha1_base_do_update(desc, data, len, sha1_ce_transform); - kernel_neon_end(); - - return 0; -} - -static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - if (!crypto_simd_usable()) - return sha1_finup_arm(desc, data, len, out); - - kernel_neon_begin(); - if (len) - sha1_base_do_update(desc, data, len, sha1_ce_transform); - sha1_base_do_finalize(desc, sha1_ce_transform); - kernel_neon_end(); - - return sha1_base_finish(desc, out); -} - -static int sha1_ce_final(struct shash_desc *desc, u8 *out) -{ - return sha1_ce_finup(desc, NULL, 0, out); -} - -static struct shash_alg alg = { - .init = sha1_base_init, - .update = sha1_ce_update, - .final = sha1_ce_final, - .finup = sha1_ce_finup, - .descsize = sizeof(struct sha1_state), - .digestsize = SHA1_DIGEST_SIZE, - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-ce", - .cra_priority = 200, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha1_ce_mod_init(void) -{ - return crypto_register_shash(&alg); -} - -static void __exit sha1_ce_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_cpu_feature_match(SHA1, sha1_ce_mod_init); -module_exit(sha1_ce_mod_fini); diff --git a/arch/arm/crypto/sha1.h b/arch/arm/crypto/sha1.h deleted file mode 100644 index b1b7e21da2c3..000000000000 --- a/arch/arm/crypto/sha1.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ASM_ARM_CRYPTO_SHA1_H -#define ASM_ARM_CRYPTO_SHA1_H - -#include <linux/crypto.h> -#include <crypto/sha1.h> - -extern int sha1_update_arm(struct shash_desc *desc, const u8 *data, - unsigned int len); - -extern int sha1_finup_arm(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out); - -#endif diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c deleted file mode 100644 index 95a727bcd664..000000000000 --- a/arch/arm/crypto/sha1_glue.c +++ /dev/null @@ -1,86 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API. - * Glue code for the SHA1 Secure Hash Algorithm assembler implementation - * - * This file is based on sha1_generic.c and sha1_ssse3_glue.c - * - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> - * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> - * Copyright (c) Mathias Krause <minipli@googlemail.com> - */ - -#include <crypto/internal/hash.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/types.h> -#include <crypto/sha1.h> -#include <crypto/sha1_base.h> -#include <asm/byteorder.h> - -#include "sha1.h" - -asmlinkage void sha1_block_data_order(struct sha1_state *digest, - const u8 *data, int rounds); - -int sha1_update_arm(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - /* make sure signature matches sha1_block_fn() */ - BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); - - return sha1_base_do_update(desc, data, len, sha1_block_data_order); -} -EXPORT_SYMBOL_GPL(sha1_update_arm); - -static int sha1_final(struct shash_desc *desc, u8 *out) -{ - sha1_base_do_finalize(desc, sha1_block_data_order); - return sha1_base_finish(desc, out); -} - -int sha1_finup_arm(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha1_base_do_update(desc, data, len, sha1_block_data_order); - return sha1_final(desc, out); -} -EXPORT_SYMBOL_GPL(sha1_finup_arm); - -static struct shash_alg alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_update_arm, - .final = sha1_final, - .finup = sha1_finup_arm, - .descsize = sizeof(struct sha1_state), - .base = { - .cra_name = "sha1", - .cra_driver_name= "sha1-asm", - .cra_priority = 150, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - - -static int __init sha1_mod_init(void) -{ - return crypto_register_shash(&alg); -} - - -static void __exit sha1_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - - -module_init(sha1_mod_init); -module_exit(sha1_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm (ARM)"); -MODULE_ALIAS_CRYPTO("sha1"); -MODULE_AUTHOR("David McCullough <ucdevel@gmail.com>"); diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c deleted file mode 100644 index 9c70b87e69f7..000000000000 --- a/arch/arm/crypto/sha1_neon_glue.c +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using - * ARM NEON instructions. - * - * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> - * - * This file is based on sha1_generic.c and sha1_ssse3_glue.c: - * Copyright (c) Alan Smithee. - * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> - * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> - * Copyright (c) Mathias Krause <minipli@googlemail.com> - * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com> - */ - -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/types.h> -#include <crypto/sha1.h> -#include <crypto/sha1_base.h> -#include <asm/neon.h> -#include <asm/simd.h> - -#include "sha1.h" - -asmlinkage void sha1_transform_neon(struct sha1_state *state_h, - const u8 *data, int rounds); - -static int sha1_neon_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) - return sha1_update_arm(desc, data, len); - - kernel_neon_begin(); - sha1_base_do_update(desc, data, len, sha1_transform_neon); - kernel_neon_end(); - - return 0; -} - -static int sha1_neon_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - if (!crypto_simd_usable()) - return sha1_finup_arm(desc, data, len, out); - - kernel_neon_begin(); - if (len) - sha1_base_do_update(desc, data, len, sha1_transform_neon); - sha1_base_do_finalize(desc, sha1_transform_neon); - kernel_neon_end(); - - return sha1_base_finish(desc, out); -} - -static int sha1_neon_final(struct shash_desc *desc, u8 *out) -{ - return sha1_neon_finup(desc, NULL, 0, out); -} - -static struct shash_alg alg = { - .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_base_init, - .update = sha1_neon_update, - .final = sha1_neon_final, - .finup = sha1_neon_finup, - .descsize = sizeof(struct sha1_state), - .base = { - .cra_name = "sha1", - .cra_driver_name = "sha1-neon", - .cra_priority = 250, - .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static int __init sha1_neon_mod_init(void) -{ - if (!cpu_has_neon()) - return -ENODEV; - - return crypto_register_shash(&alg); -} - -static void __exit sha1_neon_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(sha1_neon_mod_init); -module_exit(sha1_neon_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, NEON accelerated"); -MODULE_ALIAS_CRYPTO("sha1"); diff --git a/arch/arm/crypto/sha2-ce-core.S b/arch/arm/crypto/sha2-ce-core.S deleted file mode 100644 index b6369d2440a1..000000000000 --- a/arch/arm/crypto/sha2-ce-core.S +++ /dev/null @@ -1,123 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sha2-ce-core.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions - * - * Copyright (C) 2015 Linaro Ltd. - * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org> - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - - .text - .arch armv8-a - .fpu crypto-neon-fp-armv8 - - k0 .req q7 - k1 .req q8 - rk .req r3 - - ta0 .req q9 - ta1 .req q10 - tb0 .req q10 - tb1 .req q9 - - dga .req q11 - dgb .req q12 - - dg0 .req q13 - dg1 .req q14 - dg2 .req q15 - - .macro add_only, ev, s0 - vmov dg2, dg0 - .ifnb \s0 - vld1.32 {k\ev}, [rk, :128]! - .endif - sha256h.32 dg0, dg1, tb\ev - sha256h2.32 dg1, dg2, tb\ev - .ifnb \s0 - vadd.u32 ta\ev, q\s0, k\ev - .endif - .endm - - .macro add_update, ev, s0, s1, s2, s3 - sha256su0.32 q\s0, q\s1 - add_only \ev, \s1 - sha256su1.32 q\s0, q\s2, q\s3 - .endm - - .align 6 -.Lsha256_rcon: - .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 - .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 - .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 - .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 - .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc - .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da - .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 - .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 - .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 - .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 - .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 - .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 - .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 - .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 - .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 - .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 - - /* - * void sha2_ce_transform(struct sha256_state *sst, u8 const *src, - int blocks); - */ -ENTRY(sha2_ce_transform) - /* load state */ - vld1.32 {dga-dgb}, [r0] - - /* load input */ -0: vld1.32 {q0-q1}, [r1]! - vld1.32 {q2-q3}, [r1]! - subs r2, r2, #1 - -#ifndef CONFIG_CPU_BIG_ENDIAN - vrev32.8 q0, q0 - vrev32.8 q1, q1 - vrev32.8 q2, q2 - vrev32.8 q3, q3 -#endif - - /* load first round constant */ - adr rk, .Lsha256_rcon - vld1.32 {k0}, [rk, :128]! - - vadd.u32 ta0, q0, k0 - vmov dg0, dga - vmov dg1, dgb - - add_update 1, 0, 1, 2, 3 - add_update 0, 1, 2, 3, 0 - add_update 1, 2, 3, 0, 1 - add_update 0, 3, 0, 1, 2 - add_update 1, 0, 1, 2, 3 - add_update 0, 1, 2, 3, 0 - add_update 1, 2, 3, 0, 1 - add_update 0, 3, 0, 1, 2 - add_update 1, 0, 1, 2, 3 - add_update 0, 1, 2, 3, 0 - add_update 1, 2, 3, 0, 1 - add_update 0, 3, 0, 1, 2 - - add_only 1, 1 - add_only 0, 2 - add_only 1, 3 - add_only 0 - - /* update state */ - vadd.u32 dga, dga, dg0 - vadd.u32 dgb, dgb, dg1 - bne 0b - - /* store new state */ - vst1.32 {dga-dgb}, [r0] - bx lr -ENDPROC(sha2_ce_transform) diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c deleted file mode 100644 index aeac45bfbf9f..000000000000 --- a/arch/arm/crypto/sha2-ce-glue.c +++ /dev/null @@ -1,109 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions - * - * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> - */ - -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <crypto/sha2.h> -#include <crypto/sha256_base.h> -#include <linux/cpufeature.h> -#include <linux/crypto.h> -#include <linux/module.h> - -#include <asm/hwcap.h> -#include <asm/simd.h> -#include <asm/neon.h> -#include <linux/unaligned.h> - -#include "sha256_glue.h" - -MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); - -asmlinkage void sha2_ce_transform(struct sha256_state *sst, u8 const *src, - int blocks); - -static int sha2_ce_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) - return crypto_sha256_arm_update(desc, data, len); - - kernel_neon_begin(); - sha256_base_do_update(desc, data, len, - (sha256_block_fn *)sha2_ce_transform); - kernel_neon_end(); - - return 0; -} - -static int sha2_ce_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - if (!crypto_simd_usable()) - return crypto_sha256_arm_finup(desc, data, len, out); - - kernel_neon_begin(); - if (len) - sha256_base_do_update(desc, data, len, - (sha256_block_fn *)sha2_ce_transform); - sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform); - kernel_neon_end(); - - return sha256_base_finish(desc, out); -} - -static int sha2_ce_final(struct shash_desc *desc, u8 *out) -{ - return sha2_ce_finup(desc, NULL, 0, out); -} - -static struct shash_alg algs[] = { { - .init = sha224_base_init, - .update = sha2_ce_update, - .final = sha2_ce_final, - .finup = sha2_ce_finup, - .descsize = sizeof(struct sha256_state), - .digestsize = SHA224_DIGEST_SIZE, - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-ce", - .cra_priority = 300, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .init = sha256_base_init, - .update = sha2_ce_update, - .final = sha2_ce_final, - .finup = sha2_ce_finup, - .descsize = sizeof(struct sha256_state), - .digestsize = SHA256_DIGEST_SIZE, - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-ce", - .cra_priority = 300, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int __init sha2_ce_mod_init(void) -{ - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); -} - -static void __exit sha2_ce_mod_fini(void) -{ - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); -} - -module_cpu_feature_match(SHA2, sha2_ce_mod_init); -module_exit(sha2_ce_mod_fini); diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl deleted file mode 100644 index f3a2b54efd4e..000000000000 --- a/arch/arm/crypto/sha256-armv4.pl +++ /dev/null @@ -1,724 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 - -# This code is taken from the OpenSSL project but the author (Andy Polyakov) -# has relicensed it under the GPLv2. Therefore this program is free software; -# you can redistribute it and/or modify it under the terms of the GNU General -# Public License version 2 as published by the Free Software Foundation. -# -# The original headers, including the original license headers, are -# included below for completeness. - -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see https://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# SHA256 block procedure for ARMv4. May 2007. - -# Performance is ~2x better than gcc 3.4 generated code and in "abso- -# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per -# byte [on single-issue Xscale PXA250 core]. - -# July 2010. -# -# Rescheduling for dual-issue pipeline resulted in 22% improvement on -# Cortex A8 core and ~20 cycles per processed byte. - -# February 2011. -# -# Profiler-assisted and platform-specific optimization resulted in 16% -# improvement on Cortex A8 core and ~15.4 cycles per processed byte. - -# September 2013. -# -# Add NEON implementation. On Cortex A8 it was measured to process one -# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon -# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only -# code (meaning that latter performs sub-optimally, nothing was done -# about it). - -# May 2014. -# -# Add ARMv8 code path performing at 2.0 cpb on Apple A7. - -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; - -$ctx="r0"; $t0="r0"; -$inp="r1"; $t4="r1"; -$len="r2"; $t1="r2"; -$T1="r3"; $t3="r3"; -$A="r4"; -$B="r5"; -$C="r6"; -$D="r7"; -$E="r8"; -$F="r9"; -$G="r10"; -$H="r11"; -@V=($A,$B,$C,$D,$E,$F,$G,$H); -$t2="r12"; -$Ktbl="r14"; - -@Sigma0=( 2,13,22); -@Sigma1=( 6,11,25); -@sigma0=( 7,18, 3); -@sigma1=(17,19,10); - -sub BODY_00_15 { -my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; - -$code.=<<___ if ($i<16); -#if __ARM_ARCH__>=7 - @ ldr $t1,[$inp],#4 @ $i -# if $i==15 - str $inp,[sp,#17*4] @ make room for $t4 -# endif - eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` - add $a,$a,$t2 @ h+=Maj(a,b,c) from the past - eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) -# ifndef __ARMEB__ - rev $t1,$t1 -# endif -#else - @ ldrb $t1,[$inp,#3] @ $i - add $a,$a,$t2 @ h+=Maj(a,b,c) from the past - ldrb $t2,[$inp,#2] - ldrb $t0,[$inp,#1] - orr $t1,$t1,$t2,lsl#8 - ldrb $t2,[$inp],#4 - orr $t1,$t1,$t0,lsl#16 -# if $i==15 - str $inp,[sp,#17*4] @ make room for $t4 -# endif - eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` - orr $t1,$t1,$t2,lsl#24 - eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) -#endif -___ -$code.=<<___; - ldr $t2,[$Ktbl],#4 @ *K256++ - add $h,$h,$t1 @ h+=X[i] - str $t1,[sp,#`$i%16`*4] - eor $t1,$f,$g - add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) - and $t1,$t1,$e - add $h,$h,$t2 @ h+=K256[i] - eor $t1,$t1,$g @ Ch(e,f,g) - eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` - add $h,$h,$t1 @ h+=Ch(e,f,g) -#if $i==31 - and $t2,$t2,#0xff - cmp $t2,#0xf2 @ done? -#endif -#if $i<15 -# if __ARM_ARCH__>=7 - ldr $t1,[$inp],#4 @ prefetch -# else - ldrb $t1,[$inp,#3] -# endif - eor $t2,$a,$b @ a^b, b^c in next round -#else - ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx - eor $t2,$a,$b @ a^b, b^c in next round - ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx -#endif - eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) - and $t3,$t3,$t2 @ (b^c)&=(a^b) - add $d,$d,$h @ d+=h - eor $t3,$t3,$b @ Maj(a,b,c) - add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) - @ add $h,$h,$t3 @ h+=Maj(a,b,c) -___ - ($t2,$t3)=($t3,$t2); -} - -sub BODY_16_XX { -my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; - -$code.=<<___; - @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i - @ ldr $t4,[sp,#`($i+14)%16`*4] - mov $t0,$t1,ror#$sigma0[0] - add $a,$a,$t2 @ h+=Maj(a,b,c) from the past - mov $t2,$t4,ror#$sigma1[0] - eor $t0,$t0,$t1,ror#$sigma0[1] - eor $t2,$t2,$t4,ror#$sigma1[1] - eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) - ldr $t1,[sp,#`($i+0)%16`*4] - eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) - ldr $t4,[sp,#`($i+9)%16`*4] - - add $t2,$t2,$t0 - eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 - add $t1,$t1,$t2 - eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) - add $t1,$t1,$t4 @ X[i] -___ - &BODY_00_15(@_); -} - -$code=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ 7 -#endif - -.text -#if __ARM_ARCH__<7 -.code 32 -#else -.syntax unified -# ifdef __thumb2__ -.thumb -# else -.code 32 -# endif -#endif - -.type K256,%object -.align 5 -K256: -.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.size K256,.-K256 -.word 0 @ terminator -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha256_block_data_order -#endif -.align 5 - -.global sha256_block_data_order -.type sha256_block_data_order,%function -sha256_block_data_order: -.Lsha256_block_data_order: -#if __ARM_ARCH__<7 - sub r3,pc,#8 @ sha256_block_data_order -#else - adr r3,.Lsha256_block_data_order -#endif -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P - tst r12,#ARMV8_SHA256 - bne .LARMv8 - tst r12,#ARMV7_NEON - bne .LNEON -#endif - add $len,$inp,$len,lsl#6 @ len to point at the end of inp - stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} - ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} - sub $Ktbl,r3,#256+32 @ K256 - sub sp,sp,#16*4 @ alloca(X[16]) -.Loop: -# if __ARM_ARCH__>=7 - ldr $t1,[$inp],#4 -# else - ldrb $t1,[$inp,#3] -# endif - eor $t3,$B,$C @ magic - eor $t2,$t2,$t2 -___ -for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } -$code.=".Lrounds_16_xx:\n"; -for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } -$code.=<<___; -#if __ARM_ARCH__>=7 - ite eq @ Thumb2 thing, sanity check in ARM -#endif - ldreq $t3,[sp,#16*4] @ pull ctx - bne .Lrounds_16_xx - - add $A,$A,$t2 @ h+=Maj(a,b,c) from the past - ldr $t0,[$t3,#0] - ldr $t1,[$t3,#4] - ldr $t2,[$t3,#8] - add $A,$A,$t0 - ldr $t0,[$t3,#12] - add $B,$B,$t1 - ldr $t1,[$t3,#16] - add $C,$C,$t2 - ldr $t2,[$t3,#20] - add $D,$D,$t0 - ldr $t0,[$t3,#24] - add $E,$E,$t1 - ldr $t1,[$t3,#28] - add $F,$F,$t2 - ldr $inp,[sp,#17*4] @ pull inp - ldr $t2,[sp,#18*4] @ pull inp+len - add $G,$G,$t0 - add $H,$H,$t1 - stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} - cmp $inp,$t2 - sub $Ktbl,$Ktbl,#256 @ rewind Ktbl - bne .Loop - - add sp,sp,#`16+3`*4 @ destroy frame -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r11,pc} -#else - ldmia sp!,{r4-r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size sha256_block_data_order,.-sha256_block_data_order -___ -###################################################################### -# NEON stuff -# -{{{ -my @X=map("q$_",(0..3)); -my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); -my $Xfer=$t4; -my $j=0; - -sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } -sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } - -sub AUTOLOAD() # thunk [simplified] x86-style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; - my $arg = pop; - $arg = "#$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; -} - -sub Xupdate() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); - my ($a,$b,$c,$d,$e,$f,$g,$h); - - &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T2,$T0,$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T1,$T0,$sigma0[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vsli_32 ($T2,$T0,32-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T3,$T0,$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &veor ($T1,$T1,$T2); - eval(shift(@insns)); - eval(shift(@insns)); - &vsli_32 ($T3,$T0,32-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &veor ($T1,$T1,$T3); # sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &veor ($T5,$T5,$T4); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &veor ($T5,$T5,$T4); # sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &veor ($T5,$T5,$T4); - eval(shift(@insns)); - eval(shift(@insns)); - &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vld1_32 ("{$T0}","[$Ktbl,:128]!"); - eval(shift(@insns)); - eval(shift(@insns)); - &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &veor ($T5,$T5,$T4); # sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - &vadd_i32 ($T0,$T0,@X[0]); - while($#insns>=2) { eval(shift(@insns)); } - &vst1_32 ("{$T0}","[$Xfer,:128]!"); - eval(shift(@insns)); - eval(shift(@insns)); - - push(@X,shift(@X)); # "rotate" X[] -} - -sub Xpreload() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); - my ($a,$b,$c,$d,$e,$f,$g,$h); - - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vld1_32 ("{$T0}","[$Ktbl,:128]!"); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vrev32_8 (@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vadd_i32 ($T0,$T0,@X[0]); - foreach (@insns) { eval; } # remaining instructions - &vst1_32 ("{$T0}","[$Xfer,:128]!"); - - push(@X,shift(@X)); # "rotate" X[] -} - -sub body_00_15 () { - ( - '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. - '&add ($h,$h,$t1)', # h+=X[i]+K[i] - '&eor ($t1,$f,$g)', - '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', - '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past - '&and ($t1,$t1,$e)', - '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) - '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', - '&eor ($t1,$t1,$g)', # Ch(e,f,g) - '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) - '&eor ($t2,$a,$b)', # a^b, b^c in next round - '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) - '&add ($h,$h,$t1)', # h+=Ch(e,f,g) - '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. - '&ldr ($t1,"[$Ktbl]") if ($j==15);'. - '&ldr ($t1,"[sp,#64]") if ($j==31)', - '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) - '&add ($d,$d,$h)', # d+=h - '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) - '&eor ($t3,$t3,$b)', # Maj(a,b,c) - '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' - ) -} - -$code.=<<___; -#if __ARM_MAX_ARCH__>=7 -.arch armv7-a -.fpu neon - -.global sha256_block_data_order_neon -.type sha256_block_data_order_neon,%function -.align 4 -sha256_block_data_order_neon: -.LNEON: - stmdb sp!,{r4-r12,lr} - - sub $H,sp,#16*4+16 - adr $Ktbl,.Lsha256_block_data_order - sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256 - bic $H,$H,#15 @ align for 128-bit stores - mov $t2,sp - mov sp,$H @ alloca - add $len,$inp,$len,lsl#6 @ len to point at the end of inp - - vld1.8 {@X[0]},[$inp]! - vld1.8 {@X[1]},[$inp]! - vld1.8 {@X[2]},[$inp]! - vld1.8 {@X[3]},[$inp]! - vld1.32 {$T0},[$Ktbl,:128]! - vld1.32 {$T1},[$Ktbl,:128]! - vld1.32 {$T2},[$Ktbl,:128]! - vld1.32 {$T3},[$Ktbl,:128]! - vrev32.8 @X[0],@X[0] @ yes, even on - str $ctx,[sp,#64] - vrev32.8 @X[1],@X[1] @ big-endian - str $inp,[sp,#68] - mov $Xfer,sp - vrev32.8 @X[2],@X[2] - str $len,[sp,#72] - vrev32.8 @X[3],@X[3] - str $t2,[sp,#76] @ save original sp - vadd.i32 $T0,$T0,@X[0] - vadd.i32 $T1,$T1,@X[1] - vst1.32 {$T0},[$Xfer,:128]! - vadd.i32 $T2,$T2,@X[2] - vst1.32 {$T1},[$Xfer,:128]! - vadd.i32 $T3,$T3,@X[3] - vst1.32 {$T2},[$Xfer,:128]! - vst1.32 {$T3},[$Xfer,:128]! - - ldmia $ctx,{$A-$H} - sub $Xfer,$Xfer,#64 - ldr $t1,[sp,#0] - eor $t2,$t2,$t2 - eor $t3,$B,$C - b .L_00_48 - -.align 4 -.L_00_48: -___ - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); -$code.=<<___; - teq $t1,#0 @ check for K256 terminator - ldr $t1,[sp,#0] - sub $Xfer,$Xfer,#64 - bne .L_00_48 - - ldr $inp,[sp,#68] - ldr $t0,[sp,#72] - sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl - teq $inp,$t0 - it eq - subeq $inp,$inp,#64 @ avoid SEGV - vld1.8 {@X[0]},[$inp]! @ load next input block - vld1.8 {@X[1]},[$inp]! - vld1.8 {@X[2]},[$inp]! - vld1.8 {@X[3]},[$inp]! - it ne - strne $inp,[sp,#68] - mov $Xfer,sp -___ - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); -$code.=<<___; - ldr $t0,[$t1,#0] - add $A,$A,$t2 @ h+=Maj(a,b,c) from the past - ldr $t2,[$t1,#4] - ldr $t3,[$t1,#8] - ldr $t4,[$t1,#12] - add $A,$A,$t0 @ accumulate - ldr $t0,[$t1,#16] - add $B,$B,$t2 - ldr $t2,[$t1,#20] - add $C,$C,$t3 - ldr $t3,[$t1,#24] - add $D,$D,$t4 - ldr $t4,[$t1,#28] - add $E,$E,$t0 - str $A,[$t1],#4 - add $F,$F,$t2 - str $B,[$t1],#4 - add $G,$G,$t3 - str $C,[$t1],#4 - add $H,$H,$t4 - str $D,[$t1],#4 - stmia $t1,{$E-$H} - - ittte ne - movne $Xfer,sp - ldrne $t1,[sp,#0] - eorne $t2,$t2,$t2 - ldreq sp,[sp,#76] @ restore original sp - itt ne - eorne $t3,$B,$C - bne .L_00_48 - - ldmia sp!,{r4-r12,pc} -.size sha256_block_data_order_neon,.-sha256_block_data_order_neon -#endif -___ -}}} -###################################################################### -# ARMv8 stuff -# -{{{ -my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); -my @MSG=map("q$_",(8..11)); -my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); -my $Ktbl="r3"; - -$code.=<<___; -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - -# ifdef __thumb2__ -# define INST(a,b,c,d) .byte c,d|0xc,a,b -# else -# define INST(a,b,c,d) .byte a,b,c,d -# endif - -.type sha256_block_data_order_armv8,%function -.align 5 -sha256_block_data_order_armv8: -.LARMv8: - vld1.32 {$ABCD,$EFGH},[$ctx] -# ifdef __thumb2__ - adr $Ktbl,.LARMv8 - sub $Ktbl,$Ktbl,#.LARMv8-K256 -# else - adrl $Ktbl,K256 -# endif - add $len,$inp,$len,lsl#6 @ len to point at the end of inp - -.Loop_v8: - vld1.8 {@MSG[0]-@MSG[1]},[$inp]! - vld1.8 {@MSG[2]-@MSG[3]},[$inp]! - vld1.32 {$W0},[$Ktbl]! - vrev32.8 @MSG[0],@MSG[0] - vrev32.8 @MSG[1],@MSG[1] - vrev32.8 @MSG[2],@MSG[2] - vrev32.8 @MSG[3],@MSG[3] - vmov $ABCD_SAVE,$ABCD @ offload - vmov $EFGH_SAVE,$EFGH - teq $inp,$len -___ -for($i=0;$i<12;$i++) { -$code.=<<___; - vld1.32 {$W1},[$Ktbl]! - vadd.i32 $W0,$W0,@MSG[0] - sha256su0 @MSG[0],@MSG[1] - vmov $abcd,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - sha256su1 @MSG[0],@MSG[2],@MSG[3] -___ - ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); -} -$code.=<<___; - vld1.32 {$W1},[$Ktbl]! - vadd.i32 $W0,$W0,@MSG[0] - vmov $abcd,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - - vld1.32 {$W0},[$Ktbl]! - vadd.i32 $W1,$W1,@MSG[1] - vmov $abcd,$ABCD - sha256h $ABCD,$EFGH,$W1 - sha256h2 $EFGH,$abcd,$W1 - - vld1.32 {$W1},[$Ktbl] - vadd.i32 $W0,$W0,@MSG[2] - sub $Ktbl,$Ktbl,#256-16 @ rewind - vmov $abcd,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - - vadd.i32 $W1,$W1,@MSG[3] - vmov $abcd,$ABCD - sha256h $ABCD,$EFGH,$W1 - sha256h2 $EFGH,$abcd,$W1 - - vadd.i32 $ABCD,$ABCD,$ABCD_SAVE - vadd.i32 $EFGH,$EFGH,$EFGH_SAVE - it ne - bne .Loop_v8 - - vst1.32 {$ABCD,$EFGH},[$ctx] - - ret @ bx lr -.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 -#endif -___ -}}} -$code.=<<___; -.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" -.align 2 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.comm OPENSSL_armcap_P,4,4 -#endif -___ - -open SELF,$0; -while(<SELF>) { - next if (/^#!/); - last if (!s/^#/@/ and !/^$/); - print; -} -close SELF; - -{ my %opcode = ( - "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, - "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); - - sub unsha256 { - my ($mnemonic,$arg)=@_; - - if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { - my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) - |(($2&7)<<17)|(($2&8)<<4) - |(($3&7)<<1) |(($3&8)<<2); - # since ARMv7 instructions are always encoded little-endian. - # correct solution is to use .inst directive, but older - # assemblers don't implement it:-( - sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", - $word&0xff,($word>>8)&0xff, - ($word>>16)&0xff,($word>>24)&0xff, - $mnemonic,$arg; - } - } -} - -foreach (split($/,$code)) { - - s/\`([^\`]*)\`/eval $1/geo; - - s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; - - s/\bret\b/bx lr/go or - s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 - - print $_,"\n"; -} - -close STDOUT; # enforce flush diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c deleted file mode 100644 index f85933fdec75..000000000000 --- a/arch/arm/crypto/sha256_glue.c +++ /dev/null @@ -1,117 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Glue code for the SHA256 Secure Hash Algorithm assembly implementation - * using optimized ARM assembler and NEON instructions. - * - * Copyright © 2015 Google Inc. - * - * This file is based on sha256_ssse3_glue.c: - * Copyright (C) 2013 Intel Corporation - * Author: Tim Chen <tim.c.chen@linux.intel.com> - */ - -#include <crypto/internal/hash.h> -#include <linux/crypto.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/types.h> -#include <linux/string.h> -#include <crypto/sha2.h> -#include <crypto/sha256_base.h> -#include <asm/simd.h> -#include <asm/neon.h> - -#include "sha256_glue.h" - -asmlinkage void sha256_block_data_order(struct sha256_state *state, - const u8 *data, int num_blks); - -int crypto_sha256_arm_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - /* make sure casting to sha256_block_fn() is safe */ - BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0); - - return sha256_base_do_update(desc, data, len, sha256_block_data_order); -} -EXPORT_SYMBOL(crypto_sha256_arm_update); - -static int crypto_sha256_arm_final(struct shash_desc *desc, u8 *out) -{ - sha256_base_do_finalize(desc, sha256_block_data_order); - return sha256_base_finish(desc, out); -} - -int crypto_sha256_arm_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha256_base_do_update(desc, data, len, sha256_block_data_order); - return crypto_sha256_arm_final(desc, out); -} -EXPORT_SYMBOL(crypto_sha256_arm_finup); - -static struct shash_alg algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = crypto_sha256_arm_update, - .final = crypto_sha256_arm_final, - .finup = crypto_sha256_arm_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-asm", - .cra_priority = 150, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = crypto_sha256_arm_update, - .final = crypto_sha256_arm_final, - .finup = crypto_sha256_arm_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-asm", - .cra_priority = 150, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int __init sha256_mod_init(void) -{ - int res = crypto_register_shashes(algs, ARRAY_SIZE(algs)); - - if (res < 0) - return res; - - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) { - res = crypto_register_shashes(sha256_neon_algs, - ARRAY_SIZE(sha256_neon_algs)); - - if (res < 0) - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); - } - - return res; -} - -static void __exit sha256_mod_fini(void) -{ - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); - - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) - crypto_unregister_shashes(sha256_neon_algs, - ARRAY_SIZE(sha256_neon_algs)); -} - -module_init(sha256_mod_init); -module_exit(sha256_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON"); - -MODULE_ALIAS_CRYPTO("sha256"); diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h deleted file mode 100644 index 9f0d578bab5f..000000000000 --- a/arch/arm/crypto/sha256_glue.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _CRYPTO_SHA256_GLUE_H -#define _CRYPTO_SHA256_GLUE_H - -#include <linux/crypto.h> - -extern struct shash_alg sha256_neon_algs[2]; - -int crypto_sha256_arm_update(struct shash_desc *desc, const u8 *data, - unsigned int len); - -int crypto_sha256_arm_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *hash); - -#endif /* _CRYPTO_SHA256_GLUE_H */ diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c deleted file mode 100644 index ccdcfff71910..000000000000 --- a/arch/arm/crypto/sha256_neon_glue.c +++ /dev/null @@ -1,92 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Glue code for the SHA256 Secure Hash Algorithm assembly implementation - * using NEON instructions. - * - * Copyright © 2015 Google Inc. - * - * This file is based on sha512_neon_glue.c: - * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> - */ - -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/types.h> -#include <linux/string.h> -#include <crypto/sha2.h> -#include <crypto/sha256_base.h> -#include <asm/byteorder.h> -#include <asm/simd.h> -#include <asm/neon.h> - -#include "sha256_glue.h" - -asmlinkage void sha256_block_data_order_neon(struct sha256_state *digest, - const u8 *data, int num_blks); - -static int crypto_sha256_neon_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) - return crypto_sha256_arm_update(desc, data, len); - - kernel_neon_begin(); - sha256_base_do_update(desc, data, len, sha256_block_data_order_neon); - kernel_neon_end(); - - return 0; -} - -static int crypto_sha256_neon_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - if (!crypto_simd_usable()) - return crypto_sha256_arm_finup(desc, data, len, out); - - kernel_neon_begin(); - if (len) - sha256_base_do_update(desc, data, len, - sha256_block_data_order_neon); - sha256_base_do_finalize(desc, sha256_block_data_order_neon); - kernel_neon_end(); - - return sha256_base_finish(desc, out); -} - -static int crypto_sha256_neon_final(struct shash_desc *desc, u8 *out) -{ - return crypto_sha256_neon_finup(desc, NULL, 0, out); -} - -struct shash_alg sha256_neon_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = crypto_sha256_neon_update, - .final = crypto_sha256_neon_final, - .finup = crypto_sha256_neon_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-neon", - .cra_priority = 250, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = crypto_sha256_neon_update, - .final = crypto_sha256_neon_final, - .finup = crypto_sha256_neon_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-neon", - .cra_priority = 250, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl deleted file mode 100644 index 2fc3516912fa..000000000000 --- a/arch/arm/crypto/sha512-armv4.pl +++ /dev/null @@ -1,657 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 - -# This code is taken from the OpenSSL project but the author (Andy Polyakov) -# has relicensed it under the GPLv2. Therefore this program is free software; -# you can redistribute it and/or modify it under the terms of the GNU General -# Public License version 2 as published by the Free Software Foundation. -# -# The original headers, including the original license headers, are -# included below for completeness. - -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see https://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# SHA512 block procedure for ARMv4. September 2007. - -# This code is ~4.5 (four and a half) times faster than code generated -# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue -# Xscale PXA250 core]. -# -# July 2010. -# -# Rescheduling for dual-issue pipeline resulted in 6% improvement on -# Cortex A8 core and ~40 cycles per processed byte. - -# February 2011. -# -# Profiler-assisted and platform-specific optimization resulted in 7% -# improvement on Coxtex A8 core and ~38 cycles per byte. - -# March 2011. -# -# Add NEON implementation. On Cortex A8 it was measured to process -# one byte in 23.3 cycles or ~60% faster than integer-only code. - -# August 2012. -# -# Improve NEON performance by 12% on Snapdragon S4. In absolute -# terms it's 22.6 cycles per byte, which is disappointing result. -# Technical writers asserted that 3-way S4 pipeline can sustain -# multiple NEON instructions per cycle, but dual NEON issue could -# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html -# for further details. On side note Cortex-A15 processes one byte in -# 16 cycles. - -# Byte order [in]dependence. ========================================= -# -# Originally caller was expected to maintain specific *dword* order in -# h[0-7], namely with most significant dword at *lower* address, which -# was reflected in below two parameters as 0 and 4. Now caller is -# expected to maintain native byte order for whole 64-bit values. -$hi="HI"; -$lo="LO"; -# ==================================================================== - -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; - -$ctx="r0"; # parameter block -$inp="r1"; -$len="r2"; - -$Tlo="r3"; -$Thi="r4"; -$Alo="r5"; -$Ahi="r6"; -$Elo="r7"; -$Ehi="r8"; -$t0="r9"; -$t1="r10"; -$t2="r11"; -$t3="r12"; -############ r13 is stack pointer -$Ktbl="r14"; -############ r15 is program counter - -$Aoff=8*0; -$Boff=8*1; -$Coff=8*2; -$Doff=8*3; -$Eoff=8*4; -$Foff=8*5; -$Goff=8*6; -$Hoff=8*7; -$Xoff=8*8; - -sub BODY_00_15() { -my $magic = shift; -$code.=<<___; - @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - mov $t0,$Elo,lsr#14 - str $Tlo,[sp,#$Xoff+0] - mov $t1,$Ehi,lsr#14 - str $Thi,[sp,#$Xoff+4] - eor $t0,$t0,$Ehi,lsl#18 - ldr $t2,[sp,#$Hoff+0] @ h.lo - eor $t1,$t1,$Elo,lsl#18 - ldr $t3,[sp,#$Hoff+4] @ h.hi - eor $t0,$t0,$Elo,lsr#18 - eor $t1,$t1,$Ehi,lsr#18 - eor $t0,$t0,$Ehi,lsl#14 - eor $t1,$t1,$Elo,lsl#14 - eor $t0,$t0,$Ehi,lsr#9 - eor $t1,$t1,$Elo,lsr#9 - eor $t0,$t0,$Elo,lsl#23 - eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) - adds $Tlo,$Tlo,$t0 - ldr $t0,[sp,#$Foff+0] @ f.lo - adc $Thi,$Thi,$t1 @ T += Sigma1(e) - ldr $t1,[sp,#$Foff+4] @ f.hi - adds $Tlo,$Tlo,$t2 - ldr $t2,[sp,#$Goff+0] @ g.lo - adc $Thi,$Thi,$t3 @ T += h - ldr $t3,[sp,#$Goff+4] @ g.hi - - eor $t0,$t0,$t2 - str $Elo,[sp,#$Eoff+0] - eor $t1,$t1,$t3 - str $Ehi,[sp,#$Eoff+4] - and $t0,$t0,$Elo - str $Alo,[sp,#$Aoff+0] - and $t1,$t1,$Ehi - str $Ahi,[sp,#$Aoff+4] - eor $t0,$t0,$t2 - ldr $t2,[$Ktbl,#$lo] @ K[i].lo - eor $t1,$t1,$t3 @ Ch(e,f,g) - ldr $t3,[$Ktbl,#$hi] @ K[i].hi - - adds $Tlo,$Tlo,$t0 - ldr $Elo,[sp,#$Doff+0] @ d.lo - adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) - ldr $Ehi,[sp,#$Doff+4] @ d.hi - adds $Tlo,$Tlo,$t2 - and $t0,$t2,#0xff - adc $Thi,$Thi,$t3 @ T += K[i] - adds $Elo,$Elo,$Tlo - ldr $t2,[sp,#$Boff+0] @ b.lo - adc $Ehi,$Ehi,$Thi @ d += T - teq $t0,#$magic - - ldr $t3,[sp,#$Coff+0] @ c.lo -#if __ARM_ARCH__>=7 - it eq @ Thumb2 thing, sanity check in ARM -#endif - orreq $Ktbl,$Ktbl,#1 - @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - mov $t0,$Alo,lsr#28 - mov $t1,$Ahi,lsr#28 - eor $t0,$t0,$Ahi,lsl#4 - eor $t1,$t1,$Alo,lsl#4 - eor $t0,$t0,$Ahi,lsr#2 - eor $t1,$t1,$Alo,lsr#2 - eor $t0,$t0,$Alo,lsl#30 - eor $t1,$t1,$Ahi,lsl#30 - eor $t0,$t0,$Ahi,lsr#7 - eor $t1,$t1,$Alo,lsr#7 - eor $t0,$t0,$Alo,lsl#25 - eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) - adds $Tlo,$Tlo,$t0 - and $t0,$Alo,$t2 - adc $Thi,$Thi,$t1 @ T += Sigma0(a) - - ldr $t1,[sp,#$Boff+4] @ b.hi - orr $Alo,$Alo,$t2 - ldr $t2,[sp,#$Coff+4] @ c.hi - and $Alo,$Alo,$t3 - and $t3,$Ahi,$t1 - orr $Ahi,$Ahi,$t1 - orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo - and $Ahi,$Ahi,$t2 - adds $Alo,$Alo,$Tlo - orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi - sub sp,sp,#8 - adc $Ahi,$Ahi,$Thi @ h += T - tst $Ktbl,#1 - add $Ktbl,$Ktbl,#8 -___ -} -$code=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} -# define VFP_ABI_POP vldmia sp!,{d8-d15} -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ 7 -# define VFP_ABI_PUSH -# define VFP_ABI_POP -#endif - -#ifdef __ARMEL__ -# define LO 0 -# define HI 4 -# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 -#else -# define HI 0 -# define LO 4 -# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 -#endif - -.text -#if __ARM_ARCH__<7 -.code 32 -#else -.syntax unified -# ifdef __thumb2__ -.thumb -# else -.code 32 -# endif -#endif - -.type K512,%object -.align 5 -K512: -WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) -WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) -WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) -WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) -WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) -WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) -WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) -WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) -WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) -WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) -WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) -WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) -WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) -WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) -WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) -WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) -WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) -WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) -WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) -WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) -WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) -WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) -WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) -WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) -WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) -WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) -WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) -WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) -WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) -WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) -WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) -WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) -WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) -WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) -WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) -WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) -WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) -WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) -WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) -WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) -.size K512,.-K512 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha512_block_data_order -.skip 32-4 -#else -.skip 32 -#endif - -.global sha512_block_data_order -.type sha512_block_data_order,%function -sha512_block_data_order: -.Lsha512_block_data_order: -#if __ARM_ARCH__<7 - sub r3,pc,#8 @ sha512_block_data_order -#else - adr r3,.Lsha512_block_data_order -#endif -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P - tst r12,#1 - bne .LNEON -#endif - add $len,$inp,$len,lsl#7 @ len to point at the end of inp - stmdb sp!,{r4-r12,lr} - sub $Ktbl,r3,#672 @ K512 - sub sp,sp,#9*8 - - ldr $Elo,[$ctx,#$Eoff+$lo] - ldr $Ehi,[$ctx,#$Eoff+$hi] - ldr $t0, [$ctx,#$Goff+$lo] - ldr $t1, [$ctx,#$Goff+$hi] - ldr $t2, [$ctx,#$Hoff+$lo] - ldr $t3, [$ctx,#$Hoff+$hi] -.Loop: - str $t0, [sp,#$Goff+0] - str $t1, [sp,#$Goff+4] - str $t2, [sp,#$Hoff+0] - str $t3, [sp,#$Hoff+4] - ldr $Alo,[$ctx,#$Aoff+$lo] - ldr $Ahi,[$ctx,#$Aoff+$hi] - ldr $Tlo,[$ctx,#$Boff+$lo] - ldr $Thi,[$ctx,#$Boff+$hi] - ldr $t0, [$ctx,#$Coff+$lo] - ldr $t1, [$ctx,#$Coff+$hi] - ldr $t2, [$ctx,#$Doff+$lo] - ldr $t3, [$ctx,#$Doff+$hi] - str $Tlo,[sp,#$Boff+0] - str $Thi,[sp,#$Boff+4] - str $t0, [sp,#$Coff+0] - str $t1, [sp,#$Coff+4] - str $t2, [sp,#$Doff+0] - str $t3, [sp,#$Doff+4] - ldr $Tlo,[$ctx,#$Foff+$lo] - ldr $Thi,[$ctx,#$Foff+$hi] - str $Tlo,[sp,#$Foff+0] - str $Thi,[sp,#$Foff+4] - -.L00_15: -#if __ARM_ARCH__<7 - ldrb $Tlo,[$inp,#7] - ldrb $t0, [$inp,#6] - ldrb $t1, [$inp,#5] - ldrb $t2, [$inp,#4] - ldrb $Thi,[$inp,#3] - ldrb $t3, [$inp,#2] - orr $Tlo,$Tlo,$t0,lsl#8 - ldrb $t0, [$inp,#1] - orr $Tlo,$Tlo,$t1,lsl#16 - ldrb $t1, [$inp],#8 - orr $Tlo,$Tlo,$t2,lsl#24 - orr $Thi,$Thi,$t3,lsl#8 - orr $Thi,$Thi,$t0,lsl#16 - orr $Thi,$Thi,$t1,lsl#24 -#else - ldr $Tlo,[$inp,#4] - ldr $Thi,[$inp],#8 -#ifdef __ARMEL__ - rev $Tlo,$Tlo - rev $Thi,$Thi -#endif -#endif -___ - &BODY_00_15(0x94); -$code.=<<___; - tst $Ktbl,#1 - beq .L00_15 - ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] - ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] - bic $Ktbl,$Ktbl,#1 -.L16_79: - @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) - @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 - @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 - mov $Tlo,$t0,lsr#1 - ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] - mov $Thi,$t1,lsr#1 - ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] - eor $Tlo,$Tlo,$t1,lsl#31 - eor $Thi,$Thi,$t0,lsl#31 - eor $Tlo,$Tlo,$t0,lsr#8 - eor $Thi,$Thi,$t1,lsr#8 - eor $Tlo,$Tlo,$t1,lsl#24 - eor $Thi,$Thi,$t0,lsl#24 - eor $Tlo,$Tlo,$t0,lsr#7 - eor $Thi,$Thi,$t1,lsr#7 - eor $Tlo,$Tlo,$t1,lsl#25 - - @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) - @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 - @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 - mov $t0,$t2,lsr#19 - mov $t1,$t3,lsr#19 - eor $t0,$t0,$t3,lsl#13 - eor $t1,$t1,$t2,lsl#13 - eor $t0,$t0,$t3,lsr#29 - eor $t1,$t1,$t2,lsr#29 - eor $t0,$t0,$t2,lsl#3 - eor $t1,$t1,$t3,lsl#3 - eor $t0,$t0,$t2,lsr#6 - eor $t1,$t1,$t3,lsr#6 - ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] - eor $t0,$t0,$t3,lsl#26 - - ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] - adds $Tlo,$Tlo,$t0 - ldr $t0,[sp,#`$Xoff+8*16`+0] - adc $Thi,$Thi,$t1 - - ldr $t1,[sp,#`$Xoff+8*16`+4] - adds $Tlo,$Tlo,$t2 - adc $Thi,$Thi,$t3 - adds $Tlo,$Tlo,$t0 - adc $Thi,$Thi,$t1 -___ - &BODY_00_15(0x17); -$code.=<<___; -#if __ARM_ARCH__>=7 - ittt eq @ Thumb2 thing, sanity check in ARM -#endif - ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] - ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] - beq .L16_79 - bic $Ktbl,$Ktbl,#1 - - ldr $Tlo,[sp,#$Boff+0] - ldr $Thi,[sp,#$Boff+4] - ldr $t0, [$ctx,#$Aoff+$lo] - ldr $t1, [$ctx,#$Aoff+$hi] - ldr $t2, [$ctx,#$Boff+$lo] - ldr $t3, [$ctx,#$Boff+$hi] - adds $t0,$Alo,$t0 - str $t0, [$ctx,#$Aoff+$lo] - adc $t1,$Ahi,$t1 - str $t1, [$ctx,#$Aoff+$hi] - adds $t2,$Tlo,$t2 - str $t2, [$ctx,#$Boff+$lo] - adc $t3,$Thi,$t3 - str $t3, [$ctx,#$Boff+$hi] - - ldr $Alo,[sp,#$Coff+0] - ldr $Ahi,[sp,#$Coff+4] - ldr $Tlo,[sp,#$Doff+0] - ldr $Thi,[sp,#$Doff+4] - ldr $t0, [$ctx,#$Coff+$lo] - ldr $t1, [$ctx,#$Coff+$hi] - ldr $t2, [$ctx,#$Doff+$lo] - ldr $t3, [$ctx,#$Doff+$hi] - adds $t0,$Alo,$t0 - str $t0, [$ctx,#$Coff+$lo] - adc $t1,$Ahi,$t1 - str $t1, [$ctx,#$Coff+$hi] - adds $t2,$Tlo,$t2 - str $t2, [$ctx,#$Doff+$lo] - adc $t3,$Thi,$t3 - str $t3, [$ctx,#$Doff+$hi] - - ldr $Tlo,[sp,#$Foff+0] - ldr $Thi,[sp,#$Foff+4] - ldr $t0, [$ctx,#$Eoff+$lo] - ldr $t1, [$ctx,#$Eoff+$hi] - ldr $t2, [$ctx,#$Foff+$lo] - ldr $t3, [$ctx,#$Foff+$hi] - adds $Elo,$Elo,$t0 - str $Elo,[$ctx,#$Eoff+$lo] - adc $Ehi,$Ehi,$t1 - str $Ehi,[$ctx,#$Eoff+$hi] - adds $t2,$Tlo,$t2 - str $t2, [$ctx,#$Foff+$lo] - adc $t3,$Thi,$t3 - str $t3, [$ctx,#$Foff+$hi] - - ldr $Alo,[sp,#$Goff+0] - ldr $Ahi,[sp,#$Goff+4] - ldr $Tlo,[sp,#$Hoff+0] - ldr $Thi,[sp,#$Hoff+4] - ldr $t0, [$ctx,#$Goff+$lo] - ldr $t1, [$ctx,#$Goff+$hi] - ldr $t2, [$ctx,#$Hoff+$lo] - ldr $t3, [$ctx,#$Hoff+$hi] - adds $t0,$Alo,$t0 - str $t0, [$ctx,#$Goff+$lo] - adc $t1,$Ahi,$t1 - str $t1, [$ctx,#$Goff+$hi] - adds $t2,$Tlo,$t2 - str $t2, [$ctx,#$Hoff+$lo] - adc $t3,$Thi,$t3 - str $t3, [$ctx,#$Hoff+$hi] - - add sp,sp,#640 - sub $Ktbl,$Ktbl,#640 - - teq $inp,$len - bne .Loop - - add sp,sp,#8*9 @ destroy frame -#if __ARM_ARCH__>=5 - ldmia sp!,{r4-r12,pc} -#else - ldmia sp!,{r4-r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) -#endif -.size sha512_block_data_order,.-sha512_block_data_order -___ - -{ -my @Sigma0=(28,34,39); -my @Sigma1=(14,18,41); -my @sigma0=(1, 8, 7); -my @sigma1=(19,61,6); - -my $Ktbl="r3"; -my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch - -my @X=map("d$_",(0..15)); -my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); - -sub NEON_00_15() { -my $i=shift; -my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; -my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps - -$code.=<<___ if ($i<16 || $i&1); - vshr.u64 $t0,$e,#@Sigma1[0] @ $i -#if $i<16 - vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned -#endif - vshr.u64 $t1,$e,#@Sigma1[1] -#if $i>0 - vadd.i64 $a,$Maj @ h+=Maj from the past -#endif - vshr.u64 $t2,$e,#@Sigma1[2] -___ -$code.=<<___; - vld1.64 {$K},[$Ktbl,:64]! @ K[i++] - vsli.64 $t0,$e,#`64-@Sigma1[0]` - vsli.64 $t1,$e,#`64-@Sigma1[1]` - vmov $Ch,$e - vsli.64 $t2,$e,#`64-@Sigma1[2]` -#if $i<16 && defined(__ARMEL__) - vrev64.8 @X[$i],@X[$i] -#endif - veor $t1,$t0 - vbsl $Ch,$f,$g @ Ch(e,f,g) - vshr.u64 $t0,$a,#@Sigma0[0] - veor $t2,$t1 @ Sigma1(e) - vadd.i64 $T1,$Ch,$h - vshr.u64 $t1,$a,#@Sigma0[1] - vsli.64 $t0,$a,#`64-@Sigma0[0]` - vadd.i64 $T1,$t2 - vshr.u64 $t2,$a,#@Sigma0[2] - vadd.i64 $K,@X[$i%16] - vsli.64 $t1,$a,#`64-@Sigma0[1]` - veor $Maj,$a,$b - vsli.64 $t2,$a,#`64-@Sigma0[2]` - veor $h,$t0,$t1 - vadd.i64 $T1,$K - vbsl $Maj,$c,$b @ Maj(a,b,c) - veor $h,$t2 @ Sigma0(a) - vadd.i64 $d,$T1 - vadd.i64 $Maj,$T1 - @ vadd.i64 $h,$Maj -___ -} - -sub NEON_16_79() { -my $i=shift; - -if ($i&1) { &NEON_00_15($i,@_); return; } - -# 2x-vectorized, therefore runs every 2nd round -my @X=map("q$_",(0..7)); # view @X as 128-bit vector -my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps -my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 -my $e=@_[4]; # $e from NEON_00_15 -$i /= 2; -$code.=<<___; - vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] - vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] - vadd.i64 @_[0],d30 @ h+=Maj from the past - vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] - vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` - vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] - vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` - veor $s1,$t0 - vshr.u64 $t0,$s0,#@sigma0[0] - veor $s1,$t1 @ sigma1(X[i+14]) - vshr.u64 $t1,$s0,#@sigma0[1] - vadd.i64 @X[$i%8],$s1 - vshr.u64 $s1,$s0,#@sigma0[2] - vsli.64 $t0,$s0,#`64-@sigma0[0]` - vsli.64 $t1,$s0,#`64-@sigma0[1]` - vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] - veor $s1,$t0 - vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 - vadd.i64 @X[$i%8],$s0 - vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 - veor $s1,$t1 @ sigma0(X[i+1]) - vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 - vadd.i64 @X[$i%8],$s1 -___ - &NEON_00_15(2*$i,@_); -} - -$code.=<<___; -#if __ARM_MAX_ARCH__>=7 -.arch armv7-a -.fpu neon - -.global sha512_block_data_order_neon -.type sha512_block_data_order_neon,%function -.align 4 -sha512_block_data_order_neon: -.LNEON: - dmb @ errata #451034 on early Cortex A8 - add $len,$inp,$len,lsl#7 @ len to point at the end of inp - VFP_ABI_PUSH - adr $Ktbl,.Lsha512_block_data_order - sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512 - vldmia $ctx,{$A-$H} @ load context -.Loop_neon: -___ -for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } -$code.=<<___; - mov $cnt,#4 -.L16_79_neon: - subs $cnt,#1 -___ -for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } -$code.=<<___; - bne .L16_79_neon - - vadd.i64 $A,d30 @ h+=Maj from the past - vldmia $ctx,{d24-d31} @ load context to temp - vadd.i64 q8,q12 @ vectorized accumulate - vadd.i64 q9,q13 - vadd.i64 q10,q14 - vadd.i64 q11,q15 - vstmia $ctx,{$A-$H} @ save context - teq $inp,$len - sub $Ktbl,#640 @ rewind K512 - bne .Loop_neon - - VFP_ABI_POP - ret @ bx lr -.size sha512_block_data_order_neon,.-sha512_block_data_order_neon -#endif -___ -} -$code.=<<___; -.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" -.align 2 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.comm OPENSSL_armcap_P,4,4 -#endif -___ - -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -$code =~ s/\bret\b/bx lr/gm; - -open SELF,$0; -while(<SELF>) { - next if (/^#!/); - last if (!s/^#/@/ and !/^$/); - print; -} -close SELF; - -print $code; -close STDOUT; # enforce flush diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c deleted file mode 100644 index 1be5bd498af3..000000000000 --- a/arch/arm/crypto/sha512-glue.c +++ /dev/null @@ -1,114 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * sha512-glue.c - accelerated SHA-384/512 for ARM - * - * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> - */ - -#include <crypto/internal/hash.h> -#include <crypto/sha2.h> -#include <crypto/sha512_base.h> -#include <linux/crypto.h> -#include <linux/module.h> - -#include <asm/hwcap.h> -#include <asm/neon.h> - -#include "sha512.h" - -MODULE_DESCRIPTION("Accelerated SHA-384/SHA-512 secure hash for ARM"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); - -MODULE_ALIAS_CRYPTO("sha384"); -MODULE_ALIAS_CRYPTO("sha512"); -MODULE_ALIAS_CRYPTO("sha384-arm"); -MODULE_ALIAS_CRYPTO("sha512-arm"); - -asmlinkage void sha512_block_data_order(struct sha512_state *state, - u8 const *src, int blocks); - -int sha512_arm_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha512_base_do_update(desc, data, len, sha512_block_data_order); -} - -static int sha512_arm_final(struct shash_desc *desc, u8 *out) -{ - sha512_base_do_finalize(desc, sha512_block_data_order); - return sha512_base_finish(desc, out); -} - -int sha512_arm_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha512_base_do_update(desc, data, len, sha512_block_data_order); - return sha512_arm_final(desc, out); -} - -static struct shash_alg sha512_arm_algs[] = { { - .init = sha384_base_init, - .update = sha512_arm_update, - .final = sha512_arm_final, - .finup = sha512_arm_finup, - .descsize = sizeof(struct sha512_state), - .digestsize = SHA384_DIGEST_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name = "sha384-arm", - .cra_priority = 250, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .init = sha512_base_init, - .update = sha512_arm_update, - .final = sha512_arm_final, - .finup = sha512_arm_finup, - .descsize = sizeof(struct sha512_state), - .digestsize = SHA512_DIGEST_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name = "sha512-arm", - .cra_priority = 250, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int __init sha512_arm_mod_init(void) -{ - int err; - - err = crypto_register_shashes(sha512_arm_algs, - ARRAY_SIZE(sha512_arm_algs)); - if (err) - return err; - - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) { - err = crypto_register_shashes(sha512_neon_algs, - ARRAY_SIZE(sha512_neon_algs)); - if (err) - goto err_unregister; - } - return 0; - -err_unregister: - crypto_unregister_shashes(sha512_arm_algs, - ARRAY_SIZE(sha512_arm_algs)); - - return err; -} - -static void __exit sha512_arm_mod_fini(void) -{ - crypto_unregister_shashes(sha512_arm_algs, - ARRAY_SIZE(sha512_arm_algs)); - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) - crypto_unregister_shashes(sha512_neon_algs, - ARRAY_SIZE(sha512_neon_algs)); -} - -module_init(sha512_arm_mod_init); -module_exit(sha512_arm_mod_fini); diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c deleted file mode 100644 index c6e58fe475ac..000000000000 --- a/arch/arm/crypto/sha512-neon-glue.c +++ /dev/null @@ -1,92 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * sha512-neon-glue.c - accelerated SHA-384/512 for ARM NEON - * - * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> - */ - -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <crypto/sha2.h> -#include <crypto/sha512_base.h> -#include <linux/crypto.h> -#include <linux/module.h> - -#include <asm/simd.h> -#include <asm/neon.h> - -#include "sha512.h" - -MODULE_ALIAS_CRYPTO("sha384-neon"); -MODULE_ALIAS_CRYPTO("sha512-neon"); - -asmlinkage void sha512_block_data_order_neon(struct sha512_state *state, - const u8 *src, int blocks); - -static int sha512_neon_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) - return sha512_arm_update(desc, data, len); - - kernel_neon_begin(); - sha512_base_do_update(desc, data, len, sha512_block_data_order_neon); - kernel_neon_end(); - - return 0; -} - -static int sha512_neon_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - if (!crypto_simd_usable()) - return sha512_arm_finup(desc, data, len, out); - - kernel_neon_begin(); - if (len) - sha512_base_do_update(desc, data, len, - sha512_block_data_order_neon); - sha512_base_do_finalize(desc, sha512_block_data_order_neon); - kernel_neon_end(); - - return sha512_base_finish(desc, out); -} - -static int sha512_neon_final(struct shash_desc *desc, u8 *out) -{ - return sha512_neon_finup(desc, NULL, 0, out); -} - -struct shash_alg sha512_neon_algs[] = { { - .init = sha384_base_init, - .update = sha512_neon_update, - .final = sha512_neon_final, - .finup = sha512_neon_finup, - .descsize = sizeof(struct sha512_state), - .digestsize = SHA384_DIGEST_SIZE, - .base = { - .cra_name = "sha384", - .cra_driver_name = "sha384-neon", - .cra_priority = 300, - .cra_blocksize = SHA384_BLOCK_SIZE, - .cra_module = THIS_MODULE, - - } -}, { - .init = sha512_base_init, - .update = sha512_neon_update, - .final = sha512_neon_final, - .finup = sha512_neon_finup, - .descsize = sizeof(struct sha512_state), - .digestsize = SHA512_DIGEST_SIZE, - .base = { - .cra_name = "sha512", - .cra_driver_name = "sha512-neon", - .cra_priority = 300, - .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; diff --git a/arch/arm/crypto/sha512.h b/arch/arm/crypto/sha512.h deleted file mode 100644 index e14572be76d1..000000000000 --- a/arch/arm/crypto/sha512.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -int sha512_arm_update(struct shash_desc *desc, const u8 *data, - unsigned int len); - -int sha512_arm_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out); - -extern struct shash_alg sha512_neon_algs[2]; |