diff options
Diffstat (limited to 'lib/crypto/arm')
-rw-r--r-- | lib/crypto/arm/.gitignore | 4 | ||||
-rw-r--r-- | lib/crypto/arm/Kconfig | 24 | ||||
-rw-r--r-- | lib/crypto/arm/Makefile | 26 | ||||
-rw-r--r-- | lib/crypto/arm/blake2s-core.S | 306 | ||||
-rw-r--r-- | lib/crypto/arm/blake2s-glue.c | 7 | ||||
-rw-r--r-- | lib/crypto/arm/chacha-glue.c | 138 | ||||
-rw-r--r-- | lib/crypto/arm/chacha-neon-core.S | 643 | ||||
-rw-r--r-- | lib/crypto/arm/chacha-scalar-core.S | 444 | ||||
-rw-r--r-- | lib/crypto/arm/poly1305-armv4.pl | 1236 | ||||
-rw-r--r-- | lib/crypto/arm/poly1305-glue.c | 76 | ||||
-rw-r--r-- | lib/crypto/arm/sha1-armv4-large.S | 507 | ||||
-rw-r--r-- | lib/crypto/arm/sha1-armv7-neon.S | 633 | ||||
-rw-r--r-- | lib/crypto/arm/sha1-ce-core.S | 123 | ||||
-rw-r--r-- | lib/crypto/arm/sha1.h | 46 | ||||
-rw-r--r-- | lib/crypto/arm/sha256-armv4.pl | 724 | ||||
-rw-r--r-- | lib/crypto/arm/sha256-ce.S | 123 | ||||
-rw-r--r-- | lib/crypto/arm/sha256.h | 46 | ||||
-rw-r--r-- | lib/crypto/arm/sha512-armv4.pl | 657 | ||||
-rw-r--r-- | lib/crypto/arm/sha512.h | 38 |
19 files changed, 5801 insertions, 0 deletions
diff --git a/lib/crypto/arm/.gitignore b/lib/crypto/arm/.gitignore new file mode 100644 index 000000000000..f6c4e8ef80da --- /dev/null +++ b/lib/crypto/arm/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +poly1305-core.S +sha256-core.S +sha512-core.S diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig new file mode 100644 index 000000000000..e8444fd0aae3 --- /dev/null +++ b/lib/crypto/arm/Kconfig @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_BLAKE2S_ARM + bool "Hash functions: BLAKE2s" + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S + help + BLAKE2s cryptographic hash function (RFC 7693) + + Architecture: arm + + This is faster than the generic implementations of BLAKE2s and + BLAKE2b, but slower than the NEON implementation of BLAKE2b. + There is no NEON implementation of BLAKE2s, since NEON doesn't + really help with it. + +config CRYPTO_CHACHA20_NEON + tristate + default CRYPTO_LIB_CHACHA + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_ARM + tristate + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile new file mode 100644 index 000000000000..4c042a4c77ed --- /dev/null +++ b/lib/crypto/arm/Makefile @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o +libblake2s-arm-y := blake2s-core.o blake2s-glue.o + +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o +chacha-neon-y := chacha-scalar-core.o chacha-glue.o +chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o + +obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o +poly1305-arm-y := poly1305-core.o poly1305-glue.o + +quiet_cmd_perl = PERL $@ + cmd_perl = $(PERL) $(<) > $(@) + +$(obj)/%-core.S: $(src)/%-armv4.pl + $(call cmd,perl) + +clean-files += poly1305-core.S + +aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1 + +# massage the perlasm code a bit so we only get the NEON routine if we need it +poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5 +poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7 +AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y) diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S new file mode 100644 index 000000000000..df40e46601f1 --- /dev/null +++ b/lib/crypto/arm/blake2s-core.S @@ -0,0 +1,306 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * BLAKE2s digest algorithm, ARM scalar implementation + * + * Copyright 2020 Google LLC + * + * Author: Eric Biggers <ebiggers@google.com> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + // Registers used to hold message words temporarily. There aren't + // enough ARM registers to hold the whole message block, so we have to + // load the words on-demand. + M_0 .req r12 + M_1 .req r14 + +// The BLAKE2s initialization vector +.Lblake2s_IV: + .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 + +.macro __ldrd a, b, src, offset +#if __LINUX_ARM_ARCH__ >= 6 + ldrd \a, \b, [\src, #\offset] +#else + ldr \a, [\src, #\offset] + ldr \b, [\src, #\offset + 4] +#endif +.endm + +.macro __strd a, b, dst, offset +#if __LINUX_ARM_ARCH__ >= 6 + strd \a, \b, [\dst, #\offset] +#else + str \a, [\dst, #\offset] + str \b, [\dst, #\offset + 4] +#endif +.endm + +.macro _le32_bswap a, tmp +#ifdef __ARMEB__ + rev_l \a, \tmp +#endif +.endm + +.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp + _le32_bswap \a, \tmp + _le32_bswap \b, \tmp + _le32_bswap \c, \tmp + _le32_bswap \d, \tmp + _le32_bswap \e, \tmp + _le32_bswap \f, \tmp + _le32_bswap \g, \tmp + _le32_bswap \h, \tmp +.endm + +// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. +// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two +// columns/diagonals. s0-s1 are the word offsets to the message words the first +// column/diagonal needs, and likewise s2-s3 for the second column/diagonal. +// M_0 and M_1 are free to use, and the message block can be found at sp + 32. +// +// Note that to save instructions, the rotations don't happen when the +// pseudocode says they should, but rather they are delayed until the values are +// used. See the comment above _blake2s_round(). +.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3 + + ldr M_0, [sp, #32 + 4 * \s0] + ldr M_1, [sp, #32 + 4 * \s2] + + // a += b + m[blake2s_sigma[r][2*i + 0]]; + add \a0, \a0, \b0, ror #brot + add \a1, \a1, \b1, ror #brot + add \a0, \a0, M_0 + add \a1, \a1, M_1 + + // d = ror32(d ^ a, 16); + eor \d0, \a0, \d0, ror #drot + eor \d1, \a1, \d1, ror #drot + + // c += d; + add \c0, \c0, \d0, ror #16 + add \c1, \c1, \d1, ror #16 + + // b = ror32(b ^ c, 12); + eor \b0, \c0, \b0, ror #brot + eor \b1, \c1, \b1, ror #brot + + ldr M_0, [sp, #32 + 4 * \s1] + ldr M_1, [sp, #32 + 4 * \s3] + + // a += b + m[blake2s_sigma[r][2*i + 1]]; + add \a0, \a0, \b0, ror #12 + add \a1, \a1, \b1, ror #12 + add \a0, \a0, M_0 + add \a1, \a1, M_1 + + // d = ror32(d ^ a, 8); + eor \d0, \a0, \d0, ror#16 + eor \d1, \a1, \d1, ror#16 + + // c += d; + add \c0, \c0, \d0, ror#8 + add \c1, \c1, \d1, ror#8 + + // b = ror32(b ^ c, 7); + eor \b0, \c0, \b0, ror#12 + eor \b1, \c1, \b1, ror#12 +.endm + +// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9] +// are in r0..r9. The stack pointer points to 8 bytes of scratch space for +// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and +// r14 are free to use. The macro arguments s0-s15 give the order in which the +// message words are used in this round. +// +// All rotates are performed using the implicit rotate operand accepted by the +// 'add' and 'eor' instructions. This is faster than using explicit rotate +// instructions. To make this work, we allow the values in the second and last +// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the +// wrong rotation amount. The rotation amount is then fixed up just in time +// when the values are used. 'brot' is the number of bits the values in row 'b' +// need to be rotated right to arrive at the correct values, and 'drot' +// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such +// that they end up as (7, 8) after every round. +.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \ + s8, s9, s10, s11, s12, s13, s14, s15 + + // Mix first two columns: + // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]). + __ldrd r10, r11, sp, 16 // load v[12] and v[13] + _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \ + \s0, \s1, \s2, \s3 + __strd r8, r9, sp, 0 + __strd r10, r11, sp, 16 + + // Mix second two columns: + // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]). + __ldrd r8, r9, sp, 8 // load v[10] and v[11] + __ldrd r10, r11, sp, 24 // load v[14] and v[15] + _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \ + \s4, \s5, \s6, \s7 + str r10, [sp, #24] // store v[14] + // v[10], v[11], and v[15] are used below, so no need to store them yet. + + .set brot, 7 + .set drot, 8 + + // Mix first two diagonals: + // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]). + ldr r10, [sp, #16] // load v[12] + _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \ + \s8, \s9, \s10, \s11 + __strd r8, r9, sp, 8 + str r11, [sp, #28] + str r10, [sp, #16] + + // Mix second two diagonals: + // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]). + __ldrd r8, r9, sp, 0 // load v[8] and v[9] + __ldrd r10, r11, sp, 20 // load v[13] and v[14] + _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \ + \s12, \s13, \s14, \s15 + __strd r10, r11, sp, 20 +.endm + +// +// void blake2s_compress(struct blake2s_state *state, +// const u8 *block, size_t nblocks, u32 inc); +// +// Only the first three fields of struct blake2s_state are used: +// u32 h[8]; (inout) +// u32 t[2]; (inout) +// u32 f[2]; (in) +// + .align 5 +ENTRY(blake2s_compress) + push {r0-r2,r4-r11,lr} // keep this an even number + +.Lnext_block: + // r0 is 'state' + // r1 is 'block' + // r3 is 'inc' + + // Load and increment the counter t[0..1]. + __ldrd r10, r11, r0, 32 + adds r10, r10, r3 + adc r11, r11, #0 + __strd r10, r11, r0, 32 + + // _blake2s_round is very short on registers, so copy the message block + // to the stack to save a register during the rounds. This also has the + // advantage that misalignment only needs to be dealt with in one place. + sub sp, sp, #64 + mov r12, sp + tst r1, #3 + bne .Lcopy_block_misaligned + ldmia r1!, {r2-r9} + _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 + stmia r12!, {r2-r9} + ldmia r1!, {r2-r9} + _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 + stmia r12, {r2-r9} +.Lcopy_block_done: + str r1, [sp, #68] // Update message pointer + + // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space + // for spilling v[8..9]. Leave v[8..9] in r8-r9. + mov r14, r0 // r14 = state + adr r12, .Lblake2s_IV + ldmia r12!, {r8-r9} // load IV[0..1] + __ldrd r0, r1, r14, 40 // load f[0..1] + ldm r12, {r2-r7} // load IV[3..7] + eor r4, r4, r10 // v[12] = IV[4] ^ t[0] + eor r5, r5, r11 // v[13] = IV[5] ^ t[1] + eor r6, r6, r0 // v[14] = IV[6] ^ f[0] + eor r7, r7, r1 // v[15] = IV[7] ^ f[1] + push {r2-r7} // push v[9..15] + sub sp, sp, #8 // leave space for v[8..9] + + // Load h[0..7] == v[0..7]. + ldm r14, {r0-r7} + + // Execute the rounds. Each round is provided the order in which it + // needs to use the message words. + .set brot, 0 + .set drot, 0 + _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 + _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 + _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 + _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 + _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 + _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 + _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 + _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 + _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 + + // Fold the final state matrix into the hash chaining value: + // + // for (i = 0; i < 8; i++) + // h[i] ^= v[i] ^ v[i + 8]; + // + ldr r14, [sp, #96] // r14 = &h[0] + add sp, sp, #8 // v[8..9] are already loaded. + pop {r10-r11} // load v[10..11] + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + eor r3, r3, r11 + ldm r14, {r8-r11} // load h[0..3] + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + eor r3, r3, r11 + stmia r14!, {r0-r3} // store new h[0..3] + ldm r14, {r0-r3} // load old h[4..7] + pop {r8-r11} // load v[12..15] + eor r0, r0, r4, ror #brot + eor r1, r1, r5, ror #brot + eor r2, r2, r6, ror #brot + eor r3, r3, r7, ror #brot + eor r0, r0, r8, ror #drot + eor r1, r1, r9, ror #drot + eor r2, r2, r10, ror #drot + eor r3, r3, r11, ror #drot + add sp, sp, #64 // skip copy of message block + stm r14, {r0-r3} // store new h[4..7] + + // Advance to the next block, if there is one. Note that if there are + // multiple blocks, then 'inc' (the counter increment amount) must be + // 64. So we can simply set it to 64 without re-loading it. + ldm sp, {r0, r1, r2} // load (state, block, nblocks) + mov r3, #64 // set 'inc' + subs r2, r2, #1 // nblocks-- + str r2, [sp, #8] + bne .Lnext_block // nblocks != 0? + + pop {r0-r2,r4-r11,pc} + + // The next message block (pointed to by r1) isn't 4-byte aligned, so it + // can't be loaded using ldmia. Copy it to the stack buffer (pointed to + // by r12) using an alternative method. r2-r9 are free to use. +.Lcopy_block_misaligned: + mov r2, #64 +1: +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + ldr r3, [r1], #4 + _le32_bswap r3, r4 +#else + ldrb r3, [r1, #0] + ldrb r4, [r1, #1] + ldrb r5, [r1, #2] + ldrb r6, [r1, #3] + add r1, r1, #4 + orr r3, r3, r4, lsl #8 + orr r3, r3, r5, lsl #16 + orr r3, r3, r6, lsl #24 +#endif + subs r2, r2, #4 + str r3, [r12], #4 + bne 1b + b .Lcopy_block_done +ENDPROC(blake2s_compress) diff --git a/lib/crypto/arm/blake2s-glue.c b/lib/crypto/arm/blake2s-glue.c new file mode 100644 index 000000000000..0238a70d9581 --- /dev/null +++ b/lib/crypto/arm/blake2s-glue.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <crypto/internal/blake2s.h> +#include <linux/module.h> + +/* defined in blake2s-core.S */ +EXPORT_SYMBOL(blake2s_compress); diff --git a/lib/crypto/arm/chacha-glue.c b/lib/crypto/arm/chacha-glue.c new file mode 100644 index 000000000000..88ec96415283 --- /dev/null +++ b/lib/crypto/arm/chacha-glue.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ChaCha and HChaCha functions (ARM optimized) + * + * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2015 Martin Willi + */ + +#include <crypto/chacha.h> +#include <crypto/internal/simd.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/cputype.h> +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> + +asmlinkage void chacha_block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, int nrounds); +asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state, + u8 *dst, const u8 *src, + int nrounds, unsigned int nbytes); +asmlinkage void hchacha_block_arm(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); +asmlinkage void hchacha_block_neon(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); + +asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, + const struct chacha_state *state, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon); + +static inline bool neon_usable(void) +{ + return static_branch_likely(&use_neon) && crypto_simd_usable(); +} + +static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + u8 buf[CHACHA_BLOCK_SIZE]; + + while (bytes > CHACHA_BLOCK_SIZE) { + unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U); + + chacha_4block_xor_neon(state, dst, src, nrounds, l); + bytes -= l; + src += l; + dst += l; + state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); + } + if (bytes) { + const u8 *s = src; + u8 *d = dst; + + if (bytes != CHACHA_BLOCK_SIZE) + s = d = memcpy(buf, src, bytes); + chacha_block_xor_neon(state, d, s, nrounds); + if (d != dst) + memcpy(dst, buf, bytes); + state->x[12]++; + } +} + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) { + hchacha_block_arm(state, out, nrounds); + } else { + kernel_neon_begin(); + hchacha_block_neon(state, out, nrounds); + kernel_neon_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() || + bytes <= CHACHA_BLOCK_SIZE) { + chacha_doarm(dst, src, bytes, state, nrounds); + state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE); + return; + } + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_neon_begin(); + chacha_doneon(state, dst, src, todo, nrounds); + kernel_neon_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + /* We always can use at least the ARM scalar implementation. */ + return true; +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init chacha_arm_mod_init(void) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) { + switch (read_cpuid_part()) { + case ARM_CPU_PART_CORTEX_A7: + case ARM_CPU_PART_CORTEX_A5: + /* + * The Cortex-A7 and Cortex-A5 do not perform well with + * the NEON implementation but do incredibly with the + * scalar one and use less power. + */ + break; + default: + static_branch_enable(&use_neon); + } + } + return 0; +} +subsys_initcall(chacha_arm_mod_init); + +static void __exit chacha_arm_mod_exit(void) +{ +} +module_exit(chacha_arm_mod_exit); + +MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm/chacha-neon-core.S b/lib/crypto/arm/chacha-neon-core.S new file mode 100644 index 000000000000..ddd62b6294a5 --- /dev/null +++ b/lib/crypto/arm/chacha-neon-core.S @@ -0,0 +1,643 @@ +/* + * ChaCha/HChaCha NEON helper functions + * + * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + + /* + * NEON doesn't have a rotate instruction. The alternatives are, more or less: + * + * (a) vshl.u32 + vsri.u32 (needs temporary register) + * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) + * (c) vrev32.16 (16-bit rotations only) + * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, + * needs index vector) + * + * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations, + * the only choices are (a) and (b). We use (a) since it takes two-thirds the + * cycles of (b) on both Cortex-A7 and Cortex-A53. + * + * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest + * and doesn't need a temporary register. + * + * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence + * is twice as fast as (a), even when doing (a) on multiple registers + * simultaneously to eliminate the stall between vshl and vsri. Also, it + * parallelizes better when temporary registers are scarce. + * + * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as + * (a), so the need to load the rotation table actually makes the vtbl method + * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it + * seems to be a good compromise to get a more significant speed boost on some + * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. + */ + +#include <linux/linkage.h> +#include <asm/cache.h> + + .text + .fpu neon + .align 5 + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers q0-q3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * The round count is given in r3. + * + * Clobbers: r3, ip, q4-q5 + */ +chacha_permute: + + adr ip, .Lrol8_table + vld1.8 {d10}, [ip, :64] + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vrev32.16 q3, q3 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #12 + vsri.u32 q1, q4, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vtbl.8 d6, {d6}, d10 + vtbl.8 d7, {d7}, d10 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #7 + vsri.u32 q1, q4, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vext.8 q1, q1, q1, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vext.8 q2, q2, q2, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vext.8 q3, q3, q3, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vrev32.16 q3, q3 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #12 + vsri.u32 q1, q4, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vtbl.8 d6, {d6}, d10 + vtbl.8 d7, {d7}, d10 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #7 + vsri.u32 q1, q4, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vext.8 q1, q1, q1, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vext.8 q2, q2, q2, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vext.8 q3, q3, q3, #4 + + subs r3, r3, #2 + bne .Ldoubleround + + bx lr +ENDPROC(chacha_permute) + +ENTRY(chacha_block_xor_neon) + // r0: Input state matrix, s + // r1: 1 data block output, o + // r2: 1 data block input, i + // r3: nrounds + push {lr} + + // x0..3 = s0..3 + add ip, r0, #0x20 + vld1.32 {q0-q1}, [r0] + vld1.32 {q2-q3}, [ip] + + vmov q8, q0 + vmov q9, q1 + vmov q10, q2 + vmov q11, q3 + + bl chacha_permute + + add ip, r2, #0x20 + vld1.8 {q4-q5}, [r2] + vld1.8 {q6-q7}, [ip] + + // o0 = i0 ^ (x0 + s0) + vadd.i32 q0, q0, q8 + veor q0, q0, q4 + + // o1 = i1 ^ (x1 + s1) + vadd.i32 q1, q1, q9 + veor q1, q1, q5 + + // o2 = i2 ^ (x2 + s2) + vadd.i32 q2, q2, q10 + veor q2, q2, q6 + + // o3 = i3 ^ (x3 + s3) + vadd.i32 q3, q3, q11 + veor q3, q3, q7 + + add ip, r1, #0x20 + vst1.8 {q0-q1}, [r1] + vst1.8 {q2-q3}, [ip] + + pop {pc} +ENDPROC(chacha_block_xor_neon) + +ENTRY(hchacha_block_neon) + // r0: Input state matrix, s + // r1: output (8 32-bit words) + // r2: nrounds + push {lr} + + vld1.32 {q0-q1}, [r0]! + vld1.32 {q2-q3}, [r0] + + mov r3, r2 + bl chacha_permute + + vst1.32 {q0}, [r1]! + vst1.32 {q3}, [r1] + + pop {pc} +ENDPROC(hchacha_block_neon) + + .align 4 +.Lctrinc: .word 0, 1, 2, 3 +.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 + + .align 5 +ENTRY(chacha_4block_xor_neon) + push {r4, lr} + mov r4, sp // preserve the stack pointer + sub ip, sp, #0x20 // allocate a 32 byte buffer + bic ip, ip, #0x1f // aligned to 32 bytes + mov sp, ip + + // r0: Input state matrix, s + // r1: 4 data blocks output, o + // r2: 4 data blocks input, i + // r3: nrounds + + // + // This function encrypts four consecutive ChaCha blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. The words are re-interleaved before the + // final addition of the original state and the XORing step. + // + + // x0..15[0-3] = s0..15[0-3] + add ip, r0, #0x20 + vld1.32 {q0-q1}, [r0] + vld1.32 {q2-q3}, [ip] + + adr lr, .Lctrinc + vdup.32 q15, d7[1] + vdup.32 q14, d7[0] + vld1.32 {q4}, [lr, :128] + vdup.32 q13, d6[1] + vdup.32 q12, d6[0] + vdup.32 q11, d5[1] + vdup.32 q10, d5[0] + vadd.u32 q12, q12, q4 // x12 += counter values 0-3 + vdup.32 q9, d4[1] + vdup.32 q8, d4[0] + vdup.32 q7, d3[1] + vdup.32 q6, d3[0] + vdup.32 q5, d2[1] + vdup.32 q4, d2[0] + vdup.32 q3, d1[1] + vdup.32 q2, d1[0] + vdup.32 q1, d0[1] + vdup.32 q0, d0[0] + + adr ip, .Lrol8_table + b 1f + +.Ldoubleround4: + vld1.32 {q8-q9}, [sp, :256] +1: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vadd.i32 q0, q0, q4 + vadd.i32 q1, q1, q5 + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + + veor q12, q12, q0 + veor q13, q13, q1 + veor q14, q14, q2 + veor q15, q15, q3 + + vrev32.16 q12, q12 + vrev32.16 q13, q13 + vrev32.16 q14, q14 + vrev32.16 q15, q15 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i32 q10, q10, q14 + vadd.i32 q11, q11, q15 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q4, q8 + veor q9, q5, q9 + vshl.u32 q4, q8, #12 + vshl.u32 q5, q9, #12 + vsri.u32 q4, q8, #20 + vsri.u32 q5, q9, #20 + + veor q8, q6, q10 + veor q9, q7, q11 + vshl.u32 q6, q8, #12 + vshl.u32 q7, q9, #12 + vsri.u32 q6, q8, #20 + vsri.u32 q7, q9, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vld1.8 {d16}, [ip, :64] + vadd.i32 q0, q0, q4 + vadd.i32 q1, q1, q5 + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + + veor q12, q12, q0 + veor q13, q13, q1 + veor q14, q14, q2 + veor q15, q15, q3 + + vtbl.8 d24, {d24}, d16 + vtbl.8 d25, {d25}, d16 + vtbl.8 d26, {d26}, d16 + vtbl.8 d27, {d27}, d16 + vtbl.8 d28, {d28}, d16 + vtbl.8 d29, {d29}, d16 + vtbl.8 d30, {d30}, d16 + vtbl.8 d31, {d31}, d16 + + vld1.32 {q8-q9}, [sp, :256] + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i32 q10, q10, q14 + vadd.i32 q11, q11, q15 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q4, q8 + veor q9, q5, q9 + vshl.u32 q4, q8, #7 + vshl.u32 q5, q9, #7 + vsri.u32 q4, q8, #25 + vsri.u32 q5, q9, #25 + + veor q8, q6, q10 + veor q9, q7, q11 + vshl.u32 q6, q8, #7 + vshl.u32 q7, q9, #7 + vsri.u32 q6, q8, #25 + vsri.u32 q7, q9, #25 + + vld1.32 {q8-q9}, [sp, :256] + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vadd.i32 q0, q0, q5 + vadd.i32 q1, q1, q6 + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q4 + + veor q15, q15, q0 + veor q12, q12, q1 + veor q13, q13, q2 + veor q14, q14, q3 + + vrev32.16 q15, q15 + vrev32.16 q12, q12 + vrev32.16 q13, q13 + vrev32.16 q14, q14 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vadd.i32 q10, q10, q15 + vadd.i32 q11, q11, q12 + vadd.i32 q8, q8, q13 + vadd.i32 q9, q9, q14 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q7, q8 + veor q9, q4, q9 + vshl.u32 q7, q8, #12 + vshl.u32 q4, q9, #12 + vsri.u32 q7, q8, #20 + vsri.u32 q4, q9, #20 + + veor q8, q5, q10 + veor q9, q6, q11 + vshl.u32 q5, q8, #12 + vshl.u32 q6, q9, #12 + vsri.u32 q5, q8, #20 + vsri.u32 q6, q9, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vld1.8 {d16}, [ip, :64] + vadd.i32 q0, q0, q5 + vadd.i32 q1, q1, q6 + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q4 + + veor q15, q15, q0 + veor q12, q12, q1 + veor q13, q13, q2 + veor q14, q14, q3 + + vtbl.8 d30, {d30}, d16 + vtbl.8 d31, {d31}, d16 + vtbl.8 d24, {d24}, d16 + vtbl.8 d25, {d25}, d16 + vtbl.8 d26, {d26}, d16 + vtbl.8 d27, {d27}, d16 + vtbl.8 d28, {d28}, d16 + vtbl.8 d29, {d29}, d16 + + vld1.32 {q8-q9}, [sp, :256] + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vadd.i32 q10, q10, q15 + vadd.i32 q11, q11, q12 + vadd.i32 q8, q8, q13 + vadd.i32 q9, q9, q14 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q7, q8 + veor q9, q4, q9 + vshl.u32 q7, q8, #7 + vshl.u32 q4, q9, #7 + vsri.u32 q7, q8, #25 + vsri.u32 q4, q9, #25 + + veor q8, q5, q10 + veor q9, q6, q11 + vshl.u32 q5, q8, #7 + vshl.u32 q6, q9, #7 + vsri.u32 q5, q8, #25 + vsri.u32 q6, q9, #25 + + subs r3, r3, #2 + bne .Ldoubleround4 + + // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. + // x8..9[0-3] are on the stack. + + // Re-interleave the words in the first two rows of each block (x0..7). + // Also add the counter values 0-3 to x12[0-3]. + vld1.32 {q8}, [lr, :128] // load counter values 0-3 + vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) + vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) + vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) + vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) + vadd.u32 q12, q8 // x12 += counter values 0-3 + vswp d1, d4 + vswp d3, d6 + vld1.32 {q8-q9}, [r0]! // load s0..7 + vswp d9, d12 + vswp d11, d14 + + // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) + // after XORing the first 32 bytes. + vswp q1, q4 + + // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) + + // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) + vadd.u32 q0, q0, q8 + vadd.u32 q2, q2, q8 + vadd.u32 q4, q4, q8 + vadd.u32 q3, q3, q8 + + // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) + vadd.u32 q1, q1, q9 + vadd.u32 q6, q6, q9 + vadd.u32 q5, q5, q9 + vadd.u32 q7, q7, q9 + + // XOR first 32 bytes using keystream from first two rows of first block + vld1.8 {q8-q9}, [r2]! + veor q8, q8, q0 + veor q9, q9, q1 + vst1.8 {q8-q9}, [r1]! + + // Re-interleave the words in the last two rows of each block (x8..15). + vld1.32 {q8-q9}, [sp, :256] + mov sp, r4 // restore original stack pointer + ldr r4, [r4, #8] // load number of bytes + vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) + vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) + vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) + vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) + vld1.32 {q0-q1}, [r0] // load s8..15 + vswp d25, d28 + vswp d27, d30 + vswp d17, d20 + vswp d19, d22 + + // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) + + // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) + vadd.u32 q8, q8, q0 + vadd.u32 q10, q10, q0 + vadd.u32 q9, q9, q0 + vadd.u32 q11, q11, q0 + + // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) + vadd.u32 q12, q12, q1 + vadd.u32 q14, q14, q1 + vadd.u32 q13, q13, q1 + vadd.u32 q15, q15, q1 + + // XOR the rest of the data with the keystream + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #96 + veor q0, q0, q8 + veor q1, q1, q12 + ble .Lle96 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q2 + veor q1, q1, q6 + ble .Lle128 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q10 + veor q1, q1, q14 + ble .Lle160 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q4 + veor q1, q1, q5 + ble .Lle192 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q9 + veor q1, q1, q13 + ble .Lle224 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + subs r4, r4, #32 + veor q0, q0, q3 + veor q1, q1, q7 + blt .Llt256 +.Lout: + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2] + veor q0, q0, q11 + veor q1, q1, q15 + vst1.8 {q0-q1}, [r1] + + pop {r4, pc} + +.Lle192: + vmov q4, q9 + vmov q5, q13 + +.Lle160: + // nothing to do + +.Lfinalblock: + // Process the final block if processing less than 4 full blocks. + // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the + // previous 32 byte output block that still needs to be written at + // [r1] in q0-q1. + beq .Lfullblock + +.Lpartialblock: + adr lr, .Lpermute + 32 + add r2, r2, r4 + add lr, lr, r4 + add r4, r4, r1 + + vld1.8 {q2-q3}, [lr] + vld1.8 {q6-q7}, [r2] + + add r4, r4, #32 + + vtbl.8 d4, {q4-q5}, d4 + vtbl.8 d5, {q4-q5}, d5 + vtbl.8 d6, {q4-q5}, d6 + vtbl.8 d7, {q4-q5}, d7 + + veor q6, q6, q2 + veor q7, q7, q3 + + vst1.8 {q6-q7}, [r4] // overlapping stores + vst1.8 {q0-q1}, [r1] + pop {r4, pc} + +.Lfullblock: + vmov q11, q4 + vmov q15, q5 + b .Lout +.Lle96: + vmov q4, q2 + vmov q5, q6 + b .Lfinalblock +.Lle128: + vmov q4, q10 + vmov q5, q14 + b .Lfinalblock +.Lle224: + vmov q4, q3 + vmov q5, q7 + b .Lfinalblock +.Llt256: + vmov q4, q11 + vmov q5, q15 + b .Lpartialblock +ENDPROC(chacha_4block_xor_neon) + + .align L1_CACHE_SHIFT +.Lpermute: + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f diff --git a/lib/crypto/arm/chacha-scalar-core.S b/lib/crypto/arm/chacha-scalar-core.S new file mode 100644 index 000000000000..4951df05c158 --- /dev/null +++ b/lib/crypto/arm/chacha-scalar-core.S @@ -0,0 +1,444 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Google, Inc. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * Design notes: + * + * 16 registers would be needed to hold the state matrix, but only 14 are + * available because 'sp' and 'pc' cannot be used. So we spill the elements + * (x8, x9) to the stack and swap them out with (x10, x11). This adds one + * 'ldrd' and one 'strd' instruction per round. + * + * All rotates are performed using the implicit rotate operand accepted by the + * 'add' and 'eor' instructions. This is faster than using explicit rotate + * instructions. To make this work, we allow the values in the second and last + * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the + * wrong rotation amount. The rotation amount is then fixed up just in time + * when the values are used. 'brot' is the number of bits the values in row 'b' + * need to be rotated right to arrive at the correct values, and 'drot' + * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such + * that they end up as (25, 24) after every round. + */ + + // ChaCha state registers + X0 .req r0 + X1 .req r1 + X2 .req r2 + X3 .req r3 + X4 .req r4 + X5 .req r5 + X6 .req r6 + X7 .req r7 + X8_X10 .req r8 // shared by x8 and x10 + X9_X11 .req r9 // shared by x9 and x11 + X12 .req r10 + X13 .req r11 + X14 .req r12 + X15 .req r14 + +.macro _le32_bswap_4x a, b, c, d, tmp +#ifdef __ARMEB__ + rev_l \a, \tmp + rev_l \b, \tmp + rev_l \c, \tmp + rev_l \d, \tmp +#endif +.endm + +.macro __ldrd a, b, src, offset +#if __LINUX_ARM_ARCH__ >= 6 + ldrd \a, \b, [\src, #\offset] +#else + ldr \a, [\src, #\offset] + ldr \b, [\src, #\offset + 4] +#endif +.endm + +.macro __strd a, b, dst, offset +#if __LINUX_ARM_ARCH__ >= 6 + strd \a, \b, [\dst, #\offset] +#else + str \a, [\dst, #\offset] + str \b, [\dst, #\offset + 4] +#endif +.endm + +.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2 + + // a += b; d ^= a; d = rol(d, 16); + add \a1, \a1, \b1, ror #brot + add \a2, \a2, \b2, ror #brot + eor \d1, \a1, \d1, ror #drot + eor \d2, \a2, \d2, ror #drot + // drot == 32 - 16 == 16 + + // c += d; b ^= c; b = rol(b, 12); + add \c1, \c1, \d1, ror #16 + add \c2, \c2, \d2, ror #16 + eor \b1, \c1, \b1, ror #brot + eor \b2, \c2, \b2, ror #brot + // brot == 32 - 12 == 20 + + // a += b; d ^= a; d = rol(d, 8); + add \a1, \a1, \b1, ror #20 + add \a2, \a2, \b2, ror #20 + eor \d1, \a1, \d1, ror #16 + eor \d2, \a2, \d2, ror #16 + // drot == 32 - 8 == 24 + + // c += d; b ^= c; b = rol(b, 7); + add \c1, \c1, \d1, ror #24 + add \c2, \c2, \d2, ror #24 + eor \b1, \c1, \b1, ror #20 + eor \b2, \c2, \b2, ror #20 + // brot == 32 - 7 == 25 +.endm + +.macro _doubleround + + // column round + + // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13) + _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13 + + // save (x8, x9); restore (x10, x11) + __strd X8_X10, X9_X11, sp, 0 + __ldrd X8_X10, X9_X11, sp, 8 + + // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15) + _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15 + + .set brot, 25 + .set drot, 24 + + // diagonal round + + // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12) + _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12 + + // save (x10, x11); restore (x8, x9) + __strd X8_X10, X9_X11, sp, 8 + __ldrd X8_X10, X9_X11, sp, 0 + + // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14) + _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14 +.endm + +.macro _chacha_permute nrounds + .set brot, 0 + .set drot, 0 + .rept \nrounds / 2 + _doubleround + .endr +.endm + +.macro _chacha nrounds + +.Lnext_block\@: + // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN + // Registers contain x0-x9,x12-x15. + + // Do the core ChaCha permutation to update x0-x15. + _chacha_permute \nrounds + + add sp, #8 + // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers contain x0-x9,x12-x15. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15). + push {X8_X10, X9_X11, X12, X13, X14, X15} + + // Load (OUT, IN, LEN). + ldr r14, [sp, #96] + ldr r12, [sp, #100] + ldr r11, [sp, #104] + + orr r10, r14, r12 + + // Use slow path if fewer than 64 bytes remain. + cmp r11, #64 + blt .Lxor_slowpath\@ + + // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on + // ARMv6+, since ldmia and stmia (used below) still require alignment. + tst r10, #3 + bne .Lxor_slowpath\@ + + // Fast path: XOR 64 bytes of aligned data. + + // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // x0-x3 + __ldrd r8, r9, sp, 32 + __ldrd r10, r11, sp, 40 + add X0, X0, r8 + add X1, X1, r9 + add X2, X2, r10 + add X3, X3, r11 + _le32_bswap_4x X0, X1, X2, X3, r8 + ldmia r12!, {r8-r11} + eor X0, X0, r8 + eor X1, X1, r9 + eor X2, X2, r10 + eor X3, X3, r11 + stmia r14!, {X0-X3} + + // x4-x7 + __ldrd r8, r9, sp, 48 + __ldrd r10, r11, sp, 56 + add X4, r8, X4, ror #brot + add X5, r9, X5, ror #brot + ldmia r12!, {X0-X3} + add X6, r10, X6, ror #brot + add X7, r11, X7, ror #brot + _le32_bswap_4x X4, X5, X6, X7, r8 + eor X4, X4, X0 + eor X5, X5, X1 + eor X6, X6, X2 + eor X7, X7, X3 + stmia r14!, {X4-X7} + + // x8-x15 + pop {r0-r7} // (x8-x9,x12-x15,x10-x11) + __ldrd r8, r9, sp, 32 + __ldrd r10, r11, sp, 40 + add r0, r0, r8 // x8 + add r1, r1, r9 // x9 + add r6, r6, r10 // x10 + add r7, r7, r11 // x11 + _le32_bswap_4x r0, r1, r6, r7, r8 + ldmia r12!, {r8-r11} + eor r0, r0, r8 // x8 + eor r1, r1, r9 // x9 + eor r6, r6, r10 // x10 + eor r7, r7, r11 // x11 + stmia r14!, {r0,r1,r6,r7} + ldmia r12!, {r0,r1,r6,r7} + __ldrd r8, r9, sp, 48 + __ldrd r10, r11, sp, 56 + add r2, r8, r2, ror #drot // x12 + add r3, r9, r3, ror #drot // x13 + add r4, r10, r4, ror #drot // x14 + add r5, r11, r5, ror #drot // x15 + _le32_bswap_4x r2, r3, r4, r5, r9 + ldr r9, [sp, #72] // load LEN + eor r2, r2, r0 // x12 + eor r3, r3, r1 // x13 + eor r4, r4, r6 // x14 + eor r5, r5, r7 // x15 + subs r9, #64 // decrement and check LEN + stmia r14!, {r2-r5} + + beq .Ldone\@ + +.Lprepare_for_next_block\@: + + // Stack: x0-x15 OUT IN LEN + + // Increment block counter (x12) + add r8, #1 + + // Store updated (OUT, IN, LEN) + str r14, [sp, #64] + str r12, [sp, #68] + str r9, [sp, #72] + + mov r14, sp + + // Store updated block counter (x12) + str r8, [sp, #48] + + sub sp, #16 + + // Reload state and do next block + ldmia r14!, {r0-r11} // load x0-x11 + __strd r10, r11, sp, 8 // store x10-x11 before state + ldmia r14, {r10-r12,r14} // load x12-x15 + b .Lnext_block\@ + +.Lxor_slowpath\@: + // Slow path: < 64 bytes remaining, or unaligned input or output buffer. + // We handle it by storing the 64 bytes of keystream to the stack, then + // XOR-ing the needed portion with the data. + + // Allocate keystream buffer + sub sp, #64 + mov r14, sp + + // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0. + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'. + + // Save keystream for x0-x3 + __ldrd r8, r9, sp, 96 + __ldrd r10, r11, sp, 104 + add X0, X0, r8 + add X1, X1, r9 + add X2, X2, r10 + add X3, X3, r11 + _le32_bswap_4x X0, X1, X2, X3, r8 + stmia r14!, {X0-X3} + + // Save keystream for x4-x7 + __ldrd r8, r9, sp, 112 + __ldrd r10, r11, sp, 120 + add X4, r8, X4, ror #brot + add X5, r9, X5, ror #brot + add X6, r10, X6, ror #brot + add X7, r11, X7, ror #brot + _le32_bswap_4x X4, X5, X6, X7, r8 + add r8, sp, #64 + stmia r14!, {X4-X7} + + // Save keystream for x8-x15 + ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11) + __ldrd r8, r9, sp, 128 + __ldrd r10, r11, sp, 136 + add r0, r0, r8 // x8 + add r1, r1, r9 // x9 + add r6, r6, r10 // x10 + add r7, r7, r11 // x11 + _le32_bswap_4x r0, r1, r6, r7, r8 + stmia r14!, {r0,r1,r6,r7} + __ldrd r8, r9, sp, 144 + __ldrd r10, r11, sp, 152 + add r2, r8, r2, ror #drot // x12 + add r3, r9, r3, ror #drot // x13 + add r4, r10, r4, ror #drot // x14 + add r5, r11, r5, ror #drot // x15 + _le32_bswap_4x r2, r3, r4, r5, r9 + stmia r14, {r2-r5} + + // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN + // Registers: r8 is block counter, r12 is IN. + + ldr r9, [sp, #168] // LEN + ldr r14, [sp, #160] // OUT + cmp r9, #64 + mov r0, sp + movle r1, r9 + movgt r1, #64 + // r1 is number of bytes to XOR, in range [1, 64] + +.if __LINUX_ARM_ARCH__ < 6 + orr r2, r12, r14 + tst r2, #3 // IN or OUT misaligned? + bne .Lxor_next_byte\@ +.endif + + // XOR a word at a time +.rept 16 + subs r1, #4 + blt .Lxor_words_done\@ + ldr r2, [r12], #4 + ldr r3, [r0], #4 + eor r2, r2, r3 + str r2, [r14], #4 +.endr + b .Lxor_slowpath_done\@ +.Lxor_words_done\@: + ands r1, r1, #3 + beq .Lxor_slowpath_done\@ + + // XOR a byte at a time +.Lxor_next_byte\@: + ldrb r2, [r12], #1 + ldrb r3, [r0], #1 + eor r2, r2, r3 + strb r2, [r14], #1 + subs r1, #1 + bne .Lxor_next_byte\@ + +.Lxor_slowpath_done\@: + subs r9, #64 + add sp, #96 + bgt .Lprepare_for_next_block\@ + +.Ldone\@: +.endm // _chacha + +/* + * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes, + * const struct chacha_state *state, int nrounds); + */ +ENTRY(chacha_doarm) + cmp r2, #0 // len == 0? + reteq lr + + ldr ip, [sp] + cmp ip, #12 + + push {r0-r2,r4-r11,lr} + + // Push state x0-x15 onto stack. + // Also store an extra copy of x10-x11 just before the state. + + add X12, r3, #48 + ldm X12, {X12,X13,X14,X15} + push {X12,X13,X14,X15} + sub sp, sp, #64 + + __ldrd X8_X10, X9_X11, r3, 40 + __strd X8_X10, X9_X11, sp, 8 + __strd X8_X10, X9_X11, sp, 56 + ldm r3, {X0-X9_X11} + __strd X0, X1, sp, 16 + __strd X2, X3, sp, 24 + __strd X4, X5, sp, 32 + __strd X6, X7, sp, 40 + __strd X8_X10, X9_X11, sp, 48 + + beq 1f + _chacha 20 + +0: add sp, #76 + pop {r4-r11, pc} + +1: _chacha 12 + b 0b +ENDPROC(chacha_doarm) + +/* + * void hchacha_block_arm(const struct chacha_state *state, + * u32 out[HCHACHA_OUT_WORDS], int nrounds); + */ +ENTRY(hchacha_block_arm) + push {r1,r4-r11,lr} + + cmp r2, #12 // ChaCha12 ? + + mov r14, r0 + ldmia r14!, {r0-r11} // load x0-x11 + push {r10-r11} // store x10-x11 to stack + ldm r14, {r10-r12,r14} // load x12-x15 + sub sp, #8 + + beq 1f + _chacha_permute 20 + + // Skip over (unused0-unused1, x10-x11) +0: add sp, #16 + + // Fix up rotations of x12-x15 + ror X12, X12, #drot + ror X13, X13, #drot + pop {r4} // load 'out' + ror X14, X14, #drot + ror X15, X15, #drot + + // Store (x0-x3,x12-x15) to 'out' + stm r4, {X0,X1,X2,X3,X12,X13,X14,X15} + + pop {r4-r11,pc} + +1: _chacha_permute 12 + b 0b +ENDPROC(hchacha_block_arm) diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl new file mode 100644 index 000000000000..dd7a996361a7 --- /dev/null +++ b/lib/crypto/arm/poly1305-armv4.pl @@ -0,0 +1,1236 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# IALU(*)/gcc-4.4 NEON +# +# ARM11xx(ARMv6) 7.78/+100% - +# Cortex-A5 6.35/+130% 3.00 +# Cortex-A8 6.25/+115% 2.36 +# Cortex-A9 5.10/+95% 2.55 +# Cortex-A15 3.85/+85% 1.25(**) +# Snapdragon S4 5.70/+100% 1.48(**) +# +# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; +# (**) these are trade-off results, they can be improved by ~8% but at +# the cost of 15/12% regression on Cortex-A5/A7, it's even possible +# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ +# define poly1305_init poly1305_block_init_arch +# define poly1305_blocks poly1305_blocks_arm +# define poly1305_emit poly1305_emit_arch +#endif + +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.text + +.globl poly1305_emit +.globl poly1305_blocks +.globl poly1305_init +.type poly1305_init,%function +.align 5 +poly1305_init: +.Lpoly1305_init: + stmdb sp!,{r4-r11} + + eor r3,r3,r3 + cmp $inp,#0 + str r3,[$ctx,#0] @ zero hash value + str r3,[$ctx,#4] + str r3,[$ctx,#8] + str r3,[$ctx,#12] + str r3,[$ctx,#16] + str r3,[$ctx,#36] @ clear is_base2_26 + add $ctx,$ctx,#20 + +#ifdef __thumb2__ + it eq +#endif + moveq r0,#0 + beq .Lno_key + +#if __ARM_MAX_ARCH__>=7 + mov r3,#-1 + str r3,[$ctx,#28] @ impossible key power value +# ifndef __KERNEL__ + adr r11,.Lpoly1305_init + ldr r12,.LOPENSSL_armcap +# endif +#endif + ldrb r4,[$inp,#0] + mov r10,#0x0fffffff + ldrb r5,[$inp,#1] + and r3,r10,#-4 @ 0x0ffffffc + ldrb r6,[$inp,#2] + ldrb r7,[$inp,#3] + orr r4,r4,r5,lsl#8 + ldrb r5,[$inp,#4] + orr r4,r4,r6,lsl#16 + ldrb r6,[$inp,#5] + orr r4,r4,r7,lsl#24 + ldrb r7,[$inp,#6] + and r4,r4,r10 + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +# if !defined(_WIN32) + ldr r12,[r11,r12] @ OPENSSL_armcap_P +# endif +# if defined(__APPLE__) || defined(_WIN32) + ldr r12,[r12] +# endif +#endif + ldrb r8,[$inp,#7] + orr r5,r5,r6,lsl#8 + ldrb r6,[$inp,#8] + orr r5,r5,r7,lsl#16 + ldrb r7,[$inp,#9] + orr r5,r5,r8,lsl#24 + ldrb r8,[$inp,#10] + and r5,r5,r3 + +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + tst r12,#ARMV7_NEON @ check for NEON +# ifdef __thumb2__ + adr r9,.Lpoly1305_blocks_neon + adr r11,.Lpoly1305_blocks + it ne + movne r11,r9 + adr r12,.Lpoly1305_emit + orr r11,r11,#1 @ thumb-ify addresses + orr r12,r12,#1 +# else + add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) + ite eq + addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) + addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) +# endif +#endif + ldrb r9,[$inp,#11] + orr r6,r6,r7,lsl#8 + ldrb r7,[$inp,#12] + orr r6,r6,r8,lsl#16 + ldrb r8,[$inp,#13] + orr r6,r6,r9,lsl#24 + ldrb r9,[$inp,#14] + and r6,r6,r3 + + ldrb r10,[$inp,#15] + orr r7,r7,r8,lsl#8 + str r4,[$ctx,#0] + orr r7,r7,r9,lsl#16 + str r5,[$ctx,#4] + orr r7,r7,r10,lsl#24 + str r6,[$ctx,#8] + and r7,r7,r3 + str r7,[$ctx,#12] +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + stmia r2,{r11,r12} @ fill functions table + mov r0,#1 +#else + mov r0,#0 +#endif +.Lno_key: + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_init,.-poly1305_init +___ +{ +my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); +my ($s1,$s2,$s3)=($r1,$r2,$r3); + +$code.=<<___; +.type poly1305_blocks,%function +.align 5 +poly1305_blocks: +.Lpoly1305_blocks: + stmdb sp!,{r3-r11,lr} + + ands $len,$len,#-16 + beq .Lno_data + + add $len,$len,$inp @ end pointer + sub sp,sp,#32 + +#if __ARM_ARCH__<7 + ldmia $ctx,{$h0-$r3} @ load context + add $ctx,$ctx,#20 + str $len,[sp,#16] @ offload stuff + str $ctx,[sp,#12] +#else + ldr lr,[$ctx,#36] @ is_base2_26 + ldmia $ctx!,{$h0-$h4} @ load hash value + str $len,[sp,#16] @ offload stuff + str $ctx,[sp,#12] + + adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 + mov $r1,$h1,lsr#6 + adcs $r1,$r1,$h2,lsl#20 + mov $r2,$h2,lsr#12 + adcs $r2,$r2,$h3,lsl#14 + mov $r3,$h3,lsr#18 + adcs $r3,$r3,$h4,lsl#8 + mov $len,#0 + teq lr,#0 + str $len,[$ctx,#16] @ clear is_base2_26 + adc $len,$len,$h4,lsr#24 + + itttt ne + movne $h0,$r0 @ choose between radixes + movne $h1,$r1 + movne $h2,$r2 + movne $h3,$r3 + ldmia $ctx,{$r0-$r3} @ load key + it ne + movne $h4,$len +#endif + + mov lr,$inp + cmp $padbit,#0 + str $r1,[sp,#20] + str $r2,[sp,#24] + str $r3,[sp,#28] + b .Loop + +.align 4 +.Loop: +#if __ARM_ARCH__<7 + ldrb r0,[lr],#16 @ load input +# ifdef __thumb2__ + it hi +# endif + addhi $h4,$h4,#1 @ 1<<128 + ldrb r1,[lr,#-15] + ldrb r2,[lr,#-14] + ldrb r3,[lr,#-13] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-12] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-11] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-10] + adds $h0,$h0,r3 @ accumulate input + + ldrb r3,[lr,#-9] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-8] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-7] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-6] + adcs $h1,$h1,r3 + + ldrb r3,[lr,#-5] + orr r1,r0,r1,lsl#8 + ldrb r0,[lr,#-4] + orr r2,r1,r2,lsl#16 + ldrb r1,[lr,#-3] + orr r3,r2,r3,lsl#24 + ldrb r2,[lr,#-2] + adcs $h2,$h2,r3 + + ldrb r3,[lr,#-1] + orr r1,r0,r1,lsl#8 + str lr,[sp,#8] @ offload input pointer + orr r2,r1,r2,lsl#16 + add $s1,$r1,$r1,lsr#2 + orr r3,r2,r3,lsl#24 +#else + ldr r0,[lr],#16 @ load input + it hi + addhi $h4,$h4,#1 @ padbit + ldr r1,[lr,#-12] + ldr r2,[lr,#-8] + ldr r3,[lr,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + adds $h0,$h0,r0 @ accumulate input + str lr,[sp,#8] @ offload input pointer + adcs $h1,$h1,r1 + add $s1,$r1,$r1,lsr#2 + adcs $h2,$h2,r2 +#endif + add $s2,$r2,$r2,lsr#2 + adcs $h3,$h3,r3 + add $s3,$r3,$r3,lsr#2 + + umull r2,r3,$h1,$r0 + adc $h4,$h4,#0 + umull r0,r1,$h0,$r0 + umlal r2,r3,$h4,$s1 + umlal r0,r1,$h3,$s1 + ldr $r1,[sp,#20] @ reload $r1 + umlal r2,r3,$h2,$s3 + umlal r0,r1,$h1,$s3 + umlal r2,r3,$h3,$s2 + umlal r0,r1,$h2,$s2 + umlal r2,r3,$h0,$r1 + str r0,[sp,#0] @ future $h0 + mul r0,$s2,$h4 + ldr $r2,[sp,#24] @ reload $r2 + adds r2,r2,r1 @ d1+=d0>>32 + eor r1,r1,r1 + adc lr,r3,#0 @ future $h2 + str r2,[sp,#4] @ future $h1 + + mul r2,$s3,$h4 + eor r3,r3,r3 + umlal r0,r1,$h3,$s3 + ldr $r3,[sp,#28] @ reload $r3 + umlal r2,r3,$h3,$r0 + umlal r0,r1,$h2,$r0 + umlal r2,r3,$h2,$r1 + umlal r0,r1,$h1,$r1 + umlal r2,r3,$h1,$r2 + umlal r0,r1,$h0,$r2 + umlal r2,r3,$h0,$r3 + ldr $h0,[sp,#0] + mul $h4,$r0,$h4 + ldr $h1,[sp,#4] + + adds $h2,lr,r0 @ d2+=d1>>32 + ldr lr,[sp,#8] @ reload input pointer + adc r1,r1,#0 + adds $h3,r2,r1 @ d3+=d2>>32 + ldr r0,[sp,#16] @ reload end pointer + adc r3,r3,#0 + add $h4,$h4,r3 @ h4+=d3>>32 + + and r1,$h4,#-4 + and $h4,$h4,#3 + add r1,r1,r1,lsr#2 @ *=5 + adds $h0,$h0,r1 + adcs $h1,$h1,#0 + adcs $h2,$h2,#0 + adcs $h3,$h3,#0 + adc $h4,$h4,#0 + + cmp r0,lr @ done yet? + bhi .Loop + + ldr $ctx,[sp,#12] + add sp,sp,#32 + stmdb $ctx,{$h0-$h4} @ store the result + +.Lno_data: +#if __ARM_ARCH__>=5 + ldmia sp!,{r3-r11,pc} +#else + ldmia sp!,{r3-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce)=map("r$_",(0..2)); +my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); +my $g4=$ctx; + +$code.=<<___; +.type poly1305_emit,%function +.align 5 +poly1305_emit: +.Lpoly1305_emit: + stmdb sp!,{r4-r11} + + ldmia $ctx,{$h0-$h4} + +#if __ARM_ARCH__>=7 + ldr ip,[$ctx,#36] @ is_base2_26 + + adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 + mov $g1,$h1,lsr#6 + adcs $g1,$g1,$h2,lsl#20 + mov $g2,$h2,lsr#12 + adcs $g2,$g2,$h3,lsl#14 + mov $g3,$h3,lsr#18 + adcs $g3,$g3,$h4,lsl#8 + mov $g4,#0 + adc $g4,$g4,$h4,lsr#24 + + tst ip,ip + itttt ne + movne $h0,$g0 + movne $h1,$g1 + movne $h2,$g2 + movne $h3,$g3 + it ne + movne $h4,$g4 +#endif + + adds $g0,$h0,#5 @ compare to modulus + adcs $g1,$h1,#0 + adcs $g2,$h2,#0 + adcs $g3,$h3,#0 + adc $g4,$h4,#0 + tst $g4,#4 @ did it carry/borrow? + +#ifdef __thumb2__ + it ne +#endif + movne $h0,$g0 + ldr $g0,[$nonce,#0] +#ifdef __thumb2__ + it ne +#endif + movne $h1,$g1 + ldr $g1,[$nonce,#4] +#ifdef __thumb2__ + it ne +#endif + movne $h2,$g2 + ldr $g2,[$nonce,#8] +#ifdef __thumb2__ + it ne +#endif + movne $h3,$g3 + ldr $g3,[$nonce,#12] + + adds $h0,$h0,$g0 + adcs $h1,$h1,$g1 + adcs $h2,$h2,$g2 + adc $h3,$h3,$g3 + +#if __ARM_ARCH__>=7 +# ifdef __ARMEB__ + rev $h0,$h0 + rev $h1,$h1 + rev $h2,$h2 + rev $h3,$h3 +# endif + str $h0,[$mac,#0] + str $h1,[$mac,#4] + str $h2,[$mac,#8] + str $h3,[$mac,#12] +#else + strb $h0,[$mac,#0] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#4] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#8] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#12] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#1] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#5] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#9] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#13] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#2] + mov $h0,$h0,lsr#8 + strb $h1,[$mac,#6] + mov $h1,$h1,lsr#8 + strb $h2,[$mac,#10] + mov $h2,$h2,lsr#8 + strb $h3,[$mac,#14] + mov $h3,$h3,lsr#8 + + strb $h0,[$mac,#3] + strb $h1,[$mac,#7] + strb $h2,[$mac,#11] + strb $h3,[$mac,#15] +#endif + ldmia sp!,{r4-r11} +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size poly1305_emit,.-poly1305_emit +___ +{ +my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); +my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); +my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); + +my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.fpu neon + +.type poly1305_init_neon,%function +.align 5 +poly1305_init_neon: +.Lpoly1305_init_neon: + ldr r3,[$ctx,#48] @ first table element + cmp r3,#-1 @ is value impossible? + bne .Lno_init_neon + + ldr r4,[$ctx,#20] @ load key base 2^32 + ldr r5,[$ctx,#24] + ldr r6,[$ctx,#28] + ldr r7,[$ctx,#32] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + and r3,r3,#0x03ffffff + and r4,r4,#0x03ffffff + and r5,r5,#0x03ffffff + + vdup.32 $R0,r2 @ r^1 in both lanes + add r2,r3,r3,lsl#2 @ *5 + vdup.32 $R1,r3 + add r3,r4,r4,lsl#2 + vdup.32 $S1,r2 + vdup.32 $R2,r4 + add r4,r5,r5,lsl#2 + vdup.32 $S2,r3 + vdup.32 $R3,r5 + add r5,r6,r6,lsl#2 + vdup.32 $S3,r4 + vdup.32 $R4,r6 + vdup.32 $S4,r5 + + mov $zeros,#2 @ counter + +.Lsquare_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + + vmull.u32 $D0,$R0,${R0}[1] + vmull.u32 $D1,$R1,${R0}[1] + vmull.u32 $D2,$R2,${R0}[1] + vmull.u32 $D3,$R3,${R0}[1] + vmull.u32 $D4,$R4,${R0}[1] + + vmlal.u32 $D0,$R4,${S1}[1] + vmlal.u32 $D1,$R0,${R1}[1] + vmlal.u32 $D2,$R1,${R1}[1] + vmlal.u32 $D3,$R2,${R1}[1] + vmlal.u32 $D4,$R3,${R1}[1] + + vmlal.u32 $D0,$R3,${S2}[1] + vmlal.u32 $D1,$R4,${S2}[1] + vmlal.u32 $D3,$R1,${R2}[1] + vmlal.u32 $D2,$R0,${R2}[1] + vmlal.u32 $D4,$R2,${R2}[1] + + vmlal.u32 $D0,$R2,${S3}[1] + vmlal.u32 $D3,$R0,${R3}[1] + vmlal.u32 $D1,$R3,${S3}[1] + vmlal.u32 $D2,$R4,${S3}[1] + vmlal.u32 $D4,$R1,${R3}[1] + + vmlal.u32 $D3,$R4,${S4}[1] + vmlal.u32 $D0,$R1,${S4}[1] + vmlal.u32 $D1,$R2,${S4}[1] + vmlal.u32 $D2,$R3,${S4}[1] + vmlal.u32 $D4,$R0,${R4}[1] + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + @ and P. Schwabe + @ + @ H0>>+H1>>+H2>>+H3>>+H4 + @ H3>>+H4>>*5+H0>>+H1 + @ + @ Trivia. + @ + @ Result of multiplication of n-bit number by m-bit number is + @ n+m bits wide. However! Even though 2^n is a n+1-bit number, + @ m-bit number multiplied by 2^n is still n+m bits wide. + @ + @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, + @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit + @ one is n+1 bits wide. + @ + @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that + @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 + @ can be 27. However! In cases when their width exceeds 26 bits + @ they are limited by 2^26+2^6. This in turn means that *sum* + @ of the products with these values can still be viewed as sum + @ of 52-bit numbers as long as the amount of addends is not a + @ power of 2. For example, + @ + @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, + @ + @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or + @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than + @ 8 * (2^52) or 2^55. However, the value is then multiplied by + @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), + @ which is less than 32 * (2^52) or 2^57. And when processing + @ data we are looking at triple as many addends... + @ + @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and + @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the + @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while + @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 + @ instruction accepts 2x32-bit input and writes 2x64-bit result. + @ This means that result of reduction have to be compressed upon + @ loop wrap-around. This can be done in the process of reduction + @ to minimize amount of instructions [as well as amount of + @ 128-bit instructions, which benefits low-end processors], but + @ one has to watch for H2 (which is narrower than H0) and 5*H4 + @ not being wider than 58 bits, so that result of right shift + @ by 26 bits fits in 32 bits. This is also useful on x86, + @ because it allows to use paddd in place for paddq, which + @ benefits Atom, where paddq is ridiculously slow. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vbic.i32 $D4#lo,#0xfc000000 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vbic.i32 $D2#lo,#0xfc000000 + + vshr.u32 $T0#lo,$D0#lo,#26 + vbic.i32 $D0#lo,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + + subs $zeros,$zeros,#1 + beq .Lsquare_break_neon + + add $tbl0,$ctx,#(48+0*9*4) + add $tbl1,$ctx,#(48+1*9*4) + + vtrn.32 $R0,$D0#lo @ r^2:r^1 + vtrn.32 $R2,$D2#lo + vtrn.32 $R3,$D3#lo + vtrn.32 $R1,$D1#lo + vtrn.32 $R4,$D4#lo + + vshl.u32 $S2,$R2,#2 @ *5 + vshl.u32 $S3,$R3,#2 + vshl.u32 $S1,$R1,#2 + vshl.u32 $S4,$R4,#2 + vadd.i32 $S2,$S2,$R2 + vadd.i32 $S1,$S1,$R1 + vadd.i32 $S3,$S3,$R3 + vadd.i32 $S4,$S4,$R4 + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0,:32] + vst1.32 {${S4}[1]},[$tbl1,:32] + + b .Lsquare_neon + +.align 4 +.Lsquare_break_neon: + add $tbl0,$ctx,#(48+2*4*9) + add $tbl1,$ctx,#(48+3*4*9) + + vmov $R0,$D0#lo @ r^4:r^3 + vshl.u32 $S1,$D1#lo,#2 @ *5 + vmov $R1,$D1#lo + vshl.u32 $S2,$D2#lo,#2 + vmov $R2,$D2#lo + vshl.u32 $S3,$D3#lo,#2 + vmov $R3,$D3#lo + vshl.u32 $S4,$D4#lo,#2 + vmov $R4,$D4#lo + vadd.i32 $S1,$S1,$D1#lo + vadd.i32 $S2,$S2,$D2#lo + vadd.i32 $S3,$S3,$D3#lo + vadd.i32 $S4,$S4,$D4#lo + + vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! + vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! + vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vst1.32 {${S4}[0]},[$tbl0] + vst1.32 {${S4}[1]},[$tbl1] + +.Lno_init_neon: + ret @ bx lr +.size poly1305_init_neon,.-poly1305_init_neon + +.globl poly1305_blocks_neon +.type poly1305_blocks_neon,%function +.align 5 +poly1305_blocks_neon: +.Lpoly1305_blocks_neon: + ldr ip,[$ctx,#36] @ is_base2_26 + + cmp $len,#64 + blo .Lpoly1305_blocks + + stmdb sp!,{r4-r7} + vstmdb sp!,{d8-d15} @ ABI specification says so + + tst ip,ip @ is_base2_26? + bne .Lbase2_26_neon + + stmdb sp!,{r1-r3,lr} + bl .Lpoly1305_init_neon + + ldr r4,[$ctx,#0] @ load hash value base 2^32 + ldr r5,[$ctx,#4] + ldr r6,[$ctx,#8] + ldr r7,[$ctx,#12] + ldr ip,[$ctx,#16] + + and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 + mov r3,r4,lsr#26 + veor $D0#lo,$D0#lo,$D0#lo + mov r4,r5,lsr#20 + orr r3,r3,r5,lsl#6 + veor $D1#lo,$D1#lo,$D1#lo + mov r5,r6,lsr#14 + orr r4,r4,r6,lsl#12 + veor $D2#lo,$D2#lo,$D2#lo + mov r6,r7,lsr#8 + orr r5,r5,r7,lsl#18 + veor $D3#lo,$D3#lo,$D3#lo + and r3,r3,#0x03ffffff + orr r6,r6,ip,lsl#24 + veor $D4#lo,$D4#lo,$D4#lo + and r4,r4,#0x03ffffff + mov r1,#1 + and r5,r5,#0x03ffffff + str r1,[$ctx,#36] @ set is_base2_26 + + vmov.32 $D0#lo[0],r2 + vmov.32 $D1#lo[0],r3 + vmov.32 $D2#lo[0],r4 + vmov.32 $D3#lo[0],r5 + vmov.32 $D4#lo[0],r6 + adr $zeros,.Lzeros + + ldmia sp!,{r1-r3,lr} + b .Lhash_loaded + +.align 4 +.Lbase2_26_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ load hash value + + veor $D0#lo,$D0#lo,$D0#lo + veor $D1#lo,$D1#lo,$D1#lo + veor $D2#lo,$D2#lo,$D2#lo + veor $D3#lo,$D3#lo,$D3#lo + veor $D4#lo,$D4#lo,$D4#lo + vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + adr $zeros,.Lzeros + vld1.32 {$D4#lo[0]},[$ctx] + sub $ctx,$ctx,#16 @ rewind + +.Lhash_loaded: + add $in2,$inp,#32 + mov $padbit,$padbit,lsl#24 + tst $len,#31 + beq .Leven + + vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! + vmov.32 $H4#lo[0],$padbit + sub $len,$len,#16 + add $in2,$inp,#32 + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3#lo,$H3#lo,#18 + + vsri.u32 $H3#lo,$H2#lo,#14 + vshl.u32 $H2#lo,$H2#lo,#12 + vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi + + vbic.i32 $H3#lo,#0xfc000000 + vsri.u32 $H2#lo,$H1#lo,#20 + vshl.u32 $H1#lo,$H1#lo,#6 + + vbic.i32 $H2#lo,#0xfc000000 + vsri.u32 $H1#lo,$H0#lo,#26 + vadd.i32 $H3#hi,$H3#lo,$D3#lo + + vbic.i32 $H0#lo,#0xfc000000 + vbic.i32 $H1#lo,#0xfc000000 + vadd.i32 $H2#hi,$H2#lo,$D2#lo + + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + + mov $tbl1,$zeros + add $tbl0,$ctx,#48 + + cmp $len,$len + b .Long_tail + +.align 4 +.Leven: + subs $len,$len,#64 + it lo + movlo $in2,$zeros + + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + itt hi + addhi $tbl1,$ctx,#(48+1*9*4) + addhi $tbl0,$ctx,#(48+3*9*4) + +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H3,$H3 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 +# endif + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vshl.u32 $H3,$H3,#18 + + vsri.u32 $H3,$H2,#14 + vshl.u32 $H2,$H2,#12 + + vbic.i32 $H3,#0xfc000000 + vsri.u32 $H2,$H1,#20 + vshl.u32 $H1,$H1,#6 + + vbic.i32 $H2,#0xfc000000 + vsri.u32 $H1,$H0,#26 + + vbic.i32 $H0,#0xfc000000 + vbic.i32 $H1,#0xfc000000 + + bls .Lskip_loop + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + b .Loop_neon + +.align 5 +.Loop_neon: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + @ \___________________/ + @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + @ \___________________/ \____________________/ + @ + @ Note that we start with inp[2:3]*r^2. This is because it + @ doesn't depend on reduction in previous iteration. + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ inp[2:3]*r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] + vmull.u32 $D2,$H2#hi,${R0}[1] + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,${R0}[1] + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,${R0}[1] + vmlal.u32 $D2,$H1#hi,${R1}[1] + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,${R0}[1] + + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,${R0}[1] + subs $len,$len,#64 + vmlal.u32 $D0,$H4#hi,${S1}[1] + it lo + movlo $in2,$zeros + vmlal.u32 $D3,$H2#hi,${R1}[1] + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D1,$H0#hi,${R1}[1] + vmlal.u32 $D4,$H3#hi,${R1}[1] + + vmlal.u32 $D0,$H3#hi,${S2}[1] + vmlal.u32 $D3,$H1#hi,${R2}[1] + vmlal.u32 $D4,$H2#hi,${R2}[1] + vmlal.u32 $D1,$H4#hi,${S2}[1] + vmlal.u32 $D2,$H0#hi,${R2}[1] + + vmlal.u32 $D3,$H0#hi,${R3}[1] + vmlal.u32 $D0,$H2#hi,${S3}[1] + vmlal.u32 $D4,$H1#hi,${R3}[1] + vmlal.u32 $D1,$H3#hi,${S3}[1] + vmlal.u32 $D2,$H4#hi,${S3}[1] + + vmlal.u32 $D3,$H4#hi,${S4}[1] + vmlal.u32 $D0,$H1#hi,${S4}[1] + vmlal.u32 $D4,$H0#hi,${R4}[1] + vmlal.u32 $D1,$H2#hi,${S4}[1] + vmlal.u32 $D2,$H3#hi,${S4}[1] + + vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) + add $in2,$in2,#64 + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4 and accumulate + + vmlal.u32 $D3,$H3#lo,${R0}[0] + vmlal.u32 $D0,$H0#lo,${R0}[0] + vmlal.u32 $D4,$H4#lo,${R0}[0] + vmlal.u32 $D1,$H1#lo,${R0}[0] + vmlal.u32 $D2,$H2#lo,${R0}[0] + vld1.32 ${S4}[0],[$tbl0,:32] + + vmlal.u32 $D3,$H2#lo,${R1}[0] + vmlal.u32 $D0,$H4#lo,${S1}[0] + vmlal.u32 $D4,$H3#lo,${R1}[0] + vmlal.u32 $D1,$H0#lo,${R1}[0] + vmlal.u32 $D2,$H1#lo,${R1}[0] + + vmlal.u32 $D3,$H1#lo,${R2}[0] + vmlal.u32 $D0,$H3#lo,${S2}[0] + vmlal.u32 $D4,$H2#lo,${R2}[0] + vmlal.u32 $D1,$H4#lo,${S2}[0] + vmlal.u32 $D2,$H0#lo,${R2}[0] + + vmlal.u32 $D3,$H0#lo,${R3}[0] + vmlal.u32 $D0,$H2#lo,${S3}[0] + vmlal.u32 $D4,$H1#lo,${R3}[0] + vmlal.u32 $D1,$H3#lo,${S3}[0] + vmlal.u32 $D3,$H4#lo,${S4}[0] + + vmlal.u32 $D2,$H4#lo,${S3}[0] + vmlal.u32 $D0,$H1#lo,${S4}[0] + vmlal.u32 $D4,$H0#lo,${R4}[0] + vmov.i32 $H4,#1<<24 @ padbit, yes, always + vmlal.u32 $D1,$H2#lo,${S4}[0] + vmlal.u32 $D2,$H3#lo,${S4}[0] + + vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] + add $inp,$inp,#64 +# ifdef __ARMEB__ + vrev32.8 $H0,$H0 + vrev32.8 $H1,$H1 + vrev32.8 $H2,$H2 + vrev32.8 $H3,$H3 +# endif + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction interleaved with base 2^32 -> base 2^26 of + @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. + + vshr.u64 $T0,$D3,#26 + vmovn.i64 $D3#lo,$D3 + vshr.u64 $T1,$D0,#26 + vmovn.i64 $D0#lo,$D0 + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vbic.i32 $D3#lo,#0xfc000000 + vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + vshl.u32 $H3,$H3,#18 + vbic.i32 $D0#lo,#0xfc000000 + + vshrn.u64 $T0#lo,$D4,#26 + vmovn.i64 $D4#lo,$D4 + vshr.u64 $T1,$D1,#26 + vmovn.i64 $D1#lo,$D1 + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + vsri.u32 $H3,$H2,#14 + vbic.i32 $D4#lo,#0xfc000000 + vshl.u32 $H2,$H2,#12 + vbic.i32 $D1#lo,#0xfc000000 + + vadd.i32 $D0#lo,$D0#lo,$T0#lo + vshl.u32 $T0#lo,$T0#lo,#2 + vbic.i32 $H3,#0xfc000000 + vshrn.u64 $T1#lo,$D2,#26 + vmovn.i64 $D2#lo,$D2 + vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] + vsri.u32 $H2,$H1,#20 + vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 + vshl.u32 $H1,$H1,#6 + vbic.i32 $D2#lo,#0xfc000000 + vbic.i32 $H2,#0xfc000000 + + vshrn.u64 $T0#lo,$D0,#26 @ re-narrow + vmovn.i64 $D0#lo,$D0 + vsri.u32 $H1,$H0,#26 + vbic.i32 $H0,#0xfc000000 + vshr.u32 $T1#lo,$D3#lo,#26 + vbic.i32 $D3#lo,#0xfc000000 + vbic.i32 $D0#lo,#0xfc000000 + vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 + vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 + vbic.i32 $H1,#0xfc000000 + + bhi .Loop_neon + +.Lskip_loop: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + add $tbl1,$ctx,#(48+0*9*4) + add $tbl0,$ctx,#(48+1*9*4) + adds $len,$len,#32 + it ne + movne $len,#0 + bne .Long_tail + + vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi + vadd.i32 $H0#hi,$H0#lo,$D0#lo + vadd.i32 $H3#hi,$H3#lo,$D3#lo + vadd.i32 $H1#hi,$H1#lo,$D1#lo + vadd.i32 $H4#hi,$H4#lo,$D4#lo + +.Long_tail: + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 + + vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant + vmull.u32 $D2,$H2#hi,$R0 + vadd.i32 $H0#lo,$H0#lo,$D0#lo + vmull.u32 $D0,$H0#hi,$R0 + vadd.i32 $H3#lo,$H3#lo,$D3#lo + vmull.u32 $D3,$H3#hi,$R0 + vadd.i32 $H1#lo,$H1#lo,$D1#lo + vmull.u32 $D1,$H1#hi,$R0 + vadd.i32 $H4#lo,$H4#lo,$D4#lo + vmull.u32 $D4,$H4#hi,$R0 + + vmlal.u32 $D0,$H4#hi,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#hi,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#hi,$R1 + vmlal.u32 $D4,$H3#hi,$R1 + vmlal.u32 $D2,$H1#hi,$R1 + + vmlal.u32 $D3,$H1#hi,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#hi,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#hi,$R2 + vmlal.u32 $D1,$H4#hi,$S2 + vmlal.u32 $D2,$H0#hi,$R2 + + vmlal.u32 $D3,$H0#hi,$R3 + it ne + addne $tbl1,$ctx,#(48+2*9*4) + vmlal.u32 $D0,$H2#hi,$S3 + it ne + addne $tbl0,$ctx,#(48+3*9*4) + vmlal.u32 $D4,$H1#hi,$R3 + vmlal.u32 $D1,$H3#hi,$S3 + vmlal.u32 $D2,$H4#hi,$S3 + + vmlal.u32 $D3,$H4#hi,$S4 + vorn $MASK,$MASK,$MASK @ all-ones, can be redundant + vmlal.u32 $D0,$H1#hi,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#hi,$R4 + vmlal.u32 $D1,$H2#hi,$S4 + vmlal.u32 $D2,$H3#hi,$S4 + + beq .Lshort_tail + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ (hash+inp[0:1])*r^4:r^3 and accumulate + + vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 + vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 + + vmlal.u32 $D2,$H2#lo,$R0 + vmlal.u32 $D0,$H0#lo,$R0 + vmlal.u32 $D3,$H3#lo,$R0 + vmlal.u32 $D1,$H1#lo,$R0 + vmlal.u32 $D4,$H4#lo,$R0 + + vmlal.u32 $D0,$H4#lo,$S1 + vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! + vmlal.u32 $D3,$H2#lo,$R1 + vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! + vmlal.u32 $D1,$H0#lo,$R1 + vmlal.u32 $D4,$H3#lo,$R1 + vmlal.u32 $D2,$H1#lo,$R1 + + vmlal.u32 $D3,$H1#lo,$R2 + vld1.32 ${S4}[1],[$tbl1,:32] + vmlal.u32 $D0,$H3#lo,$S2 + vld1.32 ${S4}[0],[$tbl0,:32] + vmlal.u32 $D4,$H2#lo,$R2 + vmlal.u32 $D1,$H4#lo,$S2 + vmlal.u32 $D2,$H0#lo,$R2 + + vmlal.u32 $D3,$H0#lo,$R3 + vmlal.u32 $D0,$H2#lo,$S3 + vmlal.u32 $D4,$H1#lo,$R3 + vmlal.u32 $D1,$H3#lo,$S3 + vmlal.u32 $D2,$H4#lo,$S3 + + vmlal.u32 $D3,$H4#lo,$S4 + vorn $MASK,$MASK,$MASK @ all-ones + vmlal.u32 $D0,$H1#lo,$S4 + vshr.u64 $MASK,$MASK,#38 + vmlal.u32 $D4,$H0#lo,$R4 + vmlal.u32 $D1,$H2#lo,$S4 + vmlal.u32 $D2,$H3#lo,$S4 + +.Lshort_tail: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ horizontal addition + + vadd.i64 $D3#lo,$D3#lo,$D3#hi + vadd.i64 $D0#lo,$D0#lo,$D0#hi + vadd.i64 $D4#lo,$D4#lo,$D4#hi + vadd.i64 $D1#lo,$D1#lo,$D1#hi + vadd.i64 $D2#lo,$D2#lo,$D2#hi + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ lazy reduction, but without narrowing + + vshr.u64 $T0,$D3,#26 + vand.i64 $D3,$D3,$MASK + vshr.u64 $T1,$D0,#26 + vand.i64 $D0,$D0,$MASK + vadd.i64 $D4,$D4,$T0 @ h3 -> h4 + vadd.i64 $D1,$D1,$T1 @ h0 -> h1 + + vshr.u64 $T0,$D4,#26 + vand.i64 $D4,$D4,$MASK + vshr.u64 $T1,$D1,#26 + vand.i64 $D1,$D1,$MASK + vadd.i64 $D2,$D2,$T1 @ h1 -> h2 + + vadd.i64 $D0,$D0,$T0 + vshl.u64 $T0,$T0,#2 + vshr.u64 $T1,$D2,#26 + vand.i64 $D2,$D2,$MASK + vadd.i64 $D0,$D0,$T0 @ h4 -> h0 + vadd.i64 $D3,$D3,$T1 @ h2 -> h3 + + vshr.u64 $T0,$D0,#26 + vand.i64 $D0,$D0,$MASK + vshr.u64 $T1,$D3,#26 + vand.i64 $D3,$D3,$MASK + vadd.i64 $D1,$D1,$T0 @ h0 -> h1 + vadd.i64 $D4,$D4,$T1 @ h3 -> h4 + + cmp $len,#0 + bne .Leven + + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ store hash value + + vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! + vst1.32 {$D4#lo[0]},[$ctx] + + vldmia sp!,{d8-d15} @ epilogue + ldmia sp!,{r4-r7} + ret @ bx lr +.size poly1305_blocks_neon,.-poly1305_blocks_neon + +.align 5 +.Lzeros: +.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +#ifndef __KERNEL__ +.LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else +.word OPENSSL_armcap_P-.Lpoly1305_init +# endif +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif +#endif +___ +} } +$code.=<<___; +.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm" +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} +close STDOUT; # enforce flush diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c new file mode 100644 index 000000000000..2d86c78af883 --- /dev/null +++ b/lib/crypto/arm/poly1305-glue.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM + * + * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/poly1305.h> +#include <linux/cpufeature.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/unaligned.h> + +asmlinkage void poly1305_block_init_arch( + struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + len = round_down(len, POLY1305_BLOCK_SIZE); + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && likely(may_use_simd())) { + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(state, src, todo, padbit); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); + } else + poly1305_blocks_arm(state, src, len, padbit); +} +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); + +bool poly1305_is_arch_optimized(void) +{ + /* We always can use at least the ARM scalar implementation. */ + return true; +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +static int __init arm_poly1305_mod_init(void) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + (elf_hwcap & HWCAP_NEON)) + static_branch_enable(&have_neon); + return 0; +} +subsys_initcall(arm_poly1305_mod_init); + +static void __exit arm_poly1305_mod_exit(void) +{ +} +module_exit(arm_poly1305_mod_exit); + +MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/arm/sha1-armv4-large.S b/lib/crypto/arm/sha1-armv4-large.S new file mode 100644 index 000000000000..1c8b685149f2 --- /dev/null +++ b/lib/crypto/arm/sha1-armv4-large.S @@ -0,0 +1,507 @@ +#define __ARM_ARCH__ __LINUX_ARM_ARCH__ +@ SPDX-License-Identifier: GPL-2.0 + +@ This code is taken from the OpenSSL project but the author (Andy Polyakov) +@ has relicensed it under the GPLv2. Therefore this program is free software; +@ you can redistribute it and/or modify it under the terms of the GNU General +@ Public License version 2 as published by the Free Software Foundation. +@ +@ The original headers, including the original license headers, are +@ included below for completeness. + +@ ==================================================================== +@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see https://www.openssl.org/~appro/cryptogams/. +@ ==================================================================== + +@ sha1_block procedure for ARMv4. +@ +@ January 2007. + +@ Size/performance trade-off +@ ==================================================================== +@ impl size in bytes comp cycles[*] measured performance +@ ==================================================================== +@ thumb 304 3212 4420 +@ armv4-small 392/+29% 1958/+64% 2250/+96% +@ armv4-compact 740/+89% 1552/+26% 1840/+22% +@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] +@ full unroll ~5100/+260% ~1260/+4% ~1300/+5% +@ ==================================================================== +@ thumb = same as 'small' but in Thumb instructions[**] and +@ with recurring code in two private functions; +@ small = detached Xload/update, loops are folded; +@ compact = detached Xload/update, 5x unroll; +@ large = interleaved Xload/update, 5x unroll; +@ full unroll = interleaved Xload/update, full unroll, estimated[!]; +@ +@ [*] Manually counted instructions in "grand" loop body. Measured +@ performance is affected by prologue and epilogue overhead, +@ i-cache availability, branch penalties, etc. +@ [**] While each Thumb instruction is twice smaller, they are not as +@ diverse as ARM ones: e.g., there are only two arithmetic +@ instructions with 3 arguments, no [fixed] rotate, addressing +@ modes are limited. As result it takes more instructions to do +@ the same job in Thumb, therefore the code is never twice as +@ small and always slower. +@ [***] which is also ~35% better than compiler generated code. Dual- +@ issue Cortex A8 core was measured to process input block in +@ ~990 cycles. + +@ August 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 13% improvement on +@ Cortex A8 core and in absolute terms ~870 cycles per input block +@ [or 13.6 cycles per byte]. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 10% +@ improvement on Cortex A8 core and 12.2 cycles per byte. + +#include <linux/linkage.h> + +.text + +.align 2 +ENTRY(sha1_block_data_order) + stmdb sp!,{r4-r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + ldmia r0,{r3,r4,r5,r6,r7} +.Lloop: + ldr r8,.LK_00_19 + mov r14,sp + sub sp,sp,#15*4 + mov r5,r5,ror#30 + mov r6,r6,ror#30 + mov r7,r7,ror#30 @ [6] +.L_00_15: +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r6,r8,r6,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r4,r5 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r6,r8,r6,ror#2 @ E+=K_00_19 + eor r10,r4,r5 @ F_xx_xx + add r6,r6,r7,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r3,r10,ror#2 + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r6,r6,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r5,r8,r5,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r3,r4 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r5,r8,r5,ror#2 @ E+=K_00_19 + eor r10,r3,r4 @ F_xx_xx + add r5,r5,r6,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r7,r10,ror#2 + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r5,r5,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r4,r8,r4,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r7,r3 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r4,r8,r4,ror#2 @ E+=K_00_19 + eor r10,r7,r3 @ F_xx_xx + add r4,r4,r5,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r6,r10,ror#2 + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r4,r4,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r3,r8,r3,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r6,r7 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r3,r8,r3,ror#2 @ E+=K_00_19 + eor r10,r6,r7 @ F_xx_xx + add r3,r3,r4,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r5,r10,ror#2 + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r3,r3,r10 @ E+=F_00_19(B,C,D) + cmp r14,sp + bne .L_00_15 @ [((11+4)*5+2)*3] + sub sp,sp,#25*4 +#if __ARM_ARCH__<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + add r6,r6,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + add r5,r5,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + add r4,r4,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + add r3,r3,r10 @ E+=F_00_19(B,C,D) + + ldr r8,.LK_20_39 @ [+15+16*4] + cmn sp,#0 @ [+3], clear carry to denote 20_39 +.L_20_39_or_60_79: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r4,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_20_39(B,C,D) + ARM( teq r14,sp ) @ preserve carry + THUMB( mov r11,sp ) + THUMB( teq r14,r11 ) @ preserve carry + bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] + bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes + + ldr r8,.LK_40_59 + sub sp,sp,#20*4 @ [+2] +.L_40_59: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r4,r10,ror#2 @ F_xx_xx + and r11,r5,r6 @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_40_59(B,C,D) + add r7,r7,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + and r11,r4,r5 @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_40_59(B,C,D) + add r6,r6,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + and r11,r3,r4 @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_40_59(B,C,D) + add r5,r5,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + and r11,r7,r3 @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_40_59(B,C,D) + add r4,r4,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + and r11,r6,r7 @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_40_59(B,C,D) + add r3,r3,r11,ror#2 + cmp r14,sp + bne .L_40_59 @ [+((12+5)*5+2)*4] + + ldr r8,.LK_60_79 + sub sp,sp,#20*4 + cmp sp,#0 @ set carry to denote 60_79 + b .L_20_39_or_60_79 @ [+4], spare 300 bytes +.L_done: + add sp,sp,#80*4 @ "deallocate" stack frame + ldmia r0,{r8,r9,r10,r11,r12} + add r3,r8,r3 + add r4,r9,r4 + add r5,r10,r5,ror#2 + add r6,r11,r6,ror#2 + add r7,r12,r7,ror#2 + stmia r0,{r3,r4,r5,r6,r7} + teq r1,r2 + bne .Lloop @ [+18], total 1307 + + ldmia sp!,{r4-r12,pc} +.align 2 +.LK_00_19: .word 0x5a827999 +.LK_20_39: .word 0x6ed9eba1 +.LK_40_59: .word 0x8f1bbcdc +.LK_60_79: .word 0xca62c1d6 +ENDPROC(sha1_block_data_order) +.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" +.align 2 diff --git a/lib/crypto/arm/sha1-armv7-neon.S b/lib/crypto/arm/sha1-armv7-neon.S new file mode 100644 index 000000000000..6edba3ab62e8 --- /dev/null +++ b/lib/crypto/arm/sha1-armv7-neon.S @@ -0,0 +1,633 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function + * + * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +.syntax unified +.fpu neon + +.text + + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 + + +/* Constants */ + +#define K1 0x5A827999 +#define K2 0x6ED9EBA1 +#define K3 0x8F1BBCDC +#define K4 0xCA62C1D6 +.align 4 +.LK_VEC: +.LK1: .long K1, K1, K1, K1 +.LK2: .long K2, K2, K2, K2 +.LK3: .long K3, K3, K3, K3 +.LK4: .long K4, K4, K4, K4 + + +/* Register macros */ + +#define RSTATE r0 +#define RDATA r1 +#define RNBLKS r2 +#define ROLDSTACK r3 +#define RWK lr + +#define _a r4 +#define _b r5 +#define _c r6 +#define _d r7 +#define _e r8 + +#define RT0 r9 +#define RT1 r10 +#define RT2 r11 +#define RT3 r12 + +#define W0 q0 +#define W1 q7 +#define W2 q2 +#define W3 q3 +#define W4 q4 +#define W5 q6 +#define W6 q5 +#define W7 q1 + +#define tmp0 q8 +#define tmp1 q9 +#define tmp2 q10 +#define tmp3 q11 + +#define qK1 q12 +#define qK2 q13 +#define qK3 q14 +#define qK4 q15 + +#ifdef CONFIG_CPU_BIG_ENDIAN +#define ARM_LE(code...) +#else +#define ARM_LE(code...) code +#endif + +/* Round function macros. */ + +#define WK_offs(i) (((i) & 15) * 4) + +#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ldr RT3, [sp, WK_offs(i)]; \ + pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + bic RT0, d, b; \ + add e, e, a, ror #(32 - 5); \ + and RT1, c, b; \ + pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add RT0, RT0, RT3; \ + add e, e, RT1; \ + ror b, #(32 - 30); \ + pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, RT0; + +#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ldr RT3, [sp, WK_offs(i)]; \ + pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + eor RT0, d, b; \ + add e, e, a, ror #(32 - 5); \ + eor RT0, RT0, c; \ + pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, RT3; \ + ror b, #(32 - 30); \ + pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, RT0; \ + +#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ldr RT3, [sp, WK_offs(i)]; \ + pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + eor RT0, b, c; \ + and RT1, b, c; \ + add e, e, a, ror #(32 - 5); \ + pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + and RT0, RT0, d; \ + add RT1, RT1, RT3; \ + add e, e, RT0; \ + ror b, #(32 - 30); \ + pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, RT1; + +#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) + +#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) + +#define R(a,b,c,d,e,f,i) \ + _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\ + W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) + +#define dummy(...) + + +/* Input expansion macros. */ + +/********* Precalc macros for rounds 0-15 *************************************/ + +#define W_PRECALC_00_15() \ + add RWK, sp, #(WK_offs(0)); \ + \ + vld1.32 {W0, W7}, [RDATA]!; \ + ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ + vld1.32 {W6, W5}, [RDATA]!; \ + vadd.u32 tmp0, W0, curK; \ + ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ + ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ + vadd.u32 tmp1, W7, curK; \ + ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ + vadd.u32 tmp2, W6, curK; \ + vst1.32 {tmp0, tmp1}, [RWK]!; \ + vadd.u32 tmp3, W5, curK; \ + vst1.32 {tmp2, tmp3}, [RWK]; \ + +#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vld1.32 {W0, W7}, [RDATA]!; \ + +#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + add RWK, sp, #(WK_offs(0)); \ + +#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ + +#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vld1.32 {W6, W5}, [RDATA]!; \ + +#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp0, W0, curK; \ + +#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ + +#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ + +#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp1, W7, curK; \ + +#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ + +#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp2, W6, curK; \ + +#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vst1.32 {tmp0, tmp1}, [RWK]!; \ + +#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp3, W5, curK; \ + +#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vst1.32 {tmp2, tmp3}, [RWK]; \ + + +/********* Precalc macros for rounds 16-31 ************************************/ + +#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor tmp0, tmp0; \ + vext.8 W, W_m16, W_m12, #8; \ + +#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + add RWK, sp, #(WK_offs(i)); \ + vext.8 tmp0, W_m04, tmp0, #4; \ + +#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor tmp0, tmp0, W_m16; \ + veor.32 W, W, W_m08; \ + +#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor tmp1, tmp1; \ + veor W, W, tmp0; \ + +#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vshl.u32 tmp0, W, #1; \ + +#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vext.8 tmp1, tmp1, W, #(16-12); \ + vshr.u32 W, W, #31; \ + +#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vorr tmp0, tmp0, W; \ + vshr.u32 W, tmp1, #30; \ + +#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vshl.u32 tmp1, tmp1, #2; \ + +#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor tmp0, tmp0, W; \ + +#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor W, tmp0, tmp1; \ + +#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp0, W, curK; \ + +#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vst1.32 {tmp0}, [RWK]; + + +/********* Precalc macros for rounds 32-79 ************************************/ + +#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor W, W_m28; \ + +#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vext.8 tmp0, W_m08, W_m04, #8; \ + +#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor W, W_m16; \ + +#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + veor W, tmp0; \ + +#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + add RWK, sp, #(WK_offs(i&~3)); \ + +#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vshl.u32 tmp1, W, #2; \ + +#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vshr.u32 tmp0, W, #30; \ + +#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vorr W, tmp0, tmp1; \ + +#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vadd.u32 tmp0, W, curK; \ + +#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + vst1.32 {tmp0}, [RWK]; + + +/* + * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA. + * + * void sha1_transform_neon(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +.align 3 +ENTRY(sha1_transform_neon) + /* input: + * r0: state + * r1: data (64*nblocks bytes) + * r2: nblocks + */ + + cmp RNBLKS, #0; + beq .Ldo_nothing; + + push {r4-r12, lr}; + /*vpush {q4-q7};*/ + + adr RT3, .LK_VEC; + + mov ROLDSTACK, sp; + + /* Align stack. */ + sub RT0, sp, #(16*4); + and RT0, #(~(16-1)); + mov sp, RT0; + + vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */ + + /* Get the values of the chaining variables. */ + ldm RSTATE, {_a-_e}; + + vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */ + +#undef curK +#define curK qK1 + /* Precalc 0-15. */ + W_PRECALC_00_15(); + +.Loop: + /* Transform 0-15 + Precalc 16-31. */ + _R( _a, _b, _c, _d, _e, F1, 0, + WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, + W4, W5, W6, W7, W0, _, _, _ ); + _R( _e, _a, _b, _c, _d, F1, 1, + WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, + W4, W5, W6, W7, W0, _, _, _ ); + _R( _d, _e, _a, _b, _c, F1, 2, + WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, + W4, W5, W6, W7, W0, _, _, _ ); + _R( _c, _d, _e, _a, _b, F1, 3, + WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, + W4, W5, W6, W7, W0, _, _, _ ); + +#undef curK +#define curK qK2 + _R( _b, _c, _d, _e, _a, F1, 4, + WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, + W3, W4, W5, W6, W7, _, _, _ ); + _R( _a, _b, _c, _d, _e, F1, 5, + WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, + W3, W4, W5, W6, W7, _, _, _ ); + _R( _e, _a, _b, _c, _d, F1, 6, + WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, + W3, W4, W5, W6, W7, _, _, _ ); + _R( _d, _e, _a, _b, _c, F1, 7, + WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, + W3, W4, W5, W6, W7, _, _, _ ); + + _R( _c, _d, _e, _a, _b, F1, 8, + WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, + W2, W3, W4, W5, W6, _, _, _ ); + _R( _b, _c, _d, _e, _a, F1, 9, + WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, + W2, W3, W4, W5, W6, _, _, _ ); + _R( _a, _b, _c, _d, _e, F1, 10, + WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, + W2, W3, W4, W5, W6, _, _, _ ); + _R( _e, _a, _b, _c, _d, F1, 11, + WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, + W2, W3, W4, W5, W6, _, _, _ ); + + _R( _d, _e, _a, _b, _c, F1, 12, + WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, + W1, W2, W3, W4, W5, _, _, _ ); + _R( _c, _d, _e, _a, _b, F1, 13, + WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, + W1, W2, W3, W4, W5, _, _, _ ); + _R( _b, _c, _d, _e, _a, F1, 14, + WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, + W1, W2, W3, W4, W5, _, _, _ ); + _R( _a, _b, _c, _d, _e, F1, 15, + WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, + W1, W2, W3, W4, W5, _, _, _ ); + + /* Transform 16-63 + Precalc 32-79. */ + _R( _e, _a, _b, _c, _d, F1, 16, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _d, _e, _a, _b, _c, F1, 17, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _c, _d, _e, _a, _b, F1, 18, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _b, _c, _d, _e, _a, F1, 19, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32, + W0, W1, W2, W3, W4, W5, W6, W7); + + _R( _a, _b, _c, _d, _e, F2, 20, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _e, _a, _b, _c, _d, F2, 21, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _d, _e, _a, _b, _c, F2, 22, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _c, _d, _e, _a, _b, F2, 23, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36, + W7, W0, W1, W2, W3, W4, W5, W6); + +#undef curK +#define curK qK3 + _R( _b, _c, _d, _e, _a, F2, 24, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _a, _b, _c, _d, _e, F2, 25, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _e, _a, _b, _c, _d, F2, 26, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _d, _e, _a, _b, _c, F2, 27, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40, + W6, W7, W0, W1, W2, W3, W4, W5); + + _R( _c, _d, _e, _a, _b, F2, 28, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _b, _c, _d, _e, _a, F2, 29, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _a, _b, _c, _d, _e, F2, 30, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _e, _a, _b, _c, _d, F2, 31, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44, + W5, W6, W7, W0, W1, W2, W3, W4); + + _R( _d, _e, _a, _b, _c, F2, 32, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, + W4, W5, W6, W7, W0, W1, W2, W3); + _R( _c, _d, _e, _a, _b, F2, 33, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, + W4, W5, W6, W7, W0, W1, W2, W3); + _R( _b, _c, _d, _e, _a, F2, 34, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48, + W4, W5, W6, W7, W0, W1, W2, W3); + _R( _a, _b, _c, _d, _e, F2, 35, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48, + W4, W5, W6, W7, W0, W1, W2, W3); + + _R( _e, _a, _b, _c, _d, F2, 36, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, + W3, W4, W5, W6, W7, W0, W1, W2); + _R( _d, _e, _a, _b, _c, F2, 37, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, + W3, W4, W5, W6, W7, W0, W1, W2); + _R( _c, _d, _e, _a, _b, F2, 38, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52, + W3, W4, W5, W6, W7, W0, W1, W2); + _R( _b, _c, _d, _e, _a, F2, 39, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52, + W3, W4, W5, W6, W7, W0, W1, W2); + + _R( _a, _b, _c, _d, _e, F3, 40, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, + W2, W3, W4, W5, W6, W7, W0, W1); + _R( _e, _a, _b, _c, _d, F3, 41, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, + W2, W3, W4, W5, W6, W7, W0, W1); + _R( _d, _e, _a, _b, _c, F3, 42, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56, + W2, W3, W4, W5, W6, W7, W0, W1); + _R( _c, _d, _e, _a, _b, F3, 43, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56, + W2, W3, W4, W5, W6, W7, W0, W1); + +#undef curK +#define curK qK4 + _R( _b, _c, _d, _e, _a, F3, 44, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, + W1, W2, W3, W4, W5, W6, W7, W0); + _R( _a, _b, _c, _d, _e, F3, 45, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, + W1, W2, W3, W4, W5, W6, W7, W0); + _R( _e, _a, _b, _c, _d, F3, 46, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60, + W1, W2, W3, W4, W5, W6, W7, W0); + _R( _d, _e, _a, _b, _c, F3, 47, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60, + W1, W2, W3, W4, W5, W6, W7, W0); + + _R( _c, _d, _e, _a, _b, F3, 48, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _b, _c, _d, _e, _a, F3, 49, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _a, _b, _c, _d, _e, F3, 50, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64, + W0, W1, W2, W3, W4, W5, W6, W7); + _R( _e, _a, _b, _c, _d, F3, 51, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64, + W0, W1, W2, W3, W4, W5, W6, W7); + + _R( _d, _e, _a, _b, _c, F3, 52, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _c, _d, _e, _a, _b, F3, 53, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _b, _c, _d, _e, _a, F3, 54, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68, + W7, W0, W1, W2, W3, W4, W5, W6); + _R( _a, _b, _c, _d, _e, F3, 55, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68, + W7, W0, W1, W2, W3, W4, W5, W6); + + _R( _e, _a, _b, _c, _d, F3, 56, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _d, _e, _a, _b, _c, F3, 57, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _c, _d, _e, _a, _b, F3, 58, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72, + W6, W7, W0, W1, W2, W3, W4, W5); + _R( _b, _c, _d, _e, _a, F3, 59, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72, + W6, W7, W0, W1, W2, W3, W4, W5); + + subs RNBLKS, #1; + + _R( _a, _b, _c, _d, _e, F4, 60, + WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _e, _a, _b, _c, _d, F4, 61, + WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _d, _e, _a, _b, _c, F4, 62, + WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76, + W5, W6, W7, W0, W1, W2, W3, W4); + _R( _c, _d, _e, _a, _b, F4, 63, + WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76, + W5, W6, W7, W0, W1, W2, W3, W4); + + beq .Lend; + + /* Transform 64-79 + Precalc 0-15 of next block. */ +#undef curK +#define curK qK1 + _R( _b, _c, _d, _e, _a, F4, 64, + WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _a, _b, _c, _d, _e, F4, 65, + WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _e, _a, _b, _c, _d, F4, 66, + WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _d, _e, _a, _b, _c, F4, 67, + WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + + _R( _c, _d, _e, _a, _b, F4, 68, + dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _b, _c, _d, _e, _a, F4, 69, + dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _a, _b, _c, _d, _e, F4, 70, + WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _e, _a, _b, _c, _d, F4, 71, + WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + + _R( _d, _e, _a, _b, _c, F4, 72, + dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _c, _d, _e, _a, _b, F4, 73, + dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _b, _c, _d, _e, _a, F4, 74, + WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _a, _b, _c, _d, _e, F4, 75, + WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + + _R( _e, _a, _b, _c, _d, F4, 76, + WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _d, _e, _a, _b, _c, F4, 77, + WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _c, _d, _e, _a, _b, F4, 78, + WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _b, _c, _d, _e, _a, F4, 79, + WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ ); + + /* Update the chaining variables. */ + ldm RSTATE, {RT0-RT3}; + add _a, RT0; + ldr RT0, [RSTATE, #state_h4]; + add _b, RT1; + add _c, RT2; + add _d, RT3; + add _e, RT0; + stm RSTATE, {_a-_e}; + + b .Loop; + +.Lend: + /* Transform 64-79 */ + R( _b, _c, _d, _e, _a, F4, 64 ); + R( _a, _b, _c, _d, _e, F4, 65 ); + R( _e, _a, _b, _c, _d, F4, 66 ); + R( _d, _e, _a, _b, _c, F4, 67 ); + R( _c, _d, _e, _a, _b, F4, 68 ); + R( _b, _c, _d, _e, _a, F4, 69 ); + R( _a, _b, _c, _d, _e, F4, 70 ); + R( _e, _a, _b, _c, _d, F4, 71 ); + R( _d, _e, _a, _b, _c, F4, 72 ); + R( _c, _d, _e, _a, _b, F4, 73 ); + R( _b, _c, _d, _e, _a, F4, 74 ); + R( _a, _b, _c, _d, _e, F4, 75 ); + R( _e, _a, _b, _c, _d, F4, 76 ); + R( _d, _e, _a, _b, _c, F4, 77 ); + R( _c, _d, _e, _a, _b, F4, 78 ); + R( _b, _c, _d, _e, _a, F4, 79 ); + + mov sp, ROLDSTACK; + + /* Update the chaining variables. */ + ldm RSTATE, {RT0-RT3}; + add _a, RT0; + ldr RT0, [RSTATE, #state_h4]; + add _b, RT1; + add _c, RT2; + add _d, RT3; + /*vpop {q4-q7};*/ + add _e, RT0; + stm RSTATE, {_a-_e}; + + pop {r4-r12, pc}; + +.Ldo_nothing: + bx lr +ENDPROC(sha1_transform_neon) diff --git a/lib/crypto/arm/sha1-ce-core.S b/lib/crypto/arm/sha1-ce-core.S new file mode 100644 index 000000000000..2de40dd25e47 --- /dev/null +++ b/lib/crypto/arm/sha1-ce-core.S @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2015 Linaro Ltd. + * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a + .fpu crypto-neon-fp-armv8 + + k0 .req q0 + k1 .req q1 + k2 .req q2 + k3 .req q3 + + ta0 .req q4 + ta1 .req q5 + tb0 .req q5 + tb1 .req q4 + + dga .req q6 + dgb .req q7 + dgbs .req s28 + + dg0 .req q12 + dg1a0 .req q13 + dg1a1 .req q14 + dg1b0 .req q14 + dg1b1 .req q13 + + .macro add_only, op, ev, rc, s0, dg1 + .ifnb \s0 + vadd.u32 tb\ev, q\s0, \rc + .endif + sha1h.32 dg1b\ev, dg0 + .ifb \dg1 + sha1\op\().32 dg0, dg1a\ev, ta\ev + .else + sha1\op\().32 dg0, \dg1, ta\ev + .endif + .endm + + .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 + sha1su0.32 q\s0, q\s1, q\s2 + add_only \op, \ev, \rc, \s1, \dg1 + sha1su1.32 q\s0, q\s3 + .endm + + .align 6 +.Lsha1_rcon: + .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 + .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 + .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc + .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 + + /* + * void sha1_ce_transform(struct sha1_block_state *state, + * const u8 *data, size_t nblocks); + */ +ENTRY(sha1_ce_transform) + /* load round constants */ + adr ip, .Lsha1_rcon + vld1.32 {k0-k1}, [ip, :128]! + vld1.32 {k2-k3}, [ip, :128] + + /* load state */ + vld1.32 {dga}, [r0] + vldr dgbs, [r0, #16] + + /* load input */ +0: vld1.32 {q8-q9}, [r1]! + vld1.32 {q10-q11}, [r1]! + subs r2, r2, #1 + +#ifndef CONFIG_CPU_BIG_ENDIAN + vrev32.8 q8, q8 + vrev32.8 q9, q9 + vrev32.8 q10, q10 + vrev32.8 q11, q11 +#endif + + vadd.u32 ta0, q8, k0 + vmov dg0, dga + + add_update c, 0, k0, 8, 9, 10, 11, dgb + add_update c, 1, k0, 9, 10, 11, 8 + add_update c, 0, k0, 10, 11, 8, 9 + add_update c, 1, k0, 11, 8, 9, 10 + add_update c, 0, k1, 8, 9, 10, 11 + + add_update p, 1, k1, 9, 10, 11, 8 + add_update p, 0, k1, 10, 11, 8, 9 + add_update p, 1, k1, 11, 8, 9, 10 + add_update p, 0, k1, 8, 9, 10, 11 + add_update p, 1, k2, 9, 10, 11, 8 + + add_update m, 0, k2, 10, 11, 8, 9 + add_update m, 1, k2, 11, 8, 9, 10 + add_update m, 0, k2, 8, 9, 10, 11 + add_update m, 1, k2, 9, 10, 11, 8 + add_update m, 0, k3, 10, 11, 8, 9 + + add_update p, 1, k3, 11, 8, 9, 10 + add_only p, 0, k3, 9 + add_only p, 1, k3, 10 + add_only p, 0, k3, 11 + add_only p, 1 + + /* update state */ + vadd.u32 dga, dga, dg0 + vadd.u32 dgb, dgb, dg1a0 + bne 0b + + /* store new state */ + vst1.32 {dga}, [r0] + vstr dgbs, [r0, #16] + bx lr +ENDPROC(sha1_ce_transform) diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h new file mode 100644 index 000000000000..fa1e92419000 --- /dev/null +++ b/lib/crypto/arm/sha1.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-1 optimized for ARM + * + * Copyright 2025 Google LLC + */ +#include <asm/neon.h> +#include <asm/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +asmlinkage void sha1_block_data_order(struct sha1_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha1_transform_neon(struct sha1_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha1_ce_transform(struct sha1_block_state *state, + const u8 *data, size_t nblocks); + +static void sha1_blocks(struct sha1_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && likely(may_use_simd())) { + kernel_neon_begin(); + if (static_branch_likely(&have_ce)) + sha1_ce_transform(state, data, nblocks); + else + sha1_transform_neon(state, data, nblocks); + kernel_neon_end(); + } else { + sha1_block_data_order(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha1_mod_init_arch sha1_mod_init_arch +static inline void sha1_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) { + static_branch_enable(&have_neon); + if (elf_hwcap2 & HWCAP2_SHA1) + static_branch_enable(&have_ce); + } +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/arm/sha256-armv4.pl b/lib/crypto/arm/sha256-armv4.pl new file mode 100644 index 000000000000..f3a2b54efd4e --- /dev/null +++ b/lib/crypto/arm/sha256-armv4.pl @@ -0,0 +1,724 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from the OpenSSL project but the author (Andy Polyakov) +# has relicensed it under the GPLv2. Therefore this program is free software; +# you can redistribute it and/or modify it under the terms of the GNU General +# Public License version 2 as published by the Free Software Foundation. +# +# The original headers, including the original license headers, are +# included below for completeness. + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA256 block procedure for ARMv4. May 2007. + +# Performance is ~2x better than gcc 3.4 generated code and in "abso- +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +# byte [on single-issue Xscale PXA250 core]. + +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 22% improvement on +# Cortex A8 core and ~20 cycles per processed byte. + +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 16% +# improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +# September 2013. +# +# Add NEON implementation. On Cortex A8 it was measured to process one +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +# code (meaning that latter performs sub-optimally, nothing was done +# about it). + +# May 2014. +# +# Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$ctx="r0"; $t0="r0"; +$inp="r1"; $t4="r1"; +$len="r2"; $t1="r2"; +$T1="r3"; $t3="r3"; +$A="r4"; +$B="r5"; +$C="r6"; +$D="r7"; +$E="r8"; +$F="r9"; +$G="r10"; +$H="r11"; +@V=($A,$B,$C,$D,$E,$F,$G,$H); +$t2="r12"; +$Ktbl="r14"; + +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); + +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___ if ($i<16); +#if __ARM_ARCH__>=7 + @ ldr $t1,[$inp],#4 @ $i +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +# ifndef __ARMEB__ + rev $t1,$t1 +# endif +#else + @ ldrb $t1,[$inp,#3] @ $i + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + ldrb $t2,[$inp,#2] + ldrb $t0,[$inp,#1] + orr $t1,$t1,$t2,lsl#8 + ldrb $t2,[$inp],#4 + orr $t1,$t1,$t0,lsl#16 +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + orr $t1,$t1,$t2,lsl#24 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +#endif +___ +$code.=<<___; + ldr $t2,[$Ktbl],#4 @ *K256++ + add $h,$h,$t1 @ h+=X[i] + str $t1,[sp,#`$i%16`*4] + eor $t1,$f,$g + add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) + and $t1,$t1,$e + add $h,$h,$t2 @ h+=K256[i] + eor $t1,$t1,$g @ Ch(e,f,g) + eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` + add $h,$h,$t1 @ h+=Ch(e,f,g) +#if $i==31 + and $t2,$t2,#0xff + cmp $t2,#0xf2 @ done? +#endif +#if $i<15 +# if __ARM_ARCH__>=7 + ldr $t1,[$inp],#4 @ prefetch +# else + ldrb $t1,[$inp,#3] +# endif + eor $t2,$a,$b @ a^b, b^c in next round +#else + ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx + eor $t2,$a,$b @ a^b, b^c in next round + ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx +#endif + eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) + and $t3,$t3,$t2 @ (b^c)&=(a^b) + add $d,$d,$h @ d+=h + eor $t3,$t3,$b @ Maj(a,b,c) + add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) + @ add $h,$h,$t3 @ h+=Maj(a,b,c) +___ + ($t2,$t3)=($t3,$t2); +} + +sub BODY_16_XX { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i + @ ldr $t4,[sp,#`($i+14)%16`*4] + mov $t0,$t1,ror#$sigma0[0] + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + mov $t2,$t4,ror#$sigma1[0] + eor $t0,$t0,$t1,ror#$sigma0[1] + eor $t2,$t2,$t4,ror#$sigma1[1] + eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) + ldr $t1,[sp,#`($i+0)%16`*4] + eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) + ldr $t4,[sp,#`($i+9)%16`*4] + + add $t2,$t2,$t0 + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 + add $t1,$t1,$t2 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) + add $t1,$t1,$t4 @ X[i] +___ + &BODY_00_15(@_); +} + +$code=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +.text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else +.code 32 +# endif +#endif + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha256_block_data_order +#endif +.align 5 + +.global sha256_block_data_order +.type sha256_block_data_order,%function +sha256_block_data_order: +.Lsha256_block_data_order: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ sha256_block_data_order +#else + adr r3,.Lsha256_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#ARMV8_SHA256 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} + ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} + sub $Ktbl,r3,#256+32 @ K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH__>=7 + ldr $t1,[$inp],#4 +# else + ldrb $t1,[$inp,#3] +# endif + eor $t3,$B,$C @ magic + eor $t2,$t2,$t2 +___ +for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=".Lrounds_16_xx:\n"; +for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +#if __ARM_ARCH__>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq $t3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t0,[$t3,#0] + ldr $t1,[$t3,#4] + ldr $t2,[$t3,#8] + add $A,$A,$t0 + ldr $t0,[$t3,#12] + add $B,$B,$t1 + ldr $t1,[$t3,#16] + add $C,$C,$t2 + ldr $t2,[$t3,#20] + add $D,$D,$t0 + ldr $t0,[$t3,#24] + add $E,$E,$t1 + ldr $t1,[$t3,#28] + add $F,$F,$t2 + ldr $inp,[sp,#17*4] @ pull inp + ldr $t2,[sp,#18*4] @ pull inp+len + add $G,$G,$t0 + add $H,$H,$t1 + stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} + cmp $inp,$t2 + sub $Ktbl,$Ktbl,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#`16+3`*4 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size sha256_block_data_order,.-sha256_block_data_order +___ +###################################################################### +# NEON stuff +# +{{{ +my @X=map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); +my $Xfer=$t4; +my $j=0; + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + while($#insns>=2) { eval(shift(@insns)); } + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vrev32_8 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&eor ($t1,$f,$g)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&and ($t1,$t1,$e)', + '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&eor ($t1,$t1,$g)', # Ch(e,f,g) + '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&ldr ($t1,"[sp,#64]") if ($j==31)', + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&add ($d,$d,$h)', # d+=h + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global sha256_block_data_order_neon +.type sha256_block_data_order_neon,%function +.align 4 +sha256_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + + sub $H,sp,#16*4+16 + adr $Ktbl,.Lsha256_block_data_order + sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256 + bic $H,$H,#15 @ align for 128-bit stores + mov $t2,sp + mov sp,$H @ alloca + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + + vld1.8 {@X[0]},[$inp]! + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + vld1.32 {$T0},[$Ktbl,:128]! + vld1.32 {$T1},[$Ktbl,:128]! + vld1.32 {$T2},[$Ktbl,:128]! + vld1.32 {$T3},[$Ktbl,:128]! + vrev32.8 @X[0],@X[0] @ yes, even on + str $ctx,[sp,#64] + vrev32.8 @X[1],@X[1] @ big-endian + str $inp,[sp,#68] + mov $Xfer,sp + vrev32.8 @X[2],@X[2] + str $len,[sp,#72] + vrev32.8 @X[3],@X[3] + str $t2,[sp,#76] @ save original sp + vadd.i32 $T0,$T0,@X[0] + vadd.i32 $T1,$T1,@X[1] + vst1.32 {$T0},[$Xfer,:128]! + vadd.i32 $T2,$T2,@X[2] + vst1.32 {$T1},[$Xfer,:128]! + vadd.i32 $T3,$T3,@X[3] + vst1.32 {$T2},[$Xfer,:128]! + vst1.32 {$T3},[$Xfer,:128]! + + ldmia $ctx,{$A-$H} + sub $Xfer,$Xfer,#64 + ldr $t1,[sp,#0] + eor $t2,$t2,$t2 + eor $t3,$B,$C + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + teq $t1,#0 @ check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + ldr $inp,[sp,#68] + ldr $t0,[sp,#72] + sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl + teq $inp,$t0 + it eq + subeq $inp,$inp,#64 @ avoid SEGV + vld1.8 {@X[0]},[$inp]! @ load next input block + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + it ne + strne $inp,[sp,#68] + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + ldr $t0,[$t1,#0] + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t2,[$t1,#4] + ldr $t3,[$t1,#8] + ldr $t4,[$t1,#12] + add $A,$A,$t0 @ accumulate + ldr $t0,[$t1,#16] + add $B,$B,$t2 + ldr $t2,[$t1,#20] + add $C,$C,$t3 + ldr $t3,[$t1,#24] + add $D,$D,$t4 + ldr $t4,[$t1,#28] + add $E,$E,$t0 + str $A,[$t1],#4 + add $F,$F,$t2 + str $B,[$t1],#4 + add $G,$G,$t3 + str $C,[$t1],#4 + add $H,$H,$t4 + str $D,[$t1],#4 + stmia $t1,{$E-$H} + + ittte ne + movne $Xfer,sp + ldrne $t1,[sp,#0] + eorne $t2,$t2,$t2 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne $t3,$B,$C + bne .L_00_48 + + ldmia sp!,{r4-r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +___ +}}} +###################################################################### +# ARMv8 stuff +# +{{{ +my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); +my @MSG=map("q$_",(8..11)); +my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); +my $Ktbl="r3"; + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# ifdef __thumb2__ +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + +.type sha256_block_data_order_armv8,%function +.align 5 +sha256_block_data_order_armv8: +.LARMv8: + vld1.32 {$ABCD,$EFGH},[$ctx] +# ifdef __thumb2__ + adr $Ktbl,.LARMv8 + sub $Ktbl,$Ktbl,#.LARMv8-K256 +# else + adrl $Ktbl,K256 +# endif + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + +.Loop_v8: + vld1.8 {@MSG[0]-@MSG[1]},[$inp]! + vld1.8 {@MSG[2]-@MSG[3]},[$inp]! + vld1.32 {$W0},[$Ktbl]! + vrev32.8 @MSG[0],@MSG[0] + vrev32.8 @MSG[1],@MSG[1] + vrev32.8 @MSG[2],@MSG[2] + vrev32.8 @MSG[3],@MSG[3] + vmov $ABCD_SAVE,$ABCD @ offload + vmov $EFGH_SAVE,$EFGH + teq $inp,$len +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + vld1.32 {$W1},[$Ktbl]! + vadd.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + vld1.32 {$W1},[$Ktbl]! + vadd.i32 $W0,$W0,@MSG[0] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + vld1.32 {$W0},[$Ktbl]! + vadd.i32 $W1,$W1,@MSG[1] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + vld1.32 {$W1},[$Ktbl] + vadd.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#256-16 @ rewind + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + vadd.i32 $W1,$W1,@MSG[3] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + vadd.i32 $ABCD,$ABCD,$ABCD_SAVE + vadd.i32 $EFGH,$EFGH,$EFGH_SAVE + it ne + bne .Loop_v8 + + vst1.32 {$ABCD,$EFGH},[$ctx] + + ret @ bx lr +.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +#endif +___ +}}} +$code.=<<___; +.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +{ my %opcode = ( + "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, + "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } +} + +foreach (split($/,$code)) { + + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; + + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} + +close STDOUT; # enforce flush diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S new file mode 100644 index 000000000000..7481ac8e6c0d --- /dev/null +++ b/lib/crypto/arm/sha256-ce.S @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2015 Linaro Ltd. + * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a + .fpu crypto-neon-fp-armv8 + + k0 .req q7 + k1 .req q8 + rk .req r3 + + ta0 .req q9 + ta1 .req q10 + tb0 .req q10 + tb1 .req q9 + + dga .req q11 + dgb .req q12 + + dg0 .req q13 + dg1 .req q14 + dg2 .req q15 + + .macro add_only, ev, s0 + vmov dg2, dg0 + .ifnb \s0 + vld1.32 {k\ev}, [rk, :128]! + .endif + sha256h.32 dg0, dg1, tb\ev + sha256h2.32 dg1, dg2, tb\ev + .ifnb \s0 + vadd.u32 ta\ev, q\s0, k\ev + .endif + .endm + + .macro add_update, ev, s0, s1, s2, s3 + sha256su0.32 q\s0, q\s1 + add_only \ev, \s1 + sha256su1.32 q\s0, q\s2, q\s3 + .endm + + .align 6 +.Lsha256_rcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + + /* + * void sha256_ce_transform(struct sha256_block_state *state, + * const u8 *data, size_t nblocks); + */ +ENTRY(sha256_ce_transform) + /* load state */ + vld1.32 {dga-dgb}, [r0] + + /* load input */ +0: vld1.32 {q0-q1}, [r1]! + vld1.32 {q2-q3}, [r1]! + subs r2, r2, #1 + +#ifndef CONFIG_CPU_BIG_ENDIAN + vrev32.8 q0, q0 + vrev32.8 q1, q1 + vrev32.8 q2, q2 + vrev32.8 q3, q3 +#endif + + /* load first round constant */ + adr rk, .Lsha256_rcon + vld1.32 {k0}, [rk, :128]! + + vadd.u32 ta0, q0, k0 + vmov dg0, dga + vmov dg1, dgb + + add_update 1, 0, 1, 2, 3 + add_update 0, 1, 2, 3, 0 + add_update 1, 2, 3, 0, 1 + add_update 0, 3, 0, 1, 2 + add_update 1, 0, 1, 2, 3 + add_update 0, 1, 2, 3, 0 + add_update 1, 2, 3, 0, 1 + add_update 0, 3, 0, 1, 2 + add_update 1, 0, 1, 2, 3 + add_update 0, 1, 2, 3, 0 + add_update 1, 2, 3, 0, 1 + add_update 0, 3, 0, 1, 2 + + add_only 1, 1 + add_only 0, 2 + add_only 1, 3 + add_only 0 + + /* update state */ + vadd.u32 dga, dga, dg0 + vadd.u32 dgb, dgb, dg1 + bne 0b + + /* store new state */ + vst1.32 {dga-dgb}, [r0] + bx lr +ENDPROC(sha256_ce_transform) diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h new file mode 100644 index 000000000000..da75cbdc51d4 --- /dev/null +++ b/lib/crypto/arm/sha256.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * SHA-256 optimized for ARM + * + * Copyright 2025 Google LLC + */ +#include <asm/neon.h> +#include <crypto/internal/simd.h> + +asmlinkage void sha256_block_data_order(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha256_ce_transform(struct sha256_block_state *state, + const u8 *data, size_t nblocks); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce); + +static void sha256_blocks(struct sha256_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && crypto_simd_usable()) { + kernel_neon_begin(); + if (static_branch_likely(&have_ce)) + sha256_ce_transform(state, data, nblocks); + else + sha256_block_data_order_neon(state, data, nblocks); + kernel_neon_end(); + } else { + sha256_block_data_order(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha256_mod_init_arch sha256_mod_init_arch +static inline void sha256_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) { + static_branch_enable(&have_neon); + if (elf_hwcap2 & HWCAP2_SHA2) + static_branch_enable(&have_ce); + } +} +#endif /* CONFIG_KERNEL_MODE_NEON */ diff --git a/lib/crypto/arm/sha512-armv4.pl b/lib/crypto/arm/sha512-armv4.pl new file mode 100644 index 000000000000..2fc3516912fa --- /dev/null +++ b/lib/crypto/arm/sha512-armv4.pl @@ -0,0 +1,657 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 + +# This code is taken from the OpenSSL project but the author (Andy Polyakov) +# has relicensed it under the GPLv2. Therefore this program is free software; +# you can redistribute it and/or modify it under the terms of the GNU General +# Public License version 2 as published by the Free Software Foundation. +# +# The original headers, including the original license headers, are +# included below for completeness. + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA512 block procedure for ARMv4. September 2007. + +# This code is ~4.5 (four and a half) times faster than code generated +# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue +# Xscale PXA250 core]. +# +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 6% improvement on +# Cortex A8 core and ~40 cycles per processed byte. + +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Coxtex A8 core and ~38 cycles per byte. + +# March 2011. +# +# Add NEON implementation. On Cortex A8 it was measured to process +# one byte in 23.3 cycles or ~60% faster than integer-only code. + +# August 2012. +# +# Improve NEON performance by 12% on Snapdragon S4. In absolute +# terms it's 22.6 cycles per byte, which is disappointing result. +# Technical writers asserted that 3-way S4 pipeline can sustain +# multiple NEON instructions per cycle, but dual NEON issue could +# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html +# for further details. On side note Cortex-A15 processes one byte in +# 16 cycles. + +# Byte order [in]dependence. ========================================= +# +# Originally caller was expected to maintain specific *dword* order in +# h[0-7], namely with most significant dword at *lower* address, which +# was reflected in below two parameters as 0 and 4. Now caller is +# expected to maintain native byte order for whole 64-bit values. +$hi="HI"; +$lo="LO"; +# ==================================================================== + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$ctx="r0"; # parameter block +$inp="r1"; +$len="r2"; + +$Tlo="r3"; +$Thi="r4"; +$Alo="r5"; +$Ahi="r6"; +$Elo="r7"; +$Ehi="r8"; +$t0="r9"; +$t1="r10"; +$t2="r11"; +$t3="r12"; +############ r13 is stack pointer +$Ktbl="r14"; +############ r15 is program counter + +$Aoff=8*0; +$Boff=8*1; +$Coff=8*2; +$Doff=8*3; +$Eoff=8*4; +$Foff=8*5; +$Goff=8*6; +$Hoff=8*7; +$Xoff=8*8; + +sub BODY_00_15() { +my $magic = shift; +$code.=<<___; + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov $t0,$Elo,lsr#14 + str $Tlo,[sp,#$Xoff+0] + mov $t1,$Ehi,lsr#14 + str $Thi,[sp,#$Xoff+4] + eor $t0,$t0,$Ehi,lsl#18 + ldr $t2,[sp,#$Hoff+0] @ h.lo + eor $t1,$t1,$Elo,lsl#18 + ldr $t3,[sp,#$Hoff+4] @ h.hi + eor $t0,$t0,$Elo,lsr#18 + eor $t1,$t1,$Ehi,lsr#18 + eor $t0,$t0,$Ehi,lsl#14 + eor $t1,$t1,$Elo,lsl#14 + eor $t0,$t0,$Ehi,lsr#9 + eor $t1,$t1,$Elo,lsr#9 + eor $t0,$t0,$Elo,lsl#23 + eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) + adds $Tlo,$Tlo,$t0 + ldr $t0,[sp,#$Foff+0] @ f.lo + adc $Thi,$Thi,$t1 @ T += Sigma1(e) + ldr $t1,[sp,#$Foff+4] @ f.hi + adds $Tlo,$Tlo,$t2 + ldr $t2,[sp,#$Goff+0] @ g.lo + adc $Thi,$Thi,$t3 @ T += h + ldr $t3,[sp,#$Goff+4] @ g.hi + + eor $t0,$t0,$t2 + str $Elo,[sp,#$Eoff+0] + eor $t1,$t1,$t3 + str $Ehi,[sp,#$Eoff+4] + and $t0,$t0,$Elo + str $Alo,[sp,#$Aoff+0] + and $t1,$t1,$Ehi + str $Ahi,[sp,#$Aoff+4] + eor $t0,$t0,$t2 + ldr $t2,[$Ktbl,#$lo] @ K[i].lo + eor $t1,$t1,$t3 @ Ch(e,f,g) + ldr $t3,[$Ktbl,#$hi] @ K[i].hi + + adds $Tlo,$Tlo,$t0 + ldr $Elo,[sp,#$Doff+0] @ d.lo + adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) + ldr $Ehi,[sp,#$Doff+4] @ d.hi + adds $Tlo,$Tlo,$t2 + and $t0,$t2,#0xff + adc $Thi,$Thi,$t3 @ T += K[i] + adds $Elo,$Elo,$Tlo + ldr $t2,[sp,#$Boff+0] @ b.lo + adc $Ehi,$Ehi,$Thi @ d += T + teq $t0,#$magic + + ldr $t3,[sp,#$Coff+0] @ c.lo +#if __ARM_ARCH__>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq $Ktbl,$Ktbl,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov $t0,$Alo,lsr#28 + mov $t1,$Ahi,lsr#28 + eor $t0,$t0,$Ahi,lsl#4 + eor $t1,$t1,$Alo,lsl#4 + eor $t0,$t0,$Ahi,lsr#2 + eor $t1,$t1,$Alo,lsr#2 + eor $t0,$t0,$Alo,lsl#30 + eor $t1,$t1,$Ahi,lsl#30 + eor $t0,$t0,$Ahi,lsr#7 + eor $t1,$t1,$Alo,lsr#7 + eor $t0,$t0,$Alo,lsl#25 + eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) + adds $Tlo,$Tlo,$t0 + and $t0,$Alo,$t2 + adc $Thi,$Thi,$t1 @ T += Sigma0(a) + + ldr $t1,[sp,#$Boff+4] @ b.hi + orr $Alo,$Alo,$t2 + ldr $t2,[sp,#$Coff+4] @ c.hi + and $Alo,$Alo,$t3 + and $t3,$Ahi,$t1 + orr $Ahi,$Ahi,$t1 + orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo + and $Ahi,$Ahi,$t2 + adds $Alo,$Alo,$Tlo + orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc $Ahi,$Ahi,$Thi @ h += T + tst $Ktbl,#1 + add $Ktbl,$Ktbl,#8 +___ +} +$code=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +.text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +.thumb +# else +.code 32 +# endif +#endif + +.type K512,%object +.align 5 +K512: +WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) +WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) +WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) +WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) +WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) +WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) +WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) +WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) +WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) +WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) +WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) +WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) +WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) +WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) +WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) +WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) +WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) +WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) +WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) +WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) +WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) +WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) +WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) +WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) +WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) +WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) +WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) +WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) +WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) +WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) +WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) +WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) +WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) +WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) +WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) +WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) +WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) +WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) +WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) +WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) +.size K512,.-K512 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha512_block_data_order +.skip 32-4 +#else +.skip 32 +#endif + +.global sha512_block_data_order +.type sha512_block_data_order,%function +sha512_block_data_order: +.Lsha512_block_data_order: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ sha512_block_data_order +#else + adr r3,.Lsha512_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#1 + bne .LNEON +#endif + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + stmdb sp!,{r4-r12,lr} + sub $Ktbl,r3,#672 @ K512 + sub sp,sp,#9*8 + + ldr $Elo,[$ctx,#$Eoff+$lo] + ldr $Ehi,[$ctx,#$Eoff+$hi] + ldr $t0, [$ctx,#$Goff+$lo] + ldr $t1, [$ctx,#$Goff+$hi] + ldr $t2, [$ctx,#$Hoff+$lo] + ldr $t3, [$ctx,#$Hoff+$hi] +.Loop: + str $t0, [sp,#$Goff+0] + str $t1, [sp,#$Goff+4] + str $t2, [sp,#$Hoff+0] + str $t3, [sp,#$Hoff+4] + ldr $Alo,[$ctx,#$Aoff+$lo] + ldr $Ahi,[$ctx,#$Aoff+$hi] + ldr $Tlo,[$ctx,#$Boff+$lo] + ldr $Thi,[$ctx,#$Boff+$hi] + ldr $t0, [$ctx,#$Coff+$lo] + ldr $t1, [$ctx,#$Coff+$hi] + ldr $t2, [$ctx,#$Doff+$lo] + ldr $t3, [$ctx,#$Doff+$hi] + str $Tlo,[sp,#$Boff+0] + str $Thi,[sp,#$Boff+4] + str $t0, [sp,#$Coff+0] + str $t1, [sp,#$Coff+4] + str $t2, [sp,#$Doff+0] + str $t3, [sp,#$Doff+4] + ldr $Tlo,[$ctx,#$Foff+$lo] + ldr $Thi,[$ctx,#$Foff+$hi] + str $Tlo,[sp,#$Foff+0] + str $Thi,[sp,#$Foff+4] + +.L00_15: +#if __ARM_ARCH__<7 + ldrb $Tlo,[$inp,#7] + ldrb $t0, [$inp,#6] + ldrb $t1, [$inp,#5] + ldrb $t2, [$inp,#4] + ldrb $Thi,[$inp,#3] + ldrb $t3, [$inp,#2] + orr $Tlo,$Tlo,$t0,lsl#8 + ldrb $t0, [$inp,#1] + orr $Tlo,$Tlo,$t1,lsl#16 + ldrb $t1, [$inp],#8 + orr $Tlo,$Tlo,$t2,lsl#24 + orr $Thi,$Thi,$t3,lsl#8 + orr $Thi,$Thi,$t0,lsl#16 + orr $Thi,$Thi,$t1,lsl#24 +#else + ldr $Tlo,[$inp,#4] + ldr $Thi,[$inp],#8 +#ifdef __ARMEL__ + rev $Tlo,$Tlo + rev $Thi,$Thi +#endif +#endif +___ + &BODY_00_15(0x94); +$code.=<<___; + tst $Ktbl,#1 + beq .L00_15 + ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] + ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] + bic $Ktbl,$Ktbl,#1 +.L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov $Tlo,$t0,lsr#1 + ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] + mov $Thi,$t1,lsr#1 + ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] + eor $Tlo,$Tlo,$t1,lsl#31 + eor $Thi,$Thi,$t0,lsl#31 + eor $Tlo,$Tlo,$t0,lsr#8 + eor $Thi,$Thi,$t1,lsr#8 + eor $Tlo,$Tlo,$t1,lsl#24 + eor $Thi,$Thi,$t0,lsl#24 + eor $Tlo,$Tlo,$t0,lsr#7 + eor $Thi,$Thi,$t1,lsr#7 + eor $Tlo,$Tlo,$t1,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov $t0,$t2,lsr#19 + mov $t1,$t3,lsr#19 + eor $t0,$t0,$t3,lsl#13 + eor $t1,$t1,$t2,lsl#13 + eor $t0,$t0,$t3,lsr#29 + eor $t1,$t1,$t2,lsr#29 + eor $t0,$t0,$t2,lsl#3 + eor $t1,$t1,$t3,lsl#3 + eor $t0,$t0,$t2,lsr#6 + eor $t1,$t1,$t3,lsr#6 + ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] + eor $t0,$t0,$t3,lsl#26 + + ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] + adds $Tlo,$Tlo,$t0 + ldr $t0,[sp,#`$Xoff+8*16`+0] + adc $Thi,$Thi,$t1 + + ldr $t1,[sp,#`$Xoff+8*16`+4] + adds $Tlo,$Tlo,$t2 + adc $Thi,$Thi,$t3 + adds $Tlo,$Tlo,$t0 + adc $Thi,$Thi,$t1 +___ + &BODY_00_15(0x17); +$code.=<<___; +#if __ARM_ARCH__>=7 + ittt eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] + ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] + beq .L16_79 + bic $Ktbl,$Ktbl,#1 + + ldr $Tlo,[sp,#$Boff+0] + ldr $Thi,[sp,#$Boff+4] + ldr $t0, [$ctx,#$Aoff+$lo] + ldr $t1, [$ctx,#$Aoff+$hi] + ldr $t2, [$ctx,#$Boff+$lo] + ldr $t3, [$ctx,#$Boff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Aoff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Aoff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Boff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Boff+$hi] + + ldr $Alo,[sp,#$Coff+0] + ldr $Ahi,[sp,#$Coff+4] + ldr $Tlo,[sp,#$Doff+0] + ldr $Thi,[sp,#$Doff+4] + ldr $t0, [$ctx,#$Coff+$lo] + ldr $t1, [$ctx,#$Coff+$hi] + ldr $t2, [$ctx,#$Doff+$lo] + ldr $t3, [$ctx,#$Doff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Coff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Coff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Doff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Doff+$hi] + + ldr $Tlo,[sp,#$Foff+0] + ldr $Thi,[sp,#$Foff+4] + ldr $t0, [$ctx,#$Eoff+$lo] + ldr $t1, [$ctx,#$Eoff+$hi] + ldr $t2, [$ctx,#$Foff+$lo] + ldr $t3, [$ctx,#$Foff+$hi] + adds $Elo,$Elo,$t0 + str $Elo,[$ctx,#$Eoff+$lo] + adc $Ehi,$Ehi,$t1 + str $Ehi,[$ctx,#$Eoff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Foff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Foff+$hi] + + ldr $Alo,[sp,#$Goff+0] + ldr $Ahi,[sp,#$Goff+4] + ldr $Tlo,[sp,#$Hoff+0] + ldr $Thi,[sp,#$Hoff+4] + ldr $t0, [$ctx,#$Goff+$lo] + ldr $t1, [$ctx,#$Goff+$hi] + ldr $t2, [$ctx,#$Hoff+$lo] + ldr $t3, [$ctx,#$Hoff+$hi] + adds $t0,$Alo,$t0 + str $t0, [$ctx,#$Goff+$lo] + adc $t1,$Ahi,$t1 + str $t1, [$ctx,#$Goff+$hi] + adds $t2,$Tlo,$t2 + str $t2, [$ctx,#$Hoff+$lo] + adc $t3,$Thi,$t3 + str $t3, [$ctx,#$Hoff+$hi] + + add sp,sp,#640 + sub $Ktbl,$Ktbl,#640 + + teq $inp,$len + bne .Loop + + add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size sha512_block_data_order,.-sha512_block_data_order +___ + +{ +my @Sigma0=(28,34,39); +my @Sigma1=(14,18,41); +my @sigma0=(1, 8, 7); +my @sigma1=(19,61,6); + +my $Ktbl="r3"; +my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch + +my @X=map("d$_",(0..15)); +my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); + +sub NEON_00_15() { +my $i=shift; +my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; +my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps + +$code.=<<___ if ($i<16 || $i&1); + vshr.u64 $t0,$e,#@Sigma1[0] @ $i +#if $i<16 + vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned +#endif + vshr.u64 $t1,$e,#@Sigma1[1] +#if $i>0 + vadd.i64 $a,$Maj @ h+=Maj from the past +#endif + vshr.u64 $t2,$e,#@Sigma1[2] +___ +$code.=<<___; + vld1.64 {$K},[$Ktbl,:64]! @ K[i++] + vsli.64 $t0,$e,#`64-@Sigma1[0]` + vsli.64 $t1,$e,#`64-@Sigma1[1]` + vmov $Ch,$e + vsli.64 $t2,$e,#`64-@Sigma1[2]` +#if $i<16 && defined(__ARMEL__) + vrev64.8 @X[$i],@X[$i] +#endif + veor $t1,$t0 + vbsl $Ch,$f,$g @ Ch(e,f,g) + vshr.u64 $t0,$a,#@Sigma0[0] + veor $t2,$t1 @ Sigma1(e) + vadd.i64 $T1,$Ch,$h + vshr.u64 $t1,$a,#@Sigma0[1] + vsli.64 $t0,$a,#`64-@Sigma0[0]` + vadd.i64 $T1,$t2 + vshr.u64 $t2,$a,#@Sigma0[2] + vadd.i64 $K,@X[$i%16] + vsli.64 $t1,$a,#`64-@Sigma0[1]` + veor $Maj,$a,$b + vsli.64 $t2,$a,#`64-@Sigma0[2]` + veor $h,$t0,$t1 + vadd.i64 $T1,$K + vbsl $Maj,$c,$b @ Maj(a,b,c) + veor $h,$t2 @ Sigma0(a) + vadd.i64 $d,$T1 + vadd.i64 $Maj,$T1 + @ vadd.i64 $h,$Maj +___ +} + +sub NEON_16_79() { +my $i=shift; + +if ($i&1) { &NEON_00_15($i,@_); return; } + +# 2x-vectorized, therefore runs every 2nd round +my @X=map("q$_",(0..7)); # view @X as 128-bit vector +my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps +my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 +my $e=@_[4]; # $e from NEON_00_15 +$i /= 2; +$code.=<<___; + vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] + vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] + vadd.i64 @_[0],d30 @ h+=Maj from the past + vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] + vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` + vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] + vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` + veor $s1,$t0 + vshr.u64 $t0,$s0,#@sigma0[0] + veor $s1,$t1 @ sigma1(X[i+14]) + vshr.u64 $t1,$s0,#@sigma0[1] + vadd.i64 @X[$i%8],$s1 + vshr.u64 $s1,$s0,#@sigma0[2] + vsli.64 $t0,$s0,#`64-@sigma0[0]` + vsli.64 $t1,$s0,#`64-@sigma0[1]` + vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] + veor $s1,$t0 + vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s0 + vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 + veor $s1,$t1 @ sigma0(X[i+1]) + vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s1 +___ + &NEON_00_15(2*$i,@_); +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global sha512_block_data_order_neon +.type sha512_block_data_order_neon,%function +.align 4 +sha512_block_data_order_neon: +.LNEON: + dmb @ errata #451034 on early Cortex A8 + add $len,$inp,$len,lsl#7 @ len to point at the end of inp + VFP_ABI_PUSH + adr $Ktbl,.Lsha512_block_data_order + sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512 + vldmia $ctx,{$A-$H} @ load context +.Loop_neon: +___ +for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mov $cnt,#4 +.L16_79_neon: + subs $cnt,#1 +___ +for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bne .L16_79_neon + + vadd.i64 $A,d30 @ h+=Maj from the past + vldmia $ctx,{d24-d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia $ctx,{$A-$H} @ save context + teq $inp,$len + sub $Ktbl,#640 @ rewind K512 + bne .Loop_neon + + VFP_ABI_POP + ret @ bx lr +.size sha512_block_data_order_neon,.-sha512_block_data_order_neon +#endif +___ +} +$code.=<<___; +.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx lr/gm; + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +print $code; +close STDOUT; # enforce flush diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h new file mode 100644 index 000000000000..f147b6490d6c --- /dev/null +++ b/lib/crypto/arm/sha512.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * arm32-optimized SHA-512 block function + * + * Copyright 2025 Google LLC + */ + +#include <asm/neon.h> +#include <crypto/internal/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); + +asmlinkage void sha512_block_data_order(struct sha512_block_state *state, + const u8 *data, size_t nblocks); +asmlinkage void sha512_block_data_order_neon(struct sha512_block_state *state, + const u8 *data, size_t nblocks); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && + static_branch_likely(&have_neon) && likely(crypto_simd_usable())) { + kernel_neon_begin(); + sha512_block_data_order_neon(state, data, nblocks); + kernel_neon_end(); + } else { + sha512_block_data_order(state, data, nblocks); + } +} + +#ifdef CONFIG_KERNEL_MODE_NEON +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + if (cpu_has_neon()) + static_branch_enable(&have_neon); +} +#endif /* CONFIG_KERNEL_MODE_NEON */ |