diff options
Diffstat (limited to 'arch/x86/lib/crypto')
-rw-r--r-- | arch/x86/lib/crypto/.gitignore | 2 | ||||
-rw-r--r-- | arch/x86/lib/crypto/Kconfig | 34 | ||||
-rw-r--r-- | arch/x86/lib/crypto/Makefile | 20 | ||||
-rw-r--r-- | arch/x86/lib/crypto/blake2s-core.S | 252 | ||||
-rw-r--r-- | arch/x86/lib/crypto/blake2s-glue.c | 70 | ||||
-rw-r--r-- | arch/x86/lib/crypto/chacha-avx2-x86_64.S | 1021 | ||||
-rw-r--r-- | arch/x86/lib/crypto/chacha-avx512vl-x86_64.S | 836 | ||||
-rw-r--r-- | arch/x86/lib/crypto/chacha-ssse3-x86_64.S | 791 | ||||
-rw-r--r-- | arch/x86/lib/crypto/chacha_glue.c | 196 | ||||
-rw-r--r-- | arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl | 4253 | ||||
-rw-r--r-- | arch/x86/lib/crypto/poly1305_glue.c | 129 | ||||
-rw-r--r-- | arch/x86/lib/crypto/sha256-avx-asm.S | 499 | ||||
-rw-r--r-- | arch/x86/lib/crypto/sha256-avx2-asm.S | 774 | ||||
-rw-r--r-- | arch/x86/lib/crypto/sha256-ni-asm.S | 196 | ||||
-rw-r--r-- | arch/x86/lib/crypto/sha256-ssse3-asm.S | 511 | ||||
-rw-r--r-- | arch/x86/lib/crypto/sha256.c | 80 |
16 files changed, 0 insertions, 9664 deletions
diff --git a/arch/x86/lib/crypto/.gitignore b/arch/x86/lib/crypto/.gitignore deleted file mode 100644 index 580c839bb177..000000000000 --- a/arch/x86/lib/crypto/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -poly1305-x86_64-cryptogams.S diff --git a/arch/x86/lib/crypto/Kconfig b/arch/x86/lib/crypto/Kconfig deleted file mode 100644 index 5e94cdee492c..000000000000 --- a/arch/x86/lib/crypto/Kconfig +++ /dev/null @@ -1,34 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_BLAKE2S_X86 - bool "Hash functions: BLAKE2s (SSSE3/AVX-512)" - depends on 64BIT - select CRYPTO_LIB_BLAKE2S_GENERIC - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX-512 (Advanced Vector Extensions-512) - -config CRYPTO_CHACHA20_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_CHACHA - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - -config CRYPTO_SHA256_X86_64 - tristate - depends on 64BIT - default CRYPTO_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256 - select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD - select CRYPTO_LIB_SHA256_GENERIC diff --git a/arch/x86/lib/crypto/Makefile b/arch/x86/lib/crypto/Makefile deleted file mode 100644 index abceca3d31c0..000000000000 --- a/arch/x86/lib/crypto/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o -libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o - -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o -chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha-avx512vl-x86_64.o chacha_glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o -poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o -targets += poly1305-x86_64-cryptogams.S - -obj-$(CONFIG_CRYPTO_SHA256_X86_64) += sha256-x86_64.o -sha256-x86_64-y := sha256.o sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256-ni-asm.o - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $< > $@ - -$(obj)/%.S: $(src)/%.pl FORCE - $(call if_changed,perlasm) diff --git a/arch/x86/lib/crypto/blake2s-core.S b/arch/x86/lib/crypto/blake2s-core.S deleted file mode 100644 index ac1c845445a4..000000000000 --- a/arch/x86/lib/crypto/blake2s-core.S +++ /dev/null @@ -1,252 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 -.align 32 -IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 - .octa 0x5BE0CD191F83D9AB9B05688C510E527F -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 -.section .rodata.cst16.ROR328, "aM", @progbits, 16 -.align 16 -ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 -.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 -.align 64 -SIGMA: -.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 -.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 -.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 -.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 -.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 -.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 -.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 -.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 -.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 -.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 -.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 -.align 64 -SIGMA2: -.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 -.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 -.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 -.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 -.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 -.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 -.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 -.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 -.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 -.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 - -.text -SYM_FUNC_START(blake2s_compress_ssse3) - testq %rdx,%rdx - je .Lendofloop - movdqu (%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqa ROT16(%rip),%xmm12 - movdqa ROR328(%rip),%xmm13 - movdqu 0x20(%rdi),%xmm14 - movq %rcx,%xmm15 - leaq SIGMA+0xa0(%rip),%r8 - jmp .Lbeginofloop - .align 32 -.Lbeginofloop: - movdqa %xmm0,%xmm10 - movdqa %xmm1,%xmm11 - paddq %xmm15,%xmm14 - movdqa IV(%rip),%xmm2 - movdqa %xmm14,%xmm3 - pxor IV+0x10(%rip),%xmm3 - leaq SIGMA(%rip),%rcx -.Lroundloop: - movzbl (%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0x1(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0x2(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x3(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - punpckldq %xmm5,%xmm4 - punpckldq %xmm7,%xmm6 - punpcklqdq %xmm6,%xmm4 - paddd %xmm4,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm12,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0xc,%xmm1 - pslld $0x14,%xmm8 - por %xmm8,%xmm1 - movzbl 0x4(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0x5(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x6(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0x7(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - punpckldq %xmm6,%xmm5 - punpckldq %xmm4,%xmm7 - punpcklqdq %xmm7,%xmm5 - paddd %xmm5,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm13,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0x7,%xmm1 - pslld $0x19,%xmm8 - por %xmm8,%xmm1 - pshufd $0x93,%xmm0,%xmm0 - pshufd $0x4e,%xmm3,%xmm3 - pshufd $0x39,%xmm2,%xmm2 - movzbl 0x8(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x9(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0xa(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0xb(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - punpckldq %xmm7,%xmm6 - punpckldq %xmm5,%xmm4 - punpcklqdq %xmm4,%xmm6 - paddd %xmm6,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm12,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0xc,%xmm1 - pslld $0x14,%xmm8 - por %xmm8,%xmm1 - movzbl 0xc(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0xd(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0xe(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0xf(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - punpckldq %xmm4,%xmm7 - punpckldq %xmm6,%xmm5 - punpcklqdq %xmm5,%xmm7 - paddd %xmm7,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm13,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0x7,%xmm1 - pslld $0x19,%xmm8 - por %xmm8,%xmm1 - pshufd $0x39,%xmm0,%xmm0 - pshufd $0x4e,%xmm3,%xmm3 - pshufd $0x93,%xmm2,%xmm2 - addq $0x10,%rcx - cmpq %r8,%rcx - jnz .Lroundloop - pxor %xmm2,%xmm0 - pxor %xmm3,%xmm1 - pxor %xmm10,%xmm0 - pxor %xmm11,%xmm1 - addq $0x40,%rsi - decq %rdx - jnz .Lbeginofloop - movdqu %xmm0,(%rdi) - movdqu %xmm1,0x10(%rdi) - movdqu %xmm14,0x20(%rdi) -.Lendofloop: - RET -SYM_FUNC_END(blake2s_compress_ssse3) - -SYM_FUNC_START(blake2s_compress_avx512) - vmovdqu (%rdi),%xmm0 - vmovdqu 0x10(%rdi),%xmm1 - vmovdqu 0x20(%rdi),%xmm4 - vmovq %rcx,%xmm5 - vmovdqa IV(%rip),%xmm14 - vmovdqa IV+16(%rip),%xmm15 - jmp .Lblake2s_compress_avx512_mainloop -.align 32 -.Lblake2s_compress_avx512_mainloop: - vmovdqa %xmm0,%xmm10 - vmovdqa %xmm1,%xmm11 - vpaddq %xmm5,%xmm4,%xmm4 - vmovdqa %xmm14,%xmm2 - vpxor %xmm15,%xmm4,%xmm3 - vmovdqu (%rsi),%ymm6 - vmovdqu 0x20(%rsi),%ymm7 - addq $0x40,%rsi - leaq SIGMA2(%rip),%rax - movb $0xa,%cl -.Lblake2s_compress_avx512_roundloop: - addq $0x40,%rax - vmovdqa -0x40(%rax),%ymm8 - vmovdqa -0x20(%rax),%ymm9 - vpermi2d %ymm7,%ymm6,%ymm8 - vpermi2d %ymm7,%ymm6,%ymm9 - vmovdqa %ymm8,%ymm6 - vmovdqa %ymm9,%ymm7 - vpaddd %xmm8,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x10,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0xc,%xmm1,%xmm1 - vextracti128 $0x1,%ymm8,%xmm8 - vpaddd %xmm8,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x8,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0x7,%xmm1,%xmm1 - vpshufd $0x93,%xmm0,%xmm0 - vpshufd $0x4e,%xmm3,%xmm3 - vpshufd $0x39,%xmm2,%xmm2 - vpaddd %xmm9,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x10,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0xc,%xmm1,%xmm1 - vextracti128 $0x1,%ymm9,%xmm9 - vpaddd %xmm9,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x8,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0x7,%xmm1,%xmm1 - vpshufd $0x39,%xmm0,%xmm0 - vpshufd $0x4e,%xmm3,%xmm3 - vpshufd $0x93,%xmm2,%xmm2 - decb %cl - jne .Lblake2s_compress_avx512_roundloop - vpxor %xmm10,%xmm0,%xmm0 - vpxor %xmm11,%xmm1,%xmm1 - vpxor %xmm2,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - decq %rdx - jne .Lblake2s_compress_avx512_mainloop - vmovdqu %xmm0,(%rdi) - vmovdqu %xmm1,0x10(%rdi) - vmovdqu %xmm4,0x20(%rdi) - vzeroupper - RET -SYM_FUNC_END(blake2s_compress_avx512) diff --git a/arch/x86/lib/crypto/blake2s-glue.c b/arch/x86/lib/crypto/blake2s-glue.c deleted file mode 100644 index adc296cd17c9..000000000000 --- a/arch/x86/lib/crypto/blake2s-glue.c +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <asm/cpufeature.h> -#include <asm/fpu/api.h> -#include <asm/processor.h> -#include <asm/simd.h> -#include <crypto/internal/blake2s.h> -#include <linux/init.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/sizes.h> - -asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, - const u8 *block, const size_t nblocks, - const u32 inc); -asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, - const u8 *block, const size_t nblocks, - const u32 inc); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); - -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) -{ - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); - - if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { - blake2s_compress_generic(state, block, nblocks, inc); - return; - } - - do { - const size_t blocks = min_t(size_t, nblocks, - SZ_4K / BLAKE2S_BLOCK_SIZE); - - kernel_fpu_begin(); - if (static_branch_likely(&blake2s_use_avx512)) - blake2s_compress_avx512(state, block, blocks, inc); - else - blake2s_compress_ssse3(state, block, blocks, inc); - kernel_fpu_end(); - - nblocks -= blocks; - block += blocks * BLAKE2S_BLOCK_SIZE; - } while (nblocks); -} -EXPORT_SYMBOL(blake2s_compress); - -static int __init blake2s_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - static_branch_enable(&blake2s_use_ssse3); - - if (boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512VL) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | - XFEATURE_MASK_AVX512, NULL)) - static_branch_enable(&blake2s_use_avx512); - - return 0; -} - -subsys_initcall(blake2s_mod_init); diff --git a/arch/x86/lib/crypto/chacha-avx2-x86_64.S b/arch/x86/lib/crypto/chacha-avx2-x86_64.S deleted file mode 100644 index f3d8fc018249..000000000000 --- a/arch/x86/lib/crypto/chacha-avx2-x86_64.S +++ /dev/null @@ -1,1021 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ChaCha 256-bit cipher algorithm, x64 AVX2 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.ROT8, "aM", @progbits, 32 -.align 32 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 - .octa 0x0e0d0c0f0a09080b0605040702010003 - -.section .rodata.cst32.ROT16, "aM", @progbits, 32 -.align 32 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 - .octa 0x0d0c0f0e09080b0a0504070601000302 - -.section .rodata.cst32.CTRINC, "aM", @progbits, 32 -.align 32 -CTRINC: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.text - -SYM_FUNC_START(chacha_2block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts two ChaCha blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - - vmovdqa ROT8(%rip),%ymm4 - vmovdqa ROT16(%rip),%ymm5 - - mov %rcx,%rax - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - sub $2,%r8d - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rax - jl .Lxorpart2 - vpxor 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rax - jl .Lxorpart2 - vpxor 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rax - jl .Lxorpart2 - vpxor 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rax - jl .Lxorpart2 - vpxor 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rax - jl .Lxorpart2 - vpxor 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rax - jl .Lxorpart2 - vpxor 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rax - jl .Lxorpart2 - vpxor 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rax - jl .Lxorpart2 - vpxor 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - RET - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone2 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm7,%xmm7 - vmovdqa %xmm7,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone2 - -SYM_FUNC_END(chacha_2block_xor_avx2) - -SYM_FUNC_START(chacha_4block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four ChaCha blocks by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - - vmovdqa ROT8(%rip),%ymm8 - vmovdqa ROT16(%rip),%ymm9 - - mov %rcx,%rax - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - sub $2,%r8d - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rax - jl .Lxorpart4 - vpxor 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rax - jl .Lxorpart4 - vpxor 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rax - jl .Lxorpart4 - vpxor 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rax - jl .Lxorpart4 - vpxor 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rax - jl .Lxorpart4 - vpxor 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rax - jl .Lxorpart4 - vpxor 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rax - jl .Lxorpart4 - vpxor 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rax - jl .Lxorpart4 - vpxor 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rax - jl .Lxorpart4 - vpxor 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rax - jl .Lxorpart4 - vpxor 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rax - jl .Lxorpart4 - vpxor 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rax - jl .Lxorpart4 - vpxor 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rax - jl .Lxorpart4 - vpxor 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rax - jl .Lxorpart4 - vpxor 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rax - jl .Lxorpart4 - vpxor 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rax - jl .Lxorpart4 - vpxor 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm10,%xmm10 - vmovdqa %xmm10,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_avx2) - -SYM_FUNC_START(chacha_8block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts eight consecutive ChaCha blocks by loading - # the state matrix in AVX registers eight times. As we need some - # scratch registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32-, 64- and then 128-bit - # words, which allows us to do XOR in AVX registers. 8/16-bit word - # rotation is done with the slightly better performing byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - vzeroupper - # 4 * 32 byte stack, 32-byte aligned - lea 8(%rsp),%r10 - and $~31, %rsp - sub $0x80, %rsp - mov %rcx,%rax - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - # x0..3 on stack - vmovdqa %ymm0,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm3,0x60(%rsp) - - vmovdqa CTRINC(%rip),%ymm1 - vmovdqa ROT8(%rip),%ymm2 - vmovdqa ROT16(%rip),%ymm3 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - sub $2,%r8d - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpaddd 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpbroadcastd 0x04(%rdi),%ymm0 - vpaddd 0x20(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpbroadcastd 0x08(%rdi),%ymm0 - vpaddd 0x40(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpbroadcastd 0x0c(%rdi),%ymm0 - vpaddd 0x60(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpbroadcastd 0x10(%rdi),%ymm0 - vpaddd %ymm0,%ymm4,%ymm4 - vpbroadcastd 0x14(%rdi),%ymm0 - vpaddd %ymm0,%ymm5,%ymm5 - vpbroadcastd 0x18(%rdi),%ymm0 - vpaddd %ymm0,%ymm6,%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm0 - vpaddd %ymm0,%ymm7,%ymm7 - vpbroadcastd 0x20(%rdi),%ymm0 - vpaddd %ymm0,%ymm8,%ymm8 - vpbroadcastd 0x24(%rdi),%ymm0 - vpaddd %ymm0,%ymm9,%ymm9 - vpbroadcastd 0x28(%rdi),%ymm0 - vpaddd %ymm0,%ymm10,%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm0 - vpaddd %ymm0,%ymm11,%ymm11 - vpbroadcastd 0x30(%rdi),%ymm0 - vpaddd %ymm0,%ymm12,%ymm12 - vpbroadcastd 0x34(%rdi),%ymm0 - vpaddd %ymm0,%ymm13,%ymm13 - vpbroadcastd 0x38(%rdi),%ymm0 - vpaddd %ymm0,%ymm14,%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm0 - vpaddd %ymm0,%ymm15,%ymm15 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - - # interleave 32-bit words in state n, n+1 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x20(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa 0x40(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm1,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpckldq %ymm5,%ymm0,%ymm4 - vpunpckhdq %ymm5,%ymm0,%ymm5 - vmovdqa %ymm6,%ymm0 - vpunpckldq %ymm7,%ymm0,%ymm6 - vpunpckhdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpckldq %ymm9,%ymm0,%ymm8 - vpunpckhdq %ymm9,%ymm0,%ymm9 - vmovdqa %ymm10,%ymm0 - vpunpckldq %ymm11,%ymm0,%ymm10 - vpunpckhdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpckldq %ymm13,%ymm0,%ymm12 - vpunpckhdq %ymm13,%ymm0,%ymm13 - vmovdqa %ymm14,%ymm0 - vpunpckldq %ymm15,%ymm0,%ymm14 - vpunpckhdq %ymm15,%ymm0,%ymm15 - - # interleave 64-bit words in state n, n+2 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x40(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x00(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa 0x20(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpcklqdq %ymm6,%ymm0,%ymm4 - vpunpckhqdq %ymm6,%ymm0,%ymm6 - vmovdqa %ymm5,%ymm0 - vpunpcklqdq %ymm7,%ymm0,%ymm5 - vpunpckhqdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpcklqdq %ymm10,%ymm0,%ymm8 - vpunpckhqdq %ymm10,%ymm0,%ymm10 - vmovdqa %ymm9,%ymm0 - vpunpcklqdq %ymm11,%ymm0,%ymm9 - vpunpckhqdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpcklqdq %ymm14,%ymm0,%ymm12 - vpunpckhqdq %ymm14,%ymm0,%ymm14 - vmovdqa %ymm13,%ymm0 - vpunpcklqdq %ymm15,%ymm0,%ymm13 - vpunpckhqdq %ymm15,%ymm0,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa 0x00(%rsp),%ymm1 - vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 - cmp $0x0020,%rax - jl .Lxorpart8 - vpxor 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0000(%rsi) - vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rax - jl .Lxorpart8 - vpxor 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vmovdqa 0x40(%rsp),%ymm1 - vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 - cmp $0x0060,%rax - jl .Lxorpart8 - vpxor 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rax - jl .Lxorpart8 - vpxor 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vmovdqa 0x20(%rsp),%ymm1 - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rax - jl .Lxorpart8 - vpxor 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rax - jl .Lxorpart8 - vpxor 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vmovdqa 0x60(%rsp),%ymm1 - vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 - cmp $0x00e0,%rax - jl .Lxorpart8 - vpxor 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rax - jl .Lxorpart8 - vpxor 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa %ymm4,%ymm0 - cmp $0x0120,%rax - jl .Lxorpart8 - vpxor 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0100(%rsi) - - vmovdqa %ymm12,%ymm0 - cmp $0x0140,%rax - jl .Lxorpart8 - vpxor 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0120(%rsi) - - vmovdqa %ymm6,%ymm0 - cmp $0x0160,%rax - jl .Lxorpart8 - vpxor 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0140(%rsi) - - vmovdqa %ymm14,%ymm0 - cmp $0x0180,%rax - jl .Lxorpart8 - vpxor 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0160(%rsi) - - vmovdqa %ymm5,%ymm0 - cmp $0x01a0,%rax - jl .Lxorpart8 - vpxor 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0180(%rsi) - - vmovdqa %ymm13,%ymm0 - cmp $0x01c0,%rax - jl .Lxorpart8 - vpxor 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01a0(%rsi) - - vmovdqa %ymm7,%ymm0 - cmp $0x01e0,%rax - jl .Lxorpart8 - vpxor 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01c0(%rsi) - - vmovdqa %ymm15,%ymm0 - cmp $0x0200,%rax - jl .Lxorpart8 - vpxor 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - lea -8(%r10),%rsp - RET - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x1f,%r9 - jz .Ldone8 - and $~0x1f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone8 - -SYM_FUNC_END(chacha_8block_xor_avx2) diff --git a/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S b/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S deleted file mode 100644 index 259383e1ad44..000000000000 --- a/arch/x86/lib/crypto/chacha-avx512vl-x86_64.S +++ /dev/null @@ -1,836 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions - * - * Copyright (C) 2018 Martin Willi - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 -.align 32 -CTR8BL: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.text - -SYM_FUNC_START(chacha_2block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts two ChaCha blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - sub $2,%r8d - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rcx - jl .Lxorpart2 - vpxord 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rcx - jl .Lxorpart2 - vpxord 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rcx - jl .Lxorpart2 - vpxord 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rcx - jl .Lxorpart2 - vpxord 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rcx - jl .Lxorpart2 - vpxord 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rcx - jl .Lxorpart2 - vpxord 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rcx - jl .Lxorpart2 - vpxord 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rcx - jl .Lxorpart2 - vpxord 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - RET - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone2 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm7,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone2 - -SYM_FUNC_END(chacha_2block_xor_avx512vl) - -SYM_FUNC_START(chacha_4block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four ChaCha blocks by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - sub $2,%r8d - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rcx - jl .Lxorpart4 - vpxord 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rcx - jl .Lxorpart4 - vpxord 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rcx - jl .Lxorpart4 - vpxord 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rcx - jl .Lxorpart4 - vpxord 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rcx - jl .Lxorpart4 - vpxord 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rcx - jl .Lxorpart4 - vpxord 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rcx - jl .Lxorpart4 - vpxord 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rcx - jl .Lxorpart4 - vpxord 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rcx - jl .Lxorpart4 - vpxord 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rcx - jl .Lxorpart4 - vpxord 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rcx - jl .Lxorpart4 - vpxord 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rcx - jl .Lxorpart4 - vpxord 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rcx - jl .Lxorpart4 - vpxord 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rcx - jl .Lxorpart4 - vpxord 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rcx - jl .Lxorpart4 - vpxord 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rcx - jl .Lxorpart4 - vpxord 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone4 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm10,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_avx512vl) - -SYM_FUNC_START(chacha_8block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts eight consecutive ChaCha blocks by loading - # the state matrix in AVX registers eight times. Compared to AVX2, this - # mostly benefits from the new rotate instructions in VL and the - # additional registers. - - vzeroupper - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - - # x12 += counter values 0-3 - vpaddd CTR8BL(%rip),%ymm12,%ymm12 - - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm1,%ymm17 - vmovdqa64 %ymm2,%ymm18 - vmovdqa64 %ymm3,%ymm19 - vmovdqa64 %ymm4,%ymm20 - vmovdqa64 %ymm5,%ymm21 - vmovdqa64 %ymm6,%ymm22 - vmovdqa64 %ymm7,%ymm23 - vmovdqa64 %ymm8,%ymm24 - vmovdqa64 %ymm9,%ymm25 - vmovdqa64 %ymm10,%ymm26 - vmovdqa64 %ymm11,%ymm27 - vmovdqa64 %ymm12,%ymm28 - vmovdqa64 %ymm13,%ymm29 - vmovdqa64 %ymm14,%ymm30 - vmovdqa64 %ymm15,%ymm31 - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - - sub $2,%r8d - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpaddd %ymm16,%ymm0,%ymm0 - vpaddd %ymm17,%ymm1,%ymm1 - vpaddd %ymm18,%ymm2,%ymm2 - vpaddd %ymm19,%ymm3,%ymm3 - vpaddd %ymm20,%ymm4,%ymm4 - vpaddd %ymm21,%ymm5,%ymm5 - vpaddd %ymm22,%ymm6,%ymm6 - vpaddd %ymm23,%ymm7,%ymm7 - vpaddd %ymm24,%ymm8,%ymm8 - vpaddd %ymm25,%ymm9,%ymm9 - vpaddd %ymm26,%ymm10,%ymm10 - vpaddd %ymm27,%ymm11,%ymm11 - vpaddd %ymm28,%ymm12,%ymm12 - vpaddd %ymm29,%ymm13,%ymm13 - vpaddd %ymm30,%ymm14,%ymm14 - vpaddd %ymm31,%ymm15,%ymm15 - - # interleave 32-bit words in state n, n+1 - vpunpckldq %ymm1,%ymm0,%ymm16 - vpunpckhdq %ymm1,%ymm0,%ymm17 - vpunpckldq %ymm3,%ymm2,%ymm18 - vpunpckhdq %ymm3,%ymm2,%ymm19 - vpunpckldq %ymm5,%ymm4,%ymm20 - vpunpckhdq %ymm5,%ymm4,%ymm21 - vpunpckldq %ymm7,%ymm6,%ymm22 - vpunpckhdq %ymm7,%ymm6,%ymm23 - vpunpckldq %ymm9,%ymm8,%ymm24 - vpunpckhdq %ymm9,%ymm8,%ymm25 - vpunpckldq %ymm11,%ymm10,%ymm26 - vpunpckhdq %ymm11,%ymm10,%ymm27 - vpunpckldq %ymm13,%ymm12,%ymm28 - vpunpckhdq %ymm13,%ymm12,%ymm29 - vpunpckldq %ymm15,%ymm14,%ymm30 - vpunpckhdq %ymm15,%ymm14,%ymm31 - - # interleave 64-bit words in state n, n+2 - vpunpcklqdq %ymm18,%ymm16,%ymm0 - vpunpcklqdq %ymm19,%ymm17,%ymm1 - vpunpckhqdq %ymm18,%ymm16,%ymm2 - vpunpckhqdq %ymm19,%ymm17,%ymm3 - vpunpcklqdq %ymm22,%ymm20,%ymm4 - vpunpcklqdq %ymm23,%ymm21,%ymm5 - vpunpckhqdq %ymm22,%ymm20,%ymm6 - vpunpckhqdq %ymm23,%ymm21,%ymm7 - vpunpcklqdq %ymm26,%ymm24,%ymm8 - vpunpcklqdq %ymm27,%ymm25,%ymm9 - vpunpckhqdq %ymm26,%ymm24,%ymm10 - vpunpckhqdq %ymm27,%ymm25,%ymm11 - vpunpcklqdq %ymm30,%ymm28,%ymm12 - vpunpcklqdq %ymm31,%ymm29,%ymm13 - vpunpckhqdq %ymm30,%ymm28,%ymm14 - vpunpckhqdq %ymm31,%ymm29,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa64 %ymm0,%ymm16 - vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 - cmp $0x0020,%rcx - jl .Lxorpart8 - vpxord 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0000(%rsi) - vmovdqa64 %ymm16,%ymm0 - vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rcx - jl .Lxorpart8 - vpxord 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 - cmp $0x0060,%rcx - jl .Lxorpart8 - vpxord 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rcx - jl .Lxorpart8 - vpxord 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rcx - jl .Lxorpart8 - vpxord 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rcx - jl .Lxorpart8 - vpxord 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 - cmp $0x00e0,%rcx - jl .Lxorpart8 - vpxord 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rcx - jl .Lxorpart8 - vpxord 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa64 %ymm4,%ymm0 - cmp $0x0120,%rcx - jl .Lxorpart8 - vpxord 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0100(%rsi) - - vmovdqa64 %ymm12,%ymm0 - cmp $0x0140,%rcx - jl .Lxorpart8 - vpxord 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0120(%rsi) - - vmovdqa64 %ymm6,%ymm0 - cmp $0x0160,%rcx - jl .Lxorpart8 - vpxord 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0140(%rsi) - - vmovdqa64 %ymm14,%ymm0 - cmp $0x0180,%rcx - jl .Lxorpart8 - vpxord 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0160(%rsi) - - vmovdqa64 %ymm5,%ymm0 - cmp $0x01a0,%rcx - jl .Lxorpart8 - vpxord 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0180(%rsi) - - vmovdqa64 %ymm13,%ymm0 - cmp $0x01c0,%rcx - jl .Lxorpart8 - vpxord 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01a0(%rsi) - - vmovdqa64 %ymm7,%ymm0 - cmp $0x01e0,%rcx - jl .Lxorpart8 - vpxord 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01c0(%rsi) - - vmovdqa64 %ymm15,%ymm0 - cmp $0x0200,%rcx - jl .Lxorpart8 - vpxord 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - RET - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0x1f,%rcx - jz .Ldone8 - mov %rax,%r9 - and $~0x1f,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} - vpxord %ymm0,%ymm1,%ymm1 - vmovdqu8 %ymm1,(%rsi,%r9){%k1} - - jmp .Ldone8 - -SYM_FUNC_END(chacha_8block_xor_avx512vl) diff --git a/arch/x86/lib/crypto/chacha-ssse3-x86_64.S b/arch/x86/lib/crypto/chacha-ssse3-x86_64.S deleted file mode 100644 index 7111949cd5b9..000000000000 --- a/arch/x86/lib/crypto/chacha-ssse3-x86_64.S +++ /dev/null @@ -1,791 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -.section .rodata.cst16.ROT8, "aM", @progbits, 16 -.align 16 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 -.section .rodata.cst16.CTRINC, "aM", @progbits, 16 -.align 16 -CTRINC: .octa 0x00000003000000020000000100000000 - -.text - -/* - * chacha_permute - permute one block - * - * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This - * function performs matrix operations on four words in parallel, but requires - * shuffling to rearrange the words after each round. 8/16-bit word rotation is - * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word - * rotation uses traditional shift+OR. - * - * The round count is given in %r8d. - * - * Clobbers: %r8d, %xmm4-%xmm7 - */ -SYM_FUNC_START_LOCAL(chacha_permute) - - movdqa ROT8(%rip),%xmm4 - movdqa ROT16(%rip),%xmm5 - -.Ldoubleround: - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm3,%xmm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm3,%xmm3 - - sub $2,%r8d - jnz .Ldoubleround - - RET -SYM_FUNC_END(chacha_permute) - -SYM_FUNC_START(chacha_block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 1 data block output, o - # %rdx: up to 1 data block input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - FRAME_BEGIN - - # x0..3 = s0..3 - movdqu 0x00(%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqu 0x20(%rdi),%xmm2 - movdqu 0x30(%rdi),%xmm3 - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - movdqa %xmm2,%xmm10 - movdqa %xmm3,%xmm11 - - mov %rcx,%rax - call chacha_permute - - # o0 = i0 ^ (x0 + s0) - paddd %xmm8,%xmm0 - cmp $0x10,%rax - jl .Lxorpart - movdqu 0x00(%rdx),%xmm4 - pxor %xmm4,%xmm0 - movdqu %xmm0,0x00(%rsi) - # o1 = i1 ^ (x1 + s1) - paddd %xmm9,%xmm1 - movdqa %xmm1,%xmm0 - cmp $0x20,%rax - jl .Lxorpart - movdqu 0x10(%rdx),%xmm0 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - # o2 = i2 ^ (x2 + s2) - paddd %xmm10,%xmm2 - movdqa %xmm2,%xmm0 - cmp $0x30,%rax - jl .Lxorpart - movdqu 0x20(%rdx),%xmm0 - pxor %xmm2,%xmm0 - movdqu %xmm0,0x20(%rsi) - # o3 = i3 ^ (x3 + s3) - paddd %xmm11,%xmm3 - movdqa %xmm3,%xmm0 - cmp $0x40,%rax - jl .Lxorpart - movdqu 0x30(%rdx),%xmm0 - pxor %xmm3,%xmm0 - movdqu %xmm0,0x30(%rsi) - -.Ldone: - FRAME_END - RET - -.Lxorpart: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone - -SYM_FUNC_END(chacha_block_xor_ssse3) - -SYM_FUNC_START(hchacha_block_ssse3) - # %rdi: Input state matrix, s - # %rsi: output (8 32-bit words) - # %edx: nrounds - FRAME_BEGIN - - movdqu 0x00(%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqu 0x20(%rdi),%xmm2 - movdqu 0x30(%rdi),%xmm3 - - mov %edx,%r8d - call chacha_permute - - movdqu %xmm0,0x00(%rsi) - movdqu %xmm3,0x10(%rsi) - - FRAME_END - RET -SYM_FUNC_END(hchacha_block_ssse3) - -SYM_FUNC_START(chacha_4block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four consecutive ChaCha blocks by loading the - # the state matrix in SSE registers four times. As we need some scratch - # registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32- and then 64-bit words, - # which allows us to do XOR in SSE registers. 8/16-bit word rotation is - # done with the slightly better performing SSSE3 byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - lea 8(%rsp),%r10 - sub $0x80,%rsp - and $~63,%rsp - mov %rcx,%rax - - # x0..15[0-3] = s0..3[0..3] - movq 0x00(%rdi),%xmm1 - pshufd $0x00,%xmm1,%xmm0 - pshufd $0x55,%xmm1,%xmm1 - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - movq 0x10(%rdi),%xmm5 - pshufd $0x00,%xmm5,%xmm4 - pshufd $0x55,%xmm5,%xmm5 - movq 0x18(%rdi),%xmm7 - pshufd $0x00,%xmm7,%xmm6 - pshufd $0x55,%xmm7,%xmm7 - movq 0x20(%rdi),%xmm9 - pshufd $0x00,%xmm9,%xmm8 - pshufd $0x55,%xmm9,%xmm9 - movq 0x28(%rdi),%xmm11 - pshufd $0x00,%xmm11,%xmm10 - pshufd $0x55,%xmm11,%xmm11 - movq 0x30(%rdi),%xmm13 - pshufd $0x00,%xmm13,%xmm12 - pshufd $0x55,%xmm13,%xmm13 - movq 0x38(%rdi),%xmm15 - pshufd $0x00,%xmm15,%xmm14 - pshufd $0x55,%xmm15,%xmm15 - # x0..3 on stack - movdqa %xmm0,0x00(%rsp) - movdqa %xmm1,0x10(%rsp) - movdqa %xmm2,0x20(%rsp) - movdqa %xmm3,0x30(%rsp) - - movdqa CTRINC(%rip),%xmm1 - movdqa ROT8(%rip),%xmm2 - movdqa ROT16(%rip),%xmm3 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - -.Ldoubleround4: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - - sub $2,%r8d - jnz .Ldoubleround4 - - # x0[0-3] += s0[0] - # x1[0-3] += s0[1] - movq 0x00(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x00(%rsp),%xmm2 - movdqa %xmm2,0x00(%rsp) - paddd 0x10(%rsp),%xmm3 - movdqa %xmm3,0x10(%rsp) - # x2[0-3] += s0[2] - # x3[0-3] += s0[3] - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x20(%rsp),%xmm2 - movdqa %xmm2,0x20(%rsp) - paddd 0x30(%rsp),%xmm3 - movdqa %xmm3,0x30(%rsp) - - # x4[0-3] += s1[0] - # x5[0-3] += s1[1] - movq 0x10(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - # x6[0-3] += s1[2] - # x7[0-3] += s1[3] - movq 0x18(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - - # x8[0-3] += s2[0] - # x9[0-3] += s2[1] - movq 0x20(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm8 - paddd %xmm3,%xmm9 - # x10[0-3] += s2[2] - # x11[0-3] += s2[3] - movq 0x28(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm10 - paddd %xmm3,%xmm11 - - # x12[0-3] += s3[0] - # x13[0-3] += s3[1] - movq 0x30(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm12 - paddd %xmm3,%xmm13 - # x14[0-3] += s3[2] - # x15[0-3] += s3[3] - movq 0x38(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm14 - paddd %xmm3,%xmm15 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - - # interleave 32-bit words in state n, n+1 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x10(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x10(%rsp) - movdqa 0x20(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x20(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpckldq %xmm5,%xmm4 - punpckhdq %xmm5,%xmm0 - movdqa %xmm0,%xmm5 - movdqa %xmm6,%xmm0 - punpckldq %xmm7,%xmm6 - punpckhdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpckldq %xmm9,%xmm8 - punpckhdq %xmm9,%xmm0 - movdqa %xmm0,%xmm9 - movdqa %xmm10,%xmm0 - punpckldq %xmm11,%xmm10 - punpckhdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpckldq %xmm13,%xmm12 - punpckhdq %xmm13,%xmm0 - movdqa %xmm0,%xmm13 - movdqa %xmm14,%xmm0 - punpckldq %xmm15,%xmm14 - punpckhdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # interleave 64-bit words in state n, n+2 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x20(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x20(%rsp) - movdqa 0x10(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x10(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpcklqdq %xmm6,%xmm4 - punpckhqdq %xmm6,%xmm0 - movdqa %xmm0,%xmm6 - movdqa %xmm5,%xmm0 - punpcklqdq %xmm7,%xmm5 - punpckhqdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpcklqdq %xmm10,%xmm8 - punpckhqdq %xmm10,%xmm0 - movdqa %xmm0,%xmm10 - movdqa %xmm9,%xmm0 - punpcklqdq %xmm11,%xmm9 - punpckhqdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpcklqdq %xmm14,%xmm12 - punpckhqdq %xmm14,%xmm0 - movdqa %xmm0,%xmm14 - movdqa %xmm13,%xmm0 - punpcklqdq %xmm15,%xmm13 - punpckhqdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # xor with corresponding input, write to output - movdqa 0x00(%rsp),%xmm0 - cmp $0x10,%rax - jl .Lxorpart4 - movdqu 0x00(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x00(%rsi) - - movdqu %xmm4,%xmm0 - cmp $0x20,%rax - jl .Lxorpart4 - movdqu 0x10(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - - movdqu %xmm8,%xmm0 - cmp $0x30,%rax - jl .Lxorpart4 - movdqu 0x20(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x20(%rsi) - - movdqu %xmm12,%xmm0 - cmp $0x40,%rax - jl .Lxorpart4 - movdqu 0x30(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x30(%rsi) - - movdqa 0x20(%rsp),%xmm0 - cmp $0x50,%rax - jl .Lxorpart4 - movdqu 0x40(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x40(%rsi) - - movdqu %xmm6,%xmm0 - cmp $0x60,%rax - jl .Lxorpart4 - movdqu 0x50(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x50(%rsi) - - movdqu %xmm10,%xmm0 - cmp $0x70,%rax - jl .Lxorpart4 - movdqu 0x60(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x60(%rsi) - - movdqu %xmm14,%xmm0 - cmp $0x80,%rax - jl .Lxorpart4 - movdqu 0x70(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x70(%rsi) - - movdqa 0x10(%rsp),%xmm0 - cmp $0x90,%rax - jl .Lxorpart4 - movdqu 0x80(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x80(%rsi) - - movdqu %xmm5,%xmm0 - cmp $0xa0,%rax - jl .Lxorpart4 - movdqu 0x90(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x90(%rsi) - - movdqu %xmm9,%xmm0 - cmp $0xb0,%rax - jl .Lxorpart4 - movdqu 0xa0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xa0(%rsi) - - movdqu %xmm13,%xmm0 - cmp $0xc0,%rax - jl .Lxorpart4 - movdqu 0xb0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xb0(%rsi) - - movdqa 0x30(%rsp),%xmm0 - cmp $0xd0,%rax - jl .Lxorpart4 - movdqu 0xc0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xc0(%rsi) - - movdqu %xmm7,%xmm0 - cmp $0xe0,%rax - jl .Lxorpart4 - movdqu 0xd0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xd0(%rsi) - - movdqu %xmm11,%xmm0 - cmp $0xf0,%rax - jl .Lxorpart4 - movdqu 0xe0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xe0(%rsi) - - movdqu %xmm15,%xmm0 - cmp $0x100,%rax - jl .Lxorpart4 - movdqu 0xf0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xf0(%rsi) - -.Ldone4: - lea -8(%r10),%rsp - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_ssse3) diff --git a/arch/x86/lib/crypto/chacha_glue.c b/arch/x86/lib/crypto/chacha_glue.c deleted file mode 100644 index 10b2c945f541..000000000000 --- a/arch/x86/lib/crypto/chacha_glue.c +++ /dev/null @@ -1,196 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * ChaCha and HChaCha functions (x86_64 optimized) - * - * Copyright (C) 2015 Martin Willi - */ - -#include <asm/simd.h> -#include <crypto/chacha.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> - -asmlinkage void chacha_block_xor_ssse3(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_ssse3(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void hchacha_block_ssse3(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); - -asmlinkage void chacha_2block_xor_avx2(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx2(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx2(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -asmlinkage void chacha_2block_xor_avx512vl(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx512vl(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx512vl(const struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); - -static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) -{ - len = min(len, maxblocks * CHACHA_BLOCK_SIZE); - return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; -} - -static void chacha_dosimd(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (static_branch_likely(&chacha_use_avx512vl)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state->x[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state->x[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state->x[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_2block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state->x[12] += chacha_advance(bytes, 2); - return; - } - } - - if (static_branch_likely(&chacha_use_avx2)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state->x[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 4); - return; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 2); - return; - } - } - - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state->x[12] += 4; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - state->x[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); - state->x[12]++; - } -} - -void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd)) { - hchacha_block_generic(state, out, nrounds); - } else { - kernel_fpu_begin(); - hchacha_block_ssse3(state, out, nrounds); - kernel_fpu_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd) || - bytes <= CHACHA_BLOCK_SIZE) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_fpu_begin(); - chacha_dosimd(state, dst, src, todo, nrounds); - kernel_fpu_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -bool chacha_is_arch_optimized(void) -{ - return static_key_enabled(&chacha_use_simd); -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -static int __init chacha_simd_mod_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return 0; - - static_branch_enable(&chacha_use_simd); - - if (boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - static_branch_enable(&chacha_use_avx2); - - if (boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ - static_branch_enable(&chacha_use_avx512vl); - } - return 0; -} -subsys_initcall(chacha_simd_mod_init); - -static void __exit chacha_simd_mod_exit(void) -{ -} -module_exit(chacha_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); -MODULE_DESCRIPTION("ChaCha and HChaCha functions (x86_64 optimized)"); diff --git a/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl deleted file mode 100644 index 501827254fed..000000000000 --- a/arch/x86/lib/crypto/poly1305-x86_64-cryptogams.pl +++ /dev/null @@ -1,4253 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -# -# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. -# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. -# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. -# -# This code is taken from the OpenSSL project but the author, Andy Polyakov, -# has relicensed it under the licenses specified in the SPDX header above. -# The original headers, including the original license headers, are -# included below for completeness. -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# This module implements Poly1305 hash for x86_64. -# -# March 2015 -# -# Initial release. -# -# December 2016 -# -# Add AVX512F+VL+BW code path. -# -# November 2017 -# -# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be -# executed even on Knights Landing. Trigger for modification was -# observation that AVX512 code paths can negatively affect overall -# Skylake-X system performance. Since we are likely to suppress -# AVX512F capability flag [at least on Skylake-X], conversion serves -# as kind of "investment protection". Note that next *lake processor, -# Cannonlake, has AVX512IFMA code path to execute... -# -# Numbers are cycles per processed byte with poly1305_blocks alone, -# measured with rdtsc at fixed clock frequency. -# -# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 -# P4 4.46/+120% - -# Core 2 2.41/+90% - -# Westmere 1.88/+120% - -# Sandy Bridge 1.39/+140% 1.10 -# Haswell 1.14/+175% 1.11 0.65 -# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] -# Silvermont 2.83/+95% - -# Knights L 3.60/? 1.65 1.10 0.41(***) -# Goldmont 1.70/+180% - -# VIA Nano 1.82/+150% - -# Sledgehammer 1.38/+160% - -# Bulldozer 2.30/+130% 0.97 -# Ryzen 1.15/+200% 1.08 1.18 -# -# (*) improvement coefficients relative to clang are more modest and -# are ~50% on most processors, in both cases we are comparing to -# __int128 code; -# (**) SSE2 implementation was attempted, but among non-AVX processors -# it was faster than integer-only code only on older Intel P4 and -# Core processors, 50-30%, less newer processor is, but slower on -# contemporary ones, for example almost 2x slower on Atom, and as -# former are naturally disappearing, SSE2 is deemed unnecessary; -# (***) strangely enough performance seems to vary from core to core, -# listed result is best case; - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); -$kernel=0; $kernel=1 if (!$flavour && !$output); - -if (!$kernel) { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or - die "can't locate x86_64-xlate.pl"; - - open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; - *STDOUT=*OUT; - - if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); - } - - if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { - $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); - $avx += 1 if ($1==2.11 && $2>=8); - } - - if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=11); - } - - if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { - $avx = ($2>=3.0) + ($2>3.0); - } -} else { - $avx = 4; # The kernel uses ifdefs for this. -} - -sub declare_function() { - my ($name, $align, $nargs) = @_; - if($kernel) { - $code .= "SYM_FUNC_START($name)\n"; - $code .= ".L$name:\n"; - } else { - $code .= ".globl $name\n"; - $code .= ".type $name,\@function,$nargs\n"; - $code .= ".align $align\n"; - $code .= "$name:\n"; - } -} - -sub declare_typed_function() { - my ($name, $align, $nargs) = @_; - if($kernel) { - $code .= "SYM_TYPED_FUNC_START($name)\n"; - $code .= ".L$name:\n"; - } else { - $code .= ".globl $name\n"; - $code .= ".type $name,\@function,$nargs\n"; - $code .= ".align $align\n"; - $code .= "$name:\n"; - } -} - -sub end_function() { - my ($name) = @_; - if($kernel) { - $code .= "SYM_FUNC_END($name)\n"; - } else { - $code .= ".size $name,.-$name\n"; - } -} - -$code.=<<___ if $kernel; -#include <linux/cfi_types.h> -___ - -if ($avx) { -$code.=<<___ if $kernel; -.section .rodata -___ -$code.=<<___; -.align 64 -.Lconst: -.Lmask24: -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 -.L129: -.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 -.Lmask26: -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.Lpermd_avx2: -.long 2,2,2,3,2,0,2,1 -.Lpermd_avx512: -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 - -.L2_44_inp_permd: -.long 0,1,1,2,2,3,7,7 -.L2_44_inp_shift: -.quad 0,12,24,64 -.L2_44_mask: -.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff -.L2_44_shift_rgt: -.quad 44,44,42,64 -.L2_44_shift_lft: -.quad 8,8,10,64 - -.align 64 -.Lx_mask44: -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.Lx_mask42: -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -___ -} -$code.=<<___ if (!$kernel); -.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" -.align 16 -___ - -my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); -my ($mac,$nonce)=($inp,$len); # *_emit arguments -my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); -my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); - -sub poly1305_iteration { -# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 -# output: $h0-$h2 *= $r0-$r1 -$code.=<<___; - mulq $h0 # h0*r1 - mov %rax,$d2 - mov $r0,%rax - mov %rdx,$d3 - - mulq $h0 # h0*r0 - mov %rax,$h0 # future $h0 - mov $r0,%rax - mov %rdx,$d1 - - mulq $h1 # h1*r0 - add %rax,$d2 - mov $s1,%rax - adc %rdx,$d3 - - mulq $h1 # h1*s1 - mov $h2,$h1 # borrow $h1 - add %rax,$h0 - adc %rdx,$d1 - - imulq $s1,$h1 # h2*s1 - add $h1,$d2 - mov $d1,$h1 - adc \$0,$d3 - - imulq $r0,$h2 # h2*r0 - add $d2,$h1 - mov \$-4,%rax # mask value - adc $h2,$d3 - - and $d3,%rax # last reduction step - mov $d3,$h2 - shr \$2,$d3 - and \$3,$h2 - add $d3,%rax - add %rax,$h0 - adc \$0,$h1 - adc \$0,$h2 -___ -} - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int64 h[3]; # current hash value base 2^64 -# unsigned __int64 r[2]; # key value base 2^64 - -$code.=<<___; -.text -___ -$code.=<<___ if (!$kernel); -.extern OPENSSL_ia32cap_P - -.globl poly1305_block_init_arch -.hidden poly1305_block_init_arch -.globl poly1305_blocks_x86_64 -.hidden poly1305_blocks_x86_64 -.globl poly1305_emit_x86_64 -.hidden poly1305_emit_x86_64 -___ -&declare_typed_function("poly1305_block_init_arch", 32, 3); -$code.=<<___; - xor %eax,%eax - mov %rax,0($ctx) # initialize hash value - mov %rax,8($ctx) - mov %rax,16($ctx) - - test $inp,$inp - je .Lno_key -___ -$code.=<<___ if (!$kernel); - lea poly1305_blocks_x86_64(%rip),%r10 - lea poly1305_emit_x86_64(%rip),%r11 -___ -$code.=<<___ if (!$kernel && $avx); - mov OPENSSL_ia32cap_P+4(%rip),%r9 - lea poly1305_blocks_avx(%rip),%rax - lea poly1305_emit_avx(%rip),%rcx - bt \$`60-32`,%r9 # AVX? - cmovc %rax,%r10 - cmovc %rcx,%r11 -___ -$code.=<<___ if (!$kernel && $avx>1); - lea poly1305_blocks_avx2(%rip),%rax - bt \$`5+32`,%r9 # AVX2? - cmovc %rax,%r10 -___ -$code.=<<___ if (!$kernel && $avx>3); - mov \$`(1<<31|1<<21|1<<16)`,%rax - shr \$32,%r9 - and %rax,%r9 - cmp %rax,%r9 - je .Linit_base2_44 -___ -$code.=<<___; - mov \$0x0ffffffc0fffffff,%rax - mov \$0x0ffffffc0ffffffc,%rcx - and 0($inp),%rax - and 8($inp),%rcx - mov %rax,24($ctx) - mov %rcx,32($ctx) -___ -$code.=<<___ if (!$kernel && $flavour !~ /elf32/); - mov %r10,0(%rdx) - mov %r11,8(%rdx) -___ -$code.=<<___ if (!$kernel && $flavour =~ /elf32/); - mov %r10d,0(%rdx) - mov %r11d,4(%rdx) -___ -$code.=<<___; - mov \$1,%eax -.Lno_key: - RET -___ -&end_function("poly1305_block_init_arch"); - -&declare_function("poly1305_blocks_x86_64", 32, 4); -$code.=<<___; -.cfi_startproc -.Lblocks: - shr \$4,$len - jz .Lno_data # too short - - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $ctx -.cfi_push $ctx -.Lblocks_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2 - - mov $s1,$r1 - shr \$2,$s1 - mov $r1,%rax - add $r1,$s1 # s1 = r1 + (r1 >> 2) - jmp .Loop - -.align 32 -.Loop: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 -___ - - &poly1305_iteration(); - -$code.=<<___; - mov $r1,%rax - dec %r15 # len-=16 - jnz .Loop - - mov 0(%rsp),$ctx -.cfi_restore $ctx - - mov $h0,0($ctx) # store hash value - mov $h1,8($ctx) - mov $h2,16($ctx) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lno_data: -.Lblocks_epilogue: - RET -.cfi_endproc -___ -&end_function("poly1305_blocks_x86_64"); - -&declare_function("poly1305_emit_x86_64", 32, 3); -$code.=<<___; -.Lemit: - mov 0($ctx),%r8 # load hash value - mov 8($ctx),%r9 - mov 16($ctx),%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -___ -&end_function("poly1305_emit_x86_64"); -if ($avx) { - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int32 h[5]; # current hash value base 2^26 -# unsigned __int32 is_base2_26; -# unsigned __int64 r[2]; # key value base 2^64 -# unsigned __int64 pad; -# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; -# -# where r^n are base 2^26 digits of degrees of multiplier key. There are -# 5 digits, but last four are interleaved with multiples of 5, totalling -# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. - -my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = - map("%xmm$_",(0..15)); - -$code.=<<___; -.type __poly1305_block,\@abi-omnipotent -.align 32 -__poly1305_block: - push $ctx -___ - &poly1305_iteration(); -$code.=<<___; - pop $ctx - RET -.size __poly1305_block,.-__poly1305_block - -.type __poly1305_init_avx,\@abi-omnipotent -.align 32 -__poly1305_init_avx: - push %rbp - mov %rsp,%rbp - mov $r0,$h0 - mov $r1,$h1 - xor $h2,$h2 - - lea 48+64($ctx),$ctx # size optimization - - mov $r1,%rax - call __poly1305_block # r^2 - - mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 - mov \$0x3ffffff,%edx - mov $h0,$d1 - and $h0#d,%eax - mov $r0,$d2 - and $r0#d,%edx - mov %eax,`16*0+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*0+4-64`($ctx) - shr \$26,$d2 - - mov \$0x3ffffff,%eax - mov \$0x3ffffff,%edx - and $d1#d,%eax - and $d2#d,%edx - mov %eax,`16*1+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*1+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*2+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*2+4-64`($ctx) - shr \$26,$d2 - - mov $h1,%rax - mov $r1,%rdx - shl \$12,%rax - shl \$12,%rdx - or $d1,%rax - or $d2,%rdx - and \$0x3ffffff,%eax - and \$0x3ffffff,%edx - mov %eax,`16*3+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*3+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*4+0-64`($ctx) - mov $h1,$d1 - mov %edx,`16*4+4-64`($ctx) - mov $r1,$d2 - - mov \$0x3ffffff,%eax - mov \$0x3ffffff,%edx - shr \$14,$d1 - shr \$14,$d2 - and $d1#d,%eax - and $d2#d,%edx - mov %eax,`16*5+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*5+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*6+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*6+4-64`($ctx) - shr \$26,$d2 - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+0-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d2#d,`16*7+4-64`($ctx) - lea ($d2,$d2,4),$d2 # *5 - mov $d1#d,`16*8+0-64`($ctx) - mov $d2#d,`16*8+4-64`($ctx) - - mov $r1,%rax - call __poly1305_block # r^3 - - mov \$0x3ffffff,%eax # save r^3 base 2^26 - mov $h0,$d1 - and $h0#d,%eax - shr \$26,$d1 - mov %eax,`16*0+12-64`($ctx) - - mov \$0x3ffffff,%edx - and $d1#d,%edx - mov %edx,`16*1+12-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*2+12-64`($ctx) - - mov $h1,%rax - shl \$12,%rax - or $d1,%rax - and \$0x3ffffff,%eax - mov %eax,`16*3+12-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov $h1,$d1 - mov %eax,`16*4+12-64`($ctx) - - mov \$0x3ffffff,%edx - shr \$14,$d1 - and $d1#d,%edx - mov %edx,`16*5+12-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*6+12-64`($ctx) - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+12-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d1#d,`16*8+12-64`($ctx) - - mov $r1,%rax - call __poly1305_block # r^4 - - mov \$0x3ffffff,%eax # save r^4 base 2^26 - mov $h0,$d1 - and $h0#d,%eax - shr \$26,$d1 - mov %eax,`16*0+8-64`($ctx) - - mov \$0x3ffffff,%edx - and $d1#d,%edx - mov %edx,`16*1+8-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*2+8-64`($ctx) - - mov $h1,%rax - shl \$12,%rax - or $d1,%rax - and \$0x3ffffff,%eax - mov %eax,`16*3+8-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov $h1,$d1 - mov %eax,`16*4+8-64`($ctx) - - mov \$0x3ffffff,%edx - shr \$14,$d1 - and $d1#d,%edx - mov %edx,`16*5+8-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*6+8-64`($ctx) - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+8-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d1#d,`16*8+8-64`($ctx) - - lea -48-64($ctx),$ctx # size [de-]optimization - pop %rbp - RET -.size __poly1305_init_avx,.-__poly1305_init_avx -___ - -&declare_function("poly1305_blocks_avx", 32, 4); -$code.=<<___; -.cfi_startproc - mov 20($ctx),%r8d # is_base2_26 - cmp \$128,$len - jae .Lblocks_avx - test %r8d,%r8d - jz .Lblocks - -.Lblocks_avx: - and \$-16,$len - jz .Lno_data_avx - - vzeroupper - - test %r8d,%r8d - jz .Lbase2_64_avx - - test \$31,$len - jz .Leven_avx - - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lblocks_avx_body: - - mov $len,%r15 # reassign $len - - mov 0($ctx),$d1 # load hash value - mov 8($ctx),$d2 - mov 16($ctx),$h2#d - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - ################################# base 2^26 -> base 2^64 - mov $d1#d,$h0#d - and \$`-1*(1<<31)`,$d1 - mov $d2,$r1 # borrow $r1 - mov $d2#d,$h1#d - and \$`-1*(1<<31)`,$d2 - - shr \$6,$d1 - shl \$52,$r1 - add $d1,$h0 - shr \$12,$h1 - shr \$18,$d2 - add $r1,$h0 - adc $d2,$h1 - - mov $h2,$d1 - shl \$40,$d1 - shr \$24,$h2 - add $d1,$h1 - adc \$0,$h2 # can be partially reduced... - - mov \$-4,$d2 # ... so reduce - mov $h2,$d1 - and $h2,$d2 - shr \$2,$d1 - and \$3,$h2 - add $d2,$d1 # =*5 - add $d1,$h0 - adc \$0,$h1 - adc \$0,$h2 - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - - call __poly1305_block - - test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx # store hash in base 2^64 format - - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$r0 - mov $h1,$r1 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$r0 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $r0,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$r1 - and \$0x3ffffff,$h1 # h[3] - or $r1,$h2 # h[4] - - sub \$16,%r15 - jz .Lstore_base2_26_avx - - vmovd %rax#d,$H0 - vmovd %rdx#d,$H1 - vmovd $h0#d,$H2 - vmovd $h1#d,$H3 - vmovd $h2#d,$H4 - jmp .Lproceed_avx - -.align 32 -.Lstore_base2_64_avx: - mov $h0,0($ctx) - mov $h1,8($ctx) - mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx - -.align 16 -.Lstore_base2_26_avx: - mov %rax#d,0($ctx) # store hash value base 2^26 - mov %rdx#d,4($ctx) - mov $h0#d,8($ctx) - mov $h1#d,12($ctx) - mov $h2#d,16($ctx) -.align 16 -.Ldone_avx: - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lno_data_avx: -.Lblocks_avx_epilogue: - RET -.cfi_endproc - -.align 32 -.Lbase2_64_avx: -.cfi_startproc - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lbase2_64_avx_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2#d - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - test \$31,$len - jz .Linit_avx - - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - -.Linit_avx: - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$d1 - mov $h1,$d2 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$d1 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $d1,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$d2 - and \$0x3ffffff,$h1 # h[3] - or $d2,$h2 # h[4] - - vmovd %rax#d,$H0 - vmovd %rdx#d,$H1 - vmovd $h0#d,$H2 - vmovd $h1#d,$H3 - vmovd $h2#d,$H4 - movl \$1,20($ctx) # set is_base2_26 - - call __poly1305_init_avx - -.Lproceed_avx: - mov %r15,$len - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lbase2_64_avx_epilogue: - jmp .Ldo_avx -.cfi_endproc - -.align 32 -.Leven_avx: -.cfi_startproc - vmovd 4*0($ctx),$H0 # load hash value - vmovd 4*1($ctx),$H1 - vmovd 4*2($ctx),$H2 - vmovd 4*3($ctx),$H3 - vmovd 4*4($ctx),$H4 - -.Ldo_avx: -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - and \$-32,%rsp - sub \$-8,%rsp - lea -0x58(%rsp),%r11 - sub \$0x178,%rsp -___ -$code.=<<___ if ($win64); - lea -0xf8(%rsp),%r11 - sub \$0x218,%rsp - vmovdqa %xmm6,0x50(%r11) - vmovdqa %xmm7,0x60(%r11) - vmovdqa %xmm8,0x70(%r11) - vmovdqa %xmm9,0x80(%r11) - vmovdqa %xmm10,0x90(%r11) - vmovdqa %xmm11,0xa0(%r11) - vmovdqa %xmm12,0xb0(%r11) - vmovdqa %xmm13,0xc0(%r11) - vmovdqa %xmm14,0xd0(%r11) - vmovdqa %xmm15,0xe0(%r11) -.Ldo_avx_body: -___ -$code.=<<___; - sub \$64,$len - lea -32($inp),%rax - cmovc %rax,$inp - - vmovdqu `16*3`($ctx),$D4 # preload r0^2 - lea `16*3+64`($ctx),$ctx # size optimization - lea .Lconst(%rip),%rcx - - ################################################################ - # load input - vmovdqu 16*2($inp),$T0 - vmovdqu 16*3($inp),$T1 - vmovdqa 64(%rcx),$MASK # .Lmask26 - - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - - vpsrlq \$40,$T4,$T4 # 4 - vpsrlq \$26,$T0,$T1 - vpand $MASK,$T0,$T0 # 0 - vpsrlq \$4,$T3,$T2 - vpand $MASK,$T1,$T1 # 1 - vpsrlq \$30,$T3,$T3 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - jbe .Lskip_loop_avx - - # expand and copy pre-calculated table to stack - vmovdqu `16*1-64`($ctx),$D1 - vmovdqu `16*2-64`($ctx),$D2 - vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 - vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 - vmovdqa $D3,-0x90(%r11) - vmovdqa $D0,0x00(%rsp) - vpshufd \$0xEE,$D1,$D4 - vmovdqu `16*3-64`($ctx),$D0 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D4,-0x80(%r11) - vmovdqa $D1,0x10(%rsp) - vpshufd \$0xEE,$D2,$D3 - vmovdqu `16*4-64`($ctx),$D1 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D3,-0x70(%r11) - vmovdqa $D2,0x20(%rsp) - vpshufd \$0xEE,$D0,$D4 - vmovdqu `16*5-64`($ctx),$D2 - vpshufd \$0x44,$D0,$D0 - vmovdqa $D4,-0x60(%r11) - vmovdqa $D0,0x30(%rsp) - vpshufd \$0xEE,$D1,$D3 - vmovdqu `16*6-64`($ctx),$D0 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D3,-0x50(%r11) - vmovdqa $D1,0x40(%rsp) - vpshufd \$0xEE,$D2,$D4 - vmovdqu `16*7-64`($ctx),$D1 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D4,-0x40(%r11) - vmovdqa $D2,0x50(%rsp) - vpshufd \$0xEE,$D0,$D3 - vmovdqu `16*8-64`($ctx),$D2 - vpshufd \$0x44,$D0,$D0 - vmovdqa $D3,-0x30(%r11) - vmovdqa $D0,0x60(%rsp) - vpshufd \$0xEE,$D1,$D4 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D4,-0x20(%r11) - vmovdqa $D1,0x70(%rsp) - vpshufd \$0xEE,$D2,$D3 - vmovdqa 0x00(%rsp),$D4 # preload r0^2 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D3,-0x10(%r11) - vmovdqa $D2,0x80(%rsp) - - jmp .Loop_avx - -.align 32 -.Loop_avx: - ################################################################ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - # \___________________/ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - # \___________________/ \____________________/ - # - # Note that we start with inp[2:3]*r^2. This is because it - # doesn't depend on reduction in previous iteration. - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # though note that $Tx and $Hx are "reversed" in this section, - # and $D4 is preloaded with r0^2... - - vpmuludq $T0,$D4,$D0 # d0 = h0*r0 - vpmuludq $T1,$D4,$D1 # d1 = h1*r0 - vmovdqa $H2,0x20(%r11) # offload hash - vpmuludq $T2,$D4,$D2 # d3 = h2*r0 - vmovdqa 0x10(%rsp),$H2 # r1^2 - vpmuludq $T3,$D4,$D3 # d3 = h3*r0 - vpmuludq $T4,$D4,$D4 # d4 = h4*r0 - - vmovdqa $H0,0x00(%r11) # - vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 - vmovdqa $H1,0x10(%r11) # - vpmuludq $T3,$H2,$H1 # h3*r1 - vpaddq $H0,$D0,$D0 # d0 += h4*s1 - vpaddq $H1,$D4,$D4 # d4 += h3*r1 - vmovdqa $H3,0x30(%r11) # - vpmuludq $T2,$H2,$H0 # h2*r1 - vpmuludq $T1,$H2,$H1 # h1*r1 - vpaddq $H0,$D3,$D3 # d3 += h2*r1 - vmovdqa 0x30(%rsp),$H3 # r2^2 - vpaddq $H1,$D2,$D2 # d2 += h1*r1 - vmovdqa $H4,0x40(%r11) # - vpmuludq $T0,$H2,$H2 # h0*r1 - vpmuludq $T2,$H3,$H0 # h2*r2 - vpaddq $H2,$D1,$D1 # d1 += h0*r1 - - vmovdqa 0x40(%rsp),$H4 # s2^2 - vpaddq $H0,$D4,$D4 # d4 += h2*r2 - vpmuludq $T1,$H3,$H1 # h1*r2 - vpmuludq $T0,$H3,$H3 # h0*r2 - vpaddq $H1,$D3,$D3 # d3 += h1*r2 - vmovdqa 0x50(%rsp),$H2 # r3^2 - vpaddq $H3,$D2,$D2 # d2 += h0*r2 - vpmuludq $T4,$H4,$H0 # h4*s2 - vpmuludq $T3,$H4,$H4 # h3*s2 - vpaddq $H0,$D1,$D1 # d1 += h4*s2 - vmovdqa 0x60(%rsp),$H3 # s3^2 - vpaddq $H4,$D0,$D0 # d0 += h3*s2 - - vmovdqa 0x80(%rsp),$H4 # s4^2 - vpmuludq $T1,$H2,$H1 # h1*r3 - vpmuludq $T0,$H2,$H2 # h0*r3 - vpaddq $H1,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $T4,$H3,$H0 # h4*s3 - vpmuludq $T3,$H3,$H1 # h3*s3 - vpaddq $H0,$D2,$D2 # d2 += h4*s3 - vmovdqu 16*0($inp),$H0 # load input - vpaddq $H1,$D1,$D1 # d1 += h3*s3 - vpmuludq $T2,$H3,$H3 # h2*s3 - vpmuludq $T2,$H4,$T2 # h2*s4 - vpaddq $H3,$D0,$D0 # d0 += h2*s3 - - vmovdqu 16*1($inp),$H1 # - vpaddq $T2,$D1,$D1 # d1 += h2*s4 - vpmuludq $T3,$H4,$T3 # h3*s4 - vpmuludq $T4,$H4,$T4 # h4*s4 - vpsrldq \$6,$H0,$H2 # splat input - vpaddq $T3,$D2,$D2 # d2 += h3*s4 - vpaddq $T4,$D3,$D3 # d3 += h4*s4 - vpsrldq \$6,$H1,$H3 # - vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 - vpmuludq $T1,$H4,$T0 # h1*s4 - vpunpckhqdq $H1,$H0,$H4 # 4 - vpaddq $T4,$D4,$D4 # d4 += h0*r4 - vmovdqa -0x90(%r11),$T4 # r0^4 - vpaddq $T0,$D0,$D0 # d0 += h1*s4 - - vpunpcklqdq $H1,$H0,$H0 # 0:1 - vpunpcklqdq $H3,$H2,$H3 # 2:3 - - #vpsrlq \$40,$H4,$H4 # 4 - vpsrldq \$`40/8`,$H4,$H4 # 4 - vpsrlq \$26,$H0,$H1 - vpand $MASK,$H0,$H0 # 0 - vpsrlq \$4,$H3,$H2 - vpand $MASK,$H1,$H1 # 1 - vpand 0(%rcx),$H4,$H4 # .Lmask24 - vpsrlq \$30,$H3,$H3 - vpand $MASK,$H2,$H2 # 2 - vpand $MASK,$H3,$H3 # 3 - vpor 32(%rcx),$H4,$H4 # padbit, yes, always - - vpaddq 0x00(%r11),$H0,$H0 # add hash value - vpaddq 0x10(%r11),$H1,$H1 - vpaddq 0x20(%r11),$H2,$H2 - vpaddq 0x30(%r11),$H3,$H3 - vpaddq 0x40(%r11),$H4,$H4 - - lea 16*2($inp),%rax - lea 16*4($inp),$inp - sub \$64,$len - cmovc %rax,$inp - - ################################################################ - # Now we accumulate (inp[0:1]+hash)*r^4 - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - vpmuludq $H0,$T4,$T0 # h0*r0 - vpmuludq $H1,$T4,$T1 # h1*r0 - vpaddq $T0,$D0,$D0 - vpaddq $T1,$D1,$D1 - vmovdqa -0x80(%r11),$T2 # r1^4 - vpmuludq $H2,$T4,$T0 # h2*r0 - vpmuludq $H3,$T4,$T1 # h3*r0 - vpaddq $T0,$D2,$D2 - vpaddq $T1,$D3,$D3 - vpmuludq $H4,$T4,$T4 # h4*r0 - vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 - vpaddq $T4,$D4,$D4 - - vpaddq $T0,$D0,$D0 # d0 += h4*s1 - vpmuludq $H2,$T2,$T1 # h2*r1 - vpmuludq $H3,$T2,$T0 # h3*r1 - vpaddq $T1,$D3,$D3 # d3 += h2*r1 - vmovdqa -0x60(%r11),$T3 # r2^4 - vpaddq $T0,$D4,$D4 # d4 += h3*r1 - vpmuludq $H1,$T2,$T1 # h1*r1 - vpmuludq $H0,$T2,$T2 # h0*r1 - vpaddq $T1,$D2,$D2 # d2 += h1*r1 - vpaddq $T2,$D1,$D1 # d1 += h0*r1 - - vmovdqa -0x50(%r11),$T4 # s2^4 - vpmuludq $H2,$T3,$T0 # h2*r2 - vpmuludq $H1,$T3,$T1 # h1*r2 - vpaddq $T0,$D4,$D4 # d4 += h2*r2 - vpaddq $T1,$D3,$D3 # d3 += h1*r2 - vmovdqa -0x40(%r11),$T2 # r3^4 - vpmuludq $H0,$T3,$T3 # h0*r2 - vpmuludq $H4,$T4,$T0 # h4*s2 - vpaddq $T3,$D2,$D2 # d2 += h0*r2 - vpaddq $T0,$D1,$D1 # d1 += h4*s2 - vmovdqa -0x30(%r11),$T3 # s3^4 - vpmuludq $H3,$T4,$T4 # h3*s2 - vpmuludq $H1,$T2,$T1 # h1*r3 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - - vmovdqa -0x10(%r11),$T4 # s4^4 - vpaddq $T1,$D4,$D4 # d4 += h1*r3 - vpmuludq $H0,$T2,$T2 # h0*r3 - vpmuludq $H4,$T3,$T0 # h4*s3 - vpaddq $T2,$D3,$D3 # d3 += h0*r3 - vpaddq $T0,$D2,$D2 # d2 += h4*s3 - vmovdqu 16*2($inp),$T0 # load input - vpmuludq $H3,$T3,$T2 # h3*s3 - vpmuludq $H2,$T3,$T3 # h2*s3 - vpaddq $T2,$D1,$D1 # d1 += h3*s3 - vmovdqu 16*3($inp),$T1 # - vpaddq $T3,$D0,$D0 # d0 += h2*s3 - - vpmuludq $H2,$T4,$H2 # h2*s4 - vpmuludq $H3,$T4,$H3 # h3*s4 - vpsrldq \$6,$T0,$T2 # splat input - vpaddq $H2,$D1,$D1 # d1 += h2*s4 - vpmuludq $H4,$T4,$H4 # h4*s4 - vpsrldq \$6,$T1,$T3 # - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 - vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 - vpmuludq $H1,$T4,$H0 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - - #vpsrlq \$40,$T4,$T4 # 4 - vpsrldq \$`40/8`,$T4,$T4 # 4 - vpsrlq \$26,$T0,$T1 - vmovdqa 0x00(%rsp),$D4 # preload r0^2 - vpand $MASK,$T0,$T0 # 0 - vpsrlq \$4,$T3,$T2 - vpand $MASK,$T1,$T1 # 1 - vpand 0(%rcx),$T4,$T4 # .Lmask24 - vpsrlq \$30,$T3,$T3 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - ################################################################ - # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - # and P. Schwabe - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D0 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D0,$H0,$H0 - vpsllq \$2,$D0,$D0 - vpaddq $D0,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - ja .Loop_avx - -.Lskip_loop_avx: - ################################################################ - # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 - add \$32,$len - jnz .Long_tail_avx - - vpaddq $H2,$T2,$T2 - vpaddq $H0,$T0,$T0 - vpaddq $H1,$T1,$T1 - vpaddq $H3,$T3,$T3 - vpaddq $H4,$T4,$T4 - -.Long_tail_avx: - vmovdqa $H2,0x20(%r11) - vmovdqa $H0,0x00(%r11) - vmovdqa $H1,0x10(%r11) - vmovdqa $H3,0x30(%r11) - vmovdqa $H4,0x40(%r11) - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - vpmuludq $T2,$D4,$D2 # d2 = h2*r0 - vpmuludq $T0,$D4,$D0 # d0 = h0*r0 - vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n - vpmuludq $T1,$D4,$D1 # d1 = h1*r0 - vpmuludq $T3,$D4,$D3 # d3 = h3*r0 - vpmuludq $T4,$D4,$D4 # d4 = h4*r0 - - vpmuludq $T3,$H2,$H0 # h3*r1 - vpaddq $H0,$D4,$D4 # d4 += h3*r1 - vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n - vpmuludq $T2,$H2,$H1 # h2*r1 - vpaddq $H1,$D3,$D3 # d3 += h2*r1 - vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n - vpmuludq $T1,$H2,$H0 # h1*r1 - vpaddq $H0,$D2,$D2 # d2 += h1*r1 - vpmuludq $T0,$H2,$H2 # h0*r1 - vpaddq $H2,$D1,$D1 # d1 += h0*r1 - vpmuludq $T4,$H3,$H3 # h4*s1 - vpaddq $H3,$D0,$D0 # d0 += h4*s1 - - vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n - vpmuludq $T2,$H4,$H1 # h2*r2 - vpaddq $H1,$D4,$D4 # d4 += h2*r2 - vpmuludq $T1,$H4,$H0 # h1*r2 - vpaddq $H0,$D3,$D3 # d3 += h1*r2 - vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n - vpmuludq $T0,$H4,$H4 # h0*r2 - vpaddq $H4,$D2,$D2 # d2 += h0*r2 - vpmuludq $T4,$H2,$H1 # h4*s2 - vpaddq $H1,$D1,$D1 # d1 += h4*s2 - vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n - vpmuludq $T3,$H2,$H2 # h3*s2 - vpaddq $H2,$D0,$D0 # d0 += h3*s2 - - vpmuludq $T1,$H3,$H0 # h1*r3 - vpaddq $H0,$D4,$D4 # d4 += h1*r3 - vpmuludq $T0,$H3,$H3 # h0*r3 - vpaddq $H3,$D3,$D3 # d3 += h0*r3 - vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n - vpmuludq $T4,$H4,$H1 # h4*s3 - vpaddq $H1,$D2,$D2 # d2 += h4*s3 - vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n - vpmuludq $T3,$H4,$H0 # h3*s3 - vpaddq $H0,$D1,$D1 # d1 += h3*s3 - vpmuludq $T2,$H4,$H4 # h2*s3 - vpaddq $H4,$D0,$D0 # d0 += h2*s3 - - vpmuludq $T0,$H2,$H2 # h0*r4 - vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 - vpmuludq $T4,$H3,$H1 # h4*s4 - vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 - vpmuludq $T3,$H3,$H0 # h3*s4 - vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 - vpmuludq $T2,$H3,$H1 # h2*s4 - vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 - vpmuludq $T1,$H3,$H3 # h1*s4 - vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 - - jz .Lshort_tail_avx - - vmovdqu 16*0($inp),$H0 # load input - vmovdqu 16*1($inp),$H1 - - vpsrldq \$6,$H0,$H2 # splat input - vpsrldq \$6,$H1,$H3 - vpunpckhqdq $H1,$H0,$H4 # 4 - vpunpcklqdq $H1,$H0,$H0 # 0:1 - vpunpcklqdq $H3,$H2,$H3 # 2:3 - - vpsrlq \$40,$H4,$H4 # 4 - vpsrlq \$26,$H0,$H1 - vpand $MASK,$H0,$H0 # 0 - vpsrlq \$4,$H3,$H2 - vpand $MASK,$H1,$H1 # 1 - vpsrlq \$30,$H3,$H3 - vpand $MASK,$H2,$H2 # 2 - vpand $MASK,$H3,$H3 # 3 - vpor 32(%rcx),$H4,$H4 # padbit, yes, always - - vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 - vpaddq 0x00(%r11),$H0,$H0 - vpaddq 0x10(%r11),$H1,$H1 - vpaddq 0x20(%r11),$H2,$H2 - vpaddq 0x30(%r11),$H3,$H3 - vpaddq 0x40(%r11),$H4,$H4 - - ################################################################ - # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate - - vpmuludq $H0,$T4,$T0 # h0*r0 - vpaddq $T0,$D0,$D0 # d0 += h0*r0 - vpmuludq $H1,$T4,$T1 # h1*r0 - vpaddq $T1,$D1,$D1 # d1 += h1*r0 - vpmuludq $H2,$T4,$T0 # h2*r0 - vpaddq $T0,$D2,$D2 # d2 += h2*r0 - vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n - vpmuludq $H3,$T4,$T1 # h3*r0 - vpaddq $T1,$D3,$D3 # d3 += h3*r0 - vpmuludq $H4,$T4,$T4 # h4*r0 - vpaddq $T4,$D4,$D4 # d4 += h4*r0 - - vpmuludq $H3,$T2,$T0 # h3*r1 - vpaddq $T0,$D4,$D4 # d4 += h3*r1 - vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 - vpmuludq $H2,$T2,$T1 # h2*r1 - vpaddq $T1,$D3,$D3 # d3 += h2*r1 - vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 - vpmuludq $H1,$T2,$T0 # h1*r1 - vpaddq $T0,$D2,$D2 # d2 += h1*r1 - vpmuludq $H0,$T2,$T2 # h0*r1 - vpaddq $T2,$D1,$D1 # d1 += h0*r1 - vpmuludq $H4,$T3,$T3 # h4*s1 - vpaddq $T3,$D0,$D0 # d0 += h4*s1 - - vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 - vpmuludq $H2,$T4,$T1 # h2*r2 - vpaddq $T1,$D4,$D4 # d4 += h2*r2 - vpmuludq $H1,$T4,$T0 # h1*r2 - vpaddq $T0,$D3,$D3 # d3 += h1*r2 - vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 - vpmuludq $H0,$T4,$T4 # h0*r2 - vpaddq $T4,$D2,$D2 # d2 += h0*r2 - vpmuludq $H4,$T2,$T1 # h4*s2 - vpaddq $T1,$D1,$D1 # d1 += h4*s2 - vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 - vpmuludq $H3,$T2,$T2 # h3*s2 - vpaddq $T2,$D0,$D0 # d0 += h3*s2 - - vpmuludq $H1,$T3,$T0 # h1*r3 - vpaddq $T0,$D4,$D4 # d4 += h1*r3 - vpmuludq $H0,$T3,$T3 # h0*r3 - vpaddq $T3,$D3,$D3 # d3 += h0*r3 - vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 - vpmuludq $H4,$T4,$T1 # h4*s3 - vpaddq $T1,$D2,$D2 # d2 += h4*s3 - vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 - vpmuludq $H3,$T4,$T0 # h3*s3 - vpaddq $T0,$D1,$D1 # d1 += h3*s3 - vpmuludq $H2,$T4,$T4 # h2*s3 - vpaddq $T4,$D0,$D0 # d0 += h2*s3 - - vpmuludq $H0,$T2,$T2 # h0*r4 - vpaddq $T2,$D4,$D4 # d4 += h0*r4 - vpmuludq $H4,$T3,$T1 # h4*s4 - vpaddq $T1,$D3,$D3 # d3 += h4*s4 - vpmuludq $H3,$T3,$T0 # h3*s4 - vpaddq $T0,$D2,$D2 # d2 += h3*s4 - vpmuludq $H2,$T3,$T1 # h2*s4 - vpaddq $T1,$D1,$D1 # d1 += h2*s4 - vpmuludq $H1,$T3,$T3 # h1*s4 - vpaddq $T3,$D0,$D0 # d0 += h1*s4 - -.Lshort_tail_avx: - ################################################################ - # horizontal addition - - vpsrldq \$8,$D4,$T4 - vpsrldq \$8,$D3,$T3 - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$D0,$T0 - vpsrldq \$8,$D2,$T2 - vpaddq $T3,$D3,$D3 - vpaddq $T4,$D4,$D4 - vpaddq $T0,$D0,$D0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$D2,$D2 - - ################################################################ - # lazy reduction - - vpsrlq \$26,$D3,$H3 - vpand $MASK,$D3,$D3 - vpaddq $H3,$D4,$D4 # h3 -> h4 - - vpsrlq \$26,$D0,$H0 - vpand $MASK,$D0,$D0 - vpaddq $H0,$D1,$D1 # h0 -> h1 - - vpsrlq \$26,$D4,$H4 - vpand $MASK,$D4,$D4 - - vpsrlq \$26,$D1,$H1 - vpand $MASK,$D1,$D1 - vpaddq $H1,$D2,$D2 # h1 -> h2 - - vpaddq $H4,$D0,$D0 - vpsllq \$2,$H4,$H4 - vpaddq $H4,$D0,$D0 # h4 -> h0 - - vpsrlq \$26,$D2,$H2 - vpand $MASK,$D2,$D2 - vpaddq $H2,$D3,$D3 # h2 -> h3 - - vpsrlq \$26,$D0,$H0 - vpand $MASK,$D0,$D0 - vpaddq $H0,$D1,$D1 # h0 -> h1 - - vpsrlq \$26,$D3,$H3 - vpand $MASK,$D3,$D3 - vpaddq $H3,$D4,$D4 # h3 -> h4 - - vmovd $D0,`4*0-48-64`($ctx) # save partially reduced - vmovd $D1,`4*1-48-64`($ctx) - vmovd $D2,`4*2-48-64`($ctx) - vmovd $D3,`4*3-48-64`($ctx) - vmovd $D4,`4*4-48-64`($ctx) -___ -$code.=<<___ if ($win64); - vmovdqa 0x50(%r11),%xmm6 - vmovdqa 0x60(%r11),%xmm7 - vmovdqa 0x70(%r11),%xmm8 - vmovdqa 0x80(%r11),%xmm9 - vmovdqa 0x90(%r11),%xmm10 - vmovdqa 0xa0(%r11),%xmm11 - vmovdqa 0xb0(%r11),%xmm12 - vmovdqa 0xc0(%r11),%xmm13 - vmovdqa 0xd0(%r11),%xmm14 - vmovdqa 0xe0(%r11),%xmm15 - lea 0xf8(%r11),%rsp -.Ldo_avx_epilogue: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - vzeroupper - RET -.cfi_endproc -___ -&end_function("poly1305_blocks_avx"); - -&declare_function("poly1305_emit_avx", 32, 3); -$code.=<<___; - cmpl \$0,20($ctx) # is_base2_26? - je .Lemit - - mov 0($ctx),%eax # load hash value base 2^26 - mov 4($ctx),%ecx - mov 8($ctx),%r8d - mov 12($ctx),%r11d - mov 16($ctx),%r10d - - shl \$26,%rcx # base 2^26 -> base 2^64 - mov %r8,%r9 - shl \$52,%r8 - add %rcx,%rax - shr \$12,%r9 - add %rax,%r8 # h0 - adc \$0,%r9 - - shl \$14,%r11 - mov %r10,%rax - shr \$24,%r10 - add %r11,%r9 - shl \$40,%rax - add %rax,%r9 # h1 - adc \$0,%r10 # h2 - - mov %r10,%rax # could be partially reduced, so reduce - mov %r10,%rcx - and \$3,%r10 - shr \$2,%rax - and \$-4,%rcx - add %rcx,%rax - add %rax,%r8 - adc \$0,%r9 - adc \$0,%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -___ -&end_function("poly1305_emit_avx"); - -if ($avx>1) { - -my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = - map("%ymm$_",(0..15)); -my $S4=$MASK; - -sub poly1305_blocks_avxN { - my ($avx512) = @_; - my $suffix = $avx512 ? "_avx512" : ""; -$code.=<<___; -.cfi_startproc - mov 20($ctx),%r8d # is_base2_26 - cmp \$128,$len - jae .Lblocks_avx2$suffix - test %r8d,%r8d - jz .Lblocks - -.Lblocks_avx2$suffix: - and \$-16,$len - jz .Lno_data_avx2$suffix - - vzeroupper - - test %r8d,%r8d - jz .Lbase2_64_avx2$suffix - - test \$63,$len - jz .Leven_avx2$suffix - - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lblocks_avx2_body$suffix: - - mov $len,%r15 # reassign $len - - mov 0($ctx),$d1 # load hash value - mov 8($ctx),$d2 - mov 16($ctx),$h2#d - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - ################################# base 2^26 -> base 2^64 - mov $d1#d,$h0#d - and \$`-1*(1<<31)`,$d1 - mov $d2,$r1 # borrow $r1 - mov $d2#d,$h1#d - and \$`-1*(1<<31)`,$d2 - - shr \$6,$d1 - shl \$52,$r1 - add $d1,$h0 - shr \$12,$h1 - shr \$18,$d2 - add $r1,$h0 - adc $d2,$h1 - - mov $h2,$d1 - shl \$40,$d1 - shr \$24,$h2 - add $d1,$h1 - adc \$0,$h2 # can be partially reduced... - - mov \$-4,$d2 # ... so reduce - mov $h2,$d1 - and $h2,$d2 - shr \$2,$d1 - and \$3,$h2 - add $d2,$d1 # =*5 - add $d1,$h0 - adc \$0,$h1 - adc \$0,$h2 - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - -.Lbase2_26_pre_avx2$suffix: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - mov $r1,%rax - - test \$63,%r15 - jnz .Lbase2_26_pre_avx2$suffix - - test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format - - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$r0 - mov $h1,$r1 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$r0 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $r0,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$r1 - and \$0x3ffffff,$h1 # h[3] - or $r1,$h2 # h[4] - - test %r15,%r15 - jz .Lstore_base2_26_avx2$suffix - - vmovd %rax#d,%x#$H0 - vmovd %rdx#d,%x#$H1 - vmovd $h0#d,%x#$H2 - vmovd $h1#d,%x#$H3 - vmovd $h2#d,%x#$H4 - jmp .Lproceed_avx2$suffix - -.align 32 -.Lstore_base2_64_avx2$suffix: - mov $h0,0($ctx) - mov $h1,8($ctx) - mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx2$suffix - -.align 16 -.Lstore_base2_26_avx2$suffix: - mov %rax#d,0($ctx) # store hash value base 2^26 - mov %rdx#d,4($ctx) - mov $h0#d,8($ctx) - mov $h1#d,12($ctx) - mov $h2#d,16($ctx) -.align 16 -.Ldone_avx2$suffix: - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lno_data_avx2$suffix: -.Lblocks_avx2_epilogue$suffix: - RET -.cfi_endproc - -.align 32 -.Lbase2_64_avx2$suffix: -.cfi_startproc - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lbase2_64_avx2_body$suffix: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2#d - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - test \$63,$len - jz .Linit_avx2$suffix - -.Lbase2_64_pre_avx2$suffix: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - mov $r1,%rax - - test \$63,%r15 - jnz .Lbase2_64_pre_avx2$suffix - -.Linit_avx2$suffix: - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$d1 - mov $h1,$d2 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$d1 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $d1,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$d2 - and \$0x3ffffff,$h1 # h[3] - or $d2,$h2 # h[4] - - vmovd %rax#d,%x#$H0 - vmovd %rdx#d,%x#$H1 - vmovd $h0#d,%x#$H2 - vmovd $h1#d,%x#$H3 - vmovd $h2#d,%x#$H4 - movl \$1,20($ctx) # set is_base2_26 - - call __poly1305_init_avx - -.Lproceed_avx2$suffix: - mov %r15,$len # restore $len -___ -$code.=<<___ if (!$kernel); - mov OPENSSL_ia32cap_P+8(%rip),%r9d - mov \$`(1<<31|1<<30|1<<16)`,%r11d -___ -$code.=<<___; - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lbase2_64_avx2_epilogue$suffix: - jmp .Ldo_avx2$suffix -.cfi_endproc - -.align 32 -.Leven_avx2$suffix: -.cfi_startproc -___ -$code.=<<___ if (!$kernel); - mov OPENSSL_ia32cap_P+8(%rip),%r9d -___ -$code.=<<___; - vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 - vmovd 4*1($ctx),%x#$H1 - vmovd 4*2($ctx),%x#$H2 - vmovd 4*3($ctx),%x#$H3 - vmovd 4*4($ctx),%x#$H4 - -.Ldo_avx2$suffix: -___ -$code.=<<___ if (!$kernel && $avx>2); - cmp \$512,$len - jb .Lskip_avx512 - and %r11d,%r9d - test \$`1<<16`,%r9d # check for AVX512F - jnz .Lblocks_avx512 -.Lskip_avx512$suffix: -___ -$code.=<<___ if ($avx > 2 && $avx512 && $kernel); - cmp \$512,$len - jae .Lblocks_avx512 -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - sub \$0x128,%rsp -___ -$code.=<<___ if ($win64); - lea 8(%rsp),%r10 - sub \$0x1c8,%rsp - vmovdqa %xmm6,-0xb0(%r10) - vmovdqa %xmm7,-0xa0(%r10) - vmovdqa %xmm8,-0x90(%r10) - vmovdqa %xmm9,-0x80(%r10) - vmovdqa %xmm10,-0x70(%r10) - vmovdqa %xmm11,-0x60(%r10) - vmovdqa %xmm12,-0x50(%r10) - vmovdqa %xmm13,-0x40(%r10) - vmovdqa %xmm14,-0x30(%r10) - vmovdqa %xmm15,-0x20(%r10) -.Ldo_avx2_body$suffix: -___ -$code.=<<___; - lea .Lconst(%rip),%rcx - lea 48+64($ctx),$ctx # size optimization - vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 - - # expand and copy pre-calculated table to stack - vmovdqu `16*0-64`($ctx),%x#$T2 - and \$-512,%rsp - vmovdqu `16*1-64`($ctx),%x#$T3 - vmovdqu `16*2-64`($ctx),%x#$T4 - vmovdqu `16*3-64`($ctx),%x#$D0 - vmovdqu `16*4-64`($ctx),%x#$D1 - vmovdqu `16*5-64`($ctx),%x#$D2 - lea 0x90(%rsp),%rax # size optimization - vmovdqu `16*6-64`($ctx),%x#$D3 - vpermd $T2,$T0,$T2 # 00003412 -> 14243444 - vmovdqu `16*7-64`($ctx),%x#$D4 - vpermd $T3,$T0,$T3 - vmovdqu `16*8-64`($ctx),%x#$MASK - vpermd $T4,$T0,$T4 - vmovdqa $T2,0x00(%rsp) - vpermd $D0,$T0,$D0 - vmovdqa $T3,0x20-0x90(%rax) - vpermd $D1,$T0,$D1 - vmovdqa $T4,0x40-0x90(%rax) - vpermd $D2,$T0,$D2 - vmovdqa $D0,0x60-0x90(%rax) - vpermd $D3,$T0,$D3 - vmovdqa $D1,0x80-0x90(%rax) - vpermd $D4,$T0,$D4 - vmovdqa $D2,0xa0-0x90(%rax) - vpermd $MASK,$T0,$MASK - vmovdqa $D3,0xc0-0x90(%rax) - vmovdqa $D4,0xe0-0x90(%rax) - vmovdqa $MASK,0x100-0x90(%rax) - vmovdqa 64(%rcx),$MASK # .Lmask26 - - ################################################################ - # load input - vmovdqu 16*0($inp),%x#$T0 - vmovdqu 16*1($inp),%x#$T1 - vinserti128 \$1,16*2($inp),$T0,$T0 - vinserti128 \$1,16*3($inp),$T1,$T1 - lea 16*4($inp),$inp - - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpunpcklqdq $T3,$T2,$T2 # 2:3 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - - vpsrlq \$30,$T2,$T3 - vpsrlq \$4,$T2,$T2 - vpsrlq \$26,$T0,$T1 - vpsrlq \$40,$T4,$T4 # 4 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T0,$T0 # 0 - vpand $MASK,$T1,$T1 # 1 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - vpaddq $H2,$T2,$H2 # accumulate input - sub \$64,$len - jz .Ltail_avx2$suffix - jmp .Loop_avx2$suffix - -.align 32 -.Loop_avx2$suffix: - ################################################################ - # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 - # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 - # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 - # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 - # \________/\__________/ - ################################################################ - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - vmovdqa `32*0`(%rsp),$T0 # r0^4 - vpaddq $H1,$T1,$H1 - vmovdqa `32*1`(%rsp),$T1 # r1^4 - vpaddq $H3,$T3,$H3 - vmovdqa `32*3`(%rsp),$T2 # r2^4 - vpaddq $H4,$T4,$H4 - vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 - vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # however, as h2 is "chronologically" first one available pull - # corresponding operations up, so it's - # - # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 - # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 - # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 - - vpmuludq $H2,$T0,$D2 # d2 = h2*r0 - vpmuludq $H2,$T1,$D3 # d3 = h2*r1 - vpmuludq $H2,$T2,$D4 # d4 = h2*r2 - vpmuludq $H2,$T3,$D0 # d0 = h2*s3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - - vpmuludq $H0,$T1,$T4 # h0*r1 - vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp - vpaddq $T4,$D1,$D1 # d1 += h0*r1 - vpaddq $H2,$D2,$D2 # d2 += h1*r1 - vpmuludq $H3,$T1,$T4 # h3*r1 - vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 - vpaddq $T4,$D4,$D4 # d4 += h3*r1 - vpaddq $H2,$D0,$D0 # d0 += h4*s1 - vmovdqa `32*4-0x90`(%rax),$T1 # s2 - - vpmuludq $H0,$T0,$T4 # h0*r0 - vpmuludq $H1,$T0,$H2 # h1*r0 - vpaddq $T4,$D0,$D0 # d0 += h0*r0 - vpaddq $H2,$D1,$D1 # d1 += h1*r0 - vpmuludq $H3,$T0,$T4 # h3*r0 - vpmuludq $H4,$T0,$H2 # h4*r0 - vmovdqu 16*0($inp),%x#$T0 # load input - vpaddq $T4,$D3,$D3 # d3 += h3*r0 - vpaddq $H2,$D4,$D4 # d4 += h4*r0 - vinserti128 \$1,16*2($inp),$T0,$T0 - - vpmuludq $H3,$T1,$T4 # h3*s2 - vpmuludq $H4,$T1,$H2 # h4*s2 - vmovdqu 16*1($inp),%x#$T1 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - vpaddq $H2,$D1,$D1 # d1 += h4*s2 - vmovdqa `32*5-0x90`(%rax),$H2 # r3 - vpmuludq $H1,$T2,$T4 # h1*r2 - vpmuludq $H0,$T2,$T2 # h0*r2 - vpaddq $T4,$D3,$D3 # d3 += h1*r2 - vpaddq $T2,$D2,$D2 # d2 += h0*r2 - vinserti128 \$1,16*3($inp),$T1,$T1 - lea 16*4($inp),$inp - - vpmuludq $H1,$H2,$T4 # h1*r3 - vpmuludq $H0,$H2,$H2 # h0*r3 - vpsrldq \$6,$T0,$T2 # splat input - vpaddq $T4,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $H3,$T3,$T4 # h3*s3 - vpmuludq $H4,$T3,$H2 # h4*s3 - vpsrldq \$6,$T1,$T3 - vpaddq $T4,$D1,$D1 # d1 += h3*s3 - vpaddq $H2,$D2,$D2 # d2 += h4*s3 - vpunpckhqdq $T1,$T0,$T4 # 4 - - vpmuludq $H3,$S4,$H3 # h3*s4 - vpmuludq $H4,$S4,$H4 # h4*s4 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 - vpmuludq $H1,$S4,$H0 # h1*s4 - vmovdqa 64(%rcx),$MASK # .Lmask26 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - ################################################################ - # lazy reduction (interleaved with tail of input splat) - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$4,$T3,$T2 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpand $MASK,$T2,$T2 # 2 - vpsrlq \$26,$T0,$T1 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpaddq $T2,$H2,$H2 # modulo-scheduled - vpsrlq \$30,$T3,$T3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$40,$T4,$T4 # 4 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpand $MASK,$T0,$T0 # 0 - vpand $MASK,$T1,$T1 # 1 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - sub \$64,$len - jnz .Loop_avx2$suffix - - .byte 0x66,0x90 -.Ltail_avx2$suffix: - ################################################################ - # while above multiplications were by r^4 in all lanes, in last - # iteration we multiply least significant lane by r^4 and most - # significant one by r, so copy of above except that references - # to the precomputed table are displaced by 4... - - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - vmovdqu `32*0+4`(%rsp),$T0 # r0^4 - vpaddq $H1,$T1,$H1 - vmovdqu `32*1+4`(%rsp),$T1 # r1^4 - vpaddq $H3,$T3,$H3 - vmovdqu `32*3+4`(%rsp),$T2 # r2^4 - vpaddq $H4,$T4,$H4 - vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 - vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 - - vpmuludq $H2,$T0,$D2 # d2 = h2*r0 - vpmuludq $H2,$T1,$D3 # d3 = h2*r1 - vpmuludq $H2,$T2,$D4 # d4 = h2*r2 - vpmuludq $H2,$T3,$D0 # d0 = h2*s3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - - vpmuludq $H0,$T1,$T4 # h0*r1 - vpmuludq $H1,$T1,$H2 # h1*r1 - vpaddq $T4,$D1,$D1 # d1 += h0*r1 - vpaddq $H2,$D2,$D2 # d2 += h1*r1 - vpmuludq $H3,$T1,$T4 # h3*r1 - vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 - vpaddq $T4,$D4,$D4 # d4 += h3*r1 - vpaddq $H2,$D0,$D0 # d0 += h4*s1 - - vpmuludq $H0,$T0,$T4 # h0*r0 - vpmuludq $H1,$T0,$H2 # h1*r0 - vpaddq $T4,$D0,$D0 # d0 += h0*r0 - vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 - vpaddq $H2,$D1,$D1 # d1 += h1*r0 - vpmuludq $H3,$T0,$T4 # h3*r0 - vpmuludq $H4,$T0,$H2 # h4*r0 - vpaddq $T4,$D3,$D3 # d3 += h3*r0 - vpaddq $H2,$D4,$D4 # d4 += h4*r0 - - vpmuludq $H3,$T1,$T4 # h3*s2 - vpmuludq $H4,$T1,$H2 # h4*s2 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - vpaddq $H2,$D1,$D1 # d1 += h4*s2 - vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 - vpmuludq $H1,$T2,$T4 # h1*r2 - vpmuludq $H0,$T2,$T2 # h0*r2 - vpaddq $T4,$D3,$D3 # d3 += h1*r2 - vpaddq $T2,$D2,$D2 # d2 += h0*r2 - - vpmuludq $H1,$H2,$T4 # h1*r3 - vpmuludq $H0,$H2,$H2 # h0*r3 - vpaddq $T4,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $H3,$T3,$T4 # h3*s3 - vpmuludq $H4,$T3,$H2 # h4*s3 - vpaddq $T4,$D1,$D1 # d1 += h3*s3 - vpaddq $H2,$D2,$D2 # d2 += h4*s3 - - vpmuludq $H3,$S4,$H3 # h3*s4 - vpmuludq $H4,$S4,$H4 # h4*s4 - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 - vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 - vpmuludq $H1,$S4,$H0 # h1*s4 - vmovdqa 64(%rcx),$MASK # .Lmask26 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - ################################################################ - # horizontal addition - - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$H2,$T2 - vpsrldq \$8,$H3,$T3 - vpsrldq \$8,$H4,$T4 - vpsrldq \$8,$H0,$T0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$H2,$H2 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vpaddq $T0,$H0,$H0 - - vpermq \$0x2,$H3,$T3 - vpermq \$0x2,$H4,$T4 - vpermq \$0x2,$H0,$T0 - vpermq \$0x2,$D1,$T1 - vpermq \$0x2,$H2,$T2 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vpaddq $T0,$H0,$H0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$H2,$H2 - - ################################################################ - # lazy reduction - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced - vmovd %x#$H1,`4*1-48-64`($ctx) - vmovd %x#$H2,`4*2-48-64`($ctx) - vmovd %x#$H3,`4*3-48-64`($ctx) - vmovd %x#$H4,`4*4-48-64`($ctx) -___ -$code.=<<___ if ($win64); - vmovdqa -0xb0(%r10),%xmm6 - vmovdqa -0xa0(%r10),%xmm7 - vmovdqa -0x90(%r10),%xmm8 - vmovdqa -0x80(%r10),%xmm9 - vmovdqa -0x70(%r10),%xmm10 - vmovdqa -0x60(%r10),%xmm11 - vmovdqa -0x50(%r10),%xmm12 - vmovdqa -0x40(%r10),%xmm13 - vmovdqa -0x30(%r10),%xmm14 - vmovdqa -0x20(%r10),%xmm15 - lea -8(%r10),%rsp -.Ldo_avx2_epilogue$suffix: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - vzeroupper - RET -.cfi_endproc -___ -if($avx > 2 && $avx512) { -my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); -my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); -my $PADBIT="%zmm30"; - -map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain -map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); -map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); -map(s/%y/%z/,($MASK)); - -$code.=<<___; -.cfi_startproc -.Lblocks_avx512: - mov \$15,%eax - kmovw %eax,%k2 -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - sub \$0x128,%rsp -___ -$code.=<<___ if ($win64); - lea 8(%rsp),%r10 - sub \$0x1c8,%rsp - vmovdqa %xmm6,-0xb0(%r10) - vmovdqa %xmm7,-0xa0(%r10) - vmovdqa %xmm8,-0x90(%r10) - vmovdqa %xmm9,-0x80(%r10) - vmovdqa %xmm10,-0x70(%r10) - vmovdqa %xmm11,-0x60(%r10) - vmovdqa %xmm12,-0x50(%r10) - vmovdqa %xmm13,-0x40(%r10) - vmovdqa %xmm14,-0x30(%r10) - vmovdqa %xmm15,-0x20(%r10) -.Ldo_avx512_body: -___ -$code.=<<___; - lea .Lconst(%rip),%rcx - lea 48+64($ctx),$ctx # size optimization - vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 - - # expand pre-calculated table - vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} - and \$-512,%rsp - vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} - mov \$0x20,%rax - vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} - vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} - vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} - vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} - vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} - vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} - vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} - vpermd $D0,$T2,$R0 # 00003412 -> 14243444 - vpbroadcastq 64(%rcx),$MASK # .Lmask26 - vpermd $D1,$T2,$R1 - vpermd $T0,$T2,$S1 - vpermd $D2,$T2,$R2 - vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 - vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 - vpermd $T1,$T2,$S2 - vmovdqu64 $R1,0x00(%rsp,%rax){%k2} - vpsrlq \$32,$R1,$T1 - vpermd $D3,$T2,$R3 - vmovdqa64 $S1,0x40(%rsp){%k2} - vpermd $T3,$T2,$S3 - vpermd $D4,$T2,$R4 - vmovdqu64 $R2,0x40(%rsp,%rax){%k2} - vpermd $T4,$T2,$S4 - vmovdqa64 $S2,0x80(%rsp){%k2} - vmovdqu64 $R3,0x80(%rsp,%rax){%k2} - vmovdqa64 $S3,0xc0(%rsp){%k2} - vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} - vmovdqa64 $S4,0x100(%rsp){%k2} - - ################################################################ - # calculate 5th through 8th powers of the key - # - # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 - # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 - # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 - # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 - # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 - - vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 - vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 - vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 - vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 - vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 - vpsrlq \$32,$R2,$T2 - - vpmuludq $T1,$S4,$M0 - vpmuludq $T1,$R0,$M1 - vpmuludq $T1,$R1,$M2 - vpmuludq $T1,$R2,$M3 - vpmuludq $T1,$R3,$M4 - vpsrlq \$32,$R3,$T3 - vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 - vpaddq $M1,$D1,$D1 # d1 += r1'*r0 - vpaddq $M2,$D2,$D2 # d2 += r1'*r1 - vpaddq $M3,$D3,$D3 # d3 += r1'*r2 - vpaddq $M4,$D4,$D4 # d4 += r1'*r3 - - vpmuludq $T2,$S3,$M0 - vpmuludq $T2,$S4,$M1 - vpmuludq $T2,$R1,$M3 - vpmuludq $T2,$R2,$M4 - vpmuludq $T2,$R0,$M2 - vpsrlq \$32,$R4,$T4 - vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 - vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 - vpaddq $M3,$D3,$D3 # d3 += r2'*r1 - vpaddq $M4,$D4,$D4 # d4 += r2'*r2 - vpaddq $M2,$D2,$D2 # d2 += r2'*r0 - - vpmuludq $T3,$S2,$M0 - vpmuludq $T3,$R0,$M3 - vpmuludq $T3,$R1,$M4 - vpmuludq $T3,$S3,$M1 - vpmuludq $T3,$S4,$M2 - vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 - vpaddq $M3,$D3,$D3 # d3 += r3'*r0 - vpaddq $M4,$D4,$D4 # d4 += r3'*r1 - vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 - vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 - - vpmuludq $T4,$S4,$M3 - vpmuludq $T4,$R0,$M4 - vpmuludq $T4,$S1,$M0 - vpmuludq $T4,$S2,$M1 - vpmuludq $T4,$S3,$M2 - vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 - vpaddq $M4,$D4,$D4 # d4 += r2'*r0 - vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 - vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 - vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 - - ################################################################ - # load input - vmovdqu64 16*0($inp),%z#$T3 - vmovdqu64 16*4($inp),%z#$T4 - lea 16*8($inp),$inp - - ################################################################ - # lazy reduction - - vpsrlq \$26,$D3,$M3 - vpandq $MASK,$D3,$D3 - vpaddq $M3,$D4,$D4 # d3 -> d4 - - vpsrlq \$26,$D0,$M0 - vpandq $MASK,$D0,$D0 - vpaddq $M0,$D1,$D1 # d0 -> d1 - - vpsrlq \$26,$D4,$M4 - vpandq $MASK,$D4,$D4 - - vpsrlq \$26,$D1,$M1 - vpandq $MASK,$D1,$D1 - vpaddq $M1,$D2,$D2 # d1 -> d2 - - vpaddq $M4,$D0,$D0 - vpsllq \$2,$M4,$M4 - vpaddq $M4,$D0,$D0 # d4 -> d0 - - vpsrlq \$26,$D2,$M2 - vpandq $MASK,$D2,$D2 - vpaddq $M2,$D3,$D3 # d2 -> d3 - - vpsrlq \$26,$D0,$M0 - vpandq $MASK,$D0,$D0 - vpaddq $M0,$D1,$D1 # d0 -> d1 - - vpsrlq \$26,$D3,$M3 - vpandq $MASK,$D3,$D3 - vpaddq $M3,$D4,$D4 # d3 -> d4 - - ################################################################ - # at this point we have 14243444 in $R0-$S4 and 05060708 in - # $D0-$D4, ... - - vpunpcklqdq $T4,$T3,$T0 # transpose input - vpunpckhqdq $T4,$T3,$T4 - - # ... since input 64-bit lanes are ordered as 73625140, we could - # "vperm" it to 76543210 (here and in each loop iteration), *or* - # we could just flow along, hence the goal for $R0-$S4 is - # 1858286838784888 ... - - vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: - mov \$0x7777,%eax - kmovw %eax,%k1 - - vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- - vpermd $R1,$M0,$R1 - vpermd $R2,$M0,$R2 - vpermd $R3,$M0,$R3 - vpermd $R4,$M0,$R4 - - vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 - vpermd $D1,$M0,${R1}{%k1} - vpermd $D2,$M0,${R2}{%k1} - vpermd $D3,$M0,${R3}{%k1} - vpermd $D4,$M0,${R4}{%k1} - - vpslld \$2,$R1,$S1 # *5 - vpslld \$2,$R2,$S2 - vpslld \$2,$R3,$S3 - vpslld \$2,$R4,$S4 - vpaddd $R1,$S1,$S1 - vpaddd $R2,$S2,$S2 - vpaddd $R3,$S3,$S3 - vpaddd $R4,$S4,$S4 - - vpbroadcastq 32(%rcx),$PADBIT # .L129 - - vpsrlq \$52,$T0,$T2 # splat input - vpsllq \$12,$T4,$T3 - vporq $T3,$T2,$T2 - vpsrlq \$26,$T0,$T1 - vpsrlq \$14,$T4,$T3 - vpsrlq \$40,$T4,$T4 # 4 - vpandq $MASK,$T2,$T2 # 2 - vpandq $MASK,$T0,$T0 # 0 - #vpandq $MASK,$T1,$T1 # 1 - #vpandq $MASK,$T3,$T3 # 3 - #vporq $PADBIT,$T4,$T4 # padbit, yes, always - - vpaddq $H2,$T2,$H2 # accumulate input - sub \$192,$len - jbe .Ltail_avx512 - jmp .Loop_avx512 - -.align 32 -.Loop_avx512: - ################################################################ - # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 - # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 - # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 - # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 - # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 - # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 - # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 - # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 - # \________/\___________/ - ################################################################ - #vpaddq $H2,$T2,$H2 # accumulate input - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # however, as h2 is "chronologically" first one available pull - # corresponding operations up, so it's - # - # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 - # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 - # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 - # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 - # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 - - vpmuludq $H2,$R1,$D3 # d3 = h2*r1 - vpaddq $H0,$T0,$H0 - vpmuludq $H2,$R2,$D4 # d4 = h2*r2 - vpandq $MASK,$T1,$T1 # 1 - vpmuludq $H2,$S3,$D0 # d0 = h2*s3 - vpandq $MASK,$T3,$T3 # 3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - vporq $PADBIT,$T4,$T4 # padbit, yes, always - vpmuludq $H2,$R0,$D2 # d2 = h2*r0 - vpaddq $H1,$T1,$H1 # accumulate input - vpaddq $H3,$T3,$H3 - vpaddq $H4,$T4,$H4 - - vmovdqu64 16*0($inp),$T3 # load input - vmovdqu64 16*4($inp),$T4 - lea 16*8($inp),$inp - vpmuludq $H0,$R3,$M3 - vpmuludq $H0,$R4,$M4 - vpmuludq $H0,$R0,$M0 - vpmuludq $H0,$R1,$M1 - vpaddq $M3,$D3,$D3 # d3 += h0*r3 - vpaddq $M4,$D4,$D4 # d4 += h0*r4 - vpaddq $M0,$D0,$D0 # d0 += h0*r0 - vpaddq $M1,$D1,$D1 # d1 += h0*r1 - - vpmuludq $H1,$R2,$M3 - vpmuludq $H1,$R3,$M4 - vpmuludq $H1,$S4,$M0 - vpmuludq $H0,$R2,$M2 - vpaddq $M3,$D3,$D3 # d3 += h1*r2 - vpaddq $M4,$D4,$D4 # d4 += h1*r3 - vpaddq $M0,$D0,$D0 # d0 += h1*s4 - vpaddq $M2,$D2,$D2 # d2 += h0*r2 - - vpunpcklqdq $T4,$T3,$T0 # transpose input - vpunpckhqdq $T4,$T3,$T4 - - vpmuludq $H3,$R0,$M3 - vpmuludq $H3,$R1,$M4 - vpmuludq $H1,$R0,$M1 - vpmuludq $H1,$R1,$M2 - vpaddq $M3,$D3,$D3 # d3 += h3*r0 - vpaddq $M4,$D4,$D4 # d4 += h3*r1 - vpaddq $M1,$D1,$D1 # d1 += h1*r0 - vpaddq $M2,$D2,$D2 # d2 += h1*r1 - - vpmuludq $H4,$S4,$M3 - vpmuludq $H4,$R0,$M4 - vpmuludq $H3,$S2,$M0 - vpmuludq $H3,$S3,$M1 - vpaddq $M3,$D3,$D3 # d3 += h4*s4 - vpmuludq $H3,$S4,$M2 - vpaddq $M4,$D4,$D4 # d4 += h4*r0 - vpaddq $M0,$D0,$D0 # d0 += h3*s2 - vpaddq $M1,$D1,$D1 # d1 += h3*s3 - vpaddq $M2,$D2,$D2 # d2 += h3*s4 - - vpmuludq $H4,$S1,$M0 - vpmuludq $H4,$S2,$M1 - vpmuludq $H4,$S3,$M2 - vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 - vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 - vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 - - ################################################################ - # lazy reduction (interleaved with input splat) - - vpsrlq \$52,$T0,$T2 # splat input - vpsllq \$12,$T4,$T3 - - vpsrlq \$26,$D3,$H3 - vpandq $MASK,$D3,$D3 - vpaddq $H3,$D4,$H4 # h3 -> h4 - - vporq $T3,$T2,$T2 - - vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpandq $MASK,$T2,$T2 # 2 - - vpsrlq \$26,$H4,$D4 - vpandq $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpandq $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpaddq $T2,$H2,$H2 # modulo-scheduled - vpsrlq \$26,$T0,$T1 - - vpsrlq \$26,$H2,$D2 - vpandq $MASK,$H2,$H2 - vpaddq $D2,$D3,$H3 # h2 -> h3 - - vpsrlq \$14,$T4,$T3 - - vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$40,$T4,$T4 # 4 - - vpsrlq \$26,$H3,$D3 - vpandq $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpandq $MASK,$T0,$T0 # 0 - #vpandq $MASK,$T1,$T1 # 1 - #vpandq $MASK,$T3,$T3 # 3 - #vporq $PADBIT,$T4,$T4 # padbit, yes, always - - sub \$128,$len - ja .Loop_avx512 - -.Ltail_avx512: - ################################################################ - # while above multiplications were by r^8 in all lanes, in last - # iteration we multiply least significant lane by r^8 and most - # significant one by r, that's why table gets shifted... - - vpsrlq \$32,$R0,$R0 # 0105020603070408 - vpsrlq \$32,$R1,$R1 - vpsrlq \$32,$R2,$R2 - vpsrlq \$32,$S3,$S3 - vpsrlq \$32,$S4,$S4 - vpsrlq \$32,$R3,$R3 - vpsrlq \$32,$R4,$R4 - vpsrlq \$32,$S1,$S1 - vpsrlq \$32,$S2,$S2 - - ################################################################ - # load either next or last 64 byte of input - lea ($inp,$len),$inp - - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - - vpmuludq $H2,$R1,$D3 # d3 = h2*r1 - vpmuludq $H2,$R2,$D4 # d4 = h2*r2 - vpmuludq $H2,$S3,$D0 # d0 = h2*s3 - vpandq $MASK,$T1,$T1 # 1 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - vpandq $MASK,$T3,$T3 # 3 - vpmuludq $H2,$R0,$D2 # d2 = h2*r0 - vporq $PADBIT,$T4,$T4 # padbit, yes, always - vpaddq $H1,$T1,$H1 # accumulate input - vpaddq $H3,$T3,$H3 - vpaddq $H4,$T4,$H4 - - vmovdqu 16*0($inp),%x#$T0 - vpmuludq $H0,$R3,$M3 - vpmuludq $H0,$R4,$M4 - vpmuludq $H0,$R0,$M0 - vpmuludq $H0,$R1,$M1 - vpaddq $M3,$D3,$D3 # d3 += h0*r3 - vpaddq $M4,$D4,$D4 # d4 += h0*r4 - vpaddq $M0,$D0,$D0 # d0 += h0*r0 - vpaddq $M1,$D1,$D1 # d1 += h0*r1 - - vmovdqu 16*1($inp),%x#$T1 - vpmuludq $H1,$R2,$M3 - vpmuludq $H1,$R3,$M4 - vpmuludq $H1,$S4,$M0 - vpmuludq $H0,$R2,$M2 - vpaddq $M3,$D3,$D3 # d3 += h1*r2 - vpaddq $M4,$D4,$D4 # d4 += h1*r3 - vpaddq $M0,$D0,$D0 # d0 += h1*s4 - vpaddq $M2,$D2,$D2 # d2 += h0*r2 - - vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 - vpmuludq $H3,$R0,$M3 - vpmuludq $H3,$R1,$M4 - vpmuludq $H1,$R0,$M1 - vpmuludq $H1,$R1,$M2 - vpaddq $M3,$D3,$D3 # d3 += h3*r0 - vpaddq $M4,$D4,$D4 # d4 += h3*r1 - vpaddq $M1,$D1,$D1 # d1 += h1*r0 - vpaddq $M2,$D2,$D2 # d2 += h1*r1 - - vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 - vpmuludq $H4,$S4,$M3 - vpmuludq $H4,$R0,$M4 - vpmuludq $H3,$S2,$M0 - vpmuludq $H3,$S3,$M1 - vpmuludq $H3,$S4,$M2 - vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 - vpaddq $M4,$D4,$D4 # d4 += h4*r0 - vpaddq $M0,$D0,$D0 # d0 += h3*s2 - vpaddq $M1,$D1,$D1 # d1 += h3*s3 - vpaddq $M2,$D2,$D2 # d2 += h3*s4 - - vpmuludq $H4,$S1,$M0 - vpmuludq $H4,$S2,$M1 - vpmuludq $H4,$S3,$M2 - vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 - vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 - vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 - - ################################################################ - # horizontal addition - - mov \$1,%eax - vpermq \$0xb1,$H3,$D3 - vpermq \$0xb1,$D4,$H4 - vpermq \$0xb1,$H0,$D0 - vpermq \$0xb1,$H1,$D1 - vpermq \$0xb1,$H2,$D2 - vpaddq $D3,$H3,$H3 - vpaddq $D4,$H4,$H4 - vpaddq $D0,$H0,$H0 - vpaddq $D1,$H1,$H1 - vpaddq $D2,$H2,$H2 - - kmovw %eax,%k3 - vpermq \$0x2,$H3,$D3 - vpermq \$0x2,$H4,$D4 - vpermq \$0x2,$H0,$D0 - vpermq \$0x2,$H1,$D1 - vpermq \$0x2,$H2,$D2 - vpaddq $D3,$H3,$H3 - vpaddq $D4,$H4,$H4 - vpaddq $D0,$H0,$H0 - vpaddq $D1,$H1,$H1 - vpaddq $D2,$H2,$H2 - - vextracti64x4 \$0x1,$H3,%y#$D3 - vextracti64x4 \$0x1,$H4,%y#$D4 - vextracti64x4 \$0x1,$H0,%y#$D0 - vextracti64x4 \$0x1,$H1,%y#$D1 - vextracti64x4 \$0x1,$H2,%y#$D2 - vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case - vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 - vpaddq $D0,$H0,${H0}{%k3}{z} - vpaddq $D1,$H1,${H1}{%k3}{z} - vpaddq $D2,$H2,${H2}{%k3}{z} -___ -map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); -map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); -$code.=<<___; - ################################################################ - # lazy reduction (interleaved with input splat) - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpunpcklqdq $T3,$T2,$T2 # 2:3 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpsrlq \$30,$T2,$T3 - vpsrlq \$4,$T2,$T2 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpsrlq \$26,$T0,$T1 - vpsrlq \$40,$T4,$T4 # 4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T0,$T0 # 0 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 - vpand $MASK,$T1,$T1 # 1 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - vpaddq $D3,$H4,$H4 # h3 -> h4 - - lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 - add \$64,$len - jnz .Ltail_avx2$suffix - - vpsubq $T2,$H2,$H2 # undo input accumulation - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced - vmovd %x#$H1,`4*1-48-64`($ctx) - vmovd %x#$H2,`4*2-48-64`($ctx) - vmovd %x#$H3,`4*3-48-64`($ctx) - vmovd %x#$H4,`4*4-48-64`($ctx) - vzeroall -___ -$code.=<<___ if ($win64); - movdqa -0xb0(%r10),%xmm6 - movdqa -0xa0(%r10),%xmm7 - movdqa -0x90(%r10),%xmm8 - movdqa -0x80(%r10),%xmm9 - movdqa -0x70(%r10),%xmm10 - movdqa -0x60(%r10),%xmm11 - movdqa -0x50(%r10),%xmm12 - movdqa -0x40(%r10),%xmm13 - movdqa -0x30(%r10),%xmm14 - movdqa -0x20(%r10),%xmm15 - lea -8(%r10),%rsp -.Ldo_avx512_epilogue: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - RET -.cfi_endproc -___ - -} - -} - -&declare_function("poly1305_blocks_avx2", 32, 4); -poly1305_blocks_avxN(0); -&end_function("poly1305_blocks_avx2"); - -####################################################################### -if ($avx>2) { -# On entry we have input length divisible by 64. But since inner loop -# processes 128 bytes per iteration, cases when length is not divisible -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this -# reason stack layout is kept identical to poly1305_blocks_avx2. If not -# for this tail, we wouldn't have to even allocate stack frame... - -&declare_function("poly1305_blocks_avx512", 32, 4); -poly1305_blocks_avxN(1); -&end_function("poly1305_blocks_avx512"); - -if (!$kernel && $avx>3) { -######################################################################## -# VPMADD52 version using 2^44 radix. -# -# One can argue that base 2^52 would be more natural. Well, even though -# some operations would be more natural, one has to recognize couple of -# things. Base 2^52 doesn't provide advantage over base 2^44 if you look -# at amount of multiply-n-accumulate operations. Secondly, it makes it -# impossible to pre-compute multiples of 5 [referred to as s[]/sN in -# reference implementations], which means that more such operations -# would have to be performed in inner loop, which in turn makes critical -# path longer. In other words, even though base 2^44 reduction might -# look less elegant, overall critical path is actually shorter... - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int64 h[3]; # current hash value base 2^44 -# unsigned __int64 s[2]; # key value*20 base 2^44 -# unsigned __int64 r[3]; # key value base 2^44 -# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; -# # r^n positions reflect -# # placement in register, not -# # memory, R[3] is R[1]*20 - -$code.=<<___; -.type poly1305_init_base2_44,\@function,3 -.align 32 -poly1305_init_base2_44: - xor %eax,%eax - mov %rax,0($ctx) # initialize hash value - mov %rax,8($ctx) - mov %rax,16($ctx) - -.Linit_base2_44: - lea poly1305_blocks_vpmadd52(%rip),%r10 - lea poly1305_emit_base2_44(%rip),%r11 - - mov \$0x0ffffffc0fffffff,%rax - mov \$0x0ffffffc0ffffffc,%rcx - and 0($inp),%rax - mov \$0x00000fffffffffff,%r8 - and 8($inp),%rcx - mov \$0x00000fffffffffff,%r9 - and %rax,%r8 - shrd \$44,%rcx,%rax - mov %r8,40($ctx) # r0 - and %r9,%rax - shr \$24,%rcx - mov %rax,48($ctx) # r1 - lea (%rax,%rax,4),%rax # *5 - mov %rcx,56($ctx) # r2 - shl \$2,%rax # magic <<2 - lea (%rcx,%rcx,4),%rcx # *5 - shl \$2,%rcx # magic <<2 - mov %rax,24($ctx) # s1 - mov %rcx,32($ctx) # s2 - movq \$-1,64($ctx) # write impossible value -___ -$code.=<<___ if ($flavour !~ /elf32/); - mov %r10,0(%rdx) - mov %r11,8(%rdx) -___ -$code.=<<___ if ($flavour =~ /elf32/); - mov %r10d,0(%rdx) - mov %r11d,4(%rdx) -___ -$code.=<<___; - mov \$1,%eax - RET -.size poly1305_init_base2_44,.-poly1305_init_base2_44 -___ -{ -my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); -my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); -my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52,\@function,4 -.align 32 -poly1305_blocks_vpmadd52: - shr \$4,$len - jz .Lno_data_vpmadd52 # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - - # if powers of the key are not calculated yet, process up to 3 - # blocks with this single-block subroutine, otherwise ensure that - # length is divisible by 2 blocks and pass the rest down to next - # subroutine... - - mov \$3,%rax - mov \$1,%r10 - cmp \$4,$len # is input long - cmovae %r10,%rax - test %r8,%r8 # is power value impossible? - cmovns %r10,%rax - - and $len,%rax # is input of favourable length? - jz .Lblocks_vpmadd52_4x - - sub %rax,$len - mov \$7,%r10d - mov \$1,%r11d - kmovw %r10d,%k7 - lea .L2_44_inp_permd(%rip),%r10 - kmovw %r11d,%k1 - - vmovq $padbit,%x#$PAD - vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd - vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift - vpermq \$0xcf,$PAD,$PAD - vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask - - vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value - vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys - vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} - vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} - - vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt - vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft - - jmp .Loop_vpmadd52 - -.align 32 -.Loop_vpmadd52: - vmovdqu32 0($inp),%x#$T0 # load input as ----3210 - lea 16($inp),$inp - - vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 - vpsrlvq $inp_shift,$T0,$T0 - vpandq $reduc_mask,$T0,$T0 - vporq $PAD,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo # accumulate input - - vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value - vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} - vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} - - vpxord $Dlo,$Dlo,$Dlo - vpxord $Dhi,$Dhi,$Dhi - - vpmadd52luq $r2r1r0,$H0,$Dlo - vpmadd52huq $r2r1r0,$H0,$Dhi - - vpmadd52luq $r1r0s2,$H1,$Dlo - vpmadd52huq $r1r0s2,$H1,$Dhi - - vpmadd52luq $r0s2s1,$H2,$Dlo - vpmadd52huq $r0s2s1,$H2,$Dhi - - vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword - vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword - vpandq $reduc_mask,$Dlo,$Dlo - - vpaddq $T0,$Dhi,$Dhi - - vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword - - vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) - - vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word - vpandq $reduc_mask,$Dlo,$Dlo - - vpermq \$0b10010011,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo - - vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} - - vpaddq $T0,$Dlo,$Dlo - vpsllq \$2,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo - - dec %rax # len-=16 - jnz .Loop_vpmadd52 - - vmovdqu64 $Dlo,0($ctx){%k7} # store hash value - - test $len,$len - jnz .Lblocks_vpmadd52_4x - -.Lno_data_vpmadd52: - RET -.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 -___ -} -{ -######################################################################## -# As implied by its name 4x subroutine processes 4 blocks in parallel -# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power -# and is handled in 256-bit %ymm registers. - -my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); -my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); -my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52_4x,\@function,4 -.align 32 -poly1305_blocks_vpmadd52_4x: - shr \$4,$len - jz .Lno_data_vpmadd52_4x # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - -.Lblocks_vpmadd52_4x: - vpbroadcastq $padbit,$PAD - - vmovdqa64 .Lx_mask44(%rip),$mask44 - mov \$5,%eax - vmovdqa64 .Lx_mask42(%rip),$mask42 - kmovw %eax,%k1 # used in 2x path - - test %r8,%r8 # is power value impossible? - js .Linit_vpmadd52 # if it is, then init R[4] - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - - test \$3,$len # is length 4*n+2? - jnz .Lblocks_vpmadd52_2x_do - -.Lblocks_vpmadd52_4x_do: - vpbroadcastq 64($ctx),$R0 # load 4th power of the key - vpbroadcastq 96($ctx),$R1 - vpbroadcastq 128($ctx),$R2 - vpbroadcastq 160($ctx),$S1 - -.Lblocks_vpmadd52_4x_key_loaded: - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - test \$7,$len # is len 8*n? - jz .Lblocks_vpmadd52_8x - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*2($inp),$T3 - lea 16*4($inp),$inp - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as 3-1-2-0 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - sub \$4,$len - jz .Ltail_vpmadd52_4x - jmp .Loop_vpmadd52_4x - ud2 - -.align 32 -.Linit_vpmadd52: - vmovq 24($ctx),%x#$S1 # load key - vmovq 56($ctx),%x#$H2 - vmovq 32($ctx),%x#$S2 - vmovq 40($ctx),%x#$R0 - vmovq 48($ctx),%x#$R1 - - vmovdqa $R0,$H0 - vmovdqa $R1,$H1 - vmovdqa $H2,$R2 - - mov \$2,%eax - -.Lmul_init_vpmadd52: - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - dec %eax - jz .Ldone_init_vpmadd52 - - vpunpcklqdq $R1,$H1,$R1 # 1,2 - vpbroadcastq %x#$H1,%x#$H1 # 2,2 - vpunpcklqdq $R2,$H2,$R2 - vpbroadcastq %x#$H2,%x#$H2 - vpunpcklqdq $R0,$H0,$R0 - vpbroadcastq %x#$H0,%x#$H0 - - vpsllq \$2,$R1,$S1 # S1 = R1*5*4 - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R1,$S1,$S1 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S1,$S1 - vpsllq \$2,$S2,$S2 - - jmp .Lmul_init_vpmadd52 - ud2 - -.align 32 -.Ldone_init_vpmadd52: - vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 - vinserti128 \$1,%x#$R2,$H2,$R2 - vinserti128 \$1,%x#$R0,$H0,$R0 - - vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 - vpermq \$0b11011000,$R2,$R2 - vpermq \$0b11011000,$R0,$R0 - - vpsllq \$2,$R1,$S1 # S1 = R1*5*4 - vpaddq $R1,$S1,$S1 - vpsllq \$2,$S1,$S1 - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - - test \$3,$len # is length 4*n+2? - jnz .Ldone_init_vpmadd52_2x - - vmovdqu64 $R0,64($ctx) # save key powers - vpbroadcastq %x#$R0,$R0 # broadcast 4th power - vmovdqu64 $R1,96($ctx) - vpbroadcastq %x#$R1,$R1 - vmovdqu64 $R2,128($ctx) - vpbroadcastq %x#$R2,$R2 - vmovdqu64 $S1,160($ctx) - vpbroadcastq %x#$S1,$S1 - - jmp .Lblocks_vpmadd52_4x_key_loaded - ud2 - -.align 32 -.Ldone_init_vpmadd52_2x: - vmovdqu64 $R0,64($ctx) # save key powers - vpsrldq \$8,$R0,$R0 # 0-1-0-2 - vmovdqu64 $R1,96($ctx) - vpsrldq \$8,$R1,$R1 - vmovdqu64 $R2,128($ctx) - vpsrldq \$8,$R2,$R2 - vmovdqu64 $S1,160($ctx) - vpsrldq \$8,$S1,$S1 - jmp .Lblocks_vpmadd52_2x_key_loaded - ud2 - -.align 32 -.Lblocks_vpmadd52_2x_do: - vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers - vmovdqu64 160+8($ctx),${S1}{%k1}{z} - vmovdqu64 64+8($ctx),${R0}{%k1}{z} - vmovdqu64 96+8($ctx),${R1}{%k1}{z} - -.Lblocks_vpmadd52_2x_key_loaded: - vmovdqu64 16*0($inp),$T2 # load data - vpxorq $T3,$T3,$T3 - lea 16*2($inp),$inp - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as x-1-x-0 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - jmp .Ltail_vpmadd52_2x - ud2 - -.align 32 -.Loop_vpmadd52_4x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*2($inp),$T3 - lea 16*4($inp),$inp - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction (interleaved with data splat) - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpsrlq \$24,$T3,$T2 - vporq $PAD,$T2,$T2 - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - sub \$4,$len # len-=64 - jnz .Loop_vpmadd52_4x - -.Ltail_vpmadd52_4x: - vmovdqu64 128($ctx),$R2 # load all key powers - vmovdqu64 160($ctx),$S1 - vmovdqu64 64($ctx),$R0 - vmovdqu64 96($ctx),$R1 - -.Ltail_vpmadd52_2x: - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # horizontal addition - - mov \$1,%eax - kmovw %eax,%k1 - vpsrldq \$8,$D0lo,$T0 - vpsrldq \$8,$D0hi,$H0 - vpsrldq \$8,$D1lo,$T1 - vpsrldq \$8,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpsrldq \$8,$D2lo,$T2 - vpsrldq \$8,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vpermq \$0x2,$D0lo,$T0 - vpermq \$0x2,$D0hi,$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vpermq \$0x2,$D1lo,$T1 - vpermq \$0x2,$D1hi,$H1 - vpaddq $T0,$D0lo,${D0lo}{%k1}{z} - vpaddq $H0,$D0hi,${D0hi}{%k1}{z} - vpermq \$0x2,$D2lo,$T2 - vpermq \$0x2,$D2hi,$H2 - vpaddq $T1,$D1lo,${D1lo}{%k1}{z} - vpaddq $H1,$D1hi,${D1hi}{%k1}{z} - vpaddq $T2,$D2lo,${D2lo}{%k1}{z} - vpaddq $H2,$D2hi,${D2hi}{%k1}{z} - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - # at this point $len is - # either 4*n+2 or 0... - sub \$2,$len # len-=32 - ja .Lblocks_vpmadd52_4x_do - - vmovq %x#$H0,0($ctx) - vmovq %x#$H1,8($ctx) - vmovq %x#$H2,16($ctx) - vzeroall - -.Lno_data_vpmadd52_4x: - RET -.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x -___ -} -{ -######################################################################## -# As implied by its name 8x subroutine processes 8 blocks in parallel... -# This is intermediate version, as it's used only in cases when input -# length is either 8*n, 8*n+1 or 8*n+2... - -my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); -my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); -my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); -my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52_8x,\@function,4 -.align 32 -poly1305_blocks_vpmadd52_8x: - shr \$4,$len - jz .Lno_data_vpmadd52_8x # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - - vmovdqa64 .Lx_mask44(%rip),$mask44 - vmovdqa64 .Lx_mask42(%rip),$mask42 - - test %r8,%r8 # is power value impossible? - js .Linit_vpmadd52 # if it is, then init R[4] - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - -.Lblocks_vpmadd52_8x: - ################################################################ - # fist we calculate more key powers - - vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers - vmovdqu64 160($ctx),$S1 - vmovdqu64 64($ctx),$R0 - vmovdqu64 96($ctx),$R1 - - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - vpbroadcastq %x#$R2,$RR2 # broadcast 4th power - vpbroadcastq %x#$R0,$RR0 - vpbroadcastq %x#$R1,$RR1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $RR2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $RR2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $RR2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $RR2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $RR2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $RR2,$R0,$D2hi - - vpmadd52luq $RR0,$R0,$D0lo - vpmadd52huq $RR0,$R0,$D0hi - vpmadd52luq $RR0,$R1,$D1lo - vpmadd52huq $RR0,$R1,$D1hi - vpmadd52luq $RR0,$R2,$D2lo - vpmadd52huq $RR0,$R2,$D2hi - - vpmadd52luq $RR1,$S2,$D0lo - vpmadd52huq $RR1,$S2,$D0hi - vpmadd52luq $RR1,$R0,$D1lo - vpmadd52huq $RR1,$R0,$D1hi - vpmadd52luq $RR1,$R1,$D2lo - vpmadd52huq $RR1,$R1,$D2hi - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$RR0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$RR1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$RR2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$RR0,$RR0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$RR0,$RR0 - - vpsrlq \$44,$RR0,$tmp # additional step - vpandq $mask44,$RR0,$RR0 - - vpaddq $tmp,$RR1,$RR1 - - ################################################################ - # At this point Rx holds 1324 powers, RRx - 5768, and the goal - # is 15263748, which reflects how data is loaded... - - vpunpcklqdq $R2,$RR2,$T2 # 3748 - vpunpckhqdq $R2,$RR2,$R2 # 1526 - vpunpcklqdq $R0,$RR0,$T0 - vpunpckhqdq $R0,$RR0,$R0 - vpunpcklqdq $R1,$RR1,$T1 - vpunpckhqdq $R1,$RR1,$R1 -___ -######## switch to %zmm -map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); -map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); -map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); -map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); - -$code.=<<___; - vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 - vshufi64x2 \$0x44,$R0,$T0,$RR0 - vshufi64x2 \$0x44,$R1,$T1,$RR1 - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*4($inp),$T3 - lea 16*8($inp),$inp - - vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 - vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 - vpaddq $RR2,$SS2,$SS2 - vpaddq $RR1,$SS1,$SS1 - vpsllq \$2,$SS2,$SS2 - vpsllq \$2,$SS1,$SS1 - - vpbroadcastq $padbit,$PAD - vpbroadcastq %x#$mask44,$mask44 - vpbroadcastq %x#$mask42,$mask42 - - vpbroadcastq %x#$SS1,$S1 # broadcast 8th power - vpbroadcastq %x#$SS2,$S2 - vpbroadcastq %x#$RR0,$R0 - vpbroadcastq %x#$RR1,$R1 - vpbroadcastq %x#$RR2,$R2 - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as 73625140 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - sub \$8,$len - jz .Ltail_vpmadd52_8x - jmp .Loop_vpmadd52_8x - -.align 32 -.Loop_vpmadd52_8x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*4($inp),$T3 - lea 16*8($inp),$inp - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction (interleaved with data splat) - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpsrlq \$24,$T3,$T2 - vporq $PAD,$T2,$T2 - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - sub \$8,$len # len-=128 - jnz .Loop_vpmadd52_8x - -.Ltail_vpmadd52_8x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$SS1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$SS1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$SS2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$SS2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$RR0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$RR0,$D2hi - - vpmadd52luq $H0,$RR0,$D0lo - vpmadd52huq $H0,$RR0,$D0hi - vpmadd52luq $H0,$RR1,$D1lo - vpmadd52huq $H0,$RR1,$D1hi - vpmadd52luq $H0,$RR2,$D2lo - vpmadd52huq $H0,$RR2,$D2hi - - vpmadd52luq $H1,$SS2,$D0lo - vpmadd52huq $H1,$SS2,$D0hi - vpmadd52luq $H1,$RR0,$D1lo - vpmadd52huq $H1,$RR0,$D1hi - vpmadd52luq $H1,$RR1,$D2lo - vpmadd52huq $H1,$RR1,$D2hi - - ################################################################ - # horizontal addition - - mov \$1,%eax - kmovw %eax,%k1 - vpsrldq \$8,$D0lo,$T0 - vpsrldq \$8,$D0hi,$H0 - vpsrldq \$8,$D1lo,$T1 - vpsrldq \$8,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpsrldq \$8,$D2lo,$T2 - vpsrldq \$8,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vpermq \$0x2,$D0lo,$T0 - vpermq \$0x2,$D0hi,$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vpermq \$0x2,$D1lo,$T1 - vpermq \$0x2,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpermq \$0x2,$D2lo,$T2 - vpermq \$0x2,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vextracti64x4 \$1,$D0lo,%y#$T0 - vextracti64x4 \$1,$D0hi,%y#$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vextracti64x4 \$1,$D1lo,%y#$T1 - vextracti64x4 \$1,$D1hi,%y#$H1 - vextracti64x4 \$1,$D2lo,%y#$T2 - vextracti64x4 \$1,$D2hi,%y#$H2 -___ -######## switch back to %ymm -map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); -map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); -map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); - -$code.=<<___; - vpaddq $T0,$D0lo,${D0lo}{%k1}{z} - vpaddq $H0,$D0hi,${D0hi}{%k1}{z} - vpaddq $T1,$D1lo,${D1lo}{%k1}{z} - vpaddq $H1,$D1hi,${D1hi}{%k1}{z} - vpaddq $T2,$D2lo,${D2lo}{%k1}{z} - vpaddq $H2,$D2hi,${D2hi}{%k1}{z} - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - ################################################################ - - vmovq %x#$H0,0($ctx) - vmovq %x#$H1,8($ctx) - vmovq %x#$H2,16($ctx) - vzeroall - -.Lno_data_vpmadd52_8x: - RET -.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x -___ -} -$code.=<<___; -.type poly1305_emit_base2_44,\@function,3 -.align 32 -poly1305_emit_base2_44: - mov 0($ctx),%r8 # load hash value - mov 8($ctx),%r9 - mov 16($ctx),%r10 - - mov %r9,%rax - shr \$20,%r9 - shl \$44,%rax - mov %r10,%rcx - shr \$40,%r10 - shl \$24,%rcx - - add %rax,%r8 - adc %rcx,%r9 - adc \$0,%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 -___ -} } } -} - -if (!$kernel) -{ # chacha20-poly1305 helpers -my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order - ("%rdi","%rsi","%rdx","%rcx"); # Unix order -$code.=<<___; -.globl xor128_encrypt_n_pad -.type xor128_encrypt_n_pad,\@abi-omnipotent -.align 16 -xor128_encrypt_n_pad: - sub $otp,$inp - sub $otp,$out - mov $len,%r10 # put len aside - shr \$4,$len # len / 16 - jz .Ltail_enc - nop -.Loop_enc_xmm: - movdqu ($inp,$otp),%xmm0 - pxor ($otp),%xmm0 - movdqu %xmm0,($out,$otp) - movdqa %xmm0,($otp) - lea 16($otp),$otp - dec $len - jnz .Loop_enc_xmm - - and \$15,%r10 # len % 16 - jz .Ldone_enc - -.Ltail_enc: - mov \$16,$len - sub %r10,$len - xor %eax,%eax -.Loop_enc_byte: - mov ($inp,$otp),%al - xor ($otp),%al - mov %al,($out,$otp) - mov %al,($otp) - lea 1($otp),$otp - dec %r10 - jnz .Loop_enc_byte - - xor %eax,%eax -.Loop_enc_pad: - mov %al,($otp) - lea 1($otp),$otp - dec $len - jnz .Loop_enc_pad - -.Ldone_enc: - mov $otp,%rax - RET -.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad - -.globl xor128_decrypt_n_pad -.type xor128_decrypt_n_pad,\@abi-omnipotent -.align 16 -xor128_decrypt_n_pad: - sub $otp,$inp - sub $otp,$out - mov $len,%r10 # put len aside - shr \$4,$len # len / 16 - jz .Ltail_dec - nop -.Loop_dec_xmm: - movdqu ($inp,$otp),%xmm0 - movdqa ($otp),%xmm1 - pxor %xmm0,%xmm1 - movdqu %xmm1,($out,$otp) - movdqa %xmm0,($otp) - lea 16($otp),$otp - dec $len - jnz .Loop_dec_xmm - - pxor %xmm1,%xmm1 - and \$15,%r10 # len % 16 - jz .Ldone_dec - -.Ltail_dec: - mov \$16,$len - sub %r10,$len - xor %eax,%eax - xor %r11d,%r11d -.Loop_dec_byte: - mov ($inp,$otp),%r11b - mov ($otp),%al - xor %r11b,%al - mov %al,($out,$otp) - mov %r11b,($otp) - lea 1($otp),$otp - dec %r10 - jnz .Loop_dec_byte - - xor %eax,%eax -.Loop_dec_pad: - mov %al,($otp) - lea 1($otp),$otp - dec $len - jnz .Loop_dec_pad - -.Ldone_dec: - mov $otp,%rax - RET -.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad -___ -} - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent -.align 16 -se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lcommon_seh_tail - - lea 48(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 - mov -48(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R14 - - jmp .Lcommon_seh_tail -.size se_handler,.-se_handler - -.type avx_handler,\@abi-omnipotent -.align 16 -avx_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<prologue label - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lcommon_seh_tail - - mov 208($context),%rax # pull context->R11 - - lea 0x50(%rax),%rsi - lea 0xf8(%rax),%rax - lea 512($context),%rdi # &context.Xmm6 - mov \$20,%ecx - .long 0xa548f3fc # cld; rep movsq - -.Lcommon_seh_tail: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$154,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - RET -.size avx_handler,.-avx_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_poly1305_block_init_arch - .rva .LSEH_end_poly1305_block_init_arch - .rva .LSEH_info_poly1305_block_init_arch - - .rva .LSEH_begin_poly1305_blocks_x86_64 - .rva .LSEH_end_poly1305_blocks_x86_64 - .rva .LSEH_info_poly1305_blocks_x86_64 - - .rva .LSEH_begin_poly1305_emit_x86_64 - .rva .LSEH_end_poly1305_emit_x86_64 - .rva .LSEH_info_poly1305_emit_x86_64 -___ -$code.=<<___ if ($avx); - .rva .LSEH_begin_poly1305_blocks_avx - .rva .Lbase2_64_avx - .rva .LSEH_info_poly1305_blocks_avx_1 - - .rva .Lbase2_64_avx - .rva .Leven_avx - .rva .LSEH_info_poly1305_blocks_avx_2 - - .rva .Leven_avx - .rva .LSEH_end_poly1305_blocks_avx - .rva .LSEH_info_poly1305_blocks_avx_3 - - .rva .LSEH_begin_poly1305_emit_avx - .rva .LSEH_end_poly1305_emit_avx - .rva .LSEH_info_poly1305_emit_avx -___ -$code.=<<___ if ($avx>1); - .rva .LSEH_begin_poly1305_blocks_avx2 - .rva .Lbase2_64_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_1 - - .rva .Lbase2_64_avx2 - .rva .Leven_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_2 - - .rva .Leven_avx2 - .rva .LSEH_end_poly1305_blocks_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_3 -___ -$code.=<<___ if ($avx>2); - .rva .LSEH_begin_poly1305_blocks_avx512 - .rva .LSEH_end_poly1305_blocks_avx512 - .rva .LSEH_info_poly1305_blocks_avx512 -___ -$code.=<<___; -.section .xdata -.align 8 -.LSEH_info_poly1305_block_init_arch: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_block_init_arch,.LSEH_begin_poly1305_block_init_arch - -.LSEH_info_poly1305_blocks_x86_64: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_body,.Lblocks_epilogue - -.LSEH_info_poly1305_emit_x86_64: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 -___ -$code.=<<___ if ($avx); -.LSEH_info_poly1305_blocks_avx_1: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx_2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx_3: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_emit_avx: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx -___ -$code.=<<___ if ($avx>1); -.LSEH_info_poly1305_blocks_avx2_1: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx2_2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx2_3: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] -___ -$code.=<<___ if ($avx>2); -.LSEH_info_poly1305_blocks_avx512: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] -___ -} - -open SELF,$0; -while(<SELF>) { - next if (/^#!/); - last if (!s/^#/\/\// and !/^$/); - print; -} -close SELF; - -foreach (split('\n',$code)) { - s/\`([^\`]*)\`/eval($1)/ge; - s/%r([a-z]+)#d/%e$1/g; - s/%r([0-9]+)#d/%r$1d/g; - s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; - - if ($kernel) { - s/(^\.type.*),[0-9]+$/\1/; - s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; - next if /^\.cfi.*/; - } - - print $_,"\n"; -} -close STDOUT; diff --git a/arch/x86/lib/crypto/poly1305_glue.c b/arch/x86/lib/crypto/poly1305_glue.c deleted file mode 100644 index b7e78a583e07..000000000000 --- a/arch/x86/lib/crypto/poly1305_glue.c +++ /dev/null @@ -1,129 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <asm/cpu_device_id.h> -#include <asm/fpu/api.h> -#include <crypto/internal/poly1305.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <linux/unaligned.h> - -struct poly1305_arch_internal { - union { - struct { - u32 h[5]; - u32 is_base2_26; - }; - u64 hs[3]; - }; - u64 r[2]; - u64 pad; - struct { u32 r2, r1, r4, r3; } rn[9]; -}; - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_x86_64(struct poly1305_arch_internal *ctx, - const u8 *inp, - const size_t len, const u32 padbit); -asmlinkage void poly1305_emit_x86_64(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_emit_avx(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_blocks_avx(struct poly1305_arch_internal *ctx, - const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx2(struct poly1305_arch_internal *ctx, - const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx512(struct poly1305_arch_internal *ctx, - const u8 *inp, - const size_t len, const u32 padbit); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); - -void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *inp, - unsigned int len, u32 padbit) -{ - struct poly1305_arch_internal *ctx = - container_of(&state->h.h, struct poly1305_arch_internal, h); - - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || - SZ_4K % POLY1305_BLOCK_SIZE); - - if (!static_branch_likely(&poly1305_use_avx)) { - poly1305_blocks_x86_64(ctx, inp, len, padbit); - return; - } - - do { - const unsigned int bytes = min(len, SZ_4K); - - kernel_fpu_begin(); - if (static_branch_likely(&poly1305_use_avx512)) - poly1305_blocks_avx512(ctx, inp, bytes, padbit); - else if (static_branch_likely(&poly1305_use_avx2)) - poly1305_blocks_avx2(ctx, inp, bytes, padbit); - else - poly1305_blocks_avx(ctx, inp, bytes, padbit); - kernel_fpu_end(); - - len -= bytes; - inp += bytes; - } while (len); -} -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); - -void poly1305_emit_arch(const struct poly1305_state *ctx, - u8 mac[POLY1305_DIGEST_SIZE], const u32 nonce[4]) -{ - if (!static_branch_likely(&poly1305_use_avx)) - poly1305_emit_x86_64(ctx, mac, nonce); - else - poly1305_emit_avx(ctx, mac, nonce); -} -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -bool poly1305_is_arch_optimized(void) -{ - return static_key_enabled(&poly1305_use_avx); -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -static int __init poly1305_simd_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_AVX) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx); - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx2); - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_AVX512F) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && - /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ - boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) - static_branch_enable(&poly1305_use_avx512); - return 0; -} -subsys_initcall(poly1305_simd_mod_init); - -static void __exit poly1305_simd_mod_exit(void) -{ -} -module_exit(poly1305_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); -MODULE_DESCRIPTION("Poly1305 authenticator"); diff --git a/arch/x86/lib/crypto/sha256-avx-asm.S b/arch/x86/lib/crypto/sha256-avx-asm.S deleted file mode 100644 index 0d7b2c3e45d9..000000000000 --- a/arch/x86/lib/crypto/sha256-avx-asm.S +++ /dev/null @@ -1,499 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with AVX1 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## -# This code schedules 1 block at a time, with 4 lanes per block -######################################################################## - -#include <linux/linkage.h> -#include <linux/objtool.h> - -## assume buffers not aligned -#define VMOVDQ vmovdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - - -.macro MY_ROR p1 p2 - shld $(32-(\p1)), \p2, \p2 -.endm - -################################ - -# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -# Load xmm with mem and byte swap each dword -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - VMOVDQ \p2, \p1 - vpshufb \p3, \p1, \p1 -.endm - -################################ - -X0 = %xmm4 -X1 = %xmm5 -X2 = %xmm6 -X3 = %xmm7 - -XTMP0 = %xmm0 -XTMP1 = %xmm1 -XTMP2 = %xmm2 -XTMP3 = %xmm3 -XTMP4 = %xmm8 -XFER = %xmm9 -XTMP5 = %xmm11 - -SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %xmm13 - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg - -SRND = %rsi # clobbers INP -c = %ecx -d = %r8d -e = %edx -TBL = %r12 -a = %eax -b = %ebx - -f = %r9d -g = %r10d -h = %r11d - -y0 = %r13d -y1 = %r14d -y2 = %r15d - - -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_XFER_SIZE = 16 -_XMM_SAVE_SIZE = 0 - -_INP_END = 0 -_INP = _INP_END + _INP_END_SIZE -_XFER = _INP + _INP_SIZE -_XMM_SAVE = _XFER + _XFER_SIZE -STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - ## compute s0 four at a time and s1 two at a time - ## compute W[-16] + W[-7] 4 at a time - - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] - MY_ROR (22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - ## compute s0 - vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add _XFER(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpsrld $7, XTMP1, XTMP2 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpslld $(32-7), XTMP1, XTMP3 - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - mov a, y1 # y1 = a - MY_ROR (25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (22-13), y1 # y1 = a >> (22-13) - vpsrld $18, XTMP1, XTMP2 # - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - vpslld $(32-18), XTMP1, XTMP1 - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - vpxor XTMP1, XTMP3, XTMP3 # - add y0, y2 # y2 = S1 + CH - add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - ## compute low s1 - vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - mov a, y1 # y1 = a - MY_ROR (25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - MY_ROR (22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - xor g, y2 # y2 = f^g - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - vpxor XTMP3, XTMP2, XTMP2 # - add y0, y2 # y2 = S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - ## compute high s1 - vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - MY_ROR (22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - vpxor XTMP3, XTMP2, XTMP2 - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - rotate_Xs -.endm - -## input is [rsp + _XFER + %1 * 4] -.macro DO_ROUND round - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - xor e, y0 # y0 = e ^ (e >> (25-11)) - MY_ROR (22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - and e, y2 # y2 = (f^g)&e - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - add y0, y2 # y2 = S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - offset = \round * 4 + _XFER # - add offset(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -.endm - -######################################################################## -## void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], -## const u8 *data, size_t nblocks); -######################################################################## -.text -SYM_FUNC_START(sha256_transform_avx) - ANNOTATE_NOENDBR # since this is called only via static_call - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - movq %rsp, %rbp - - subq $STACK_SIZE, %rsp # allocate stack space - and $~15, %rsp # align stack pointer - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - add INP, NUM_BLKS # pointer to end of data - mov NUM_BLKS, _INP_END(%rsp) - - ## load initial digest - mov 4*0(CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 -.Lloop0: - lea K256(%rip), TBL - - ## byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK - - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 16 each - mov $3, SRND -.align 16 -.Lloop1: - vpaddd (TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 1*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 2*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 3*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - add $4*16, TBL - FOUR_ROUNDS_AND_SCHED - - sub $1, SRND - jne .Lloop1 - - mov $2, SRND -.Lloop2: - vpaddd (TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vpaddd 1*16(TBL), X1, XFER - vmovdqa XFER, _XFER(%rsp) - add $2*16, TBL - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vmovdqa X2, X0 - vmovdqa X3, X1 - - sub $1, SRND - jne .Lloop2 - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - mov _INP(%rsp), INP - add $64, INP - cmp _INP_END(%rsp), INP - jne .Lloop0 - -.Ldone_hash: - - mov %rbp, %rsp - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - RET -SYM_FUNC_END(sha256_transform_avx) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 -.align 16 -# shuffle xBxA -> 00BA -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 -.align 16 -# shuffle xDxC -> DC00 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/lib/crypto/sha256-avx2-asm.S b/arch/x86/lib/crypto/sha256-avx2-asm.S deleted file mode 100644 index 25d3380321ec..000000000000 --- a/arch/x86/lib/crypto/sha256-avx2-asm.S +++ /dev/null @@ -1,774 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with AVX2 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## -# This code schedules 2 blocks at a time, with 4 lanes per block -######################################################################## - -#include <linux/linkage.h> -#include <linux/objtool.h> - -## assume buffers not aligned -#define VMOVDQ vmovdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - -################################ - -X0 = %ymm4 -X1 = %ymm5 -X2 = %ymm6 -X3 = %ymm7 - -# XMM versions of above -XWORD0 = %xmm4 -XWORD1 = %xmm5 -XWORD2 = %xmm6 -XWORD3 = %xmm7 - -XTMP0 = %ymm0 -XTMP1 = %ymm1 -XTMP2 = %ymm2 -XTMP3 = %ymm3 -XTMP4 = %ymm8 -XFER = %ymm9 -XTMP5 = %ymm11 - -SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %ymm13 - -X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg -c = %ecx -d = %r8d -e = %edx # clobbers NUM_BLKS -y3 = %esi # clobbers INP - -SRND = CTX # SRND is same register as CTX - -a = %eax -b = %ebx -f = %r9d -g = %r10d -h = %r11d -old_h = %r11d - -T1 = %r12d -y0 = %r13d -y1 = %r14d -y2 = %r15d - - -_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round -_XMM_SAVE_SIZE = 0 -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_CTX_SIZE = 8 - -_XFER = 0 -_XMM_SAVE = _XFER + _XFER_SIZE -_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE -_INP = _INP_END + _INP_END_SIZE -_CTX = _INP + _INP_SIZE -STACK_SIZE = _CTX + _CTX_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs - X_ = X0 - X0 = X1 - X1 = X2 - X2 = X3 - X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS - old_h = h - TMP_ = h - h = g - g = f - f = e - e = d - d = c - c = b - b = a - a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED disp -################################### RND N + 0 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - - addl \disp(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - - and e, y2 # y2 = (f^g)&e # CH - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - add h, d # d = k + w + h + d # -- - - and b, y3 # y3 = (a|c)&b # MAJA - vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - vpsrld $7, XTMP1, XTMP2 - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - - add y0, y2 # y2 = S1 + CH # -- - vpslld $(32-7), XTMP1, XTMP3 - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 - - vpsrld $18, XTMP1, XTMP2 - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - - ROTATE_ARGS - -################################### RND N + 1 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - offset = \disp + 1*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - - - rorx $6, e, y1 # y1 = (e >> 6) # S1 - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - - vpslld $(32-18), XTMP1, XTMP1 - and b, y3 # y3 = (a|c)&b # MAJA - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - - vpxor XTMP1, XTMP3, XTMP3 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 - vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - - - ROTATE_ARGS - -################################### RND N + 2 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - offset = \disp + 2*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} - rorx $11, e, y1 # y1 = e >> 11 # S1B - or c, y3 # y3 = a|c # MAJA - mov f, y2 # y2 = f # CH - xor g, y2 # y2 = f^g # CH - - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} - and e, y2 # y2 = (f^g)&e # CH - - rorx $6, e, y1 # y1 = (e >> 6) # S1 - vpxor XTMP3, XTMP2, XTMP2 - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a ,T1 # T1 = (a >> 2) # S0 - vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} - - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1,h # h = k + w + h + S0 # -- - add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3,h # h = t1 + S0 + MAJ # -- - - - ROTATE_ARGS - -################################### RND N + 3 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - offset = \disp + 3*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - - - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpxor XTMP3, XTMP2, XTMP2 - rorx $22, a, y1 # y1 = a >> 22 # S0A - add y0, y2 # y2 = S1 + CH # -- - - vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - rorx $2, a, T1 # T1 = (a >> 2) # S0 - vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} - - vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - - add y1, h # h = k + w + h + S0 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - ROTATE_ARGS - rotate_Xs -.endm - -.macro DO_4ROUNDS disp -################################### RND N + 0 ########################### - - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - addl \disp(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 1 ########################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*1 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 2 ############################## - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*2 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 3 ########################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*3 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3, h # h = t1 + S0 + MAJ # -- - - ROTATE_ARGS - -.endm - -######################################################################## -## void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], -## const u8 *data, size_t nblocks); -######################################################################## -.text -SYM_FUNC_START(sha256_transform_rorx) - ANNOTATE_NOENDBR # since this is called only via static_call - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - push %rbp - mov %rsp, %rbp - - subq $STACK_SIZE, %rsp - and $-32, %rsp # align rsp to 32 byte boundary - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block - mov NUM_BLKS, _INP_END(%rsp) - - cmp NUM_BLKS, INP - je .Lonly_one_block - - ## load initial digest - mov (CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 - - mov CTX, _CTX(%rsp) - -.Lloop0: - ## Load first 16 dwords from two blocks - VMOVDQ 0*32(INP),XTMP0 - VMOVDQ 1*32(INP),XTMP1 - VMOVDQ 2*32(INP),XTMP2 - VMOVDQ 3*32(INP),XTMP3 - - ## byte swap data - vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 - vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 - vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 - vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 - - ## transpose data into high/low halves - vperm2i128 $0x20, XTMP2, XTMP0, X0 - vperm2i128 $0x31, XTMP2, XTMP0, X1 - vperm2i128 $0x20, XTMP3, XTMP1, X2 - vperm2i128 $0x31, XTMP3, XTMP1, X3 - -.Llast_block_enter: - add $64, INP - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 12 each - xor SRND, SRND - -.align 16 -.Lloop1: - leaq K256+0*32(%rip), INP ## reuse INP as scratch reg - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 0*32) - - leaq K256+1*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 1*32) - - leaq K256+2*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 2*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 2*32) - - leaq K256+3*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 3*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 3*32) - - add $4*32, SRND - cmp $3*4*32, SRND - jb .Lloop1 - -.Lloop2: - ## Do last 16 rounds with no scheduling - leaq K256+0*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - DO_4ROUNDS (_XFER + 0*32) - - leaq K256+1*32(%rip), INP - vpaddd (INP, SRND), X1, XFER - vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - DO_4ROUNDS (_XFER + 1*32) - add $2*32, SRND - - vmovdqa X2, X0 - vmovdqa X3, X1 - - cmp $4*4*32, SRND - jb .Lloop2 - - mov _CTX(%rsp), CTX - mov _INP(%rsp), INP - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - cmp _INP_END(%rsp), INP - ja .Ldone_hash - - #### Do second block using previously scheduled results - xor SRND, SRND -.align 16 -.Lloop3: - DO_4ROUNDS (_XFER + 0*32 + 16) - DO_4ROUNDS (_XFER + 1*32 + 16) - add $2*32, SRND - cmp $4*4*32, SRND - jb .Lloop3 - - mov _CTX(%rsp), CTX - mov _INP(%rsp), INP - add $64, INP - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - cmp _INP_END(%rsp), INP - jb .Lloop0 - ja .Ldone_hash - -.Ldo_last_block: - VMOVDQ 0*16(INP),XWORD0 - VMOVDQ 1*16(INP),XWORD1 - VMOVDQ 2*16(INP),XWORD2 - VMOVDQ 3*16(INP),XWORD3 - - vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 - vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 - vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 - vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 - - jmp .Llast_block_enter - -.Lonly_one_block: - - ## load initial digest - mov (4*0)(CTX),a - mov (4*1)(CTX),b - mov (4*2)(CTX),c - mov (4*3)(CTX),d - mov (4*4)(CTX),e - mov (4*5)(CTX),f - mov (4*6)(CTX),g - mov (4*7)(CTX),h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 - - mov CTX, _CTX(%rsp) - jmp .Ldo_last_block - -.Ldone_hash: - - mov %rbp, %rsp - pop %rbp - - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - vzeroupper - RET -SYM_FUNC_END(sha256_transform_rorx) - -.section .rodata.cst512.K256, "aM", @progbits, 512 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 -.align 32 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 - -# shuffle xBxA -> 00BA -.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 -.align 32 -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 - -# shuffle xDxC -> DC00 -.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 -.align 32 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/lib/crypto/sha256-ni-asm.S b/arch/x86/lib/crypto/sha256-ni-asm.S deleted file mode 100644 index d3548206cf3d..000000000000 --- a/arch/x86/lib/crypto/sha256-ni-asm.S +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Intel SHA Extensions optimized implementation of a SHA-256 update function - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Sean Gulley <sean.m.gulley@intel.com> - * Tim Chen <tim.c.chen@linux.intel.com> - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <linux/linkage.h> -#include <linux/objtool.h> - -#define STATE_PTR %rdi /* 1st arg */ -#define DATA_PTR %rsi /* 2nd arg */ -#define NUM_BLKS %rdx /* 3rd arg */ - -#define SHA256CONSTANTS %rax - -#define MSG %xmm0 /* sha256rnds2 implicit operand */ -#define STATE0 %xmm1 -#define STATE1 %xmm2 -#define MSG0 %xmm3 -#define MSG1 %xmm4 -#define MSG2 %xmm5 -#define MSG3 %xmm6 -#define TMP %xmm7 - -#define SHUF_MASK %xmm8 - -#define ABEF_SAVE %xmm9 -#define CDGH_SAVE %xmm10 - -.macro do_4rounds i, m0, m1, m2, m3 -.if \i < 16 - movdqu \i*4(DATA_PTR), \m0 - pshufb SHUF_MASK, \m0 -.endif - movdqa (\i-32)*4(SHA256CONSTANTS), MSG - paddd \m0, MSG - sha256rnds2 STATE0, STATE1 -.if \i >= 12 && \i < 60 - movdqa \m0, TMP - palignr $4, \m3, TMP - paddd TMP, \m1 - sha256msg2 \m0, \m1 -.endif - punpckhqdq MSG, MSG - sha256rnds2 STATE1, STATE0 -.if \i >= 4 && \i < 52 - sha256msg1 \m0, \m3 -.endif -.endm - -/* - * Intel SHA Extensions optimized implementation of a SHA-256 block function - * - * This function takes a pointer to the current SHA-256 state, a pointer to the - * input data, and the number of 64-byte blocks to process. Once all blocks - * have been processed, the state is updated with the new state. This function - * only processes complete blocks. State initialization, buffering of partial - * blocks, and digest finalization is expected to be handled elsewhere. - * - * void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], - * const u8 *data, size_t nblocks); - */ -.text -SYM_FUNC_START(sha256_ni_transform) - ANNOTATE_NOENDBR # since this is called only via static_call - - shl $6, NUM_BLKS /* convert to bytes */ - jz .Ldone_hash - add DATA_PTR, NUM_BLKS /* pointer to end of data */ - - /* - * load initial hash values - * Need to reorder these appropriately - * DCBA, HGFE -> ABEF, CDGH - */ - movdqu 0*16(STATE_PTR), STATE0 /* DCBA */ - movdqu 1*16(STATE_PTR), STATE1 /* HGFE */ - - movdqa STATE0, TMP - punpcklqdq STATE1, STATE0 /* FEBA */ - punpckhqdq TMP, STATE1 /* DCHG */ - pshufd $0x1B, STATE0, STATE0 /* ABEF */ - pshufd $0xB1, STATE1, STATE1 /* CDGH */ - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - lea K256+32*4(%rip), SHA256CONSTANTS - -.Lloop0: - /* Save hash values for addition after rounds */ - movdqa STATE0, ABEF_SAVE - movdqa STATE1, CDGH_SAVE - -.irp i, 0, 16, 32, 48 - do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3 - do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0 - do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1 - do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2 -.endr - - /* Add current hash values with previously saved */ - paddd ABEF_SAVE, STATE0 - paddd CDGH_SAVE, STATE1 - - /* Increment data pointer and loop if more to process */ - add $64, DATA_PTR - cmp NUM_BLKS, DATA_PTR - jne .Lloop0 - - /* Write hash values back in the correct order */ - movdqa STATE0, TMP - punpcklqdq STATE1, STATE0 /* GHEF */ - punpckhqdq TMP, STATE1 /* ABCD */ - pshufd $0xB1, STATE0, STATE0 /* HGFE */ - pshufd $0x1B, STATE1, STATE1 /* DCBA */ - - movdqu STATE1, 0*16(STATE_PTR) - movdqu STATE0, 1*16(STATE_PTR) - -.Ldone_hash: - - RET -SYM_FUNC_END(sha256_ni_transform) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/arch/x86/lib/crypto/sha256-ssse3-asm.S b/arch/x86/lib/crypto/sha256-ssse3-asm.S deleted file mode 100644 index 7f24a4cdcb25..000000000000 --- a/arch/x86/lib/crypto/sha256-ssse3-asm.S +++ /dev/null @@ -1,511 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with SSSE3 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## - -#include <linux/linkage.h> -#include <linux/objtool.h> - -## assume buffers not aligned -#define MOVDQ movdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - -################################ - -# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -# Load xmm with mem and byte swap each dword -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - MOVDQ \p2, \p1 - pshufb \p3, \p1 -.endm - -################################ - -X0 = %xmm4 -X1 = %xmm5 -X2 = %xmm6 -X3 = %xmm7 - -XTMP0 = %xmm0 -XTMP1 = %xmm1 -XTMP2 = %xmm2 -XTMP3 = %xmm3 -XTMP4 = %xmm8 -XFER = %xmm9 - -SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %xmm12 - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg - -SRND = %rsi # clobbers INP -c = %ecx -d = %r8d -e = %edx -TBL = %r12 -a = %eax -b = %ebx - -f = %r9d -g = %r10d -h = %r11d - -y0 = %r13d -y1 = %r14d -y2 = %r15d - - - -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_XFER_SIZE = 16 -_XMM_SAVE_SIZE = 0 - -_INP_END = 0 -_INP = _INP_END + _INP_END_SIZE -_XFER = _INP + _INP_SIZE -_XMM_SAVE = _XFER + _XFER_SIZE -STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - ## compute s0 four at a time and s1 two at a time - ## compute W[-16] + W[-7] 4 at a time - movdqa X3, XTMP0 - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - palignr $4, X2, XTMP0 # XTMP0 = W[-7] - ror $(22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - movdqa X1, XTMP1 - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - ## compute s0 - palignr $4, X0, XTMP1 # XTMP1 = W[-15] - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - movdqa XTMP1, XTMP2 # XTMP2 = W[-15] - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH - movdqa XTMP1, XTMP3 # XTMP3 = W[-15] - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pslld $(32-7), XTMP1 # - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - psrld $7, XTMP2 # - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - # - ROTATE_ARGS # - movdqa XTMP3, XTMP2 # XTMP2 = W[-15] - mov e, y0 # y0 = e - mov a, y1 # y1 = a - movdqa XTMP3, XTMP4 # XTMP4 = W[-15] - ror $(25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(22-13), y1 # y1 = a >> (22-13) - pslld $(32-18), XTMP3 # - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - psrld $18, XTMP2 # - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP3, XTMP1 - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 - add y0, y2 # y2 = S1 + CH - add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pxor XTMP4, XTMP1 # XTMP1 = s0 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - ## compute low s1 - pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - - ROTATE_ARGS - movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} - mov e, y0 # y0 = e - mov a, y1 # y1 = a - ror $(25-11), y0 # y0 = e >> (25-11) - movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} - xor e, y0 # y0 = e ^ (e >> (25-11)) - ror $(22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} - xor g, y2 # y2 = f^g - psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP3, XTMP2 - add y0, y2 # y2 = S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - ## compute high s1 - pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - # - ROTATE_ARGS # - movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - movdqa XTMP2, X0 # X0 = W[-2] {DDCC} - ror $(22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 - and e, y2 # y2 = (f^g)&e - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 - xor g, y2 # y2 = CH = ((f^g)&e)^g - pxor XTMP3, XTMP2 # - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 - add y0, y2 # y2 = S1 + CH - add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - pxor XTMP2, X0 # X0 = s1 {xDxC} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pshufb SHUF_DC00, X0 # X0 = s1 {DC00} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - - ROTATE_ARGS - rotate_Xs -.endm - -## input is [rsp + _XFER + %1 * 4] -.macro DO_ROUND round - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - xor e, y0 # y0 = e ^ (e >> (25-11)) - ror $(22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - and e, y2 # y2 = (f^g)&e - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - add y0, y2 # y2 = S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - offset = \round * 4 + _XFER - add offset(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -.endm - -######################################################################## -## void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], -## const u8 *data, size_t nblocks); -######################################################################## -.text -SYM_FUNC_START(sha256_transform_ssse3) - ANNOTATE_NOENDBR # since this is called only via static_call - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - mov %rsp, %rbp - - subq $STACK_SIZE, %rsp - and $~15, %rsp - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - add INP, NUM_BLKS - mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data - - ## load initial digest - mov 4*0(CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - movdqa _SHUF_00BA(%rip), SHUF_00BA - movdqa _SHUF_DC00(%rip), SHUF_DC00 - -.Lloop0: - lea K256(%rip), TBL - - ## byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK - - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 16 each - mov $3, SRND -.align 16 -.Lloop1: - movdqa (TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 1*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 2*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 3*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - add $4*16, TBL - FOUR_ROUNDS_AND_SCHED - - sub $1, SRND - jne .Lloop1 - - mov $2, SRND -.Lloop2: - paddd (TBL), X0 - movdqa X0, _XFER(%rsp) - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - paddd 1*16(TBL), X1 - movdqa X1, _XFER(%rsp) - add $2*16, TBL - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - movdqa X2, X0 - movdqa X3, X1 - - sub $1, SRND - jne .Lloop2 - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - mov _INP(%rsp), INP - add $64, INP - cmp _INP_END(%rsp), INP - jne .Lloop0 - -.Ldone_hash: - - mov %rbp, %rsp - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - - RET -SYM_FUNC_END(sha256_transform_ssse3) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 -.align 16 -# shuffle xBxA -> 00BA -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 -.align 16 -# shuffle xDxC -> DC00 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/lib/crypto/sha256.c b/arch/x86/lib/crypto/sha256.c deleted file mode 100644 index 80380f8fdcee..000000000000 --- a/arch/x86/lib/crypto/sha256.c +++ /dev/null @@ -1,80 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SHA-256 optimized for x86_64 - * - * Copyright 2025 Google LLC - */ -#include <asm/fpu/api.h> -#include <crypto/internal/sha2.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/static_call.h> - -asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); -asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86); - -DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3); - -void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - if (static_branch_likely(&have_sha256_x86)) { - kernel_fpu_begin(); - static_call(sha256_blocks_x86)(state, data, nblocks); - kernel_fpu_end(); - } else { - sha256_blocks_generic(state, data, nblocks); - } -} -EXPORT_SYMBOL_GPL(sha256_blocks_simd); - -void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], - const u8 *data, size_t nblocks) -{ - sha256_blocks_generic(state, data, nblocks); -} -EXPORT_SYMBOL_GPL(sha256_blocks_arch); - -bool sha256_is_arch_optimized(void) -{ - return static_key_enabled(&have_sha256_x86); -} -EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); - -static int __init sha256_x86_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) { - static_call_update(sha256_blocks_x86, sha256_ni_transform); - } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | - XFEATURE_MASK_YMM, NULL) && - boot_cpu_has(X86_FEATURE_AVX)) { - if (boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_BMI2)) - static_call_update(sha256_blocks_x86, - sha256_transform_rorx); - else - static_call_update(sha256_blocks_x86, - sha256_transform_avx); - } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) { - return 0; - } - static_branch_enable(&have_sha256_x86); - return 0; -} -subsys_initcall(sha256_x86_mod_init); - -static void __exit sha256_x86_mod_exit(void) -{ -} -module_exit(sha256_x86_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA-256 optimized for x86_64"); |