diff options
Diffstat (limited to 'arch/x86/crypto')
38 files changed, 5433 insertions, 8411 deletions
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index c9e59589a1ce..3d948f10c94c 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -3,10 +3,12 @@ menu "Accelerated Cryptographic Algorithms for CPU (x86)" config CRYPTO_CURVE25519_X86 - tristate "Public key crypto: Curve25519 (ADX)" + tristate depends on X86 && 64BIT + select CRYPTO_KPP select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 + default CRYPTO_LIB_CURVE25519_INTERNAL help Curve25519 algorithm @@ -14,20 +16,25 @@ config CRYPTO_CURVE25519_X86 - ADX (large integer arithmetic) config CRYPTO_AES_NI_INTEL - tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTR, XTS, GCM (AES-NI)" + tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)" depends on X86 select CRYPTO_AEAD select CRYPTO_LIB_AES + select CRYPTO_LIB_GF128MUL select CRYPTO_ALGAPI select CRYPTO_SKCIPHER select CRYPTO_SIMD help Block cipher: AES cipher algorithms AEAD cipher: AES with GCM - Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTR, XTS + Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XCTR, XTS Architecture: x86 (32-bit and 64-bit) using: - AES-NI (AES new instructions) + - VAES (Vector AES) + + Some algorithm implementations are supported only in 64-bit builds, + and some have additional prerequisites such as AVX2 or AVX512. config CRYPTO_BLOWFISH_X86_64 tristate "Ciphers: Blowfish, modes: ECB, CBC" @@ -343,11 +350,12 @@ config CRYPTO_ARIA_GFNI_AVX512_X86_64 Processes 64 blocks in parallel. config CRYPTO_CHACHA20_X86_64 - tristate "Ciphers: ChaCha20, XChaCha20, XChaCha12 (SSSE3/AVX2/AVX-512VL)" + tristate depends on X86 && 64BIT select CRYPTO_SKCIPHER select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA + default CRYPTO_LIB_CHACHA_INTERNAL help Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms @@ -358,7 +366,7 @@ config CRYPTO_CHACHA20_X86_64 - AVX-512VL (Advanced Vector Extensions-512VL) config CRYPTO_AEGIS128_AESNI_SSE2 - tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)" + tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)" depends on X86 && 64BIT select CRYPTO_AEAD select CRYPTO_SIMD @@ -367,7 +375,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2 Architecture: x86_64 using: - AES-NI (AES New Instructions) - - SSE2 (Streaming SIMD Extensions 2) + - SSE4.1 (Streaming SIMD Extensions 4.1) config CRYPTO_NHPOLY1305_SSE2 tristate "Hash functions: NHPoly1305 (SSE2)" @@ -412,10 +420,12 @@ config CRYPTO_POLYVAL_CLMUL_NI - CLMUL-NI (carry-less multiplication new instructions) config CRYPTO_POLY1305_X86_64 - tristate "Hash functions: Poly1305 (SSE2/AVX2)" + tristate depends on X86 && 64BIT + select CRYPTO_HASH select CRYPTO_LIB_POLY1305_GENERIC select CRYPTO_ARCH_HAVE_LIB_POLY1305 + default CRYPTO_LIB_POLY1305_INTERNAL help Poly1305 authenticator algorithm (RFC7539) @@ -487,36 +497,4 @@ config CRYPTO_GHASH_CLMUL_NI_INTEL Architecture: x86_64 using: - CLMUL-NI (carry-less multiplication new instructions) -config CRYPTO_CRC32C_INTEL - tristate "CRC32c (SSE4.2/PCLMULQDQ)" - depends on X86 - select CRYPTO_HASH - help - CRC32c CRC algorithm with the iSCSI polynomial (RFC 3385 and RFC 3720) - - Architecture: x86 (32-bit and 64-bit) using: - - SSE4.2 (Streaming SIMD Extensions 4.2) CRC32 instruction - - PCLMULQDQ (carry-less multiplication) - -config CRYPTO_CRC32_PCLMUL - tristate "CRC32 (PCLMULQDQ)" - depends on X86 - select CRYPTO_HASH - select CRC32 - help - CRC32 CRC algorithm (IEEE 802.3) - - Architecture: x86 (32-bit and 64-bit) using: - - PCLMULQDQ (carry-less multiplication) - -config CRYPTO_CRCT10DIF_PCLMUL - tristate "CRCT10DIF (PCLMULQDQ)" - depends on X86 && 64BIT && CRC_T10DIF - select CRYPTO_HASH - help - CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF) - - Architecture: x86_64 using: - - PCLMULQDQ (carry-less multiplication) - endmenu diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 9aa46093c91b..5d19f41bde58 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -48,7 +48,12 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o -aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o +aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \ + aes-gcm-aesni-x86_64.o \ + aes-xts-avx-x86_64.o +ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) +aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o +endif obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o @@ -70,16 +75,6 @@ ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o -obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o -crc32c-intel-y := crc32c-intel_glue.o -crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o - -obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o -crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o - -obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o -crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o - obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o targets += poly1305-x86_64-cryptogams.S diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index ad7f4c891625..7294dc0ee7ba 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -1,14 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * AES-NI + SSE2 implementation of AEGIS-128 + * AES-NI + SSE4.1 implementation of AEGIS-128 * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. + * Copyright 2024 Google LLC */ #include <linux/linkage.h> -#include <linux/cfi_types.h> -#include <asm/frame.h> #define STATE0 %xmm0 #define STATE1 %xmm1 @@ -20,11 +19,6 @@ #define T0 %xmm6 #define T1 %xmm7 -#define STATEP %rdi -#define LEN %rsi -#define SRC %rdx -#define DST %rcx - .section .rodata.cst16.aegis128_const, "aM", @progbits, 32 .align 16 .Laegis128_const_0: @@ -34,11 +28,11 @@ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd -.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 -.align 16 -.Laegis128_counter: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32 +.align 32 +.Lzeropad_mask: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0 .text @@ -61,140 +55,102 @@ .endm /* - * __load_partial: internal ABI - * input: - * LEN - bytes - * SRC - src - * output: - * MSG - message block - * changed: - * T0 - * %r8 - * %r9 + * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register + * MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8. */ -SYM_FUNC_START_LOCAL(__load_partial) - xor %r9d, %r9d - pxor MSG, MSG - - mov LEN, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov LEN, %r8 - and $0x1E, %r8 - add SRC, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov LEN, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov LEN, %r8 - and $0x1C, %r8 - add SRC, %r8 - shl $0x10, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov LEN, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov LEN, %r8 - and $0x18, %r8 - add SRC, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG - - mov LEN, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov LEN, %r8 - and $0x10, %r8 - add SRC, %r8 - pslldq $8, MSG - movq (%r8), T0 - pxor T0, MSG - -.Lld_partial_8: - RET -SYM_FUNC_END(__load_partial) +.macro load_partial + sub $8, %ecx /* LEN - 8 */ + jle .Lle8\@ + + /* Load 9 <= LEN <= 15 bytes: */ + movq (SRC), MSG /* Load first 8 bytes */ + mov (SRC, %rcx), %rax /* Load last 8 bytes */ + neg %ecx + shl $3, %ecx + shr %cl, %rax /* Discard overlapping bytes */ + pinsrq $1, %rax, MSG + jmp .Ldone\@ + +.Lle8\@: + add $4, %ecx /* LEN - 4 */ + jl .Llt4\@ + + /* Load 4 <= LEN <= 8 bytes: */ + mov (SRC), %eax /* Load first 4 bytes */ + mov (SRC, %rcx), %r8d /* Load last 4 bytes */ + jmp .Lcombine\@ + +.Llt4\@: + /* Load 1 <= LEN <= 3 bytes: */ + add $2, %ecx /* LEN - 2 */ + movzbl (SRC), %eax /* Load first byte */ + jl .Lmovq\@ + movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */ +.Lcombine\@: + shl $3, %ecx + shl %cl, %r8 + or %r8, %rax /* Combine the two parts */ +.Lmovq\@: + movq %rax, MSG +.Ldone\@: +.endm /* - * __store_partial: internal ABI - * input: - * LEN - bytes - * DST - dst - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 + * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer + * DST. Clobbers %rax, %rcx, and %r8. */ -SYM_FUNC_START_LOCAL(__store_partial) - mov LEN, %r8 - mov DST, %r9 - - movq T0, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0 - movq T0, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $0x10, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - RET -SYM_FUNC_END(__store_partial) +.macro store_partial msg + sub $8, %ecx /* LEN - 8 */ + jl .Llt8\@ + + /* Store 8 <= LEN <= 15 bytes: */ + pextrq $1, \msg, %rax + mov %ecx, %r8d + shl $3, %ecx + ror %cl, %rax + mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */ + movq \msg, (DST) /* Store first 8 bytes */ + jmp .Ldone\@ + +.Llt8\@: + add $4, %ecx /* LEN - 4 */ + jl .Llt4\@ + + /* Store 4 <= LEN <= 7 bytes: */ + pextrd $1, \msg, %eax + mov %ecx, %r8d + shl $3, %ecx + ror %cl, %eax + mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */ + movd \msg, (DST) /* Store first 4 bytes */ + jmp .Ldone\@ + +.Llt4\@: + /* Store 1 <= LEN <= 3 bytes: */ + pextrb $0, \msg, 0(DST) + cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */ + jl .Ldone\@ + pextrb $1, \msg, 1(DST) + je .Ldone\@ + pextrb $2, \msg, 2(DST) +.Ldone\@: +.endm /* - * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); + * void aegis128_aesni_init(struct aegis_state *state, + * const struct aegis_block *key, + * const u8 iv[AEGIS128_NONCE_SIZE]); */ -SYM_FUNC_START(crypto_aegis128_aesni_init) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_init) + .set STATEP, %rdi + .set KEYP, %rsi + .set IVP, %rdx /* load IV: */ - movdqu (%rdx), T1 + movdqu (IVP), T1 /* load key: */ - movdqa (%rsi), KEY + movdqa (KEYP), KEY pxor KEY, T1 movdqa T1, STATE0 movdqa KEY, STATE3 @@ -224,20 +180,22 @@ SYM_FUNC_START(crypto_aegis128_aesni_init) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) - - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_init) +SYM_FUNC_END(aegis128_aesni_init) /* - * void crypto_aegis128_aesni_ad(void *state, unsigned int length, - * const void *data); + * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, + * unsigned int len); + * + * len must be a multiple of 16. */ -SYM_FUNC_START(crypto_aegis128_aesni_ad) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_ad) + .set STATEP, %rdi + .set SRC, %rsi + .set LEN, %edx - cmp $0x10, LEN - jb .Lad_out + test LEN, LEN + jz .Lad_out /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -246,89 +204,40 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 - mov SRC, %r8 - and $0xF, %r8 - jnz .Lad_u_loop - -.align 8 -.Lad_a_loop: - movdqa 0x00(SRC), MSG - aegis128_update - pxor MSG, STATE4 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_1 - - movdqa 0x10(SRC), MSG - aegis128_update - pxor MSG, STATE3 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_2 - - movdqa 0x20(SRC), MSG - aegis128_update - pxor MSG, STATE2 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_3 - - movdqa 0x30(SRC), MSG - aegis128_update - pxor MSG, STATE1 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_4 - - movdqa 0x40(SRC), MSG - aegis128_update - pxor MSG, STATE0 - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_0 - - add $0x50, SRC - jmp .Lad_a_loop - .align 8 -.Lad_u_loop: +.Lad_loop: movdqu 0x00(SRC), MSG aegis128_update pxor MSG, STATE4 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_1 + jz .Lad_out_1 movdqu 0x10(SRC), MSG aegis128_update pxor MSG, STATE3 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_2 + jz .Lad_out_2 movdqu 0x20(SRC), MSG aegis128_update pxor MSG, STATE2 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_3 + jz .Lad_out_3 movdqu 0x30(SRC), MSG aegis128_update pxor MSG, STATE1 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_4 + jz .Lad_out_4 movdqu 0x40(SRC), MSG aegis128_update pxor MSG, STATE0 sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_0 + jz .Lad_out_0 add $0x50, SRC - jmp .Lad_u_loop + jmp .Lad_loop /* store the state: */ .Lad_out_0: @@ -337,7 +246,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) - FRAME_END RET .Lad_out_1: @@ -346,7 +254,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - FRAME_END RET .Lad_out_2: @@ -355,7 +262,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE0, 0x20(STATEP) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) - FRAME_END RET .Lad_out_3: @@ -364,7 +270,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE4, 0x20(STATEP) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) - FRAME_END RET .Lad_out_4: @@ -373,41 +278,38 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE3, 0x20(STATEP) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) - FRAME_END - RET - .Lad_out: - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_ad) +SYM_FUNC_END(aegis128_aesni_ad) -.macro encrypt_block a s0 s1 s2 s3 s4 i - movdq\a (\i * 0x10)(SRC), MSG +.macro encrypt_block s0 s1 s2 s3 s4 i + movdqu (\i * 0x10)(SRC), MSG movdqa MSG, T0 pxor \s1, T0 pxor \s4, T0 movdqa \s2, T1 pand \s3, T1 pxor T1, T0 - movdq\a T0, (\i * 0x10)(DST) + movdqu T0, (\i * 0x10)(DST) aegis128_update pxor MSG, \s4 sub $0x10, LEN - cmp $0x10, LEN - jl .Lenc_out_\i + jz .Lenc_out_\i .endm /* - * void crypto_aegis128_aesni_enc(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst, + * unsigned int len); + * + * len must be nonzero and a multiple of 16. */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) - FRAME_BEGIN - - cmp $0x10, LEN - jb .Lenc_out +SYM_FUNC_START(aegis128_aesni_enc) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -416,34 +318,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 - mov SRC, %r8 - or DST, %r8 - and $0xF, %r8 - jnz .Lenc_u_loop - .align 8 -.Lenc_a_loop: - encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 - encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 - encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 - encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 - encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 +.Lenc_loop: + encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 + encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 + encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 + encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 + encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 add $0x50, SRC add $0x50, DST - jmp .Lenc_a_loop - -.align 8 -.Lenc_u_loop: - encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 - encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 - encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 - encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 - encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 - - add $0x50, SRC - add $0x50, DST - jmp .Lenc_u_loop + jmp .Lenc_loop /* store the state: */ .Lenc_out_0: @@ -452,7 +337,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - FRAME_END RET .Lenc_out_1: @@ -461,7 +345,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE0, 0x20(STATEP) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) - FRAME_END RET .Lenc_out_2: @@ -470,7 +353,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE4, 0x20(STATEP) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) - FRAME_END RET .Lenc_out_3: @@ -479,7 +361,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE3, 0x20(STATEP) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) - FRAME_END RET .Lenc_out_4: @@ -488,20 +369,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) - FRAME_END - RET - .Lenc_out: - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_enc) +SYM_FUNC_END(aegis128_aesni_enc) /* - * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src, + * u8 *dst, unsigned int len); */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_enc_tail) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -511,7 +391,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) movdqu 0x40(STATEP), STATE4 /* encrypt message: */ - call __load_partial + mov LEN, %r9d + load_partial movdqa MSG, T0 pxor STATE1, T0 @@ -520,7 +401,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) pand STATE3, T1 pxor T1, T0 - call __store_partial + mov %r9d, LEN + store_partial T0 aegis128_update pxor MSG, STATE4 @@ -531,37 +413,36 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) +SYM_FUNC_END(aegis128_aesni_enc_tail) -.macro decrypt_block a s0 s1 s2 s3 s4 i - movdq\a (\i * 0x10)(SRC), MSG +.macro decrypt_block s0 s1 s2 s3 s4 i + movdqu (\i * 0x10)(SRC), MSG pxor \s1, MSG pxor \s4, MSG movdqa \s2, T1 pand \s3, T1 pxor T1, MSG - movdq\a MSG, (\i * 0x10)(DST) + movdqu MSG, (\i * 0x10)(DST) aegis128_update pxor MSG, \s4 sub $0x10, LEN - cmp $0x10, LEN - jl .Ldec_out_\i + jz .Ldec_out_\i .endm /* - * void crypto_aegis128_aesni_dec(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst, + * unsigned int len); + * + * len must be nonzero and a multiple of 16. */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) - FRAME_BEGIN - - cmp $0x10, LEN - jb .Ldec_out +SYM_FUNC_START(aegis128_aesni_dec) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -570,34 +451,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 - mov SRC, %r8 - or DST, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - .align 8 -.Ldec_a_loop: - decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 - decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 - decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 - decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 - decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 +.Ldec_loop: + decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 + decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 + decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 + decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 + decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 add $0x50, SRC add $0x50, DST - jmp .Ldec_a_loop - -.align 8 -.Ldec_u_loop: - decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 - decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 - decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 - decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 - decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 - - add $0x50, SRC - add $0x50, DST - jmp .Ldec_u_loop + jmp .Ldec_loop /* store the state: */ .Ldec_out_0: @@ -606,7 +470,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - FRAME_END RET .Ldec_out_1: @@ -615,7 +478,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE0, 0x20(STATEP) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) - FRAME_END RET .Ldec_out_2: @@ -624,7 +486,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE4, 0x20(STATEP) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) - FRAME_END RET .Ldec_out_3: @@ -633,7 +494,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE3, 0x20(STATEP) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) - FRAME_END RET .Ldec_out_4: @@ -642,20 +502,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) - FRAME_END - RET - .Ldec_out: - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_dec) +SYM_FUNC_END(aegis128_aesni_dec) /* - * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, - * const void *src, void *dst); + * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src, + * u8 *dst, unsigned int len); */ -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_dec_tail) + .set STATEP, %rdi + .set SRC, %rsi + .set DST, %rdx + .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -665,7 +524,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) movdqu 0x40(STATEP), STATE4 /* decrypt message: */ - call __load_partial + mov LEN, %r9d + load_partial pxor STATE1, MSG pxor STATE4, MSG @@ -673,17 +533,13 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) pand STATE3, T1 pxor T1, MSG - movdqa MSG, T0 - call __store_partial + mov %r9d, LEN + store_partial MSG /* mask with byte count: */ - movq LEN, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - movdqa .Laegis128_counter(%rip), T1 - pcmpgtb T1, T0 + lea .Lzeropad_mask+16(%rip), %rax + sub %r9, %rax + movdqu (%rax), T0 pand T0, MSG aegis128_update @@ -695,17 +551,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) - - FRAME_END RET -SYM_FUNC_END(crypto_aegis128_aesni_dec_tail) +SYM_FUNC_END(aegis128_aesni_dec_tail) /* - * void crypto_aegis128_aesni_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); + * void aegis128_aesni_final(struct aegis_state *state, + * struct aegis_block *tag_xor, + * unsigned int assoclen, unsigned int cryptlen); */ -SYM_FUNC_START(crypto_aegis128_aesni_final) - FRAME_BEGIN +SYM_FUNC_START(aegis128_aesni_final) + .set STATEP, %rdi + .set TAG_XOR, %rsi + .set ASSOCLEN, %edx + .set CRYPTLEN, %ecx /* load the state: */ movdqu 0x00(STATEP), STATE0 @@ -715,10 +573,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) movdqu 0x40(STATEP), STATE4 /* prepare length block: */ - movq %rdx, MSG - movq %rcx, T0 - pslldq $8, T0 - pxor T0, MSG + movd ASSOCLEN, MSG + pinsrd $2, CRYPTLEN, MSG psllq $3, MSG /* multiply by 8 (to get bit count) */ pxor STATE3, MSG @@ -733,7 +589,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) aegis128_update; pxor MSG, STATE3 /* xor tag: */ - movdqu (%rsi), MSG + movdqu (TAG_XOR), MSG pxor STATE0, MSG pxor STATE1, MSG @@ -741,8 +597,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) pxor STATE3, MSG pxor STATE4, MSG - movdqu MSG, (%rsi) - - FRAME_END + movdqu MSG, (TAG_XOR) RET -SYM_FUNC_END(crypto_aegis128_aesni_final) +SYM_FUNC_END(aegis128_aesni_final) diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 4623189000d8..26786e15abac 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * The AEGIS-128 Authenticated-Encryption Algorithm - * Glue for AES-NI + SSE2 implementation + * Glue for AES-NI + SSE4.1 implementation * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. @@ -23,27 +23,6 @@ #define AEGIS128_MIN_AUTH_SIZE 8 #define AEGIS128_MAX_AUTH_SIZE 16 -asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv); - -asmlinkage void crypto_aegis128_aesni_ad( - void *state, unsigned int length, const void *data); - -asmlinkage void crypto_aegis128_aesni_enc( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_dec( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_enc_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_dec_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128_aesni_final( - void *state, void *tag_xor, unsigned int cryptlen, - unsigned int assoclen); - struct aegis_block { u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN); }; @@ -56,15 +35,31 @@ struct aegis_ctx { struct aegis_block key; }; -struct aegis_crypt_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); +asmlinkage void aegis128_aesni_init(struct aegis_state *state, + const struct aegis_block *key, + const u8 iv[AEGIS128_NONCE_SIZE]); - void (*crypt_blocks)(void *state, unsigned int length, const void *src, - void *dst); - void (*crypt_tail)(void *state, unsigned int length, const void *src, - void *dst); -}; +asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, + unsigned int len); + +asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, + u8 *dst, unsigned int len); + +asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, + u8 *dst, unsigned int len); + +asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state, + const u8 *src, u8 *dst, + unsigned int len); + +asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state, + const u8 *src, u8 *dst, + unsigned int len); + +asmlinkage void aegis128_aesni_final(struct aegis_state *state, + struct aegis_block *tag_xor, + unsigned int assoclen, + unsigned int cryptlen); static void crypto_aegis128_aesni_process_ad( struct aegis_state *state, struct scatterlist *sg_src, @@ -76,25 +71,23 @@ static void crypto_aegis128_aesni_process_ad( scatterwalk_start(&walk, sg_src); while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); + unsigned int size = scatterwalk_next(&walk, assoclen); + const u8 *src = walk.addr; unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; if (pos + size >= AEGIS128_BLOCK_SIZE) { if (pos > 0) { unsigned int fill = AEGIS128_BLOCK_SIZE - pos; memcpy(buf.bytes + pos, src, fill); - crypto_aegis128_aesni_ad(state, - AEGIS128_BLOCK_SIZE, - buf.bytes); + aegis128_aesni_ad(state, buf.bytes, + AEGIS128_BLOCK_SIZE); pos = 0; left -= fill; src += fill; } - crypto_aegis128_aesni_ad(state, left, src); - + aegis128_aesni_ad(state, src, + left & ~(AEGIS128_BLOCK_SIZE - 1)); src += left & ~(AEGIS128_BLOCK_SIZE - 1); left &= AEGIS128_BLOCK_SIZE - 1; } @@ -103,31 +96,42 @@ static void crypto_aegis128_aesni_process_ad( pos += left; assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); + scatterwalk_done_src(&walk, size); } if (pos > 0) { memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos); - crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes); + aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE); } } -static void crypto_aegis128_aesni_process_crypt( - struct aegis_state *state, struct skcipher_walk *walk, - const struct aegis_crypt_ops *ops) +static __always_inline void +crypto_aegis128_aesni_process_crypt(struct aegis_state *state, + struct skcipher_walk *walk, bool enc) { while (walk->nbytes >= AEGIS128_BLOCK_SIZE) { - ops->crypt_blocks(state, - round_down(walk->nbytes, AEGIS128_BLOCK_SIZE), - walk->src.virt.addr, walk->dst.virt.addr); + if (enc) + aegis128_aesni_enc(state, walk->src.virt.addr, + walk->dst.virt.addr, + round_down(walk->nbytes, + AEGIS128_BLOCK_SIZE)); + else + aegis128_aesni_dec(state, walk->src.virt.addr, + walk->dst.virt.addr, + round_down(walk->nbytes, + AEGIS128_BLOCK_SIZE)); skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE); } if (walk->nbytes) { - ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr, - walk->dst.virt.addr); + if (enc) + aegis128_aesni_enc_tail(state, walk->src.virt.addr, + walk->dst.virt.addr, + walk->nbytes); + else + aegis128_aesni_dec_tail(state, walk->src.virt.addr, + walk->dst.virt.addr, + walk->nbytes); skcipher_walk_done(walk, 0); } } @@ -162,42 +166,39 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm, return 0; } -static void crypto_aegis128_aesni_crypt(struct aead_request *req, - struct aegis_block *tag_xor, - unsigned int cryptlen, - const struct aegis_crypt_ops *ops) +static __always_inline void +crypto_aegis128_aesni_crypt(struct aead_request *req, + struct aegis_block *tag_xor, + unsigned int cryptlen, bool enc) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; - ops->skcipher_walk_init(&walk, req, true); + if (enc) + skcipher_walk_aead_encrypt(&walk, req, true); + else + skcipher_walk_aead_decrypt(&walk, req, true); kernel_fpu_begin(); - crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv); + aegis128_aesni_init(&state, &ctx->key, req->iv); crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); - crypto_aegis128_aesni_process_crypt(&state, &walk, ops); - crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + crypto_aegis128_aesni_process_crypt(&state, &walk, enc); + aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); kernel_fpu_end(); } static int crypto_aegis128_aesni_encrypt(struct aead_request *req) { - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = crypto_aegis128_aesni_enc, - .crypt_tail = crypto_aegis128_aesni_enc_tail, - }; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen; - crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true); scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); @@ -208,12 +209,6 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) { static const struct aegis_block zeros = {}; - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = crypto_aegis128_aesni_dec, - .crypt_tail = crypto_aegis128_aesni_dec_tail, - }; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); @@ -222,27 +217,16 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); - crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false); return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; } -static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead) -{ - return 0; -} - -static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead) -{ -} - static struct aead_alg crypto_aegis128_aesni_alg = { .setkey = crypto_aegis128_aesni_setkey, .setauthsize = crypto_aegis128_aesni_setauthsize, .encrypt = crypto_aegis128_aesni_encrypt, .decrypt = crypto_aegis128_aesni_decrypt, - .init = crypto_aegis128_aesni_init_tfm, - .exit = crypto_aegis128_aesni_exit_tfm, .ivsize = AEGIS128_NONCE_SIZE, .maxauthsize = AEGIS128_MAX_AUTH_SIZE, @@ -253,7 +237,6 @@ static struct aead_alg crypto_aegis128_aesni_alg = { .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aegis_ctx) + __alignof__(struct aegis_ctx), - .cra_alignmask = 0, .cra_priority = 400, .cra_name = "__aegis128", @@ -267,7 +250,7 @@ static struct simd_aead_alg *simd_alg; static int __init crypto_aegis128_aesni_module_init(void) { - if (!boot_cpu_has(X86_FEATURE_XMM2) || + if (!boot_cpu_has(X86_FEATURE_XMM4_1) || !boot_cpu_has(X86_FEATURE_AES) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; @@ -286,6 +269,6 @@ module_exit(crypto_aegis128_aesni_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation"); +MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation"); MODULE_ALIAS_CRYPTO("aegis128"); MODULE_ALIAS_CRYPTO("aegis128-aesni"); diff --git a/arch/x86/crypto/aes-ctr-avx-x86_64.S b/arch/x86/crypto/aes-ctr-avx-x86_64.S new file mode 100644 index 000000000000..1685d8b24b2c --- /dev/null +++ b/arch/x86/crypto/aes-ctr-avx-x86_64.S @@ -0,0 +1,592 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// Copyright 2025 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ +// +// This file contains x86_64 assembly implementations of AES-CTR and AES-XCTR +// using the following sets of CPU features: +// - AES-NI && AVX +// - VAES && AVX2 +// - VAES && (AVX10/256 || (AVX512BW && AVX512VL)) && BMI2 +// - VAES && (AVX10/512 || (AVX512BW && AVX512VL)) && BMI2 +// +// See the function definitions at the bottom of the file for more information. + +#include <linux/linkage.h> +#include <linux/cfi_types.h> + +.section .rodata +.p2align 4 + +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f + +.Lctr_pattern: + .quad 0, 0 +.Lone: + .quad 1, 0 +.Ltwo: + .quad 2, 0 + .quad 3, 0 + +.Lfour: + .quad 4, 0 + +.text + +// Move a vector between memory and a register. +// The register operand must be in the first 16 vector registers. +.macro _vmovdqu src, dst +.if VL < 64 + vmovdqu \src, \dst +.else + vmovdqu8 \src, \dst +.endif +.endm + +// Move a vector between registers. +// The registers must be in the first 16 vector registers. +.macro _vmovdqa src, dst +.if VL < 64 + vmovdqa \src, \dst +.else + vmovdqa64 \src, \dst +.endif +.endm + +// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector +// register. The register operand must be in the first 16 vector registers. +.macro _vbroadcast128 src, dst +.if VL == 16 + vmovdqu \src, \dst +.elseif VL == 32 + vbroadcasti128 \src, \dst +.else + vbroadcasti32x4 \src, \dst +.endif +.endm + +// XOR two vectors together. +// Any register operands must be in the first 16 vector registers. +.macro _vpxor src1, src2, dst +.if VL < 64 + vpxor \src1, \src2, \dst +.else + vpxord \src1, \src2, \dst +.endif +.endm + +// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst +// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. +.macro _load_partial_block src, dst, tmp64, tmp32 + sub $8, %ecx // LEN - 8 + jle .Lle8\@ + + // Load 9 <= LEN <= 15 bytes. + vmovq (\src), \dst // Load first 8 bytes + mov (\src, %rcx), %rax // Load last 8 bytes + neg %ecx + shl $3, %ecx + shr %cl, %rax // Discard overlapping bytes + vpinsrq $1, %rax, \dst, \dst + jmp .Ldone\@ + +.Lle8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Load 4 <= LEN <= 8 bytes. + mov (\src), %eax // Load first 4 bytes + mov (\src, %rcx), \tmp32 // Load last 4 bytes + jmp .Lcombine\@ + +.Llt4\@: + // Load 1 <= LEN <= 3 bytes. + add $2, %ecx // LEN - 2 + movzbl (\src), %eax // Load first byte + jl .Lmovq\@ + movzwl (\src, %rcx), \tmp32 // Load last 2 bytes +.Lcombine\@: + shl $3, %ecx + shl %cl, \tmp64 + or \tmp64, %rax // Combine the two parts +.Lmovq\@: + vmovq %rax, \dst +.Ldone\@: +.endm + +// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. +// Clobbers %rax, %rcx, and \tmp{64,32}. +.macro _store_partial_block src, dst, tmp64, tmp32 + sub $8, %ecx // LEN - 8 + jl .Llt8\@ + + // Store 8 <= LEN <= 15 bytes. + vpextrq $1, \src, %rax + mov %ecx, \tmp32 + shl $3, %ecx + ror %cl, %rax + mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes + vmovq \src, (\dst) // Store first 8 bytes + jmp .Ldone\@ + +.Llt8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Store 4 <= LEN <= 7 bytes. + vpextrd $1, \src, %eax + mov %ecx, \tmp32 + shl $3, %ecx + ror %cl, %eax + mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes + vmovd \src, (\dst) // Store first 4 bytes + jmp .Ldone\@ + +.Llt4\@: + // Store 1 <= LEN <= 3 bytes. + vpextrb $0, \src, 0(\dst) + cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? + jl .Ldone\@ + vpextrb $1, \src, 1(\dst) + je .Ldone\@ + vpextrb $2, \src, 2(\dst) +.Ldone\@: +.endm + +// Prepare the next two vectors of AES inputs in AESDATA\i0 and AESDATA\i1, and +// XOR each with the zero-th round key. Also update LE_CTR if !\final. +.macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0 +.if \is_xctr + .if USE_AVX10 + _vmovdqa LE_CTR, AESDATA\i0 + vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0 + .else + vpxor XCTR_IV, LE_CTR, AESDATA\i0 + vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 + .endif + vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 + + .if USE_AVX10 + vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1 + .else + vpxor XCTR_IV, AESDATA\i1, AESDATA\i1 + vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 + .endif +.else + vpshufb BSWAP_MASK, LE_CTR, AESDATA\i0 + _vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 + vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 + vpshufb BSWAP_MASK, AESDATA\i1, AESDATA\i1 + _vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 +.endif +.if !\final + vpaddq LE_CTR_INC2, LE_CTR, LE_CTR +.endif +.endm + +// Do all AES rounds on the data in the given AESDATA vectors, excluding the +// zero-th and last rounds. +.macro _aesenc_loop vecs:vararg + mov KEY, %rax +1: + _vbroadcast128 (%rax), RNDKEY +.irp i, \vecs + vaesenc RNDKEY, AESDATA\i, AESDATA\i +.endr + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b +.endm + +// Finalize the keystream blocks in the given AESDATA vectors by doing the last +// AES round, then XOR those keystream blocks with the corresponding data. +// Reduce latency by doing the XOR before the vaesenclast, utilizing the +// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). +.macro _aesenclast_and_xor vecs:vararg +.irp i, \vecs + _vpxor \i*VL(SRC), RNDKEYLAST, RNDKEY + vaesenclast RNDKEY, AESDATA\i, AESDATA\i +.endr +.irp i, \vecs + _vmovdqu AESDATA\i, \i*VL(DST) +.endr +.endm + +// XOR the keystream blocks in the specified AESDATA vectors with the +// corresponding data. +.macro _xor_data vecs:vararg +.irp i, \vecs + _vpxor \i*VL(SRC), AESDATA\i, AESDATA\i +.endr +.irp i, \vecs + _vmovdqu AESDATA\i, \i*VL(DST) +.endr +.endm + +.macro _aes_ctr_crypt is_xctr + + // Define register aliases V0-V15 that map to the xmm, ymm, or zmm + // registers according to the selected Vector Length (VL). +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .if VL == 16 + .set V\i, %xmm\i + .elseif VL == 32 + .set V\i, %ymm\i + .elseif VL == 64 + .set V\i, %zmm\i + .else + .error "Unsupported Vector Length (VL)" + .endif +.endr + + // Function arguments + .set KEY, %rdi // Initially points to the start of the + // crypto_aes_ctx, then is advanced to + // point to the index 1 round key + .set KEY32, %edi // Available as temp register after all + // keystream blocks have been generated + .set SRC, %rsi // Pointer to next source data + .set DST, %rdx // Pointer to next destination data + .set LEN, %ecx // Remaining length in bytes. + // Note: _load_partial_block relies on + // this being in %ecx. + .set LEN64, %rcx // Zero-extend LEN before using! + .set LEN8, %cl +.if \is_xctr + .set XCTR_IV_PTR, %r8 // const u8 iv[AES_BLOCK_SIZE]; + .set XCTR_CTR, %r9 // u64 ctr; +.else + .set LE_CTR_PTR, %r8 // const u64 le_ctr[2]; +.endif + + // Additional local variables + .set RNDKEYLAST_PTR, %r10 + .set AESDATA0, V0 + .set AESDATA0_XMM, %xmm0 + .set AESDATA1, V1 + .set AESDATA1_XMM, %xmm1 + .set AESDATA2, V2 + .set AESDATA3, V3 + .set AESDATA4, V4 + .set AESDATA5, V5 + .set AESDATA6, V6 + .set AESDATA7, V7 +.if \is_xctr + .set XCTR_IV, V8 +.else + .set BSWAP_MASK, V8 +.endif + .set LE_CTR, V9 + .set LE_CTR_XMM, %xmm9 + .set LE_CTR_INC1, V10 + .set LE_CTR_INC2, V11 + .set RNDKEY0, V12 + .set RNDKEYLAST, V13 + .set RNDKEY, V14 + + // Create the first vector of counters. +.if \is_xctr + .if VL == 16 + vmovq XCTR_CTR, LE_CTR + .elseif VL == 32 + vmovq XCTR_CTR, LE_CTR_XMM + inc XCTR_CTR + vmovq XCTR_CTR, AESDATA0_XMM + vinserti128 $1, AESDATA0_XMM, LE_CTR, LE_CTR + .else + vpbroadcastq XCTR_CTR, LE_CTR + vpsrldq $8, LE_CTR, LE_CTR + vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR + .endif + _vbroadcast128 (XCTR_IV_PTR), XCTR_IV +.else + _vbroadcast128 (LE_CTR_PTR), LE_CTR + .if VL > 16 + vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR + .endif + _vbroadcast128 .Lbswap_mask(%rip), BSWAP_MASK +.endif + +.if VL == 16 + _vbroadcast128 .Lone(%rip), LE_CTR_INC1 +.elseif VL == 32 + _vbroadcast128 .Ltwo(%rip), LE_CTR_INC1 +.else + _vbroadcast128 .Lfour(%rip), LE_CTR_INC1 +.endif + vpsllq $1, LE_CTR_INC1, LE_CTR_INC2 + + // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). + movl 480(KEY), %eax + + // Compute the pointer to the last round key. + lea 6*16(KEY, %rax, 4), RNDKEYLAST_PTR + + // Load the zero-th and last round keys. + _vbroadcast128 (KEY), RNDKEY0 + _vbroadcast128 (RNDKEYLAST_PTR), RNDKEYLAST + + // Make KEY point to the first round key. + add $16, KEY + + // This is the main loop, which encrypts 8 vectors of data at a time. + add $-8*VL, LEN + jl .Lloop_8x_done\@ +.Lloop_8x\@: + _prepare_2_ctr_vecs \is_xctr, 0, 1 + _prepare_2_ctr_vecs \is_xctr, 2, 3 + _prepare_2_ctr_vecs \is_xctr, 4, 5 + _prepare_2_ctr_vecs \is_xctr, 6, 7 + _aesenc_loop 0,1,2,3,4,5,6,7 + _aesenclast_and_xor 0,1,2,3,4,5,6,7 + sub $-8*VL, SRC + sub $-8*VL, DST + add $-8*VL, LEN + jge .Lloop_8x\@ +.Lloop_8x_done\@: + sub $-8*VL, LEN + jz .Ldone\@ + + // 1 <= LEN < 8*VL. Generate 2, 4, or 8 more vectors of keystream + // blocks, depending on the remaining LEN. + + _prepare_2_ctr_vecs \is_xctr, 0, 1 + _prepare_2_ctr_vecs \is_xctr, 2, 3 + cmp $4*VL, LEN + jle .Lenc_tail_atmost4vecs\@ + + // 4*VL < LEN < 8*VL. Generate 8 vectors of keystream blocks. Use the + // first 4 to XOR 4 full vectors of data. Then XOR the remaining data. + _prepare_2_ctr_vecs \is_xctr, 4, 5 + _prepare_2_ctr_vecs \is_xctr, 6, 7, final=1 + _aesenc_loop 0,1,2,3,4,5,6,7 + _aesenclast_and_xor 0,1,2,3 + vaesenclast RNDKEYLAST, AESDATA4, AESDATA0 + vaesenclast RNDKEYLAST, AESDATA5, AESDATA1 + vaesenclast RNDKEYLAST, AESDATA6, AESDATA2 + vaesenclast RNDKEYLAST, AESDATA7, AESDATA3 + sub $-4*VL, SRC + sub $-4*VL, DST + add $-4*VL, LEN + cmp $1*VL-1, LEN + jle .Lxor_tail_partial_vec_0\@ + _xor_data 0 + cmp $2*VL-1, LEN + jle .Lxor_tail_partial_vec_1\@ + _xor_data 1 + cmp $3*VL-1, LEN + jle .Lxor_tail_partial_vec_2\@ + _xor_data 2 + cmp $4*VL-1, LEN + jle .Lxor_tail_partial_vec_3\@ + _xor_data 3 + jmp .Ldone\@ + +.Lenc_tail_atmost4vecs\@: + cmp $2*VL, LEN + jle .Lenc_tail_atmost2vecs\@ + + // 2*VL < LEN <= 4*VL. Generate 4 vectors of keystream blocks. Use the + // first 2 to XOR 2 full vectors of data. Then XOR the remaining data. + _aesenc_loop 0,1,2,3 + _aesenclast_and_xor 0,1 + vaesenclast RNDKEYLAST, AESDATA2, AESDATA0 + vaesenclast RNDKEYLAST, AESDATA3, AESDATA1 + sub $-2*VL, SRC + sub $-2*VL, DST + add $-2*VL, LEN + jmp .Lxor_tail_upto2vecs\@ + +.Lenc_tail_atmost2vecs\@: + // 1 <= LEN <= 2*VL. Generate 2 vectors of keystream blocks. Then XOR + // the remaining data. + _aesenc_loop 0,1 + vaesenclast RNDKEYLAST, AESDATA0, AESDATA0 + vaesenclast RNDKEYLAST, AESDATA1, AESDATA1 + +.Lxor_tail_upto2vecs\@: + cmp $1*VL-1, LEN + jle .Lxor_tail_partial_vec_0\@ + _xor_data 0 + cmp $2*VL-1, LEN + jle .Lxor_tail_partial_vec_1\@ + _xor_data 1 + jmp .Ldone\@ + +.Lxor_tail_partial_vec_1\@: + add $-1*VL, LEN + jz .Ldone\@ + sub $-1*VL, SRC + sub $-1*VL, DST + _vmovdqa AESDATA1, AESDATA0 + jmp .Lxor_tail_partial_vec_0\@ + +.Lxor_tail_partial_vec_2\@: + add $-2*VL, LEN + jz .Ldone\@ + sub $-2*VL, SRC + sub $-2*VL, DST + _vmovdqa AESDATA2, AESDATA0 + jmp .Lxor_tail_partial_vec_0\@ + +.Lxor_tail_partial_vec_3\@: + add $-3*VL, LEN + jz .Ldone\@ + sub $-3*VL, SRC + sub $-3*VL, DST + _vmovdqa AESDATA3, AESDATA0 + +.Lxor_tail_partial_vec_0\@: + // XOR the remaining 1 <= LEN < VL bytes. It's easy if masked + // loads/stores are available; otherwise it's a bit harder... +.if USE_AVX10 + .if VL <= 32 + mov $-1, %eax + bzhi LEN, %eax, %eax + kmovd %eax, %k1 + .else + mov $-1, %rax + bzhi LEN64, %rax, %rax + kmovq %rax, %k1 + .endif + vmovdqu8 (SRC), AESDATA1{%k1}{z} + _vpxor AESDATA1, AESDATA0, AESDATA0 + vmovdqu8 AESDATA0, (DST){%k1} +.else + .if VL == 32 + cmp $16, LEN + jl 1f + vpxor (SRC), AESDATA0_XMM, AESDATA1_XMM + vmovdqu AESDATA1_XMM, (DST) + add $16, SRC + add $16, DST + sub $16, LEN + jz .Ldone\@ + vextracti128 $1, AESDATA0, AESDATA0_XMM +1: + .endif + mov LEN, %r10d + _load_partial_block SRC, AESDATA1_XMM, KEY, KEY32 + vpxor AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM + mov %r10d, %ecx + _store_partial_block AESDATA0_XMM, DST, KEY, KEY32 +.endif + +.Ldone\@: +.if VL > 16 + vzeroupper +.endif + RET +.endm + +// Below are the definitions of the functions generated by the above macro. +// They have the following prototypes: +// +// +// void aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, int len, +// const u64 le_ctr[2]); +// +// void aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, int len, +// const u8 iv[AES_BLOCK_SIZE], u64 ctr); +// +// Both functions generate |len| bytes of keystream, XOR it with the data from +// |src|, and write the result to |dst|. On non-final calls, |len| must be a +// multiple of 16. On the final call, |len| can be any value. +// +// aes_ctr64_crypt_* implement "regular" CTR, where the keystream is generated +// from a 128-bit big endian counter that increments by 1 for each AES block. +// HOWEVER, to keep the assembly code simple, some of the counter management is +// left to the caller. aes_ctr64_crypt_* take the counter in little endian +// form, only increment the low 64 bits internally, do the conversion to big +// endian internally, and don't write the updated counter back to memory. The +// caller is responsible for converting the starting IV to the little endian +// le_ctr, detecting the (very rare) case of a carry out of the low 64 bits +// being needed and splitting at that point with a carry done in between, and +// updating le_ctr after each part if the message is multi-part. +// +// aes_xctr_crypt_* implement XCTR as specified in "Length-preserving encryption +// with HCTR2" (https://eprint.iacr.org/2021/1441.pdf). XCTR is an +// easier-to-implement variant of CTR that uses little endian byte order and +// eliminates carries. |ctr| is the per-message block counter starting at 1. + +.set VL, 16 +.set USE_AVX10, 0 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx) + _aes_ctr_crypt 0 +SYM_FUNC_END(aes_ctr64_crypt_aesni_avx) +SYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx) + _aes_ctr_crypt 1 +SYM_FUNC_END(aes_xctr_crypt_aesni_avx) + +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +.set VL, 32 +.set USE_AVX10, 0 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2) + _aes_ctr_crypt 0 +SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2) +SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2) + _aes_ctr_crypt 1 +SYM_FUNC_END(aes_xctr_crypt_vaes_avx2) + +.set VL, 32 +.set USE_AVX10, 1 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_256) + _aes_ctr_crypt 0 +SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_256) +SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_256) + _aes_ctr_crypt 1 +SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_256) + +.set VL, 64 +.set USE_AVX10, 1 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_512) + _aes_ctr_crypt 0 +SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_512) +SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_512) + _aes_ctr_crypt 1 +SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_512) +#endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S new file mode 100644 index 000000000000..45940e2883a0 --- /dev/null +++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S @@ -0,0 +1,1128 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// AES-NI optimized AES-GCM for x86_64 +// +// Copyright 2024 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ +// +// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that +// support the original set of AES instructions, i.e. AES-NI. Two +// implementations are provided, one that uses AVX and one that doesn't. They +// are very similar, being generated by the same macros. The only difference is +// that the AVX implementation takes advantage of VEX-coded instructions in some +// places to avoid some 'movdqu' and 'movdqa' instructions. The AVX +// implementation does *not* use 256-bit vectors, as AES is not supported on +// 256-bit vectors until the VAES feature (which this file doesn't target). +// +// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1 +// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems +// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.) +// +// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is +// more thoroughly commented. This file has the following notable changes: +// +// - The vector length is fixed at 128-bit, i.e. xmm registers. This means +// there is only one AES block (and GHASH block) per register. +// +// - Without AVX512 / AVX10, only 16 SIMD registers are available instead of +// 32. We work around this by being much more careful about using +// registers, relying heavily on loads to load values as they are needed. +// +// - Masking is not available either. We work around this by implementing +// partial block loads and stores using overlapping scalar loads and stores +// combined with shifts and SSE4.1 insertion and extraction instructions. +// +// - The main loop is organized differently due to the different design +// constraints. First, with just one AES block per SIMD register, on some +// CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore +// do an 8-register wide loop. Considering that and the fact that we have +// just 16 SIMD registers to work with, it's not feasible to cache AES +// round keys and GHASH key powers in registers across loop iterations. +// That's not ideal, but also not actually that bad, since loads can run in +// parallel with other instructions. Significantly, this also makes it +// possible to roll up the inner loops, relying on hardware loop unrolling +// instead of software loop unrolling, greatly reducing code size. +// +// - We implement the GHASH multiplications in the main loop using Karatsuba +// multiplication instead of schoolbook multiplication. This saves one +// pclmulqdq instruction per block, at the cost of one 64-bit load, one +// pshufd, and 0.25 pxors per block. (This is without the three-argument +// XOR support that would be provided by AVX512 / AVX10, which would be +// more beneficial to schoolbook than Karatsuba.) +// +// As a rough approximation, we can assume that Karatsuba multiplication is +// faster than schoolbook multiplication in this context if one pshufd and +// 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit +// load is "free" due to running in parallel with arithmetic instructions.) +// This is true on AMD CPUs, including all that support pclmulqdq up to at +// least Zen 3. It's also true on older Intel CPUs: Westmere through +// Haswell on the Core side, and Silvermont through Goldmont Plus on the +// low-power side. On some of these CPUs, pclmulqdq is quite slow, and the +// benefit of Karatsuba should be substantial. On newer Intel CPUs, +// schoolbook multiplication should be faster, but only marginally. +// +// Not all these CPUs were available to be tested. However, benchmarks on +// available CPUs suggest that this approximation is plausible. Switching +// to Karatsuba showed negligible change (< 1%) on Intel Broadwell, +// Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%. +// Considering that and the fact that Karatsuba should be even more +// beneficial on older Intel CPUs, it seems like the right choice here. +// +// An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be +// saved by using a multiplication-less reduction method. We don't do that +// because it would require a large number of shift and xor instructions, +// making it less worthwhile and likely harmful on newer CPUs. +// +// It does make sense to sometimes use a different reduction optimization +// that saves a pclmulqdq, though: precompute the hash key times x^64, and +// multiply the low half of the data block by the hash key with the extra +// factor of x^64. This eliminates one step of the reduction. However, +// this is incompatible with Karatsuba multiplication. Therefore, for +// multi-block processing we use Karatsuba multiplication with a regular +// reduction. For single-block processing, we use the x^64 optimization. + +#include <linux/linkage.h> + +.section .rodata +.p2align 4 +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f +.Lgfpoly: + .quad 0xc200000000000000 +.Lone: + .quad 1 +.Lgfpoly_and_internal_carrybit: + .octa 0xc2000000000000010000000000000001 + // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of + // 'len' 0xff bytes and the rest zeroes. +.Lzeropad_mask: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0 + +// Offsets in struct aes_gcm_key_aesni +#define OFFSETOF_AESKEYLEN 480 +#define OFFSETOF_H_POWERS 496 +#define OFFSETOF_H_POWERS_XORED 624 +#define OFFSETOF_H_TIMES_X64 688 + +.text + +// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback +// assumes that all operands are distinct and that any mem operand is aligned. +.macro _vpclmulqdq imm, src1, src2, dst +.if USE_AVX + vpclmulqdq \imm, \src1, \src2, \dst +.else + movdqa \src2, \dst + pclmulqdq \imm, \src1, \dst +.endif +.endm + +// Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes +// that all operands are distinct and that any mem operand is aligned. +.macro _vpshufb src1, src2, dst +.if USE_AVX + vpshufb \src1, \src2, \dst +.else + movdqa \src2, \dst + pshufb \src1, \dst +.endif +.endm + +// Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that +// all operands are distinct. +.macro _vpand src1, src2, dst +.if USE_AVX + vpand \src1, \src2, \dst +.else + movdqu \src1, \dst + pand \src2, \dst +.endif +.endm + +// XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must +// be a temporary xmm register. +.macro _xor_mem_to_reg mem, reg, tmp +.if USE_AVX + vpxor \mem, \reg, \reg +.else + movdqu \mem, \tmp + pxor \tmp, \reg +.endif +.endm + +// Test the unaligned memory operand \mem against the xmm register \reg. \tmp +// must be a temporary xmm register. +.macro _test_mem mem, reg, tmp +.if USE_AVX + vptest \mem, \reg +.else + movdqu \mem, \tmp + ptest \tmp, \reg +.endif +.endm + +// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst +// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. +.macro _load_partial_block src, dst, tmp64, tmp32 + sub $8, %ecx // LEN - 8 + jle .Lle8\@ + + // Load 9 <= LEN <= 15 bytes. + movq (\src), \dst // Load first 8 bytes + mov (\src, %rcx), %rax // Load last 8 bytes + neg %ecx + shl $3, %ecx + shr %cl, %rax // Discard overlapping bytes + pinsrq $1, %rax, \dst + jmp .Ldone\@ + +.Lle8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Load 4 <= LEN <= 8 bytes. + mov (\src), %eax // Load first 4 bytes + mov (\src, %rcx), \tmp32 // Load last 4 bytes + jmp .Lcombine\@ + +.Llt4\@: + // Load 1 <= LEN <= 3 bytes. + add $2, %ecx // LEN - 2 + movzbl (\src), %eax // Load first byte + jl .Lmovq\@ + movzwl (\src, %rcx), \tmp32 // Load last 2 bytes +.Lcombine\@: + shl $3, %ecx + shl %cl, \tmp64 + or \tmp64, %rax // Combine the two parts +.Lmovq\@: + movq %rax, \dst +.Ldone\@: +.endm + +// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. +// Clobbers %rax, %rcx, and %rsi. +.macro _store_partial_block src, dst + sub $8, %ecx // LEN - 8 + jl .Llt8\@ + + // Store 8 <= LEN <= 15 bytes. + pextrq $1, \src, %rax + mov %ecx, %esi + shl $3, %ecx + ror %cl, %rax + mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes + movq \src, (\dst) // Store first 8 bytes + jmp .Ldone\@ + +.Llt8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Store 4 <= LEN <= 7 bytes. + pextrd $1, \src, %eax + mov %ecx, %esi + shl $3, %ecx + ror %cl, %eax + mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes + movd \src, (\dst) // Store first 4 bytes + jmp .Ldone\@ + +.Llt4\@: + // Store 1 <= LEN <= 3 bytes. + pextrb $0, \src, 0(\dst) + cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? + jl .Ldone\@ + pextrb $1, \src, 1(\dst) + je .Ldone\@ + pextrb $2, \src, 2(\dst) +.Ldone\@: +.endm + +// Do one step of GHASH-multiplying \a by \b and storing the reduced product in +// \b. To complete all steps, this must be invoked with \i=0 through \i=9. +// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the +// .Lgfpoly constant, and \t0-\t1 must be temporary registers. +.macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1 + + // MI = (a_L * b_H) + ((a*x^64)_L * b_L) +.if \i == 0 + _vpclmulqdq $0x01, \a, \b, \t0 +.elseif \i == 1 + _vpclmulqdq $0x00, \a_times_x64, \b, \t1 +.elseif \i == 2 + pxor \t1, \t0 + + // HI = (a_H * b_H) + ((a*x^64)_H * b_L) +.elseif \i == 3 + _vpclmulqdq $0x11, \a, \b, \t1 +.elseif \i == 4 + pclmulqdq $0x10, \a_times_x64, \b +.elseif \i == 5 + pxor \t1, \b +.elseif \i == 6 + + // Fold MI into HI. + pshufd $0x4e, \t0, \t1 // Swap halves of MI +.elseif \i == 7 + pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + pxor \t1, \b +.elseif \i == 9 + pxor \t0, \b +.endif +.endm + +// GHASH-multiply \a by \b and store the reduced product in \b. +// See _ghash_mul_step for details. +.macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1 +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1 +.endr +.endm + +// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi. +// This does Karatsuba multiplication and must be paired with _ghash_reduce. On +// the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the +// two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered. +.macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0 + + // LO += a_L * b_L + _vpclmulqdq $0x00, \a, \b, \t0 + pxor \t0, \lo + + // b_L + b_H + pshufd $0x4e, \b, \t0 + pxor \b, \t0 + + // HI += a_H * b_H + pclmulqdq $0x11, \a, \b + pxor \b, \hi + + // MI += (a_L + a_H) * (b_L + b_H) + pclmulqdq $0x00, \a_xored, \t0 + pxor \t0, \mi +.endm + +// Reduce the product from \lo, \mi, and \hi, and store the result in \dst. +// This assumes that _ghash_mul_noreduce was used. +.macro _ghash_reduce lo, mi, hi, dst, t0 + + movq .Lgfpoly(%rip), \t0 + + // MI += LO + HI (needed because we used Karatsuba multiplication) + pxor \lo, \mi + pxor \hi, \mi + + // Fold LO into MI. + pshufd $0x4e, \lo, \dst + pclmulqdq $0x00, \t0, \lo + pxor \dst, \mi + pxor \lo, \mi + + // Fold MI into HI. + pshufd $0x4e, \mi, \dst + pclmulqdq $0x00, \t0, \mi + pxor \hi, \dst + pxor \mi, \dst +.endm + +// Do the first step of the GHASH update of a set of 8 ciphertext blocks. +// +// The whole GHASH update does: +// +// GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 + +// blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1 +// +// This macro just does the first step: it does the unreduced multiplication +// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm +// registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the +// inner block counter in %rax, which is a value that counts up by 8 for each +// block in the set of 8 and is used later to index by 8*blknum and 16*blknum. +// +// To reduce the number of pclmulqdq instructions required, both this macro and +// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook +// multiplication. See the file comment for more details about this choice. +// +// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if +// encrypting, or SRC if decrypting. They also expect the precomputed hash key +// powers H^i and their XOR'd-together halves to be available in the struct +// pointed to by KEY. Both macros clobber TMP[0-2]. +.macro _ghash_update_begin_8x enc + + // Initialize the inner block counter. + xor %eax, %eax + + // Load the highest hash key power, H^8. + movdqa OFFSETOF_H_POWERS(KEY), TMP0 + + // Load the first ciphertext block and byte-reflect it. +.if \enc + movdqu (DST), TMP1 +.else + movdqu (SRC), TMP1 +.endif + pshufb BSWAP_MASK, TMP1 + + // Add the GHASH accumulator to the ciphertext block to get the block + // 'b' that needs to be multiplied with the hash key power 'a'. + pxor TMP1, GHASH_ACC + + // b_L + b_H + pshufd $0x4e, GHASH_ACC, MI + pxor GHASH_ACC, MI + + // LO = a_L * b_L + _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO + + // HI = a_H * b_H + pclmulqdq $0x11, TMP0, GHASH_ACC + + // MI = (a_L + a_H) * (b_L + b_H) + pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI +.endm + +// Continue the GHASH update of 8 ciphertext blocks as described above by doing +// an unreduced multiplication of the next ciphertext block by the next lowest +// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI. +.macro _ghash_update_continue_8x enc + add $8, %eax + + // Load the next lowest key power. + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0 + + // Load the next ciphertext block and byte-reflect it. +.if \enc + movdqu (DST,%rax,2), TMP1 +.else + movdqu (SRC,%rax,2), TMP1 +.endif + pshufb BSWAP_MASK, TMP1 + + // LO += a_L * b_L + _vpclmulqdq $0x00, TMP0, TMP1, TMP2 + pxor TMP2, LO + + // b_L + b_H + pshufd $0x4e, TMP1, TMP2 + pxor TMP1, TMP2 + + // HI += a_H * b_H + pclmulqdq $0x11, TMP0, TMP1 + pxor TMP1, GHASH_ACC + + // MI += (a_L + a_H) * (b_L + b_H) + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1 + pclmulqdq $0x00, TMP1, TMP2 + pxor TMP2, MI +.endm + +// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to +// _ghash_reduce, but it's hardcoded to use the registers of the main loop and +// it uses the same register for HI and the destination. It's also divided into +// two steps. TMP1 must be preserved across steps. +// +// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of +// shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would +// increase the critical path length, and it seems to slightly hurt performance. +.macro _ghash_update_end_8x_step i +.if \i == 0 + movq .Lgfpoly(%rip), TMP1 + pxor LO, MI + pxor GHASH_ACC, MI + pshufd $0x4e, LO, TMP2 + pclmulqdq $0x00, TMP1, LO + pxor TMP2, MI + pxor LO, MI +.elseif \i == 1 + pshufd $0x4e, MI, TMP2 + pclmulqdq $0x00, TMP1, MI + pxor TMP2, GHASH_ACC + pxor MI, GHASH_ACC +.endif +.endm + +// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key); +// +// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH +// related fields in the key struct. +.macro _aes_gcm_precompute + + // Function arguments + .set KEY, %rdi + + // Additional local variables. + // %xmm0-%xmm1 and %rax are used as temporaries. + .set RNDKEYLAST_PTR, %rsi + .set H_CUR, %xmm2 + .set H_POW1, %xmm3 // H^1 + .set H_POW1_X64, %xmm4 // H^1 * x^64 + .set GFPOLY, %xmm5 + + // Encrypt an all-zeroes block to get the raw hash subkey. + movl OFFSETOF_AESKEYLEN(KEY), %eax + lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR + movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block + lea 16(KEY), %rax +1: + aesenc (%rax), H_POW1 + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + aesenclast (RNDKEYLAST_PTR), H_POW1 + + // Preprocess the raw hash subkey as needed to operate on GHASH's + // bit-reflected values directly: reflect its bytes, then multiply it by + // x^-1 (using the backwards interpretation of polynomial coefficients + // from the GCM spec) or equivalently x^1 (using the alternative, + // natural interpretation of polynomial coefficients). + pshufb .Lbswap_mask(%rip), H_POW1 + movdqa H_POW1, %xmm0 + pshufd $0xd3, %xmm0, %xmm0 + psrad $31, %xmm0 + paddq H_POW1, H_POW1 + pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0 + pxor %xmm0, H_POW1 + + // Store H^1. + movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY) + + // Compute and store H^1 * x^64. + movq .Lgfpoly(%rip), GFPOLY + pshufd $0x4e, H_POW1, %xmm0 + _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64 + pxor %xmm0, H_POW1_X64 + movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY) + + // Compute and store the halves of H^1 XOR'd together. + pxor H_POW1, %xmm0 + movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY) + + // Compute and store the remaining key powers H^2 through H^8. + movdqa H_POW1, H_CUR + mov $6*8, %eax +.Lprecompute_next\@: + // Compute H^i = H^{i-1} * H^1. + _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1 + // Store H^i. + movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2) + // Compute and store the halves of H^i XOR'd together. + pshufd $0x4e, H_CUR, %xmm0 + pxor H_CUR, %xmm0 + movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax) + sub $8, %eax + jge .Lprecompute_next\@ + + RET +.endm + +// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, +// u8 ghash_acc[16], const u8 *aad, int aadlen); +// +// This function processes the AAD (Additional Authenticated Data) in GCM. +// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the +// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all +// zeroes. |aadlen| must be a multiple of 16, except on the last call where it +// can be any length. The caller must do any buffering needed to ensure this. +.macro _aes_gcm_aad_update + + // Function arguments + .set KEY, %rdi + .set GHASH_ACC_PTR, %rsi + .set AAD, %rdx + .set AADLEN, %ecx + // Note: _load_partial_block relies on AADLEN being in %ecx. + + // Additional local variables. + // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers. + .set BSWAP_MASK, %xmm2 + .set GHASH_ACC, %xmm3 + .set H_POW1, %xmm4 // H^1 + .set H_POW1_X64, %xmm5 // H^1 * x^64 + .set GFPOLY, %xmm6 + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + movdqu (GHASH_ACC_PTR), GHASH_ACC + movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 + movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 + movq .Lgfpoly(%rip), GFPOLY + + // Process the AAD one full block at a time. + sub $16, AADLEN + jl .Laad_loop_1x_done\@ +.Laad_loop_1x\@: + movdqu (AAD), %xmm0 + pshufb BSWAP_MASK, %xmm0 + pxor %xmm0, GHASH_ACC + _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 + add $16, AAD + sub $16, AADLEN + jge .Laad_loop_1x\@ +.Laad_loop_1x_done\@: + // Check whether there is a partial block at the end. + add $16, AADLEN + jz .Laad_done\@ + + // Process a partial block of length 1 <= AADLEN <= 15. + // _load_partial_block assumes that %ecx contains AADLEN. + _load_partial_block AAD, %xmm0, %r10, %r10d + pshufb BSWAP_MASK, %xmm0 + pxor %xmm0, GHASH_ACC + _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 + +.Laad_done\@: + movdqu GHASH_ACC, (GHASH_ACC_PTR) + RET +.endm + +// Increment LE_CTR eight times to generate eight little-endian counter blocks, +// swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with +// the zero-th AES round key. Clobbers TMP0 and TMP1. +.macro _ctr_begin_8x + movq .Lone(%rip), TMP0 + movdqa (KEY), TMP1 // zero-th round key +.irp i, 0,1,2,3,4,5,6,7 + _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i + pxor TMP1, AESDATA\i + paddd TMP0, LE_CTR +.endr +.endm + +// Do a non-last round of AES on AESDATA[0-7] using \round_key. +.macro _aesenc_8x round_key +.irp i, 0,1,2,3,4,5,6,7 + aesenc \round_key, AESDATA\i +.endr +.endm + +// Do the last round of AES on AESDATA[0-7] using \round_key. +.macro _aesenclast_8x round_key +.irp i, 0,1,2,3,4,5,6,7 + aesenclast \round_key, AESDATA\i +.endr +.endm + +// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and +// store the result to DST. Clobbers TMP0. +.macro _xor_data_8x +.irp i, 0,1,2,3,4,5,6,7 + _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0 +.endr +.irp i, 0,1,2,3,4,5,6,7 + movdqu AESDATA\i, \i*16(DST) +.endr +.endm + +// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// const u8 *src, u8 *dst, int datalen); +// +// This macro generates a GCM encryption or decryption update function with the +// above prototype (with \enc selecting which one). +// +// This function computes the next portion of the CTR keystream, XOR's it with +// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted +// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the +// next |datalen| ciphertext bytes. +// +// |datalen| must be a multiple of 16, except on the last call where it can be +// any length. The caller must do any buffering needed to ensure this. Both +// in-place and out-of-place en/decryption are supported. +// +// |le_ctr| must give the current counter in little-endian format. For a new +// message, the low word of the counter must be 2. This function loads the +// counter from |le_ctr| and increments the loaded counter as needed, but it +// does *not* store the updated counter back to |le_ctr|. The caller must +// update |le_ctr| if any more data segments follow. Internally, only the low +// 32-bit word of the counter is incremented, following the GCM standard. +.macro _aes_gcm_update enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg + .set GHASH_ACC_PTR, %rdx + .set SRC, %rcx + .set DST, %r8 + .set DATALEN, %r9d + .set DATALEN64, %r9 // Zero-extend DATALEN before using! + // Note: the code setting up for _load_partial_block assumes that SRC is + // in %rcx (and that DATALEN is *not* in %rcx). + + // Additional local variables + + // %rax and %rsi are used as temporary registers. Note: %rsi overlaps + // with LE_CTR_PTR, which is used only at the beginning. + + .set AESKEYLEN, %r10d // AES key length in bytes + .set AESKEYLEN64, %r10 + .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key + + // Put the most frequently used values in %xmm0-%xmm7 to reduce code + // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.) + .set TMP0, %xmm0 + .set TMP1, %xmm1 + .set TMP2, %xmm2 + .set LO, %xmm3 // Low part of unreduced product + .set MI, %xmm4 // Middle part of unreduced product + .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also + // the high part of unreduced product + .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes + .set LE_CTR, %xmm7 // Little-endian counter value + .set AESDATA0, %xmm8 + .set AESDATA1, %xmm9 + .set AESDATA2, %xmm10 + .set AESDATA3, %xmm11 + .set AESDATA4, %xmm12 + .set AESDATA5, %xmm13 + .set AESDATA6, %xmm14 + .set AESDATA7, %xmm15 + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + movdqu (GHASH_ACC_PTR), GHASH_ACC + movdqu (LE_CTR_PTR), LE_CTR + + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR + + // If there are at least 8*16 bytes of data, then continue into the main + // loop, which processes 8*16 bytes of data per iteration. + // + // The main loop interleaves AES and GHASH to improve performance on + // CPUs that can execute these instructions in parallel. When + // decrypting, the GHASH input (the ciphertext) is immediately + // available. When encrypting, we instead encrypt a set of 8 blocks + // first and then GHASH those blocks while encrypting the next set of 8, + // repeat that as needed, and finally GHASH the last set of 8 blocks. + // + // Code size optimization: Prefer adding or subtracting -8*16 over 8*16, + // as this makes the immediate fit in a signed byte, saving 3 bytes. + add $-8*16, DATALEN + jl .Lcrypt_loop_8x_done\@ +.if \enc + // Encrypt the first 8 plaintext blocks. + _ctr_begin_8x + lea 16(KEY), %rsi + .p2align 4 +1: + movdqa (%rsi), TMP0 + _aesenc_8x TMP0 + add $16, %rsi + cmp %rsi, RNDKEYLAST_PTR + jne 1b + movdqa (%rsi), TMP0 + _aesenclast_8x TMP0 + _xor_data_8x + // Don't increment DST until the ciphertext blocks have been hashed. + sub $-8*16, SRC + add $-8*16, DATALEN + jl .Lghash_last_ciphertext_8x\@ +.endif + + .p2align 4 +.Lcrypt_loop_8x\@: + + // Generate the next set of 8 counter blocks and start encrypting them. + _ctr_begin_8x + lea 16(KEY), %rsi + + // Do a round of AES, and start the GHASH update of 8 ciphertext blocks + // by doing the unreduced multiplication for the first ciphertext block. + movdqa (%rsi), TMP0 + add $16, %rsi + _aesenc_8x TMP0 + _ghash_update_begin_8x \enc + + // Do 7 more rounds of AES, and continue the GHASH update by doing the + // unreduced multiplication for the remaining ciphertext blocks. + .p2align 4 +1: + movdqa (%rsi), TMP0 + add $16, %rsi + _aesenc_8x TMP0 + _ghash_update_continue_8x \enc + cmp $7*8, %eax + jne 1b + + // Do the remaining AES rounds. + .p2align 4 +1: + movdqa (%rsi), TMP0 + add $16, %rsi + _aesenc_8x TMP0 + cmp %rsi, RNDKEYLAST_PTR + jne 1b + + // Do the GHASH reduction and the last round of AES. + movdqa (RNDKEYLAST_PTR), TMP0 + _ghash_update_end_8x_step 0 + _aesenclast_8x TMP0 + _ghash_update_end_8x_step 1 + + // XOR the data with the AES-CTR keystream blocks. +.if \enc + sub $-8*16, DST +.endif + _xor_data_8x + sub $-8*16, SRC +.if !\enc + sub $-8*16, DST +.endif + add $-8*16, DATALEN + jge .Lcrypt_loop_8x\@ + +.if \enc +.Lghash_last_ciphertext_8x\@: + // Update GHASH with the last set of 8 ciphertext blocks. + _ghash_update_begin_8x \enc + .p2align 4 +1: + _ghash_update_continue_8x \enc + cmp $7*8, %eax + jne 1b + _ghash_update_end_8x_step 0 + _ghash_update_end_8x_step 1 + sub $-8*16, DST +.endif + +.Lcrypt_loop_8x_done\@: + + sub $-8*16, DATALEN + jz .Ldone\@ + + // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep + // things simple and keep the code size down by just going one block at + // a time, again taking advantage of hardware loop unrolling. Since + // there are enough key powers available for all remaining data, we do + // the GHASH multiplications unreduced, and only reduce at the very end. + + .set HI, TMP2 + .set H_POW, AESDATA0 + .set H_POW_XORED, AESDATA1 + .set ONE, AESDATA2 + + movq .Lone(%rip), ONE + + // Start collecting the unreduced GHASH intermediate value LO, MI, HI. + pxor LO, LO + pxor MI, MI + pxor HI, HI + + // Set up a block counter %rax to contain 8*(8-n), where n is the number + // of blocks that remain, counting any partial block. This will be used + // to access the key powers H^n through H^1. + mov DATALEN, %eax + neg %eax + and $~15, %eax + sar $1, %eax + add $64, %eax + + sub $16, DATALEN + jl .Lcrypt_loop_1x_done\@ + + // Process the data one full block at a time. +.Lcrypt_loop_1x\@: + + // Encrypt the next counter block. + _vpshufb BSWAP_MASK, LE_CTR, TMP0 + paddd ONE, LE_CTR + pxor (KEY), TMP0 + lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + aesenc -7*16(%rsi), TMP0 + aesenc -6*16(%rsi), TMP0 +192: + aesenc -5*16(%rsi), TMP0 + aesenc -4*16(%rsi), TMP0 +128: +.irp i, -3,-2,-1,0,1,2,3,4,5 + aesenc \i*16(%rsi), TMP0 +.endr + aesenclast (RNDKEYLAST_PTR), TMP0 + + // Load the next key power H^i. + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED + + // XOR the keystream block that was just generated in TMP0 with the next + // source data block and store the resulting en/decrypted data to DST. +.if \enc + _xor_mem_to_reg (SRC), TMP0, tmp=TMP1 + movdqu TMP0, (DST) +.else + movdqu (SRC), TMP1 + pxor TMP1, TMP0 + movdqu TMP0, (DST) +.endif + + // Update GHASH with the ciphertext block. +.if \enc + pshufb BSWAP_MASK, TMP0 + pxor TMP0, GHASH_ACC +.else + pshufb BSWAP_MASK, TMP1 + pxor TMP1, GHASH_ACC +.endif + _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 + pxor GHASH_ACC, GHASH_ACC + + add $8, %eax + add $16, SRC + add $16, DST + sub $16, DATALEN + jge .Lcrypt_loop_1x\@ +.Lcrypt_loop_1x_done\@: + // Check whether there is a partial block at the end. + add $16, DATALEN + jz .Lghash_reduce\@ + + // Process a partial block of length 1 <= DATALEN <= 15. + + // Encrypt a counter block for the last time. + pshufb BSWAP_MASK, LE_CTR + pxor (KEY), LE_CTR + lea 16(KEY), %rsi +1: + aesenc (%rsi), LE_CTR + add $16, %rsi + cmp %rsi, RNDKEYLAST_PTR + jne 1b + aesenclast (RNDKEYLAST_PTR), LE_CTR + + // Load the lowest key power, H^1. + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED + + // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is + // in %rcx, but _load_partial_block needs DATALEN in %rcx instead. + // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC. + mov SRC, RNDKEYLAST_PTR + mov DATALEN, %ecx + _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi + + // XOR the keystream block that was just generated in LE_CTR with the + // source data block and store the resulting en/decrypted data to DST. + pxor TMP0, LE_CTR + mov DATALEN, %ecx + _store_partial_block LE_CTR, DST + + // If encrypting, zero-pad the final ciphertext block for GHASH. (If + // decrypting, this was already done by _load_partial_block.) +.if \enc + lea .Lzeropad_mask+16(%rip), %rax + sub DATALEN64, %rax + _vpand (%rax), LE_CTR, TMP0 +.endif + + // Update GHASH with the final ciphertext block. + pshufb BSWAP_MASK, TMP0 + pxor TMP0, GHASH_ACC + _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 + +.Lghash_reduce\@: + // Finally, do the GHASH reduction. + _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0 + +.Ldone\@: + // Store the updated GHASH accumulator back to memory. + movdqu GHASH_ACC, (GHASH_ACC_PTR) + + RET +.endm + +// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen); +// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key, +// const u32 le_ctr[4], const u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen, +// const u8 tag[16], int taglen); +// +// This macro generates one of the above two functions (with \enc selecting +// which one). Both functions finish computing the GCM authentication tag by +// updating GHASH with the lengths block and encrypting the GHASH accumulator. +// |total_aadlen| and |total_datalen| must be the total length of the additional +// authenticated data and the en/decrypted data in bytes, respectively. +// +// The encryption function then stores the full-length (16-byte) computed +// authentication tag to |ghash_acc|. The decryption function instead loads the +// expected authentication tag (the one that was transmitted) from the 16-byte +// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the +// computed tag in constant time, and returns true if and only if they match. +.macro _aes_gcm_final enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set TOTAL_AADLEN, %rcx + .set TOTAL_DATALEN, %r8 + .set TAG, %r9 + .set TAGLEN, %r10d // Originally at 8(%rsp) + .set TAGLEN64, %r10 + + // Additional local variables. + // %rax and %xmm0-%xmm2 are used as temporary registers. + .set AESKEYLEN, %r11d + .set AESKEYLEN64, %r11 + .set BSWAP_MASK, %xmm3 + .set GHASH_ACC, %xmm4 + .set H_POW1, %xmm5 // H^1 + .set H_POW1_X64, %xmm6 // H^1 * x^64 + .set GFPOLY, %xmm7 + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Set up a counter block with 1 in the low 32-bit word. This is the + // counter that produces the ciphertext needed to encrypt the auth tag. + movdqu (LE_CTR_PTR), %xmm0 + mov $1, %eax + pinsrd $0, %eax, %xmm0 + + // Build the lengths block and XOR it into the GHASH accumulator. + movq TOTAL_DATALEN, GHASH_ACC + pinsrq $1, TOTAL_AADLEN, GHASH_ACC + psllq $3, GHASH_ACC // Bytes to bits + _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1 + + movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 + movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 + movq .Lgfpoly(%rip), GFPOLY + + // Make %rax point to the 6th from last AES round key. (Using signed + // byte offsets -7*16 through 6*16 decreases code size.) + lea (KEY,AESKEYLEN64,4), %rax + + // AES-encrypt the counter block and also multiply GHASH_ACC by H^1. + // Interleave the AES and GHASH instructions to improve performance. + pshufb BSWAP_MASK, %xmm0 + pxor (KEY), %xmm0 + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + aesenc -7*16(%rax), %xmm0 + aesenc -6*16(%rax), %xmm0 +192: + aesenc -5*16(%rax), %xmm0 + aesenc -4*16(%rax), %xmm0 +128: +.irp i, 0,1,2,3,4,5,6,7,8 + aesenc (\i-3)*16(%rax), %xmm0 + _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 +.endr + aesenclast 6*16(%rax), %xmm0 + _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 + + // Undo the byte reflection of the GHASH accumulator. + pshufb BSWAP_MASK, GHASH_ACC + + // Encrypt the GHASH accumulator. + pxor %xmm0, GHASH_ACC + +.if \enc + // Return the computed auth tag. + movdqu GHASH_ACC, (GHASH_ACC_PTR) +.else + .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN! + + // Verify the auth tag in constant time by XOR'ing the transmitted and + // computed auth tags together and using the ptest instruction to check + // whether the first TAGLEN bytes of the result are zero. + _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0 + movl 8(%rsp), TAGLEN + lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR + sub TAGLEN64, ZEROPAD_MASK_PTR + xor %eax, %eax + _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0 + sete %al +.endif + RET +.endm + +.set USE_AVX, 0 +SYM_FUNC_START(aes_gcm_precompute_aesni) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_aesni) +SYM_FUNC_START(aes_gcm_aad_update_aesni) + _aes_gcm_aad_update +SYM_FUNC_END(aes_gcm_aad_update_aesni) +SYM_FUNC_START(aes_gcm_enc_update_aesni) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_aesni) +SYM_FUNC_START(aes_gcm_dec_update_aesni) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_aesni) +SYM_FUNC_START(aes_gcm_enc_final_aesni) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_aesni) +SYM_FUNC_START(aes_gcm_dec_final_aesni) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_aesni) + +.set USE_AVX, 1 +SYM_FUNC_START(aes_gcm_precompute_aesni_avx) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_aesni_avx) +SYM_FUNC_START(aes_gcm_aad_update_aesni_avx) + _aes_gcm_aad_update +SYM_FUNC_END(aes_gcm_aad_update_aesni_avx) +SYM_FUNC_START(aes_gcm_enc_update_aesni_avx) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_aesni_avx) +SYM_FUNC_START(aes_gcm_dec_update_aesni_avx) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_aesni_avx) +SYM_FUNC_START(aes_gcm_enc_final_aesni_avx) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_aesni_avx) +SYM_FUNC_START(aes_gcm_dec_final_aesni_avx) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_aesni_avx) diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S new file mode 100644 index 000000000000..02ee11083d4f --- /dev/null +++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S @@ -0,0 +1,1199 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// VAES and VPCLMULQDQ optimized AES-GCM for x86_64 +// +// Copyright 2024 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ +// +// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that +// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and +// either AVX512 or AVX10. Some of the functions, notably the encryption and +// decryption update functions which are the most performance-critical, are +// provided in two variants generated from a macro: one using 256-bit vectors +// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The +// other, "shared" functions (vaes_avx10) use at most 256-bit vectors. +// +// The functions that use 512-bit vectors are intended for CPUs that support +// 512-bit vectors *and* where using them doesn't cause significant +// downclocking. They require the following CPU features: +// +// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512) +// +// The other functions require the following CPU features: +// +// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256) +// +// All functions use the "System V" ABI. The Windows ABI is not supported. +// +// Note that we use "avx10" in the names of the functions as a shorthand to +// really mean "AVX10 or a certain set of AVX512 features". Due to Intel's +// introduction of AVX512 and then its replacement by AVX10, there doesn't seem +// to be a simple way to name things that makes sense on all CPUs. +// +// Note that the macros that support both 256-bit and 512-bit vectors could +// fairly easily be changed to support 128-bit too. However, this would *not* +// be sufficient to allow the code to run on CPUs without AVX512 or AVX10, +// because the code heavily uses several features of these extensions other than +// the vector length: the increase in the number of SIMD registers from 16 to +// 32, masking support, and new instructions such as vpternlogd (which can do a +// three-argument XOR). These features are very useful for AES-GCM. + +#include <linux/linkage.h> + +.section .rodata +.p2align 6 + + // A shuffle mask that reflects the bytes of 16-byte blocks +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f + + // This is the GHASH reducing polynomial without its constant term, i.e. + // x^128 + x^7 + x^2 + x, represented using the backwards mapping + // between bits and polynomial coefficients. + // + // Alternatively, it can be interpreted as the naturally-ordered + // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the + // "reversed" GHASH reducing polynomial without its x^128 term. +.Lgfpoly: + .octa 0xc2000000000000000000000000000001 + + // Same as above, but with the (1 << 64) bit set. +.Lgfpoly_and_internal_carrybit: + .octa 0xc2000000000000010000000000000001 + + // The below constants are used for incrementing the counter blocks. + // ctr_pattern points to the four 128-bit values [0, 1, 2, 3]. + // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and + // 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks. +.Lctr_pattern: + .octa 0 + .octa 1 +.Linc_2blocks: + .octa 2 + .octa 3 +.Linc_4blocks: + .octa 4 + +// Number of powers of the hash key stored in the key struct. The powers are +// stored from highest (H^NUM_H_POWERS) to lowest (H^1). +#define NUM_H_POWERS 16 + +// Offset to AES key length (in bytes) in the key struct +#define OFFSETOF_AESKEYLEN 480 + +// Offset to start of hash key powers array in the key struct +#define OFFSETOF_H_POWERS 512 + +// Offset to end of hash key powers array in the key struct. +// +// This is immediately followed by three zeroized padding blocks, which are +// included so that partial vectors can be handled more easily. E.g. if VL=64 +// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most +// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded. +#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) + +.text + +// Set the vector length in bytes. This sets the VL variable and defines +// register aliases V0-V31 that map to the ymm or zmm registers. +.macro _set_veclen vl + .set VL, \vl +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +.if VL == 32 + .set V\i, %ymm\i +.elseif VL == 64 + .set V\i, %zmm\i +.else + .error "Unsupported vector length" +.endif +.endr +.endm + +// The _ghash_mul_step macro does one step of GHASH multiplication of the +// 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the +// reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the +// same size as \a and \b. To complete all steps, this must invoked with \i=0 +// through \i=9. The division into steps allows users of this macro to +// optionally interleave the computation with other instructions. Users of this +// macro must preserve the parameter registers across steps. +// +// The multiplications are done in GHASH's representation of the finite field +// GF(2^128). Elements of GF(2^128) are represented as binary polynomials +// (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial +// G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is +// just XOR, while multiplication is more complex and has two parts: (a) do +// carryless multiplication of two 128-bit input polynomials to get a 256-bit +// intermediate product polynomial, and (b) reduce the intermediate product to +// 128 bits by adding multiples of G that cancel out terms in it. (Adding +// multiples of G doesn't change which field element the polynomial represents.) +// +// Unfortunately, the GCM specification maps bits to/from polynomial +// coefficients backwards from the natural order. In each byte it specifies the +// highest bit to be the lowest order polynomial coefficient, *not* the highest! +// This makes it nontrivial to work with the GHASH polynomials. We could +// reflect the bits, but x86 doesn't have an instruction that does that. +// +// Instead, we operate on the values without bit-reflecting them. This *mostly* +// just works, since XOR and carryless multiplication are symmetric with respect +// to bit order, but it has some consequences. First, due to GHASH's byte +// order, by skipping bit reflection, *byte* reflection becomes necessary to +// give the polynomial terms a consistent order. E.g., considering an N-bit +// value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0 +// through N-1 of the byte-reflected value represent the coefficients of x^(N-1) +// through x^0, whereas bits 0 through N-1 of the non-byte-reflected value +// represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked +// with. Fortunately, x86's vpshufb instruction can do byte reflection. +// +// Second, forgoing the bit reflection causes an extra multiple of x (still +// using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each +// multiplication. This is because an M-bit by N-bit carryless multiplication +// really produces a (M+N-1)-bit product, but in practice it's zero-extended to +// M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits +// to polynomial coefficients backwards, this zero-extension actually changes +// the product by introducing an extra factor of x. Therefore, users of this +// macro must ensure that one of the inputs has an extra factor of x^-1, i.e. +// the multiplicative inverse of x, to cancel out the extra x. +// +// Third, the backwards coefficients convention is just confusing to work with, +// since it makes "low" and "high" in the polynomial math mean the opposite of +// their normal meaning in computer programming. This can be solved by using an +// alternative interpretation: the polynomial coefficients are understood to be +// in the natural order, and the multiplication is actually \a * \b * x^-128 mod +// x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs, +// or the implementation at all; it just changes the mathematical interpretation +// of what each instruction is doing. Starting from here, we'll use this +// alternative interpretation, as it's easier to understand the code that way. +// +// Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 => +// 128-bit carryless multiplication, so we break the 128 x 128 multiplication +// into parts as follows (the _L and _H suffixes denote low and high 64 bits): +// +// LO = a_L * b_L +// MI = (a_L * b_H) + (a_H * b_L) +// HI = a_H * b_H +// +// The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit. +// Note that MI "overlaps" with LO and HI. We don't consolidate MI into LO and +// HI right away, since the way the reduction works makes that unnecessary. +// +// For the reduction, we cancel out the low 128 bits by adding multiples of G = +// x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of +// which cancels out the next lowest 64 bits. Consider a value x^64*A + B, +// where A and B are 128-bit. Adding B_L*G to that value gives: +// +// x^64*A + B + B_L*G +// = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1) +// = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L +// = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L +// = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57)) +// +// So: if we sum A, B with its halves swapped, and the low half of B times x^63 +// + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the +// original value x^64*A + B. I.e., the low 64 bits got canceled out. +// +// We just need to apply this twice: first to fold LO into MI, and second to +// fold the updated MI into HI. +// +// The needed three-argument XORs are done using the vpternlogd instruction with +// immediate 0x96, since this is faster than two vpxord instructions. +// +// A potential optimization, assuming that b is fixed per-key (if a is fixed +// per-key it would work the other way around), is to use one iteration of the +// reduction described above to precompute a value c such that x^64*c = b mod G, +// and then multiply a_L by c (and implicitly by x^64) instead of by b: +// +// MI = (a_L * c_L) + (a_H * b_L) +// HI = (a_L * c_H) + (a_H * b_H) +// +// This would eliminate the LO part of the intermediate product, which would +// eliminate the need to fold LO into MI. This would save two instructions, +// including a vpclmulqdq. However, we currently don't use this optimization +// because it would require twice as many per-key precomputed values. +// +// Using Karatsuba multiplication instead of "schoolbook" multiplication +// similarly would save a vpclmulqdq but does not seem to be worth it. +.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2 +.if \i == 0 + vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L + vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H +.elseif \i == 1 + vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L +.elseif \i == 2 + vpxord \t2, \t1, \t1 // MI = MI_0 + MI_1 +.elseif \i == 3 + vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57) +.elseif \i == 4 + vpshufd $0x4e, \t0, \t0 // Swap halves of LO +.elseif \i == 5 + vpternlogd $0x96, \t2, \t0, \t1 // Fold LO into MI +.elseif \i == 6 + vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H +.elseif \i == 7 + vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + vpshufd $0x4e, \t1, \t1 // Swap halves of MI +.elseif \i == 9 + vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI +.endif +.endm + +// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store +// the reduced products in \dst. See _ghash_mul_step for full explanation. +.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2 +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2 +.endr +.endm + +// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the +// *unreduced* products to \lo, \mi, and \hi. +.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0, t1, t2, t3 + vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L + vpclmulqdq $0x01, \a, \b, \t1 // a_L * b_H + vpclmulqdq $0x10, \a, \b, \t2 // a_H * b_L + vpclmulqdq $0x11, \a, \b, \t3 // a_H * b_H + vpxord \t0, \lo, \lo + vpternlogd $0x96, \t2, \t1, \mi + vpxord \t3, \hi, \hi +.endm + +// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit +// reduced products in \hi. See _ghash_mul_step for explanation of reduction. +.macro _ghash_reduce lo, mi, hi, gfpoly, t0 + vpclmulqdq $0x01, \lo, \gfpoly, \t0 + vpshufd $0x4e, \lo, \lo + vpternlogd $0x96, \t0, \lo, \mi + vpclmulqdq $0x01, \mi, \gfpoly, \t0 + vpshufd $0x4e, \mi, \mi + vpternlogd $0x96, \t0, \mi, \hi +.endm + +// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key); +// +// Given the expanded AES key |key->aes_key|, this function derives the GHASH +// subkey and initializes |key->ghash_key_powers| with powers of it. +// +// The number of key powers initialized is NUM_H_POWERS, and they are stored in +// the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key +// powers themselves are also initialized. +// +// This macro supports both VL=32 and VL=64. _set_veclen must have been invoked +// with the desired length. In the VL=32 case, the function computes twice as +// many key powers than are actually used by the VL=32 GCM update functions. +// This is done to keep the key format the same regardless of vector length. +.macro _aes_gcm_precompute + + // Function arguments + .set KEY, %rdi + + // Additional local variables. V0-V2 and %rax are used as temporaries. + .set POWERS_PTR, %rsi + .set RNDKEYLAST_PTR, %rdx + .set H_CUR, V3 + .set H_CUR_YMM, %ymm3 + .set H_CUR_XMM, %xmm3 + .set H_INC, V4 + .set H_INC_YMM, %ymm4 + .set H_INC_XMM, %xmm4 + .set GFPOLY, V5 + .set GFPOLY_YMM, %ymm5 + .set GFPOLY_XMM, %xmm5 + + // Get pointer to lowest set of key powers (located at end of array). + lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR + + // Encrypt an all-zeroes block to get the raw hash subkey. + movl OFFSETOF_AESKEYLEN(KEY), %eax + lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR + vmovdqu (KEY), %xmm0 // Zero-th round key XOR all-zeroes block + add $16, KEY +1: + vaesenc (KEY), %xmm0, %xmm0 + add $16, KEY + cmp KEY, RNDKEYLAST_PTR + jne 1b + vaesenclast (RNDKEYLAST_PTR), %xmm0, %xmm0 + + // Reflect the bytes of the raw hash subkey. + vpshufb .Lbswap_mask(%rip), %xmm0, H_CUR_XMM + + // Zeroize the padding blocks. + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %ymm0, VL(POWERS_PTR) + vmovdqu %xmm0, VL+2*16(POWERS_PTR) + + // Finish preprocessing the first key power, H^1. Since this GHASH + // implementation operates directly on values with the backwards bit + // order specified by the GCM standard, it's necessary to preprocess the + // raw key as follows. First, reflect its bytes. Second, multiply it + // by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards + // interpretation of polynomial coefficients), which can also be + // interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121 + // + 1 using the alternative, natural interpretation of polynomial + // coefficients. For details, see the comment above _ghash_mul_step. + // + // Either way, for the multiplication the concrete operation performed + // is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2 + // << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit + // wide shift instruction, so instead double each of the two 64-bit + // halves and incorporate the internal carry bit into the value XOR'd. + vpshufd $0xd3, H_CUR_XMM, %xmm0 + vpsrad $31, %xmm0, %xmm0 + vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM + // H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit + vpternlogd $0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, H_CUR_XMM + + // Load the gfpoly constant. + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY + + // Square H^1 to get H^2. + // + // Note that as with H^1, all higher key powers also need an extra + // factor of x^-1 (or x using the natural interpretation). Nothing + // special needs to be done to make this happen, though: H^1 * H^1 would + // end up with two factors of x^-1, but the multiplication consumes one. + // So the product H^2 ends up with the desired one factor of x^-1. + _ghash_mul H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \ + %xmm0, %xmm1, %xmm2 + + // Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2]. + vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM + vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM + +.if VL == 64 + // Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4]. + _ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \ + %ymm0, %ymm1, %ymm2 + vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR + vshufi64x2 $0, H_INC, H_INC, H_INC +.endif + + // Store the lowest set of key powers. + vmovdqu8 H_CUR, (POWERS_PTR) + + // Compute and store the remaining key powers. With VL=32, repeatedly + // multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)]. + // With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by + // [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)]. + mov $(NUM_H_POWERS*16/VL) - 1, %eax +.Lprecompute_next\@: + sub $VL, POWERS_PTR + _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2 + vmovdqu8 H_CUR, (POWERS_PTR) + dec %eax + jnz .Lprecompute_next\@ + + vzeroupper // This is needed after using ymm or zmm registers. + RET +.endm + +// XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store +// the result in \dst_xmm. This implicitly zeroizes the other lanes of dst. +.macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm + vextracti32x4 $1, \src, \t0_xmm +.if VL == 32 + vpxord \t0_xmm, \src_xmm, \dst_xmm +.elseif VL == 64 + vextracti32x4 $2, \src, \t1_xmm + vextracti32x4 $3, \src, \t2_xmm + vpxord \t0_xmm, \src_xmm, \dst_xmm + vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm +.else + .error "Unsupported vector length" +.endif +.endm + +// Do one step of the GHASH update of the data blocks given in the vector +// registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The +// division into steps allows users of this macro to optionally interleave the +// computation with other instructions. This macro uses the vector register +// GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered; +// H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and +// GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the +// data blocks. The parameter registers must be preserved across steps. +// +// The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) + +// H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the +// operations are vectorized operations on vectors of 16-byte blocks. E.g., +// with VL=32 there are 2 blocks per vector and the vectorized terms correspond +// to the following non-vectorized terms: +// +// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0) +// H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3 +// H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5 +// H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7 +// +// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15. +// +// More concretely, this code does: +// - Do vectorized "schoolbook" multiplications to compute the intermediate +// 256-bit product of each block and its corresponding hash key power. +// There are 4*VL/16 of these intermediate products. +// - Sum (XOR) the intermediate 256-bit products across vectors. This leaves +// VL/16 256-bit intermediate values. +// - Do a vectorized reduction of these 256-bit intermediate values to +// 128-bits each. This leaves VL/16 128-bit intermediate values. +// - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM. +// +// See _ghash_mul_step for the full explanation of the operations performed for +// each individual finite field multiplication and reduction. +.macro _ghash_step_4x i +.if \i == 0 + vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0 + vpxord GHASH_ACC, GHASHDATA0, GHASHDATA0 + vpshufb BSWAP_MASK, GHASHDATA1, GHASHDATA1 + vpshufb BSWAP_MASK, GHASHDATA2, GHASHDATA2 +.elseif \i == 1 + vpshufb BSWAP_MASK, GHASHDATA3, GHASHDATA3 + vpclmulqdq $0x00, H_POW4, GHASHDATA0, GHASH_ACC // LO_0 + vpclmulqdq $0x00, H_POW3, GHASHDATA1, GHASHTMP0 // LO_1 + vpclmulqdq $0x00, H_POW2, GHASHDATA2, GHASHTMP1 // LO_2 +.elseif \i == 2 + vpxord GHASHTMP0, GHASH_ACC, GHASH_ACC // sum(LO_{1,0}) + vpclmulqdq $0x00, H_POW1, GHASHDATA3, GHASHTMP2 // LO_3 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASH_ACC // LO = sum(LO_{3,2,1,0}) + vpclmulqdq $0x01, H_POW4, GHASHDATA0, GHASHTMP0 // MI_0 +.elseif \i == 3 + vpclmulqdq $0x01, H_POW3, GHASHDATA1, GHASHTMP1 // MI_1 + vpclmulqdq $0x01, H_POW2, GHASHDATA2, GHASHTMP2 // MI_2 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{2,1,0}) + vpclmulqdq $0x01, H_POW1, GHASHDATA3, GHASHTMP1 // MI_3 +.elseif \i == 4 + vpclmulqdq $0x10, H_POW4, GHASHDATA0, GHASHTMP2 // MI_4 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{4,3,2,1,0}) + vpclmulqdq $0x10, H_POW3, GHASHDATA1, GHASHTMP1 // MI_5 + vpclmulqdq $0x10, H_POW2, GHASHDATA2, GHASHTMP2 // MI_6 +.elseif \i == 5 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{6,5,4,3,2,1,0}) + vpclmulqdq $0x01, GHASH_ACC, GFPOLY, GHASHTMP2 // LO_L*(x^63 + x^62 + x^57) + vpclmulqdq $0x10, H_POW1, GHASHDATA3, GHASHTMP1 // MI_7 + vpxord GHASHTMP1, GHASHTMP0, GHASHTMP0 // MI = sum(MI_{7,6,5,4,3,2,1,0}) +.elseif \i == 6 + vpshufd $0x4e, GHASH_ACC, GHASH_ACC // Swap halves of LO + vpclmulqdq $0x11, H_POW4, GHASHDATA0, GHASHDATA0 // HI_0 + vpclmulqdq $0x11, H_POW3, GHASHDATA1, GHASHDATA1 // HI_1 + vpclmulqdq $0x11, H_POW2, GHASHDATA2, GHASHDATA2 // HI_2 +.elseif \i == 7 + vpternlogd $0x96, GHASHTMP2, GHASH_ACC, GHASHTMP0 // Fold LO into MI + vpclmulqdq $0x11, H_POW1, GHASHDATA3, GHASHDATA3 // HI_3 + vpternlogd $0x96, GHASHDATA2, GHASHDATA1, GHASHDATA0 // sum(HI_{2,1,0}) + vpclmulqdq $0x01, GHASHTMP0, GFPOLY, GHASHTMP1 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + vpxord GHASHDATA3, GHASHDATA0, GHASH_ACC // HI = sum(HI_{3,2,1,0}) + vpshufd $0x4e, GHASHTMP0, GHASHTMP0 // Swap halves of MI + vpternlogd $0x96, GHASHTMP1, GHASHTMP0, GHASH_ACC // Fold MI into HI +.elseif \i == 9 + _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \ + GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM +.endif +.endm + +// Do one non-last round of AES encryption on the counter blocks in V0-V3 using +// the round key that has been broadcast to all 128-bit lanes of \round_key. +.macro _vaesenc_4x round_key + vaesenc \round_key, V0, V0 + vaesenc \round_key, V1, V1 + vaesenc \round_key, V2, V2 + vaesenc \round_key, V3, V3 +.endm + +// Start the AES encryption of four vectors of counter blocks. +.macro _ctr_begin_4x + + // Increment LE_CTR four times to generate four vectors of little-endian + // counter blocks, swap each to big-endian, and store them in V0-V3. + vpshufb BSWAP_MASK, LE_CTR, V0 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpshufb BSWAP_MASK, LE_CTR, V1 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpshufb BSWAP_MASK, LE_CTR, V2 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpshufb BSWAP_MASK, LE_CTR, V3 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + + // AES "round zero": XOR in the zero-th round key. + vpxord RNDKEY0, V0, V0 + vpxord RNDKEY0, V1, V1 + vpxord RNDKEY0, V2, V2 + vpxord RNDKEY0, V3, V3 +.endm + +// Do the last AES round for four vectors of counter blocks V0-V3, XOR source +// data with the resulting keystream, and write the result to DST and +// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.) +.macro _aesenclast_and_xor_4x + // XOR the source data with the last round key, saving the result in + // GHASHDATA[0-3]. This reduces latency by taking advantage of the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). + vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0 + vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1 + vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2 + vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3 + + // Do the last AES round. This handles the XOR with the source data + // too, as per the optimization described above. + vaesenclast GHASHDATA0, V0, GHASHDATA0 + vaesenclast GHASHDATA1, V1, GHASHDATA1 + vaesenclast GHASHDATA2, V2, GHASHDATA2 + vaesenclast GHASHDATA3, V3, GHASHDATA3 + + // Store the en/decrypted data to DST. + vmovdqu8 GHASHDATA0, 0*VL(DST) + vmovdqu8 GHASHDATA1, 1*VL(DST) + vmovdqu8 GHASHDATA2, 2*VL(DST) + vmovdqu8 GHASHDATA3, 3*VL(DST) +.endm + +// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// const u8 *src, u8 *dst, int datalen); +// +// This macro generates a GCM encryption or decryption update function with the +// above prototype (with \enc selecting which one). This macro supports both +// VL=32 and VL=64. _set_veclen must have been invoked with the desired length. +// +// This function computes the next portion of the CTR keystream, XOR's it with +// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted +// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the +// next |datalen| ciphertext bytes. +// +// |datalen| must be a multiple of 16, except on the last call where it can be +// any length. The caller must do any buffering needed to ensure this. Both +// in-place and out-of-place en/decryption are supported. +// +// |le_ctr| must give the current counter in little-endian format. For a new +// message, the low word of the counter must be 2. This function loads the +// counter from |le_ctr| and increments the loaded counter as needed, but it +// does *not* store the updated counter back to |le_ctr|. The caller must +// update |le_ctr| if any more data segments follow. Internally, only the low +// 32-bit word of the counter is incremented, following the GCM standard. +.macro _aes_gcm_update enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set SRC, %rcx + .set DST, %r8 + .set DATALEN, %r9d + .set DATALEN64, %r9 // Zero-extend DATALEN before using! + + // Additional local variables + + // %rax and %k1 are used as temporary registers. LE_CTR_PTR is also + // available as a temporary register after the counter is loaded. + + // AES key length in bytes + .set AESKEYLEN, %r10d + .set AESKEYLEN64, %r10 + + // Pointer to the last AES round key for the chosen AES variant + .set RNDKEYLAST_PTR, %r11 + + // In the main loop, V0-V3 are used as AES input and output. Elsewhere + // they are used as temporary registers. + + // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data. + .set GHASHDATA0, V4 + .set GHASHDATA0_XMM, %xmm4 + .set GHASHDATA1, V5 + .set GHASHDATA1_XMM, %xmm5 + .set GHASHDATA2, V6 + .set GHASHDATA2_XMM, %xmm6 + .set GHASHDATA3, V7 + + // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values + // using vpshufb, copied to all 128-bit lanes. + .set BSWAP_MASK, V8 + + // RNDKEY temporarily holds the next AES round key. + .set RNDKEY, V9 + + // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, + // only the lowest 128-bit lane can be nonzero. When not fully reduced, + // more than one lane may be used, and they need to be XOR'd together. + .set GHASH_ACC, V10 + .set GHASH_ACC_XMM, %xmm10 + + // LE_CTR_INC is the vector of 32-bit words that need to be added to a + // vector of little-endian counter blocks to advance it forwards. + .set LE_CTR_INC, V11 + + // LE_CTR contains the next set of little-endian counter blocks. + .set LE_CTR, V12 + + // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys, + // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, + // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. + .set RNDKEY0, V13 + .set RNDKEYLAST, V14 + .set RNDKEY_M9, V15 + .set RNDKEY_M8, V16 + .set RNDKEY_M7, V17 + .set RNDKEY_M6, V18 + .set RNDKEY_M5, V19 + .set RNDKEY_M4, V20 + .set RNDKEY_M3, V21 + .set RNDKEY_M2, V22 + .set RNDKEY_M1, V23 + + // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These + // cannot coincide with anything used for AES encryption, since for + // performance reasons GHASH and AES encryption are interleaved. + .set GHASHTMP0, V24 + .set GHASHTMP1, V25 + .set GHASHTMP2, V26 + + // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The + // descending numbering reflects the order of the key powers. + .set H_POW4, V27 + .set H_POW3, V28 + .set H_POW2, V29 + .set H_POW1, V30 + + // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes. + .set GFPOLY, V31 + + // Load some constants. + vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY + + // Load the GHASH accumulator and the starting counter. + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM + vbroadcasti32x4 (LE_CTR_PTR), LE_CTR + + // Load the AES key length in bytes. + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Make RNDKEYLAST_PTR point to the last AES round key. This is the + // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 + // respectively. Then load the zero-th and last round keys. + lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR + vbroadcasti32x4 (KEY), RNDKEY0 + vbroadcasti32x4 (RNDKEYLAST_PTR), RNDKEYLAST + + // Finish initializing LE_CTR by adding [0, 1, ...] to its low words. + vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR + + // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes. +.if VL == 32 + vbroadcasti32x4 .Linc_2blocks(%rip), LE_CTR_INC +.elseif VL == 64 + vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC +.else + .error "Unsupported vector length" +.endif + + // If there are at least 4*VL bytes of data, then continue into the loop + // that processes 4*VL bytes of data at a time. Otherwise skip it. + // + // Pre-subtracting 4*VL from DATALEN saves an instruction from the main + // loop and also ensures that at least one write always occurs to + // DATALEN, zero-extending it and allowing DATALEN64 to be used later. + add $-4*VL, DATALEN // shorter than 'sub 4*VL' when VL=32 + jl .Lcrypt_loop_4x_done\@ + + // Load powers of the hash key. + vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4 + vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3 + vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2 + vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1 + + // Main loop: en/decrypt and hash 4 vectors at a time. + // + // When possible, interleave the AES encryption of the counter blocks + // with the GHASH update of the ciphertext blocks. This improves + // performance on many CPUs because the execution ports used by the VAES + // instructions often differ from those used by vpclmulqdq and other + // instructions used in GHASH. For example, many Intel CPUs dispatch + // vaesenc to ports 0 and 1 and vpclmulqdq to port 5. + // + // The interleaving is easiest to do during decryption, since during + // decryption the ciphertext blocks are immediately available. For + // encryption, instead encrypt the first set of blocks, then hash those + // blocks while encrypting the next set of blocks, repeat that as + // needed, and finally hash the last set of blocks. + +.if \enc + // Encrypt the first 4 vectors of plaintext blocks. Leave the resulting + // ciphertext in GHASHDATA[0-3] for GHASH. + _ctr_begin_4x + lea 16(KEY), %rax +1: + vbroadcasti32x4 (%rax), RNDKEY + _vaesenc_4x RNDKEY + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + _aesenclast_and_xor_4x + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, DATALEN + jl .Lghash_last_ciphertext_4x\@ +.endif + + // Cache as many additional AES round keys as possible. +.irp i, 9,8,7,6,5,4,3,2,1 + vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i +.endr + +.Lcrypt_loop_4x\@: + + // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If + // encrypting, GHASHDATA[0-3] already contain the previous ciphertext. +.if !\enc + vmovdqu8 0*VL(SRC), GHASHDATA0 + vmovdqu8 1*VL(SRC), GHASHDATA1 + vmovdqu8 2*VL(SRC), GHASHDATA2 + vmovdqu8 3*VL(SRC), GHASHDATA3 +.endif + + // Start the AES encryption of the counter blocks. + _ctr_begin_4x + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + vbroadcasti32x4 -13*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY + vbroadcasti32x4 -12*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY +192: + vbroadcasti32x4 -11*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY + vbroadcasti32x4 -10*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY +128: + + // Finish the AES encryption of the counter blocks in V0-V3, interleaved + // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. +.irp i, 9,8,7,6,5,4,3,2,1 + _ghash_step_4x (9 - \i) + _vaesenc_4x RNDKEY_M\i +.endr + _ghash_step_4x 9 + _aesenclast_and_xor_4x + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, DATALEN + jge .Lcrypt_loop_4x\@ + +.if \enc +.Lghash_last_ciphertext_4x\@: + // Update GHASH with the last set of ciphertext blocks. +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_step_4x \i +.endr +.endif + +.Lcrypt_loop_4x_done\@: + + // Undo the extra subtraction by 4*VL and check whether data remains. + sub $-4*VL, DATALEN // shorter than 'add 4*VL' when VL=32 + jz .Ldone\@ + + // The data length isn't a multiple of 4*VL. Process the remaining data + // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time. + // Going one vector at a time may seem inefficient compared to having + // separate code paths for each possible number of vectors remaining. + // However, using a loop keeps the code size down, and it performs + // surprising well; modern CPUs will start executing the next iteration + // before the previous one finishes and also predict the number of loop + // iterations. For a similar reason, we roll up the AES rounds. + // + // On the last iteration, the remaining length may be less than VL. + // Handle this using masking. + // + // Since there are enough key powers available for all remaining data, + // there is no need to do a GHASH reduction after each iteration. + // Instead, multiply each remaining block by its own key power, and only + // do a GHASH reduction at the very end. + + // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N + // is the number of blocks that remain. + .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused. + mov DATALEN, %eax + neg %rax + and $~15, %rax // -round_up(DATALEN, 16) + lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR + + // Start collecting the unreduced GHASH intermediate value LO, MI, HI. + .set LO, GHASHDATA0 + .set LO_XMM, GHASHDATA0_XMM + .set MI, GHASHDATA1 + .set MI_XMM, GHASHDATA1_XMM + .set HI, GHASHDATA2 + .set HI_XMM, GHASHDATA2_XMM + vpxor LO_XMM, LO_XMM, LO_XMM + vpxor MI_XMM, MI_XMM, MI_XMM + vpxor HI_XMM, HI_XMM, HI_XMM + +.Lcrypt_loop_1x\@: + + // Select the appropriate mask for this iteration: all 1's if + // DATALEN >= VL, otherwise DATALEN 1's. Do this branchlessly using the + // bzhi instruction from BMI2. (This relies on DATALEN <= 255.) +.if VL < 64 + mov $-1, %eax + bzhi DATALEN, %eax, %eax + kmovd %eax, %k1 +.else + mov $-1, %rax + bzhi DATALEN64, %rax, %rax + kmovq %rax, %k1 +.endif + + // Encrypt a vector of counter blocks. This does not need to be masked. + vpshufb BSWAP_MASK, LE_CTR, V0 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpxord RNDKEY0, V0, V0 + lea 16(KEY), %rax +1: + vbroadcasti32x4 (%rax), RNDKEY + vaesenc RNDKEY, V0, V0 + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + vaesenclast RNDKEYLAST, V0, V0 + + // XOR the data with the appropriate number of keystream bytes. + vmovdqu8 (SRC), V1{%k1}{z} + vpxord V1, V0, V0 + vmovdqu8 V0, (DST){%k1} + + // Update GHASH with the ciphertext block(s), without reducing. + // + // In the case of DATALEN < VL, the ciphertext is zero-padded to VL. + // (If decrypting, it's done by the above masked load. If encrypting, + // it's done by the below masked register-to-register move.) Note that + // if DATALEN <= VL - 16, there will be additional padding beyond the + // padding of the last block specified by GHASH itself; i.e., there may + // be whole block(s) that get processed by the GHASH multiplication and + // reduction instructions but should not actually be included in the + // GHASH. However, any such blocks are all-zeroes, and the values that + // they're multiplied with are also all-zeroes. Therefore they just add + // 0 * 0 = 0 to the final GHASH result, which makes no difference. + vmovdqu8 (POWERS_PTR), H_POW1 +.if \enc + vmovdqu8 V0, V1{%k1}{z} +.endif + vpshufb BSWAP_MASK, V1, V0 + vpxord GHASH_ACC, V0, V0 + _ghash_mul_noreduce H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3 + vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM + + add $VL, POWERS_PTR + add $VL, SRC + add $VL, DST + sub $VL, DATALEN + jg .Lcrypt_loop_1x\@ + + // Finally, do the GHASH reduction. + _ghash_reduce LO, MI, HI, GFPOLY, V0 + _horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2 + +.Ldone\@: + // Store the updated GHASH accumulator back to memory. + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) + + vzeroupper // This is needed after using ymm or zmm registers. + RET +.endm + +// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen); +// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, +// const u32 le_ctr[4], +// const u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen, +// const u8 tag[16], int taglen); +// +// This macro generates one of the above two functions (with \enc selecting +// which one). Both functions finish computing the GCM authentication tag by +// updating GHASH with the lengths block and encrypting the GHASH accumulator. +// |total_aadlen| and |total_datalen| must be the total length of the additional +// authenticated data and the en/decrypted data in bytes, respectively. +// +// The encryption function then stores the full-length (16-byte) computed +// authentication tag to |ghash_acc|. The decryption function instead loads the +// expected authentication tag (the one that was transmitted) from the 16-byte +// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the +// computed tag in constant time, and returns true if and only if they match. +.macro _aes_gcm_final enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set TOTAL_AADLEN, %rcx + .set TOTAL_DATALEN, %r8 + .set TAG, %r9 + .set TAGLEN, %r10d // Originally at 8(%rsp) + + // Additional local variables. + // %rax, %xmm0-%xmm3, and %k1 are used as temporary registers. + .set AESKEYLEN, %r11d + .set AESKEYLEN64, %r11 + .set GFPOLY, %xmm4 + .set BSWAP_MASK, %xmm5 + .set LE_CTR, %xmm6 + .set GHASH_ACC, %xmm7 + .set H_POW1, %xmm8 + + // Load some constants. + vmovdqa .Lgfpoly(%rip), GFPOLY + vmovdqa .Lbswap_mask(%rip), BSWAP_MASK + + // Load the AES key length in bytes. + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Set up a counter block with 1 in the low 32-bit word. This is the + // counter that produces the ciphertext needed to encrypt the auth tag. + // GFPOLY has 1 in the low word, so grab the 1 from there using a blend. + vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR + + // Build the lengths block and XOR it with the GHASH accumulator. + // Although the lengths block is defined as the AAD length followed by + // the en/decrypted data length, both in big-endian byte order, a byte + // reflection of the full block is needed because of the way we compute + // GHASH (see _ghash_mul_step). By using little-endian values in the + // opposite order, we avoid having to reflect any bytes here. + vmovq TOTAL_DATALEN, %xmm0 + vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0 + vpsllq $3, %xmm0, %xmm0 // Bytes to bits + vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC + + // Load the first hash key power (H^1), which is stored last. + vmovdqu8 OFFSETOFEND_H_POWERS-16(KEY), H_POW1 + +.if !\enc + // Prepare a mask of TAGLEN one bits. + movl 8(%rsp), TAGLEN + mov $-1, %eax + bzhi TAGLEN, %eax, %eax + kmovd %eax, %k1 +.endif + + // Make %rax point to the last AES round key for the chosen AES variant. + lea 6*16(KEY,AESKEYLEN64,4), %rax + + // Start the AES encryption of the counter block by swapping the counter + // block to big-endian and XOR-ing it with the zero-th AES round key. + vpshufb BSWAP_MASK, LE_CTR, %xmm0 + vpxor (KEY), %xmm0, %xmm0 + + // Complete the AES encryption and multiply GHASH_ACC by H^1. + // Interleave the AES and GHASH instructions to improve performance. + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + vaesenc -13*16(%rax), %xmm0, %xmm0 + vaesenc -12*16(%rax), %xmm0, %xmm0 +192: + vaesenc -11*16(%rax), %xmm0, %xmm0 + vaesenc -10*16(%rax), %xmm0, %xmm0 +128: +.irp i, 0,1,2,3,4,5,6,7,8 + _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %xmm1, %xmm2, %xmm3 + vaesenc (\i-9)*16(%rax), %xmm0, %xmm0 +.endr + _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %xmm1, %xmm2, %xmm3 + + // Undo the byte reflection of the GHASH accumulator. + vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC + + // Do the last AES round and XOR the resulting keystream block with the + // GHASH accumulator to produce the full computed authentication tag. + // + // Reduce latency by taking advantage of the property vaesenclast(key, + // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last + // round key, instead of XOR'ing the final AES output with GHASH_ACC. + // + // enc_final then returns the computed auth tag, while dec_final + // compares it with the transmitted one and returns a bool. To compare + // the tags, dec_final XORs them together and uses vptest to check + // whether the result is all-zeroes. This should be constant-time. + // dec_final applies the vaesenclast optimization to this additional + // value XOR'd too, using vpternlogd to XOR the last round key, GHASH + // accumulator, and transmitted auth tag together in one instruction. +.if \enc + vpxor (%rax), GHASH_ACC, %xmm1 + vaesenclast %xmm1, %xmm0, GHASH_ACC + vmovdqu GHASH_ACC, (GHASH_ACC_PTR) +.else + vmovdqu (TAG), %xmm1 + vpternlogd $0x96, (%rax), GHASH_ACC, %xmm1 + vaesenclast %xmm1, %xmm0, %xmm0 + xor %eax, %eax + vmovdqu8 %xmm0, %xmm0{%k1}{z} // Truncate to TAGLEN bytes + vptest %xmm0, %xmm0 + sete %al +.endif + // No need for vzeroupper here, since only used xmm registers were used. + RET +.endm + +_set_veclen 32 +SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256) +SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256) +SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256) + +_set_veclen 64 +SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512) +SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512) +SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512) + +// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, +// u8 ghash_acc[16], +// const u8 *aad, int aadlen); +// +// This function processes the AAD (Additional Authenticated Data) in GCM. +// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the +// data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been +// initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen| +// must be a multiple of 16, except on the last call where it can be any length. +// The caller must do any buffering needed to ensure this. +// +// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes. +// Therefore, for AAD processing we currently only provide this implementation +// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This +// keeps the code size down, and it enables some micro-optimizations, e.g. using +// VEX-coded instructions instead of EVEX-coded to save some instruction bytes. +// To optimize for large amounts of AAD, we could implement a 4x-wide loop and +// provide a version using 512-bit vectors, but that doesn't seem to be useful. +SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10) + + // Function arguments + .set KEY, %rdi + .set GHASH_ACC_PTR, %rsi + .set AAD, %rdx + .set AADLEN, %ecx + .set AADLEN64, %rcx // Zero-extend AADLEN before using! + + // Additional local variables. + // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers. + .set BSWAP_MASK, %ymm4 + .set GFPOLY, %ymm5 + .set GHASH_ACC, %ymm6 + .set GHASH_ACC_XMM, %xmm6 + .set H_POW1, %ymm7 + + // Load some constants. + vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK + vbroadcasti128 .Lgfpoly(%rip), GFPOLY + + // Load the GHASH accumulator. + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM + + // Update GHASH with 32 bytes of AAD at a time. + // + // Pre-subtracting 32 from AADLEN saves an instruction from the loop and + // also ensures that at least one write always occurs to AADLEN, + // zero-extending it and allowing AADLEN64 to be used later. + sub $32, AADLEN + jl .Laad_loop_1x_done + vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1] +.Laad_loop_1x: + vmovdqu (AAD), %ymm0 + vpshufb BSWAP_MASK, %ymm0, %ymm0 + vpxor %ymm0, GHASH_ACC, GHASH_ACC + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %ymm0, %ymm1, %ymm2 + vextracti128 $1, GHASH_ACC, %xmm0 + vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM + add $32, AAD + sub $32, AADLEN + jge .Laad_loop_1x +.Laad_loop_1x_done: + add $32, AADLEN + jz .Laad_done + + // Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD. + mov $-1, %eax + bzhi AADLEN, %eax, %eax + kmovd %eax, %k1 + vmovdqu8 (AAD), %ymm0{%k1}{z} + neg AADLEN64 + and $~15, AADLEN64 // -round_up(AADLEN, 16) + vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1 + vpshufb BSWAP_MASK, %ymm0, %ymm0 + vpxor %ymm0, GHASH_ACC, GHASH_ACC + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %ymm0, %ymm1, %ymm2 + vextracti128 $1, GHASH_ACC, %xmm0 + vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM + +.Laad_done: + // Store the updated GHASH accumulator back to memory. + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) + + vzeroupper // This is needed after using ymm or zmm registers. + RET +SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10) + +SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10) +SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10) diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S new file mode 100644 index 000000000000..93ba0ddbe009 --- /dev/null +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -0,0 +1,891 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// AES-XTS for modern x86_64 CPUs +// +// Copyright 2024 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/* + * This file implements AES-XTS for modern x86_64 CPUs. To handle the + * complexities of coding for x86 SIMD, e.g. where every vector length needs + * different code, it uses a macro to generate several implementations that + * share similar source code but are targeted at different CPUs, listed below: + * + * AES-NI + AVX + * - 128-bit vectors (1 AES block per vector) + * - VEX-coded instructions + * - xmm0-xmm15 + * - This is for older CPUs that lack VAES but do have AVX. + * + * VAES + VPCLMULQDQ + AVX2 + * - 256-bit vectors (2 AES blocks per vector) + * - VEX-coded instructions + * - ymm0-ymm15 + * - This is for CPUs that have VAES but lack AVX512 or AVX10, + * e.g. Intel's Alder Lake and AMD's Zen 3. + * + * VAES + VPCLMULQDQ + AVX10/256 + BMI2 + * - 256-bit vectors (2 AES blocks per vector) + * - EVEX-coded instructions + * - ymm0-ymm31 + * - This is for CPUs that have AVX512 but where using zmm registers causes + * downclocking, and for CPUs that have AVX10/256 but not AVX10/512. + * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256. + * To avoid confusion with 512-bit, we just write AVX10/256. + * + * VAES + VPCLMULQDQ + AVX10/512 + BMI2 + * - Same as the previous one, but upgrades to 512-bit vectors + * (4 AES blocks per vector) in zmm0-zmm31. + * - This is for CPUs that have good AVX512 or AVX10/512 support. + * + * This file doesn't have an implementation for AES-NI alone (without AVX), as + * the lack of VEX would make all the assembly code different. + * + * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of + * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be + * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might + * need to start also providing an implementation using VAES alone. + * + * The AES-XTS implementations in this file support everything required by the + * crypto API, including support for arbitrary input lengths and multi-part + * processing. However, they are most heavily optimized for the common case of + * power-of-2 length inputs that are processed in a single part (disk sectors). + */ + +#include <linux/linkage.h> +#include <linux/cfi_types.h> + +.section .rodata +.p2align 4 +.Lgf_poly: + // The low 64 bits of this value represent the polynomial x^7 + x^2 + x + // + 1. It is the value that must be XOR'd into the low 64 bits of the + // tweak each time a 1 is carried out of the high 64 bits. + // + // The high 64 bits of this value is just the internal carry bit that + // exists when there's a carry out of the low 64 bits of the tweak. + .quad 0x87, 1 + + // This table contains constants for vpshufb and vpblendvb, used to + // handle variable byte shifts and blending during ciphertext stealing + // on CPUs that don't support AVX10-style masking. +.Lcts_permute_table: + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 +.text + +.macro _define_Vi i +.if VL == 16 + .set V\i, %xmm\i +.elseif VL == 32 + .set V\i, %ymm\i +.elseif VL == 64 + .set V\i, %zmm\i +.else + .error "Unsupported Vector Length (VL)" +.endif +.endm + +.macro _define_aliases + // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers + // are available, that map to the xmm, ymm, or zmm registers according + // to the selected Vector Length (VL). +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + _define_Vi \i +.endr +.if USE_AVX10 +.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + _define_Vi \i +.endr +.endif + + // Function parameters + .set KEY, %rdi // Initially points to crypto_aes_ctx, then is + // advanced to point to 7th-from-last round key + .set SRC, %rsi // Pointer to next source data + .set DST, %rdx // Pointer to next destination data + .set LEN, %ecx // Remaining length in bytes + .set LEN8, %cl + .set LEN64, %rcx + .set TWEAK, %r8 // Pointer to next tweak + + // %rax holds the AES key length in bytes. + .set KEYLEN, %eax + .set KEYLEN64, %rax + + // %r9-r11 are available as temporaries. + + // V0-V3 hold the data blocks during the main loop, or temporary values + // otherwise. V4-V5 hold temporary values. + + // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak. + .set TWEAK0_XMM, %xmm6 + .set TWEAK0, V6 + .set TWEAK1_XMM, %xmm7 + .set TWEAK1, V7 + .set TWEAK2, V8 + .set TWEAK3, V9 + + // V10-V13 are used for computing the next values of TWEAK[0-3]. + .set NEXT_TWEAK0, V10 + .set NEXT_TWEAK1, V11 + .set NEXT_TWEAK2, V12 + .set NEXT_TWEAK3, V13 + + // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes. + .set GF_POLY_XMM, %xmm14 + .set GF_POLY, V14 + + // V15 holds the key for AES "round 0", copied to all 128-bit lanes. + .set KEY0_XMM, %xmm15 + .set KEY0, V15 + + // If 32 SIMD registers are available, then V16-V29 hold the remaining + // AES round keys, copied to all 128-bit lanes. + // + // AES-128, AES-192, and AES-256 use different numbers of round keys. + // To allow handling all three variants efficiently, we align the round + // keys to the *end* of this register range. I.e., AES-128 uses + // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14. + // (All also use KEY0 for the XOR-only "round" at the beginning.) +.if USE_AVX10 + .set KEY1_XMM, %xmm16 + .set KEY1, V16 + .set KEY2_XMM, %xmm17 + .set KEY2, V17 + .set KEY3_XMM, %xmm18 + .set KEY3, V18 + .set KEY4_XMM, %xmm19 + .set KEY4, V19 + .set KEY5_XMM, %xmm20 + .set KEY5, V20 + .set KEY6_XMM, %xmm21 + .set KEY6, V21 + .set KEY7_XMM, %xmm22 + .set KEY7, V22 + .set KEY8_XMM, %xmm23 + .set KEY8, V23 + .set KEY9_XMM, %xmm24 + .set KEY9, V24 + .set KEY10_XMM, %xmm25 + .set KEY10, V25 + .set KEY11_XMM, %xmm26 + .set KEY11, V26 + .set KEY12_XMM, %xmm27 + .set KEY12, V27 + .set KEY13_XMM, %xmm28 + .set KEY13, V28 + .set KEY14_XMM, %xmm29 + .set KEY14, V29 +.endif + // V30-V31 are currently unused. +.endm + +// Move a vector between memory and a register. +// The register operand must be in the first 16 vector registers. +.macro _vmovdqu src, dst +.if VL < 64 + vmovdqu \src, \dst +.else + vmovdqu8 \src, \dst +.endif +.endm + +// Broadcast a 128-bit value into a vector. +.macro _vbroadcast128 src, dst +.if VL == 16 && !USE_AVX10 + vmovdqu \src, \dst +.elseif VL == 32 && !USE_AVX10 + vbroadcasti128 \src, \dst +.else + vbroadcasti32x4 \src, \dst +.endif +.endm + +// XOR two vectors together. +// Any register operands must be in the first 16 vector registers. +.macro _vpxor src1, src2, dst +.if VL < 64 + vpxor \src1, \src2, \dst +.else + vpxord \src1, \src2, \dst +.endif +.endm + +// XOR three vectors together. +.macro _xor3 src1, src2, src3_and_dst +.if USE_AVX10 + // vpternlogd with immediate 0x96 is a three-argument XOR. + vpternlogd $0x96, \src1, \src2, \src3_and_dst +.else + vpxor \src1, \src3_and_dst, \src3_and_dst + vpxor \src2, \src3_and_dst, \src3_and_dst +.endif +.endm + +// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak +// (by multiplying by the polynomial 'x') and write it to \dst. +.macro _next_tweak src, tmp, dst + vpshufd $0x13, \src, \tmp + vpaddq \src, \src, \dst + vpsrad $31, \tmp, \tmp +.if USE_AVX10 + vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst +.else + vpand GF_POLY_XMM, \tmp, \tmp + vpxor \tmp, \dst, \dst +.endif +.endm + +// Given the XTS tweak(s) in the vector \src, compute the next vector of +// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst. +// +// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute +// all tweaks in the vector in parallel. If VL=16, we just do the regular +// computation without vpclmulqdq, as it's the faster method for a single tweak. +.macro _next_tweakvec src, tmp1, tmp2, dst +.if VL == 16 + _next_tweak \src, \tmp1, \dst +.else + vpsrlq $64 - VL/16, \src, \tmp1 + vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2 + vpslldq $8, \tmp1, \tmp1 + vpsllq $VL/16, \src, \dst + _xor3 \tmp1, \tmp2, \dst +.endif +.endm + +// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and +// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5. +.macro _compute_first_set_of_tweaks + vmovdqu (TWEAK), TWEAK0_XMM + _vbroadcast128 .Lgf_poly(%rip), GF_POLY +.if VL == 16 + // With VL=16, multiplying by x serially is fastest. + _next_tweak TWEAK0, %xmm0, TWEAK1 + _next_tweak TWEAK1, %xmm0, TWEAK2 + _next_tweak TWEAK2, %xmm0, TWEAK3 +.else +.if VL == 32 + // Compute the second block of TWEAK0. + _next_tweak TWEAK0_XMM, %xmm0, %xmm1 + vinserti128 $1, %xmm1, TWEAK0, TWEAK0 +.elseif VL == 64 + // Compute the remaining blocks of TWEAK0. + _next_tweak TWEAK0_XMM, %xmm0, %xmm1 + _next_tweak %xmm1, %xmm0, %xmm2 + _next_tweak %xmm2, %xmm0, %xmm3 + vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0 + vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0 + vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0 +.endif + // Compute TWEAK[1-3] from TWEAK0. + vpsrlq $64 - 1*VL/16, TWEAK0, V0 + vpsrlq $64 - 2*VL/16, TWEAK0, V2 + vpsrlq $64 - 3*VL/16, TWEAK0, V4 + vpclmulqdq $0x01, GF_POLY, V0, V1 + vpclmulqdq $0x01, GF_POLY, V2, V3 + vpclmulqdq $0x01, GF_POLY, V4, V5 + vpslldq $8, V0, V0 + vpslldq $8, V2, V2 + vpslldq $8, V4, V4 + vpsllq $1*VL/16, TWEAK0, TWEAK1 + vpsllq $2*VL/16, TWEAK0, TWEAK2 + vpsllq $3*VL/16, TWEAK0, TWEAK3 +.if USE_AVX10 + vpternlogd $0x96, V0, V1, TWEAK1 + vpternlogd $0x96, V2, V3, TWEAK2 + vpternlogd $0x96, V4, V5, TWEAK3 +.else + vpxor V0, TWEAK1, TWEAK1 + vpxor V2, TWEAK2, TWEAK2 + vpxor V4, TWEAK3, TWEAK3 + vpxor V1, TWEAK1, TWEAK1 + vpxor V3, TWEAK2, TWEAK2 + vpxor V5, TWEAK3, TWEAK3 +.endif +.endif +.endm + +// Do one step in computing the next set of tweaks using the method of just +// multiplying by x repeatedly (the same method _next_tweak uses). +.macro _tweak_step_mulx i +.if \i == 0 + .set PREV_TWEAK, TWEAK3 + .set NEXT_TWEAK, NEXT_TWEAK0 +.elseif \i == 5 + .set PREV_TWEAK, NEXT_TWEAK0 + .set NEXT_TWEAK, NEXT_TWEAK1 +.elseif \i == 10 + .set PREV_TWEAK, NEXT_TWEAK1 + .set NEXT_TWEAK, NEXT_TWEAK2 +.elseif \i == 15 + .set PREV_TWEAK, NEXT_TWEAK2 + .set NEXT_TWEAK, NEXT_TWEAK3 +.endif +.if \i >= 0 && \i < 20 && \i % 5 == 0 + vpshufd $0x13, PREV_TWEAK, V5 +.elseif \i >= 0 && \i < 20 && \i % 5 == 1 + vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK +.elseif \i >= 0 && \i < 20 && \i % 5 == 2 + vpsrad $31, V5, V5 +.elseif \i >= 0 && \i < 20 && \i % 5 == 3 + vpand GF_POLY, V5, V5 +.elseif \i >= 0 && \i < 20 && \i % 5 == 4 + vpxor V5, NEXT_TWEAK, NEXT_TWEAK +.elseif \i == 1000 + vmovdqa NEXT_TWEAK0, TWEAK0 + vmovdqa NEXT_TWEAK1, TWEAK1 + vmovdqa NEXT_TWEAK2, TWEAK2 + vmovdqa NEXT_TWEAK3, TWEAK3 +.endif +.endm + +// Do one step in computing the next set of tweaks using the VPCLMULQDQ method +// (the same method _next_tweakvec uses for VL > 16). This means multiplying +// each tweak by x^(4*VL/16) independently. +// +// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed +// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq +// to do 128-bit wide shifts. The 128-bit left shift (vpslldq) saves +// instructions directly. The 128-bit right shift (vpsrldq) performs better +// than a 64-bit right shift on Intel CPUs in the context where it is used here, +// because it runs on a different execution port from the AES instructions. +.macro _tweak_step_pclmul i +.if \i == 0 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 +.elseif \i == 2 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1 +.elseif \i == 4 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2 +.elseif \i == 6 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3 +.elseif \i == 8 + vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0 +.elseif \i == 10 + vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1 +.elseif \i == 12 + vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2 +.elseif \i == 14 + vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3 +.elseif \i == 1000 + vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0 + vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1 + vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2 + vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3 + _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0 + _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1 + _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2 + _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3 +.endif +.endm + +// _tweak_step does one step of the computation of the next set of tweaks from +// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of +// \i that include at least 0 through 19, then 1000 which signals the last step. +// +// This is used to interleave the computation of the next set of tweaks with the +// AES en/decryptions, which increases performance in some cases. Clobbers V5. +.macro _tweak_step i +.if VL == 16 + _tweak_step_mulx \i +.else + _tweak_step_pclmul \i +.endif +.endm + +.macro _setup_round_keys enc + + // Select either the encryption round keys or the decryption round keys. +.if \enc + .set OFFS, 0 +.else + .set OFFS, 240 +.endif + + // Load the round key for "round 0". + _vbroadcast128 OFFS(KEY), KEY0 + + // Increment KEY to make it so that 7*16(KEY) is the last round key. + // For AES-128, increment by 3*16, resulting in the 10 round keys (not + // counting the zero-th round key which was just loaded into KEY0) being + // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use + // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment + // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY). + // + // This rebasing provides two benefits. First, it makes the offset to + // any round key be in the range [-96, 112], fitting in a signed byte. + // This shortens VEX-encoded instructions that access the later round + // keys which otherwise would need 4-byte offsets. Second, it makes it + // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the + // beginning. Skipping rounds at the end doesn't work as well because + // the last round needs different instructions. + // + // An alternative approach would be to roll up all the round loops. We + // don't do that because (a) it isn't compatible with caching the round + // keys in registers which we do when possible (see below), (b) we + // interleave the AES rounds with the XTS tweak computation, and (c) it + // seems unwise to rely *too* heavily on the CPU's branch predictor. + lea OFFS-16(KEY, KEYLEN64, 4), KEY + + // If all 32 SIMD registers are available, cache all the round keys. +.if USE_AVX10 + cmp $24, KEYLEN + jl .Laes128\@ + je .Laes192\@ + _vbroadcast128 -6*16(KEY), KEY1 + _vbroadcast128 -5*16(KEY), KEY2 +.Laes192\@: + _vbroadcast128 -4*16(KEY), KEY3 + _vbroadcast128 -3*16(KEY), KEY4 +.Laes128\@: + _vbroadcast128 -2*16(KEY), KEY5 + _vbroadcast128 -1*16(KEY), KEY6 + _vbroadcast128 0*16(KEY), KEY7 + _vbroadcast128 1*16(KEY), KEY8 + _vbroadcast128 2*16(KEY), KEY9 + _vbroadcast128 3*16(KEY), KEY10 + _vbroadcast128 4*16(KEY), KEY11 + _vbroadcast128 5*16(KEY), KEY12 + _vbroadcast128 6*16(KEY), KEY13 + _vbroadcast128 7*16(KEY), KEY14 +.endif +.endm + +// Do a single non-last round of AES encryption (if \enc==1) or decryption (if +// \enc==0) on the block(s) in \data using the round key(s) in \key. The +// register length determines the number of AES blocks en/decrypted. +.macro _vaes enc, key, data +.if \enc + vaesenc \key, \data, \data +.else + vaesdec \key, \data, \data +.endif +.endm + +// Same as _vaes, but does the last round. +.macro _vaeslast enc, key, data +.if \enc + vaesenclast \key, \data, \data +.else + vaesdeclast \key, \data, \data +.endif +.endm + +// Do a single non-last round of AES en/decryption on the block(s) in \data, +// using the same key for all block(s). The round key is loaded from the +// appropriate register or memory location for round \i. May clobber \tmp. +.macro _vaes_1x enc, i, xmm_suffix, data, tmp +.if USE_AVX10 + _vaes \enc, KEY\i\xmm_suffix, \data +.else +.ifnb \xmm_suffix + _vaes \enc, (\i-7)*16(KEY), \data +.else + _vbroadcast128 (\i-7)*16(KEY), \tmp + _vaes \enc, \tmp, \data +.endif +.endif +.endm + +// Do a single non-last round of AES en/decryption on the blocks in registers +// V0-V3, using the same key for all blocks. The round key is loaded from the +// appropriate register or memory location for round \i. In addition, does two +// steps of the computation of the next set of tweaks. May clobber V4 and V5. +.macro _vaes_4x enc, i +.if USE_AVX10 + _tweak_step (2*(\i-5)) + _vaes \enc, KEY\i, V0 + _vaes \enc, KEY\i, V1 + _tweak_step (2*(\i-5) + 1) + _vaes \enc, KEY\i, V2 + _vaes \enc, KEY\i, V3 +.else + _vbroadcast128 (\i-7)*16(KEY), V4 + _tweak_step (2*(\i-5)) + _vaes \enc, V4, V0 + _vaes \enc, V4, V1 + _tweak_step (2*(\i-5) + 1) + _vaes \enc, V4, V2 + _vaes \enc, V4, V3 +.endif +.endm + +// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, +// then XOR with \tweak again) of the block(s) in \data. To process a single +// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of +// length VL, use V* registers and leave \xmm_suffix empty. Clobbers \tmp. +.macro _aes_crypt enc, xmm_suffix, tweak, data, tmp + _xor3 KEY0\xmm_suffix, \tweak, \data + cmp $24, KEYLEN + jl .Laes128\@ + je .Laes192\@ + _vaes_1x \enc, 1, \xmm_suffix, \data, tmp=\tmp + _vaes_1x \enc, 2, \xmm_suffix, \data, tmp=\tmp +.Laes192\@: + _vaes_1x \enc, 3, \xmm_suffix, \data, tmp=\tmp + _vaes_1x \enc, 4, \xmm_suffix, \data, tmp=\tmp +.Laes128\@: +.irp i, 5,6,7,8,9,10,11,12,13 + _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp +.endr +.if USE_AVX10 + vpxord KEY14\xmm_suffix, \tweak, \tmp +.else +.ifnb \xmm_suffix + vpxor 7*16(KEY), \tweak, \tmp +.else + _vbroadcast128 7*16(KEY), \tmp + vpxor \tweak, \tmp, \tmp +.endif +.endif + _vaeslast \enc, \tmp, \data +.endm + +.macro _aes_xts_crypt enc + _define_aliases + +.if !\enc + // When decrypting a message whose length isn't a multiple of the AES + // block length, exclude the last full block from the main loop by + // subtracting 16 from LEN. This is needed because ciphertext stealing + // decryption uses the last two tweaks in reverse order. We'll handle + // the last full block and the partial block specially at the end. + lea -16(LEN), %eax + test $15, LEN8 + cmovnz %eax, LEN +.endif + + // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). + movl 480(KEY), KEYLEN + + // Setup the pointer to the round keys and cache as many as possible. + _setup_round_keys \enc + + // Compute the first set of tweaks TWEAK[0-3]. + _compute_first_set_of_tweaks + + add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32 + jl .Lhandle_remainder\@ + +.Lmain_loop\@: + // This is the main loop, en/decrypting 4*VL bytes per iteration. + + // XOR each source block with its tweak and the zero-th round key. +.if USE_AVX10 + _vmovdqu 0*VL(SRC), V0 + _vmovdqu 1*VL(SRC), V1 + _vmovdqu 2*VL(SRC), V2 + _vmovdqu 3*VL(SRC), V3 + vpternlogd $0x96, TWEAK0, KEY0, V0 + vpternlogd $0x96, TWEAK1, KEY0, V1 + vpternlogd $0x96, TWEAK2, KEY0, V2 + vpternlogd $0x96, TWEAK3, KEY0, V3 +.else + vpxor 0*VL(SRC), KEY0, V0 + vpxor 1*VL(SRC), KEY0, V1 + vpxor 2*VL(SRC), KEY0, V2 + vpxor 3*VL(SRC), KEY0, V3 + vpxor TWEAK0, V0, V0 + vpxor TWEAK1, V1, V1 + vpxor TWEAK2, V2, V2 + vpxor TWEAK3, V3, V3 +.endif + cmp $24, KEYLEN + jl .Laes128\@ + je .Laes192\@ + // Do all the AES rounds on the data blocks, interleaved with + // the computation of the next set of tweaks. + _vaes_4x \enc, 1 + _vaes_4x \enc, 2 +.Laes192\@: + _vaes_4x \enc, 3 + _vaes_4x \enc, 4 +.Laes128\@: +.irp i, 5,6,7,8,9,10,11,12,13 + _vaes_4x \enc, \i +.endr + // Do the last AES round, then XOR the results with the tweaks again. + // Reduce latency by doing the XOR before the vaesenclast, utilizing the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) + // (and likewise for vaesdeclast). +.if USE_AVX10 + _tweak_step 18 + _tweak_step 19 + vpxord TWEAK0, KEY14, V4 + vpxord TWEAK1, KEY14, V5 + _vaeslast \enc, V4, V0 + _vaeslast \enc, V5, V1 + vpxord TWEAK2, KEY14, V4 + vpxord TWEAK3, KEY14, V5 + _vaeslast \enc, V4, V2 + _vaeslast \enc, V5, V3 +.else + _vbroadcast128 7*16(KEY), V4 + _tweak_step 18 // uses V5 + _tweak_step 19 // uses V5 + vpxor TWEAK0, V4, V5 + _vaeslast \enc, V5, V0 + vpxor TWEAK1, V4, V5 + _vaeslast \enc, V5, V1 + vpxor TWEAK2, V4, V5 + vpxor TWEAK3, V4, V4 + _vaeslast \enc, V5, V2 + _vaeslast \enc, V4, V3 +.endif + + // Store the destination blocks. + _vmovdqu V0, 0*VL(DST) + _vmovdqu V1, 1*VL(DST) + _vmovdqu V2, 2*VL(DST) + _vmovdqu V3, 3*VL(DST) + + // Finish computing the next set of tweaks. + _tweak_step 1000 + + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, LEN + jge .Lmain_loop\@ + + // Check for the uncommon case where the data length isn't a multiple of + // 4*VL. Handle it out-of-line in order to optimize for the common + // case. In the common case, just fall through to the ret. + test $4*VL-1, LEN8 + jnz .Lhandle_remainder\@ +.Ldone\@: + // Store the next tweak back to *TWEAK to support continuation calls. + vmovdqu TWEAK0_XMM, (TWEAK) +.if VL > 16 + vzeroupper +.endif + RET + +.Lhandle_remainder\@: + + // En/decrypt any remaining full blocks, one vector at a time. +.if VL > 16 + add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL. + jl .Lvec_at_a_time_done\@ +.Lvec_at_a_time\@: + _vmovdqu (SRC), V0 + _aes_crypt \enc, , TWEAK0, V0, tmp=V1 + _vmovdqu V0, (DST) + _next_tweakvec TWEAK0, V0, V1, TWEAK0 + add $VL, SRC + add $VL, DST + sub $VL, LEN + jge .Lvec_at_a_time\@ +.Lvec_at_a_time_done\@: + add $VL-16, LEN // Undo extra sub of VL, then sub 16. +.else + add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16. +.endif + + // En/decrypt any remaining full blocks, one at a time. + jl .Lblock_at_a_time_done\@ +.Lblock_at_a_time\@: + vmovdqu (SRC), %xmm0 + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 + vmovdqu %xmm0, (DST) + _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM + add $16, SRC + add $16, DST + sub $16, LEN + jge .Lblock_at_a_time\@ +.Lblock_at_a_time_done\@: + add $16, LEN // Undo the extra sub of 16. + // Now 0 <= LEN <= 15. If LEN is zero, we're done. + jz .Ldone\@ + + // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN. + // Do ciphertext stealing to process the last 16 + LEN bytes. + +.if \enc + // If encrypting, the main loop already encrypted the last full block to + // create the CTS intermediate ciphertext. Prepare for the rest of CTS + // by rewinding the pointers and loading the intermediate ciphertext. + sub $16, SRC + sub $16, DST + vmovdqu (DST), %xmm0 +.else + // If decrypting, the main loop didn't decrypt the last full block + // because CTS decryption uses the last two tweaks in reverse order. + // Do it now by advancing the tweak and decrypting the last full block. + _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM + vmovdqu (SRC), %xmm0 + _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1 +.endif + +.if USE_AVX10 + // Create a mask that has the first LEN bits set. + mov $-1, %r9d + bzhi LEN, %r9d, %r9d + kmovd %r9d, %k1 + + // Swap the first LEN bytes of the en/decryption of the last full block + // with the partial block. Note that to support in-place en/decryption, + // the load from the src partial block must happen before the store to + // the dst partial block. + vmovdqa %xmm0, %xmm1 + vmovdqu8 16(SRC), %xmm0{%k1} + vmovdqu8 %xmm1, 16(DST){%k1} +.else + lea .Lcts_permute_table(%rip), %r9 + + // Load the src partial block, left-aligned. Note that to support + // in-place en/decryption, this must happen before the store to the dst + // partial block. + vmovdqu (SRC, LEN64, 1), %xmm1 + + // Shift the first LEN bytes of the en/decryption of the last full block + // to the end of a register, then store it to DST+LEN. This stores the + // dst partial block. It also writes to the second part of the dst last + // full block, but that part is overwritten later. + vpshufb (%r9, LEN64, 1), %xmm0, %xmm2 + vmovdqu %xmm2, (DST, LEN64, 1) + + // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...]. + sub LEN64, %r9 + vmovdqu 32(%r9), %xmm3 + + // Shift the src partial block to the beginning of its register. + vpshufb %xmm3, %xmm1, %xmm1 + + // Do a blend to generate the src partial block followed by the second + // part of the en/decryption of the last full block. + vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 +.endif + // En/decrypt again and store the last full block. + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 + vmovdqu %xmm0, (DST) + jmp .Ldone\@ +.endm + +// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, +// u8 iv[AES_BLOCK_SIZE]); +// +// Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes +// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10. +SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) + .set TWEAK_KEY, %rdi + .set IV, %rsi + .set KEYLEN, %eax + .set KEYLEN64, %rax + + vmovdqu (IV), %xmm0 + vpxor (TWEAK_KEY), %xmm0, %xmm0 + movl 480(TWEAK_KEY), KEYLEN + lea -16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY + cmp $24, KEYLEN + jl .Lencrypt_iv_aes128 + je .Lencrypt_iv_aes192 + vaesenc -6*16(TWEAK_KEY), %xmm0, %xmm0 + vaesenc -5*16(TWEAK_KEY), %xmm0, %xmm0 +.Lencrypt_iv_aes192: + vaesenc -4*16(TWEAK_KEY), %xmm0, %xmm0 + vaesenc -3*16(TWEAK_KEY), %xmm0, %xmm0 +.Lencrypt_iv_aes128: +.irp i, -2,-1,0,1,2,3,4,5,6 + vaesenc \i*16(TWEAK_KEY), %xmm0, %xmm0 +.endr + vaesenclast 7*16(TWEAK_KEY), %xmm0, %xmm0 + vmovdqu %xmm0, (IV) + RET +SYM_FUNC_END(aes_xts_encrypt_iv) + +// Below are the actual AES-XTS encryption and decryption functions, +// instantiated from the above macro. They all have the following prototype: +// +// void (*xts_crypt_func)(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, int len, +// u8 tweak[AES_BLOCK_SIZE]); +// +// |key| is the data key. |tweak| contains the next tweak; the encryption of +// the original IV with the tweak key was already done. This function supports +// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and +// |len| must be a multiple of 16 except on the last call. If |len| is a +// multiple of 16, then this function updates |tweak| to contain the next tweak. + +.set VL, 16 +.set USE_AVX10, 0 +SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx) + _aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_aesni_avx) +SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx) + _aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_aesni_avx) + +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +.set VL, 32 +.set USE_AVX10, 0 +SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2) + _aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) +SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2) + _aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) + +.set VL, 32 +.set USE_AVX10, 1 +SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256) + _aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256) +SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256) + _aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256) + +.set VL, 64 +.set USE_AVX10, 1 +SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512) + _aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512) +SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512) + _aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512) +#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S deleted file mode 100644 index 2402b9418cd7..000000000000 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ /dev/null @@ -1,597 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */ -/* - * AES CTR mode by8 optimization with AVX instructions. (x86_64) - * - * Copyright(c) 2014 Intel Corporation. - * - * Contact Information: - * James Guilford <james.guilford@intel.com> - * Sean Gulley <sean.m.gulley@intel.com> - * Chandramouli Narayanan <mouli@linux.intel.com> - */ -/* - * This is AES128/192/256 CTR mode optimization implementation. It requires - * the support of Intel(R) AESNI and AVX instructions. - * - * This work was inspired by the AES CTR mode optimization published - * in Intel Optimized IPSEC Cryptographic library. - * Additional information on it can be found at: - * https://github.com/intel/intel-ipsec-mb - */ - -#include <linux/linkage.h> - -#define VMOVDQ vmovdqu - -/* - * Note: the "x" prefix in these aliases means "this is an xmm register". The - * alias prefixes have no relation to XCTR where the "X" prefix means "XOR - * counter". - */ -#define xdata0 %xmm0 -#define xdata1 %xmm1 -#define xdata2 %xmm2 -#define xdata3 %xmm3 -#define xdata4 %xmm4 -#define xdata5 %xmm5 -#define xdata6 %xmm6 -#define xdata7 %xmm7 -#define xcounter %xmm8 // CTR mode only -#define xiv %xmm8 // XCTR mode only -#define xbyteswap %xmm9 // CTR mode only -#define xtmp %xmm9 // XCTR mode only -#define xkey0 %xmm10 -#define xkey4 %xmm11 -#define xkey8 %xmm12 -#define xkey12 %xmm13 -#define xkeyA %xmm14 -#define xkeyB %xmm15 - -#define p_in %rdi -#define p_iv %rsi -#define p_keys %rdx -#define p_out %rcx -#define num_bytes %r8 -#define counter %r9 // XCTR mode only -#define tmp %r10 -#define DDQ_DATA 0 -#define XDATA 1 -#define KEY_128 1 -#define KEY_192 2 -#define KEY_256 3 - -.section .rodata -.align 16 - -byteswap_const: - .octa 0x000102030405060708090A0B0C0D0E0F -ddq_low_msk: - .octa 0x0000000000000000FFFFFFFFFFFFFFFF -ddq_high_add_1: - .octa 0x00000000000000010000000000000000 -ddq_add_1: - .octa 0x00000000000000000000000000000001 -ddq_add_2: - .octa 0x00000000000000000000000000000002 -ddq_add_3: - .octa 0x00000000000000000000000000000003 -ddq_add_4: - .octa 0x00000000000000000000000000000004 -ddq_add_5: - .octa 0x00000000000000000000000000000005 -ddq_add_6: - .octa 0x00000000000000000000000000000006 -ddq_add_7: - .octa 0x00000000000000000000000000000007 -ddq_add_8: - .octa 0x00000000000000000000000000000008 - -.text - -/* generate a unique variable for ddq_add_x */ - -/* generate a unique variable for xmm register */ -.macro setxdata n - var_xdata = %xmm\n -.endm - -/* club the numeric 'id' to the symbol 'name' */ - -.macro club name, id -.altmacro - .if \name == XDATA - setxdata %\id - .endif -.noaltmacro -.endm - -/* - * do_aes num_in_par load_keys key_len - * This increments p_in, but not p_out - */ -.macro do_aes b, k, key_len, xctr - .set by, \b - .set load_keys, \k - .set klen, \key_len - - .if (load_keys) - vmovdqa 0*16(p_keys), xkey0 - .endif - - .if \xctr - movq counter, xtmp - .set i, 0 - .rept (by) - club XDATA, i - vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata - .set i, (i +1) - .endr - .set i, 0 - .rept (by) - club XDATA, i - vpxor xiv, var_xdata, var_xdata - .set i, (i +1) - .endr - .else - vpshufb xbyteswap, xcounter, xdata0 - .set i, 1 - .rept (by - 1) - club XDATA, i - vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata - vptest ddq_low_msk(%rip), var_xdata - jnz 1f - vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: - vpshufb xbyteswap, var_xdata, var_xdata - .set i, (i +1) - .endr - .endif - - vmovdqa 1*16(p_keys), xkeyA - - vpxor xkey0, xdata0, xdata0 - .if \xctr - add $by, counter - .else - vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter - vptest ddq_low_msk(%rip), xcounter - jnz 1f - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: - .endif - - .set i, 1 - .rept (by - 1) - club XDATA, i - vpxor xkey0, var_xdata, var_xdata - .set i, (i +1) - .endr - - vmovdqa 2*16(p_keys), xkeyB - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ - .set i, (i +1) - .endr - - .if (klen == KEY_128) - .if (load_keys) - vmovdqa 3*16(p_keys), xkey4 - .endif - .else - vmovdqa 3*16(p_keys), xkeyA - .endif - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ - .set i, (i +1) - .endr - - add $(16*by), p_in - - .if (klen == KEY_128) - vmovdqa 4*16(p_keys), xkeyB - .else - .if (load_keys) - vmovdqa 4*16(p_keys), xkey4 - .endif - .endif - - .set i, 0 - .rept by - club XDATA, i - /* key 3 */ - .if (klen == KEY_128) - vaesenc xkey4, var_xdata, var_xdata - .else - vaesenc xkeyA, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - vmovdqa 5*16(p_keys), xkeyA - - .set i, 0 - .rept by - club XDATA, i - /* key 4 */ - .if (klen == KEY_128) - vaesenc xkeyB, var_xdata, var_xdata - .else - vaesenc xkey4, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen == KEY_128) - .if (load_keys) - vmovdqa 6*16(p_keys), xkey8 - .endif - .else - vmovdqa 6*16(p_keys), xkeyB - .endif - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ - .set i, (i +1) - .endr - - vmovdqa 7*16(p_keys), xkeyA - - .set i, 0 - .rept by - club XDATA, i - /* key 6 */ - .if (klen == KEY_128) - vaesenc xkey8, var_xdata, var_xdata - .else - vaesenc xkeyB, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen == KEY_128) - vmovdqa 8*16(p_keys), xkeyB - .else - .if (load_keys) - vmovdqa 8*16(p_keys), xkey8 - .endif - .endif - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ - .set i, (i +1) - .endr - - .if (klen == KEY_128) - .if (load_keys) - vmovdqa 9*16(p_keys), xkey12 - .endif - .else - vmovdqa 9*16(p_keys), xkeyA - .endif - - .set i, 0 - .rept by - club XDATA, i - /* key 8 */ - .if (klen == KEY_128) - vaesenc xkeyB, var_xdata, var_xdata - .else - vaesenc xkey8, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - vmovdqa 10*16(p_keys), xkeyB - - .set i, 0 - .rept by - club XDATA, i - /* key 9 */ - .if (klen == KEY_128) - vaesenc xkey12, var_xdata, var_xdata - .else - vaesenc xkeyA, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen != KEY_128) - vmovdqa 11*16(p_keys), xkeyA - .endif - - .set i, 0 - .rept by - club XDATA, i - /* key 10 */ - .if (klen == KEY_128) - vaesenclast xkeyB, var_xdata, var_xdata - .else - vaesenc xkeyB, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen != KEY_128) - .if (load_keys) - vmovdqa 12*16(p_keys), xkey12 - .endif - - .set i, 0 - .rept by - club XDATA, i - vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ - .set i, (i +1) - .endr - - .if (klen == KEY_256) - vmovdqa 13*16(p_keys), xkeyA - .endif - - .set i, 0 - .rept by - club XDATA, i - .if (klen == KEY_256) - /* key 12 */ - vaesenc xkey12, var_xdata, var_xdata - .else - vaesenclast xkey12, var_xdata, var_xdata - .endif - .set i, (i +1) - .endr - - .if (klen == KEY_256) - vmovdqa 14*16(p_keys), xkeyB - - .set i, 0 - .rept by - club XDATA, i - /* key 13 */ - vaesenc xkeyA, var_xdata, var_xdata - .set i, (i +1) - .endr - - .set i, 0 - .rept by - club XDATA, i - /* key 14 */ - vaesenclast xkeyB, var_xdata, var_xdata - .set i, (i +1) - .endr - .endif - .endif - - .set i, 0 - .rept (by / 2) - .set j, (i+1) - VMOVDQ (i*16 - 16*by)(p_in), xkeyA - VMOVDQ (j*16 - 16*by)(p_in), xkeyB - club XDATA, i - vpxor xkeyA, var_xdata, var_xdata - club XDATA, j - vpxor xkeyB, var_xdata, var_xdata - .set i, (i+2) - .endr - - .if (i < by) - VMOVDQ (i*16 - 16*by)(p_in), xkeyA - club XDATA, i - vpxor xkeyA, var_xdata, var_xdata - .endif - - .set i, 0 - .rept by - club XDATA, i - VMOVDQ var_xdata, i*16(p_out) - .set i, (i+1) - .endr -.endm - -.macro do_aes_load val, key_len, xctr - do_aes \val, 1, \key_len, \xctr -.endm - -.macro do_aes_noload val, key_len, xctr - do_aes \val, 0, \key_len, \xctr -.endm - -/* main body of aes ctr load */ - -.macro do_aes_ctrmain key_len, xctr - cmp $16, num_bytes - jb .Ldo_return2\xctr\key_len - - .if \xctr - shr $4, counter - vmovdqu (p_iv), xiv - .else - vmovdqa byteswap_const(%rip), xbyteswap - vmovdqu (p_iv), xcounter - vpshufb xbyteswap, xcounter, xcounter - .endif - - mov num_bytes, tmp - and $(7*16), tmp - jz .Lmult_of_8_blks\xctr\key_len - - /* 1 <= tmp <= 7 */ - cmp $(4*16), tmp - jg .Lgt4\xctr\key_len - je .Leq4\xctr\key_len - -.Llt4\xctr\key_len: - cmp $(2*16), tmp - jg .Leq3\xctr\key_len - je .Leq2\xctr\key_len - -.Leq1\xctr\key_len: - do_aes_load 1, \key_len, \xctr - add $(1*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Leq2\xctr\key_len: - do_aes_load 2, \key_len, \xctr - add $(2*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - - -.Leq3\xctr\key_len: - do_aes_load 3, \key_len, \xctr - add $(3*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Leq4\xctr\key_len: - do_aes_load 4, \key_len, \xctr - add $(4*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Lgt4\xctr\key_len: - cmp $(6*16), tmp - jg .Leq7\xctr\key_len - je .Leq6\xctr\key_len - -.Leq5\xctr\key_len: - do_aes_load 5, \key_len, \xctr - add $(5*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Leq6\xctr\key_len: - do_aes_load 6, \key_len, \xctr - add $(6*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Leq7\xctr\key_len: - do_aes_load 7, \key_len, \xctr - add $(7*16), p_out - and $(~7*16), num_bytes - jz .Ldo_return2\xctr\key_len - jmp .Lmain_loop2\xctr\key_len - -.Lmult_of_8_blks\xctr\key_len: - .if (\key_len != KEY_128) - vmovdqa 0*16(p_keys), xkey0 - vmovdqa 4*16(p_keys), xkey4 - vmovdqa 8*16(p_keys), xkey8 - vmovdqa 12*16(p_keys), xkey12 - .else - vmovdqa 0*16(p_keys), xkey0 - vmovdqa 3*16(p_keys), xkey4 - vmovdqa 6*16(p_keys), xkey8 - vmovdqa 9*16(p_keys), xkey12 - .endif -.align 16 -.Lmain_loop2\xctr\key_len: - /* num_bytes is a multiple of 8 and >0 */ - do_aes_noload 8, \key_len, \xctr - add $(8*16), p_out - sub $(8*16), num_bytes - jne .Lmain_loop2\xctr\key_len - -.Ldo_return2\xctr\key_len: - .if !\xctr - /* return updated IV */ - vpshufb xbyteswap, xcounter, xcounter - vmovdqu xcounter, (p_iv) - .endif - RET -.endm - -/* - * routine to do AES128 CTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, - * unsigned int num_bytes) - */ -SYM_FUNC_START(aes_ctr_enc_128_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_128 0 - -SYM_FUNC_END(aes_ctr_enc_128_avx_by8) - -/* - * routine to do AES192 CTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, - * unsigned int num_bytes) - */ -SYM_FUNC_START(aes_ctr_enc_192_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_192 0 - -SYM_FUNC_END(aes_ctr_enc_192_avx_by8) - -/* - * routine to do AES256 CTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, - * unsigned int num_bytes) - */ -SYM_FUNC_START(aes_ctr_enc_256_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_256 0 - -SYM_FUNC_END(aes_ctr_enc_256_avx_by8) - -/* - * routine to do AES128 XCTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, - * u8* out, unsigned int num_bytes, unsigned int byte_ctr) - */ -SYM_FUNC_START(aes_xctr_enc_128_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_128 1 - -SYM_FUNC_END(aes_xctr_enc_128_avx_by8) - -/* - * routine to do AES192 XCTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, - * u8* out, unsigned int num_bytes, unsigned int byte_ctr) - */ -SYM_FUNC_START(aes_xctr_enc_192_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_192 1 - -SYM_FUNC_END(aes_xctr_enc_192_avx_by8) - -/* - * routine to do AES256 XCTR enc/decrypt "by8" - * XMM registers are clobbered. - * Saving/restoring must be done at a higher level - * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, - * u8* out, unsigned int num_bytes, unsigned int byte_ctr) - */ -SYM_FUNC_START(aes_xctr_enc_256_avx_by8) - /* call the aes main loop */ - do_aes_ctrmain KEY_256 1 - -SYM_FUNC_END(aes_xctr_enc_256_avx_by8) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 411d8c83e88a..b37881bb9f15 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -10,120 +10,15 @@ * Vinodh Gopal <vinodh.gopal@intel.com> * Kahraman Akdemir * - * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD - * interface for 64-bit kernels. - * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) - * Aidan O'Mahony (aidan.o.mahony@intel.com) - * Adrian Hoban <adrian.hoban@intel.com> - * James Guilford (james.guilford@intel.com) - * Gabriele Paoloni <gabriele.paoloni@intel.com> - * Tadeusz Struk (tadeusz.struk@intel.com) - * Wajdi Feghali (wajdi.k.feghali@intel.com) - * Copyright (c) 2010, Intel Corporation. + * Copyright (c) 2010, Intel Corporation. * * Ported x86_64 version to x86: * Author: Mathias Krause <minipli@googlemail.com> */ #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/frame.h> -#include <asm/nospec-branch.h> - -/* - * The following macros are used to move an (un)aligned 16 byte value to/from - * an XMM register. This can done for either FP or integer values, for FP use - * movaps (move aligned packed single) or integer use movdqa (move double quad - * aligned). It doesn't make a performance difference which instruction is used - * since Nehalem (original Core i7) was released. However, the movaps is a byte - * shorter, so that is the one we'll use for now. (same for unaligned). - */ -#define MOVADQ movaps -#define MOVUDQ movups - -#ifdef __x86_64__ - -# constants in mergeable sections, linker can reorder and merge -.section .rodata.cst16.POLY, "aM", @progbits, 16 -.align 16 -POLY: .octa 0xC2000000000000000000000000000001 -.section .rodata.cst16.TWOONE, "aM", @progbits, 16 -.align 16 -TWOONE: .octa 0x00000001000000000000000000000001 - -.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 -.align 16 -SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F -.section .rodata.cst16.MASK1, "aM", @progbits, 16 -.align 16 -MASK1: .octa 0x0000000000000000ffffffffffffffff -.section .rodata.cst16.MASK2, "aM", @progbits, 16 -.align 16 -MASK2: .octa 0xffffffffffffffff0000000000000000 -.section .rodata.cst16.ONE, "aM", @progbits, 16 -.align 16 -ONE: .octa 0x00000000000000000000000000000001 -.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 -.align 16 -F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 -.section .rodata.cst16.dec, "aM", @progbits, 16 -.align 16 -dec: .octa 0x1 -.section .rodata.cst16.enc, "aM", @progbits, 16 -.align 16 -enc: .octa 0x2 - -# order of these constants should not change. -# more specifically, ALL_F should follow SHIFT_MASK, -# and zero should follow ALL_F -.section .rodata, "a", @progbits -.align 16 -SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 -ALL_F: .octa 0xffffffffffffffffffffffffffffffff - .octa 0x00000000000000000000000000000000 - -.text - - -#define STACK_OFFSET 8*3 - -#define AadHash 16*0 -#define AadLen 16*1 -#define InLen (16*1)+8 -#define PBlockEncKey 16*2 -#define OrigIV 16*3 -#define CurCount 16*4 -#define PBlockLen 16*5 -#define HashKey 16*6 // store HashKey <<1 mod poly here -#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here -#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here -#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here -#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 - // bits of HashKey <<1 mod poly here - //(for Karatsuba purposes) -#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 - // bits of HashKey^2 <<1 mod poly here - // (for Karatsuba purposes) -#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 - // bits of HashKey^3 <<1 mod poly here - // (for Karatsuba purposes) -#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 - // bits of HashKey^4 <<1 mod poly here - // (for Karatsuba purposes) - -#define arg1 rdi -#define arg2 rsi -#define arg3 rdx -#define arg4 rcx -#define arg5 r8 -#define arg6 r9 -#define arg7 STACK_OFFSET+8(%rsp) -#define arg8 STACK_OFFSET+16(%rsp) -#define arg9 STACK_OFFSET+24(%rsp) -#define arg10 STACK_OFFSET+32(%rsp) -#define arg11 STACK_OFFSET+40(%rsp) -#define keysize 2*15*16(%arg1) -#endif - #define STATE1 %xmm0 #define STATE2 %xmm4 @@ -170,1587 +65,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff #define TKEYP T1 #endif -.macro FUNC_SAVE - push %r12 - push %r13 - push %r14 -# -# states of %xmm registers %xmm6:%xmm15 not saved -# all %xmm registers are clobbered -# -.endm - - -.macro FUNC_RESTORE - pop %r14 - pop %r13 - pop %r12 -.endm - -# Precompute hashkeys. -# Input: Hash subkey. -# Output: HashKeys stored in gcm_context_data. Only needs to be called -# once per key. -# clobbers r12, and tmp xmm registers. -.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 - mov \SUBKEY, %r12 - movdqu (%r12), \TMP3 - movdqa SHUF_MASK(%rip), \TMP2 - pshufb \TMP2, \TMP3 - - # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) - - movdqa \TMP3, \TMP2 - psllq $1, \TMP3 - psrlq $63, \TMP2 - movdqa \TMP2, \TMP1 - pslldq $8, \TMP2 - psrldq $8, \TMP1 - por \TMP2, \TMP3 - - # reduce HashKey<<1 - - pshufd $0x24, \TMP1, \TMP2 - pcmpeqd TWOONE(%rip), \TMP2 - pand POLY(%rip), \TMP2 - pxor \TMP2, \TMP3 - movdqu \TMP3, HashKey(%arg2) - - movdqa \TMP3, \TMP5 - pshufd $78, \TMP3, \TMP1 - pxor \TMP3, \TMP1 - movdqu \TMP1, HashKey_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^2<<1 (mod poly) - movdqu \TMP5, HashKey_2(%arg2) -# HashKey_2 = HashKey^2<<1 (mod poly) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_2_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqu \TMP5, HashKey_3(%arg2) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_3_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqu \TMP5, HashKey_4(%arg2) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_4_k(%arg2) -.endm - -# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. -# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 -.macro GCM_INIT Iv SUBKEY AAD AADLEN - mov \AADLEN, %r11 - mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length - xor %r11d, %r11d - mov %r11, InLen(%arg2) # ctx_data.in_length = 0 - mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 - mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 - mov \Iv, %rax - movdqu (%rax), %xmm0 - movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv - - movdqa SHUF_MASK(%rip), %xmm2 - pshufb %xmm2, %xmm0 - movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv - - PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 - movdqu HashKey(%arg2), %xmm13 - - CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ - %xmm4, %xmm5, %xmm6 -.endm - -# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context -# struct has been initialized by GCM_INIT. -# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK -# Clobbers rax, r10-r13, and xmm0-xmm15 -.macro GCM_ENC_DEC operation - movdqu AadHash(%arg2), %xmm8 - movdqu HashKey(%arg2), %xmm13 - add %arg5, InLen(%arg2) - - xor %r11d, %r11d # initialise the data pointer offset as zero - PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation - - sub %r11, %arg5 # sub partial block data used - mov %arg5, %r13 # save the number of bytes - - and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) - mov %r13, %r12 - # Encrypt/Decrypt first few blocks - - and $(3<<4), %r12 - jz .L_initial_num_blocks_is_0_\@ - cmp $(2<<4), %r12 - jb .L_initial_num_blocks_is_1_\@ - je .L_initial_num_blocks_is_2_\@ -.L_initial_num_blocks_is_3_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation - sub $48, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_2_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation - sub $32, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_1_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation - sub $16, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_0_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation -.L_initial_blocks_\@: - - # Main loop - Encrypt/Decrypt remaining blocks - - test %r13, %r13 - je .L_zero_cipher_left_\@ - sub $64, %r13 - je .L_four_cipher_left_\@ -.L_crypt_by_4_\@: - GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ - %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ - %xmm7, %xmm8, enc - add $64, %r11 - sub $64, %r13 - jne .L_crypt_by_4_\@ -.L_four_cipher_left_\@: - GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ -%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 -.L_zero_cipher_left_\@: - movdqu %xmm8, AadHash(%arg2) - movdqu %xmm0, CurCount(%arg2) - - mov %arg5, %r13 - and $15, %r13 # %r13 = arg5 (mod 16) - je .L_multiple_of_16_bytes_\@ - - mov %r13, PBlockLen(%arg2) - - # Handle the last <16 Byte block separately - paddd ONE(%rip), %xmm0 # INCR CNT to get Yn - movdqu %xmm0, CurCount(%arg2) - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm0 - - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) - movdqu %xmm0, PBlockEncKey(%arg2) - - cmp $16, %arg5 - jge .L_large_enough_update_\@ - - lea (%arg4,%r11,1), %r10 - mov %r13, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 - jmp .L_data_read_\@ - -.L_large_enough_update_\@: - sub $16, %r11 - add %r13, %r11 - - # receive the last <16 Byte block - movdqu (%arg4, %r11, 1), %xmm1 - - sub %r13, %r11 - add $16, %r11 - - lea SHIFT_MASK+16(%rip), %r12 - # adjust the shuffle mask pointer to be able to shift 16-r13 bytes - # (r13 is the number of bytes in plaintext mod 16) - sub %r13, %r12 - # get the appropriate shuffle mask - movdqu (%r12), %xmm2 - # shift right 16-r13 bytes - pshufb %xmm2, %xmm1 - -.L_data_read_\@: - lea ALL_F+16(%rip), %r12 - sub %r13, %r12 - -.ifc \operation, dec - movdqa %xmm1, %xmm2 -.endif - pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) - movdqu (%r12), %xmm1 - # get the appropriate mask to mask out top 16-r13 bytes of xmm0 - pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 -.ifc \operation, dec - pand %xmm1, %xmm2 - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10 ,%xmm2 - - pxor %xmm2, %xmm8 -.else - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10,%xmm0 - - pxor %xmm0, %xmm8 -.endif - - movdqu %xmm8, AadHash(%arg2) -.ifc \operation, enc - # GHASH computation for the last <16 byte block - movdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm0 back to output as ciphertext - pshufb %xmm10, %xmm0 -.endif - - # Output %r13 bytes - movq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - mov %rax, (%arg3 , %r11, 1) - add $8, %r11 - psrldq $8, %xmm0 - movq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - mov %al, (%arg3, %r11, 1) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_multiple_of_16_bytes_\@: -.endm - -# GCM_COMPLETE Finishes update of tag of last partial block -# Output: Authorization Tag (AUTH_TAG) -# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 -.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN - movdqu AadHash(%arg2), %xmm8 - movdqu HashKey(%arg2), %xmm13 - - mov PBlockLen(%arg2), %r12 - - test %r12, %r12 - je .L_partial_done\@ - - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - -.L_partial_done\@: - mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - movd %r12d, %xmm15 # len(A) in %xmm15 - mov InLen(%arg2), %r12 - shl $3, %r12 # len(C) in bits (*128) - movq %r12, %xmm1 - - pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 - pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) - pxor %xmm15, %xmm8 - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - # final GHASH computation - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm8 - - movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) - pxor %xmm8, %xmm0 -.L_return_T_\@: - mov \AUTHTAG, %r10 # %r10 = authTag - mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len - cmp $16, %r11 - je .L_T_16_\@ - cmp $8, %r11 - jl .L_T_4_\@ -.L_T_8_\@: - movq %xmm0, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - psrldq $8, %xmm0 - test %r11, %r11 - je .L_return_T_done_\@ -.L_T_4_\@: - movd %xmm0, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - psrldq $4, %xmm0 - test %r11, %r11 - je .L_return_T_done_\@ -.L_T_123_\@: - movd %xmm0, %eax - cmp $2, %r11 - jl .L_T_1_\@ - mov %ax, (%r10) - cmp $2, %r11 - je .L_return_T_done_\@ - add $2, %r10 - sar $16, %eax -.L_T_1_\@: - mov %al, (%r10) - jmp .L_return_T_done_\@ -.L_T_16_\@: - movdqu %xmm0, (%r10) -.L_return_T_done_\@: -.endm - -#ifdef __x86_64__ -/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -* -* -* Input: A and B (128-bits each, bit-reflected) -* Output: C = A*B*x mod poly, (i.e. >>1 ) -* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -* -*/ -.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 - movdqa \GH, \TMP1 - pshufd $78, \GH, \TMP2 - pshufd $78, \HK, \TMP3 - pxor \GH, \TMP2 # TMP2 = a1+a0 - pxor \HK, \TMP3 # TMP3 = b1+b0 - pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \HK, \GH # GH = a0*b0 - pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) - pxor \GH, \TMP2 - pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \GH - pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK - - # first phase of the reduction - - movdqa \GH, \TMP2 - movdqa \GH, \TMP3 - movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 - # in in order to perform - # independent shifts - pslld $31, \TMP2 # packed right shift <<31 - pslld $30, \TMP3 # packed right shift <<30 - pslld $25, \TMP4 # packed right shift <<25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift TMP5 1 DW - pslldq $12, \TMP2 # left shift TMP2 3 DWs - pxor \TMP2, \GH - - # second phase of the reduction - - movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 - # in in order to perform - # independent shifts - movdqa \GH,\TMP3 - movdqa \GH,\TMP4 - psrld $1,\TMP2 # packed left shift >>1 - psrld $2,\TMP3 # packed left shift >>2 - psrld $7,\TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \GH - pxor \TMP1, \GH # result is in TMP1 -.endm - -# Reads DLEN bytes starting at DPTR and stores in XMMDst -# where 0 < DLEN < 16 -# Clobbers %rax, DLEN and XMM1 -.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst - cmp $8, \DLEN - jl .L_read_lt8_\@ - mov (\DPTR), %rax - movq %rax, \XMMDst - sub $8, \DLEN - jz .L_done_read_partial_block_\@ - xor %eax, %eax -.L_read_next_byte_\@: - shl $8, %rax - mov 7(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_\@ - movq %rax, \XMM1 - pslldq $8, \XMM1 - por \XMM1, \XMMDst - jmp .L_done_read_partial_block_\@ -.L_read_lt8_\@: - xor %eax, %eax -.L_read_next_byte_lt8_\@: - shl $8, %rax - mov -1(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_lt8_\@ - movq %rax, \XMMDst -.L_done_read_partial_block_\@: -.endm - -# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. -# clobbers r10-11, xmm14 -.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ - TMP6 TMP7 - MOVADQ SHUF_MASK(%rip), %xmm14 - mov \AAD, %r10 # %r10 = AAD - mov \AADLEN, %r11 # %r11 = aadLen - pxor \TMP7, \TMP7 - pxor \TMP6, \TMP6 - - cmp $16, %r11 - jl .L_get_AAD_rest\@ -.L_get_AAD_blocks\@: - movdqu (%r10), \TMP7 - pshufb %xmm14, \TMP7 # byte-reflect the AAD data - pxor \TMP7, \TMP6 - GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 - add $16, %r10 - sub $16, %r11 - cmp $16, %r11 - jge .L_get_AAD_blocks\@ - - movdqu \TMP6, \TMP7 - - /* read the last <16B of AAD */ -.L_get_AAD_rest\@: - test %r11, %r11 - je .L_get_AAD_done\@ - - READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 - pshufb %xmm14, \TMP7 # byte-reflect the AAD data - pxor \TMP6, \TMP7 - GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 - movdqu \TMP7, \TMP6 - -.L_get_AAD_done\@: - movdqu \TMP6, AadHash(%arg2) -.endm - -# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks -# between update calls. -# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK -# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context -# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 -.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ - AAD_HASH operation - mov PBlockLen(%arg2), %r13 - test %r13, %r13 - je .L_partial_block_done_\@ # Leave Macro if no partial blocks - # Read in input data without over reading - cmp $16, \PLAIN_CYPH_LEN - jl .L_fewer_than_16_bytes_\@ - movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp .L_data_read_\@ - -.L_fewer_than_16_bytes_\@: - lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 - mov \PLAIN_CYPH_LEN, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 - - mov PBlockLen(%arg2), %r13 - -.L_data_read_\@: # Finished reading in data - - movdqu PBlockEncKey(%arg2), %xmm9 - movdqu HashKey(%arg2), %xmm13 - - lea SHIFT_MASK(%rip), %r12 - - # adjust the shuffle mask pointer to be able to shift r13 bytes - # r16-r13 is the number of bytes in plaintext mod 16) - add %r13, %r12 - movdqu (%r12), %xmm2 # get the appropriate shuffle mask - pshufb %xmm2, %xmm9 # shift right r13 bytes - -.ifc \operation, dec - movdqa %xmm1, %xmm3 - pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_1_\@ - sub %r10, %r12 -.L_no_extra_mask_1_\@: - - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 - - pand %xmm1, %xmm3 - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm3 - pshufb %xmm2, %xmm3 - pxor %xmm3, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_1_\@ - - # GHASH computation for the last <16 Byte block - GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax, %eax - - mov %rax, PBlockLen(%arg2) - jmp .L_dec_done_\@ -.L_partial_incomplete_1_\@: - add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -.L_dec_done_\@: - movdqu \AAD_HASH, AadHash(%arg2) -.else - pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_2_\@ - sub %r10, %r12 -.L_no_extra_mask_2_\@: - - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - pand %xmm1, %xmm9 - - movdqa SHUF_MASK(%rip), %xmm1 - pshufb %xmm1, %xmm9 - pshufb %xmm2, %xmm9 - pxor %xmm9, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_2_\@ - - # GHASH computation for the last <16 Byte block - GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax, %eax - - mov %rax, PBlockLen(%arg2) - jmp .L_encode_done_\@ -.L_partial_incomplete_2_\@: - add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -.L_encode_done_\@: - movdqu \AAD_HASH, AadHash(%arg2) - - movdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm9 back to output as ciphertext - pshufb %xmm10, %xmm9 - pshufb %xmm2, %xmm9 -.endif - # output encrypted Bytes - test %r10, %r10 - jl .L_partial_fill_\@ - mov %r13, %r12 - mov $16, %r13 - # Set r13 to be the number of bytes to write out - sub %r12, %r13 - jmp .L_count_set_\@ -.L_partial_fill_\@: - mov \PLAIN_CYPH_LEN, %r13 -.L_count_set_\@: - movdqa %xmm9, %xmm0 - movq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - - mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $8, \DATA_OFFSET - psrldq $8, %xmm0 - movq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $1, \DATA_OFFSET - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_partial_block_done_\@: -.endm # PARTIAL_BLOCK - -/* -* if a = number of total plaintext bytes -* b = floor(a/16) -* num_initial_blocks = b mod 4 -* encrypt the initial num_initial_blocks blocks and apply ghash on -* the ciphertext -* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers -* are clobbered -* arg1, %arg2, %arg3 are used as a pointer only, not modified -*/ - - -.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ - XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation - MOVADQ SHUF_MASK(%rip), %xmm14 - - movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 - - # start AES for num_initial_blocks blocks - - movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 - -.if (\i == 5) || (\i == 6) || (\i == 7) - - MOVADQ ONE(%RIP),\TMP1 - MOVADQ 0(%arg1),\TMP2 -.irpc index, \i_seq - paddd \TMP1, \XMM0 # INCR Y0 -.ifc \operation, dec - movdqa \XMM0, %xmm\index -.else - MOVADQ \XMM0, %xmm\index -.endif - pshufb %xmm14, %xmm\index # perform a 16 byte swap - pxor \TMP2, %xmm\index -.endr - lea 0x10(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - add $5,%eax # 128->9, 192->11, 256->13 - -.Laes_loop_initial_\@: - MOVADQ (%r10),\TMP1 -.irpc index, \i_seq - aesenc \TMP1, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_initial_\@ - - MOVADQ (%r10), \TMP1 -.irpc index, \i_seq - aesenclast \TMP1, %xmm\index # Last Round -.endr -.irpc index, \i_seq - movdqu (%arg4 , %r11, 1), \TMP1 - pxor \TMP1, %xmm\index - movdqu %xmm\index, (%arg3 , %r11, 1) - # write back plaintext/ciphertext for num_initial_blocks - add $16, %r11 - -.ifc \operation, dec - movdqa \TMP1, %xmm\index -.endif - pshufb %xmm14, %xmm\index - - # prepare plaintext/ciphertext for GHASH computation -.endr -.endif - - # apply GHASH on num_initial_blocks blocks - -.if \i == 5 - pxor %xmm5, %xmm6 - GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm6, %xmm7 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.elseif \i == 6 - pxor %xmm6, %xmm7 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.elseif \i == 7 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.endif - cmp $64, %r13 - jl .L_initial_blocks_done\@ - # no need for precomputed values -/* -* -* Precomputations for HashKey parallel with encryption of first 4 blocks. -* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i -*/ - MOVADQ ONE(%RIP),\TMP1 - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM1 - pshufb %xmm14, \XMM1 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM2 - pshufb %xmm14, \XMM2 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM3 - pshufb %xmm14, \XMM3 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM4 - pshufb %xmm14, \XMM4 # perform a 16 byte swap - - MOVADQ 0(%arg1),\TMP1 - pxor \TMP1, \XMM1 - pxor \TMP1, \XMM2 - pxor \TMP1, \XMM3 - pxor \TMP1, \XMM4 -.irpc index, 1234 # do 4 rounds - movaps 0x10*\index(%arg1), \TMP1 - aesenc \TMP1, \XMM1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 -.endr -.irpc index, 56789 # do next 5 rounds - movaps 0x10*\index(%arg1), \TMP1 - aesenc \TMP1, \XMM1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 -.endr - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_pre_done\@ - -.Laes_loop_pre_\@: - MOVADQ (%r10),\TMP2 -.irpc index, 1234 - aesenc \TMP2, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_pre_\@ - -.Laes_loop_pre_done\@: - MOVADQ (%r10), \TMP2 - aesenclast \TMP2, \XMM1 - aesenclast \TMP2, \XMM2 - aesenclast \TMP2, \XMM3 - aesenclast \TMP2, \XMM4 - movdqu 16*0(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM1 -.ifc \operation, dec - movdqu \XMM1, 16*0(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM1 -.endif - movdqu 16*1(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM2 -.ifc \operation, dec - movdqu \XMM2, 16*1(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM2 -.endif - movdqu 16*2(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM3 -.ifc \operation, dec - movdqu \XMM3, 16*2(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM3 -.endif - movdqu 16*3(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM4 -.ifc \operation, dec - movdqu \XMM4, 16*3(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM4 -.else - movdqu \XMM1, 16*0(%arg3 , %r11 , 1) - movdqu \XMM2, 16*1(%arg3 , %r11 , 1) - movdqu \XMM3, 16*2(%arg3 , %r11 , 1) - movdqu \XMM4, 16*3(%arg3 , %r11 , 1) -.endif - - add $64, %r11 - pshufb %xmm14, \XMM1 # perform a 16 byte swap - pxor \XMMDst, \XMM1 -# combine GHASHed value with the corresponding ciphertext - pshufb %xmm14, \XMM2 # perform a 16 byte swap - pshufb %xmm14, \XMM3 # perform a 16 byte swap - pshufb %xmm14, \XMM4 # perform a 16 byte swap - -.L_initial_blocks_done\@: - -.endm - -/* -* encrypt 4 blocks at a time -* ghash the 4 previously encrypted ciphertext blocks -* arg1, %arg3, %arg4 are used as pointers only, not modified -* %r11 is the data offset value -*/ -.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ -TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation - - movdqa \XMM1, \XMM5 - movdqa \XMM2, \XMM6 - movdqa \XMM3, \XMM7 - movdqa \XMM4, \XMM8 - - movdqa SHUF_MASK(%rip), %xmm15 - # multiply TMP5 * HashKey using karatsuba - - movdqa \XMM5, \TMP4 - pshufd $78, \XMM5, \TMP6 - pxor \XMM5, \TMP6 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 - movdqa \XMM0, \XMM1 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM2 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM3 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor (%arg1), \XMM1 - pxor (%arg1), \XMM2 - pxor (%arg1), \XMM3 - pxor (%arg1), \XMM4 - movdqu HashKey_4_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) - movaps 0x10(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movaps 0x20(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 2 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movdqa \XMM6, \TMP1 - pshufd $78, \XMM6, \TMP2 - pxor \XMM6, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 - movaps 0x30(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 3 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 - movaps 0x40(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 4 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_3_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x50(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 5 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM6, \XMM5 - pxor \TMP2, \TMP6 - movdqa \XMM7, \TMP1 - pshufd $78, \XMM7, \TMP2 - pxor \XMM7, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - - # Multiply TMP5 * HashKey using karatsuba - - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x60(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 6 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 - movaps 0x70(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 7 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_2_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x80(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 8 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM7, \XMM5 - pxor \TMP2, \TMP6 - - # Multiply XMM8 * HashKey - # XMM8 and TMP5 hold the values for the two operands - - movdqa \XMM8, \TMP1 - pshufd $78, \XMM8, \TMP2 - pxor \XMM8, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x90(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 9 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_par_enc_done\@ - -.Laes_loop_par_enc\@: - MOVADQ (%r10),\TMP3 -.irpc index, 1234 - aesenc \TMP3, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_par_enc\@ - -.Laes_loop_par_enc_done\@: - MOVADQ (%r10), \TMP3 - aesenclast \TMP3, \XMM1 # Round 10 - aesenclast \TMP3, \XMM2 - aesenclast \TMP3, \XMM3 - aesenclast \TMP3, \XMM4 - movdqu HashKey_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqu (%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK - movdqu 16(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK - movdqu 32(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK - movdqu 48(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK - movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor \TMP4, \TMP1 - pxor \XMM8, \XMM5 - pxor \TMP6, \TMP2 - pxor \TMP1, \TMP2 - pxor \XMM5, \TMP2 - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \XMM5 - pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 - - # first phase of reduction - - movdqa \XMM5, \TMP2 - movdqa \XMM5, \TMP3 - movdqa \XMM5, \TMP4 -# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently - pslld $31, \TMP2 # packed right shift << 31 - pslld $30, \TMP3 # packed right shift << 30 - pslld $25, \TMP4 # packed right shift << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift T5 1 DW - pslldq $12, \TMP2 # left shift T2 3 DWs - pxor \TMP2, \XMM5 - - # second phase of reduction - - movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 - movdqa \XMM5,\TMP3 - movdqa \XMM5,\TMP4 - psrld $1, \TMP2 # packed left shift >>1 - psrld $2, \TMP3 # packed left shift >>2 - psrld $7, \TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \XMM5 - pxor \TMP1, \XMM5 # result is in TMP1 - - pxor \XMM5, \XMM1 -.endm - -/* -* decrypt 4 blocks at a time -* ghash the 4 previously decrypted ciphertext blocks -* arg1, %arg3, %arg4 are used as pointers only, not modified -* %r11 is the data offset value -*/ -.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ -TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation - - movdqa \XMM1, \XMM5 - movdqa \XMM2, \XMM6 - movdqa \XMM3, \XMM7 - movdqa \XMM4, \XMM8 - - movdqa SHUF_MASK(%rip), %xmm15 - # multiply TMP5 * HashKey using karatsuba - - movdqa \XMM5, \TMP4 - pshufd $78, \XMM5, \TMP6 - pxor \XMM5, \TMP6 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 - movdqa \XMM0, \XMM1 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM2 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM3 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor (%arg1), \XMM1 - pxor (%arg1), \XMM2 - pxor (%arg1), \XMM3 - pxor (%arg1), \XMM4 - movdqu HashKey_4_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) - movaps 0x10(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movaps 0x20(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 2 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movdqa \XMM6, \TMP1 - pshufd $78, \XMM6, \TMP2 - pxor \XMM6, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 - movaps 0x30(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 3 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 - movaps 0x40(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 4 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_3_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x50(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 5 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM6, \XMM5 - pxor \TMP2, \TMP6 - movdqa \XMM7, \TMP1 - pshufd $78, \XMM7, \TMP2 - pxor \XMM7, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - - # Multiply TMP5 * HashKey using karatsuba - - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x60(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 6 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 - movaps 0x70(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 7 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_2_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x80(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 8 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM7, \XMM5 - pxor \TMP2, \TMP6 - - # Multiply XMM8 * HashKey - # XMM8 and TMP5 hold the values for the two operands - - movdqa \XMM8, \TMP1 - pshufd $78, \XMM8, \TMP2 - pxor \XMM8, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x90(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 9 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_par_dec_done\@ - -.Laes_loop_par_dec\@: - MOVADQ (%r10),\TMP3 -.irpc index, 1234 - aesenc \TMP3, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_par_dec\@ - -.Laes_loop_par_dec_done\@: - MOVADQ (%r10), \TMP3 - aesenclast \TMP3, \XMM1 # last round - aesenclast \TMP3, \XMM2 - aesenclast \TMP3, \XMM3 - aesenclast \TMP3, \XMM4 - movdqu HashKey_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqu (%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK - movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM1 - movdqu 16(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK - movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM2 - movdqu 32(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK - movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM3 - movdqu 48(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK - movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor \TMP4, \TMP1 - pxor \XMM8, \XMM5 - pxor \TMP6, \TMP2 - pxor \TMP1, \TMP2 - pxor \XMM5, \TMP2 - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \XMM5 - pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 - - # first phase of reduction - - movdqa \XMM5, \TMP2 - movdqa \XMM5, \TMP3 - movdqa \XMM5, \TMP4 -# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently - pslld $31, \TMP2 # packed right shift << 31 - pslld $30, \TMP3 # packed right shift << 30 - pslld $25, \TMP4 # packed right shift << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift T5 1 DW - pslldq $12, \TMP2 # left shift T2 3 DWs - pxor \TMP2, \XMM5 - - # second phase of reduction - - movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 - movdqa \XMM5,\TMP3 - movdqa \XMM5,\TMP4 - psrld $1, \TMP2 # packed left shift >>1 - psrld $2, \TMP3 # packed left shift >>2 - psrld $7, \TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \XMM5 - pxor \TMP1, \XMM5 # result is in TMP1 - - pxor \XMM5, \XMM1 -.endm - -/* GHASH the last 4 ciphertext blocks. */ -.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ -TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst - - # Multiply TMP6 * HashKey (using Karatsuba) - - movdqa \XMM1, \TMP6 - pshufd $78, \XMM1, \TMP2 - pxor \XMM1, \TMP2 - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 - movdqu HashKey_4_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqa \XMM1, \XMMDst - movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - - movdqa \XMM2, \TMP1 - pshufd $78, \XMM2, \TMP2 - pxor \XMM2, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 - movdqu HashKey_3_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM2, \XMMDst - pxor \TMP2, \XMM1 -# results accumulated in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - - movdqa \XMM3, \TMP1 - pshufd $78, \XMM3, \TMP2 - pxor \XMM3, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 - movdqu HashKey_2_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM3, \XMMDst - pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - movdqa \XMM4, \TMP1 - pshufd $78, \XMM4, \TMP2 - pxor \XMM4, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 - movdqu HashKey_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM4, \XMMDst - pxor \XMM1, \TMP2 - pxor \TMP6, \TMP2 - pxor \XMMDst, \TMP2 - # middle section of the temp results combined as in karatsuba algorithm - movdqa \TMP2, \TMP4 - pslldq $8, \TMP4 # left shift TMP4 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP4, \XMMDst - pxor \TMP2, \TMP6 -# TMP6:XMMDst holds the result of the accumulated carry-less multiplications - # first phase of the reduction - movdqa \XMMDst, \TMP2 - movdqa \XMMDst, \TMP3 - movdqa \XMMDst, \TMP4 -# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently - pslld $31, \TMP2 # packed right shifting << 31 - pslld $30, \TMP3 # packed right shifting << 30 - pslld $25, \TMP4 # packed right shifting << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP7 - psrldq $4, \TMP7 # right shift TMP7 1 DW - pslldq $12, \TMP2 # left shift TMP2 3 DWs - pxor \TMP2, \XMMDst - - # second phase of the reduction - movdqa \XMMDst, \TMP2 - # make 3 copies of XMMDst for doing 3 shift operations - movdqa \XMMDst, \TMP3 - movdqa \XMMDst, \TMP4 - psrld $1, \TMP2 # packed left shift >> 1 - psrld $2, \TMP3 # packed left shift >> 2 - psrld $7, \TMP4 # packed left shift >> 7 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - pxor \TMP7, \TMP2 - pxor \TMP2, \XMMDst - pxor \TMP6, \XMMDst # reduced result is in XMMDst -.endm - - -/* Encryption of a single block -* uses eax & r10 -*/ - -.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 - - pxor (%arg1), \XMM0 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - add $5,%eax # 128->9, 192->11, 256->13 - lea 16(%arg1), %r10 # get first expanded key address - -_esb_loop_\@: - MOVADQ (%r10),\TMP1 - aesenc \TMP1,\XMM0 - add $16,%r10 - sub $1,%eax - jnz _esb_loop_\@ - - MOVADQ (%r10),\TMP1 - aesenclast \TMP1,\XMM0 -.endm -/***************************************************************************** -* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data -* // Context data -* u8 *out, // Plaintext output. Encrypt in-place is allowed. -* const u8 *in, // Ciphertext input -* u64 plaintext_len, // Length of data in bytes for decryption. -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes -* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the -* // given authentication tag and only return the plaintext if they match. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 -* // (most likely), 12 or 8. -* -* Assumptions: -* -* keys: -* keys are pre-expanded and aligned to 16 bytes. we are using the first -* set of 11 keys in the data structure void *aes_ctx -* -* iv: -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Salt (From the SA) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Initialization Vector | -* | (This is the sequence number from IPSec header) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x1 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* -* -* AAD: -* AAD padded to 128 bits with 0 -* for example, assume AAD is a u32 vector -* -* if AAD is 8 bytes: -* AAD[3] = {A0, A1}; -* padded AAD in xmm register = {A1 A0 0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A1) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 32-bit Sequence Number (A0) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 32-bit Sequence Number -* -* if AAD is 12 bytes: -* AAD[3] = {A0, A1, A2}; -* padded AAD in xmm register = {A2 A1 A0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A2) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 64-bit Extended Sequence Number {A1,A0} | -* | | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 64-bit Extended Sequence Number -* -* poly = x^128 + x^127 + x^126 + x^121 + 1 -* -*****************************************************************************/ -SYM_FUNC_START(aesni_gcm_dec) - FUNC_SAVE - - GCM_INIT %arg6, arg7, arg8, arg9 - GCM_ENC_DEC dec - GCM_COMPLETE arg10, arg11 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec) - - -/***************************************************************************** -* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data -* // Context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes -* u8 *auth_tag, // Authenticated Tag output. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), -* // 12 or 8. -* -* Assumptions: -* -* keys: -* keys are pre-expanded and aligned to 16 bytes. we are using the -* first set of 11 keys in the data structure void *aes_ctx -* -* -* iv: -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Salt (From the SA) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | Initialization Vector | -* | (This is the sequence number from IPSec header) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x1 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* -* -* AAD: -* AAD padded to 128 bits with 0 -* for example, assume AAD is a u32 vector -* -* if AAD is 8 bytes: -* AAD[3] = {A0, A1}; -* padded AAD in xmm register = {A1 A0 0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A1) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 32-bit Sequence Number (A0) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 32-bit Sequence Number -* -* if AAD is 12 bytes: -* AAD[3] = {A0, A1, A2}; -* padded AAD in xmm register = {A2 A1 A0 0} -* -* 0 1 2 3 -* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | SPI (A2) | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 64-bit Extended Sequence Number {A1,A0} | -* | | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* | 0x0 | -* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -* -* AAD Format with 64-bit Extended Sequence Number -* -* poly = x^128 + x^127 + x^126 + x^121 + 1 -***************************************************************************/ -SYM_FUNC_START(aesni_gcm_enc) - FUNC_SAVE - - GCM_INIT %arg6, arg7, arg8, arg9 - GCM_ENC_DEC enc - - GCM_COMPLETE arg10, arg11 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc) - -/***************************************************************************** -* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len) // Length of AAD in bytes. -*/ -SYM_FUNC_START(aesni_gcm_init) - FUNC_SAVE - GCM_INIT %arg3, %arg4,%arg5, %arg6 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init) - -/***************************************************************************** -* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -*/ -SYM_FUNC_START(aesni_gcm_enc_update) - FUNC_SAVE - GCM_ENC_DEC enc - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update) - -/***************************************************************************** -* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -*/ -SYM_FUNC_START(aesni_gcm_dec_update) - FUNC_SAVE - GCM_ENC_DEC dec - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update) - -/***************************************************************************** -* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *auth_tag, // Authenticated Tag output. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), -* // 12 or 8. -*/ -SYM_FUNC_START(aesni_gcm_finalize) - FUNC_SAVE - GCM_COMPLETE %arg3 %arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize) - -#endif - SYM_FUNC_START_LOCAL(_key_expansion_256a) pshufd $0b11111111, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 @@ -1820,8 +134,8 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b) SYM_FUNC_END(_key_expansion_256b) /* - * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, - * unsigned int key_len) + * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, + * unsigned int key_len) */ SYM_FUNC_START(aesni_set_key) FRAME_BEGIN @@ -1926,7 +240,6 @@ SYM_FUNC_START(aesni_set_key) sub $0x10, UKEYP cmp TKEYP, KEYP jb .Ldec_key_loop - xor AREG, AREG #ifndef __x86_64__ popl KEYP #endif @@ -2759,6 +1072,7 @@ SYM_FUNC_END(_aesni_inc) * size_t len, u8 *iv) */ SYM_FUNC_START(aesni_ctr_enc) + ANNOTATE_NOENDBR FRAME_BEGIN cmp $16, LEN jb .Lctr_enc_just_ret @@ -2826,28 +1140,24 @@ SYM_FUNC_END(aesni_ctr_enc) .previous /* - * _aesni_gf128mul_x_ble: internal ABI - * Multiply in GF(2^128) for XTS IVs + * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs * input: * IV: current IV * GF128MUL_MASK == mask with 0x87 and 0x01 * output: * IV: next IV * changed: - * CTR: == temporary value + * KEY: == temporary value */ -#define _aesni_gf128mul_x_ble() \ - pshufd $0x13, IV, KEY; \ - paddq IV, IV; \ - psrad $31, KEY; \ - pand GF128MUL_MASK, KEY; \ - pxor KEY, IV; +.macro _aesni_gf128mul_x_ble + pshufd $0x13, IV, KEY + paddq IV, IV + psrad $31, KEY + pand GF128MUL_MASK, KEY + pxor KEY, IV +.endm -/* - * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, - * const u8 *src, unsigned int len, le128 *iv) - */ -SYM_FUNC_START(aesni_xts_encrypt) +.macro _aesni_xts_crypt enc FRAME_BEGIN #ifndef __x86_64__ pushl IVP @@ -2866,35 +1176,46 @@ SYM_FUNC_START(aesni_xts_encrypt) movups (IVP), IV mov 480(KEYP), KLEN +.if !\enc + add $240, KEYP + + test $15, LEN + jz .Lxts_loop4\@ + sub $16, LEN +.endif -.Lxts_enc_loop4: +.Lxts_loop4\@: sub $64, LEN - jl .Lxts_enc_1x + jl .Lxts_1x\@ movdqa IV, STATE1 movdqu 0x00(INP), IN pxor IN, STATE1 movdqu IV, 0x00(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble movdqa IV, STATE2 movdqu 0x10(INP), IN pxor IN, STATE2 movdqu IV, 0x10(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble movdqa IV, STATE3 movdqu 0x20(INP), IN pxor IN, STATE3 movdqu IV, 0x20(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble movdqa IV, STATE4 movdqu 0x30(INP), IN pxor IN, STATE4 movdqu IV, 0x30(OUTP) +.if \enc call _aesni_enc4 +.else + call _aesni_dec4 +.endif movdqu 0x00(OUTP), IN pxor IN, STATE1 @@ -2912,17 +1233,17 @@ SYM_FUNC_START(aesni_xts_encrypt) pxor IN, STATE4 movdqu STATE4, 0x30(OUTP) - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble add $64, INP add $64, OUTP test LEN, LEN - jnz .Lxts_enc_loop4 + jnz .Lxts_loop4\@ -.Lxts_enc_ret_iv: +.Lxts_ret_iv\@: movups IV, (IVP) -.Lxts_enc_ret: +.Lxts_ret\@: #ifndef __x86_64__ popl KLEN popl KEYP @@ -2932,201 +1253,60 @@ SYM_FUNC_START(aesni_xts_encrypt) FRAME_END RET -.Lxts_enc_1x: +.Lxts_1x\@: add $64, LEN - jz .Lxts_enc_ret_iv + jz .Lxts_ret_iv\@ +.if \enc sub $16, LEN - jl .Lxts_enc_cts4 + jl .Lxts_cts4\@ +.endif -.Lxts_enc_loop1: +.Lxts_loop1\@: movdqu (INP), STATE +.if \enc pxor IV, STATE call _aesni_enc1 - pxor IV, STATE - _aesni_gf128mul_x_ble() - - test LEN, LEN - jz .Lxts_enc_out - +.else add $16, INP sub $16, LEN - jl .Lxts_enc_cts1 - - movdqu STATE, (OUTP) - add $16, OUTP - jmp .Lxts_enc_loop1 - -.Lxts_enc_out: - movdqu STATE, (OUTP) - jmp .Lxts_enc_ret_iv - -.Lxts_enc_cts4: - movdqa STATE4, STATE - sub $16, OUTP - -.Lxts_enc_cts1: -#ifndef __x86_64__ - lea .Lcts_permute_table, T1 -#else - lea .Lcts_permute_table(%rip), T1 -#endif - add LEN, INP /* rewind input pointer */ - add $16, LEN /* # bytes in final block */ - movups (INP), IN1 - - mov T1, IVP - add $32, IVP - add LEN, T1 - sub LEN, IVP - add OUTP, LEN - - movups (T1), %xmm4 - movaps STATE, IN2 - pshufb %xmm4, STATE - movups STATE, (LEN) - - movups (IVP), %xmm0 - pshufb %xmm0, IN1 - pblendvb IN2, IN1 - movaps IN1, STATE - + jl .Lxts_cts1\@ pxor IV, STATE - call _aesni_enc1 + call _aesni_dec1 +.endif pxor IV, STATE + _aesni_gf128mul_x_ble - movups STATE, (OUTP) - jmp .Lxts_enc_ret -SYM_FUNC_END(aesni_xts_encrypt) - -/* - * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, - * const u8 *src, unsigned int len, le128 *iv) - */ -SYM_FUNC_START(aesni_xts_decrypt) - FRAME_BEGIN -#ifndef __x86_64__ - pushl IVP - pushl LEN - pushl KEYP - pushl KLEN - movl (FRAME_OFFSET+20)(%esp), KEYP # ctx - movl (FRAME_OFFSET+24)(%esp), OUTP # dst - movl (FRAME_OFFSET+28)(%esp), INP # src - movl (FRAME_OFFSET+32)(%esp), LEN # len - movl (FRAME_OFFSET+36)(%esp), IVP # iv - movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK -#else - movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK -#endif - movups (IVP), IV - - mov 480(KEYP), KLEN - add $240, KEYP - - test $15, LEN - jz .Lxts_dec_loop4 - sub $16, LEN - -.Lxts_dec_loop4: - sub $64, LEN - jl .Lxts_dec_1x - - movdqa IV, STATE1 - movdqu 0x00(INP), IN - pxor IN, STATE1 - movdqu IV, 0x00(OUTP) - - _aesni_gf128mul_x_ble() - movdqa IV, STATE2 - movdqu 0x10(INP), IN - pxor IN, STATE2 - movdqu IV, 0x10(OUTP) - - _aesni_gf128mul_x_ble() - movdqa IV, STATE3 - movdqu 0x20(INP), IN - pxor IN, STATE3 - movdqu IV, 0x20(OUTP) - - _aesni_gf128mul_x_ble() - movdqa IV, STATE4 - movdqu 0x30(INP), IN - pxor IN, STATE4 - movdqu IV, 0x30(OUTP) - - call _aesni_dec4 - - movdqu 0x00(OUTP), IN - pxor IN, STATE1 - movdqu STATE1, 0x00(OUTP) - - movdqu 0x10(OUTP), IN - pxor IN, STATE2 - movdqu STATE2, 0x10(OUTP) - - movdqu 0x20(OUTP), IN - pxor IN, STATE3 - movdqu STATE3, 0x20(OUTP) - - movdqu 0x30(OUTP), IN - pxor IN, STATE4 - movdqu STATE4, 0x30(OUTP) - - _aesni_gf128mul_x_ble() - - add $64, INP - add $64, OUTP test LEN, LEN - jnz .Lxts_dec_loop4 - -.Lxts_dec_ret_iv: - movups IV, (IVP) - -.Lxts_dec_ret: -#ifndef __x86_64__ - popl KLEN - popl KEYP - popl LEN - popl IVP -#endif - FRAME_END - RET - -.Lxts_dec_1x: - add $64, LEN - jz .Lxts_dec_ret_iv - -.Lxts_dec_loop1: - movdqu (INP), STATE + jz .Lxts_out\@ +.if \enc add $16, INP sub $16, LEN - jl .Lxts_dec_cts1 - - pxor IV, STATE - call _aesni_dec1 - pxor IV, STATE - _aesni_gf128mul_x_ble() - - test LEN, LEN - jz .Lxts_dec_out + jl .Lxts_cts1\@ +.endif movdqu STATE, (OUTP) add $16, OUTP - jmp .Lxts_dec_loop1 + jmp .Lxts_loop1\@ -.Lxts_dec_out: +.Lxts_out\@: movdqu STATE, (OUTP) - jmp .Lxts_dec_ret_iv + jmp .Lxts_ret_iv\@ -.Lxts_dec_cts1: +.if \enc +.Lxts_cts4\@: + movdqa STATE4, STATE + sub $16, OUTP +.Lxts_cts1\@: +.else +.Lxts_cts1\@: movdqa IV, STATE4 - _aesni_gf128mul_x_ble() + _aesni_gf128mul_x_ble pxor IV, STATE call _aesni_dec1 pxor IV, STATE - +.endif #ifndef __x86_64__ lea .Lcts_permute_table, T1 #else @@ -3152,10 +1332,32 @@ SYM_FUNC_START(aesni_xts_decrypt) pblendvb IN2, IN1 movaps IN1, STATE +.if \enc + pxor IV, STATE + call _aesni_enc1 + pxor IV, STATE +.else pxor STATE4, STATE call _aesni_dec1 pxor STATE4, STATE +.endif movups STATE, (OUTP) - jmp .Lxts_dec_ret -SYM_FUNC_END(aesni_xts_decrypt) + jmp .Lxts_ret\@ +.endm + +/* + * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) + */ +SYM_FUNC_START(aesni_xts_enc) + _aesni_xts_crypt 1 +SYM_FUNC_END(aesni_xts_enc) + +/* + * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst, + * const u8 *src, unsigned int len, le128 *iv) + */ +SYM_FUNC_START(aesni_xts_dec) + _aesni_xts_crypt 0 +SYM_FUNC_END(aesni_xts_dec) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S deleted file mode 100644 index 8c9749ed0651..000000000000 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ /dev/null @@ -1,2804 +0,0 @@ -######################################################################## -# Copyright (c) 2013, Intel Corporation -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the -# distribution. -# -# * Neither the name of the Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# -# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR -# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -######################################################################## -## -## Authors: -## Erdinc Ozturk <erdinc.ozturk@intel.com> -## Vinodh Gopal <vinodh.gopal@intel.com> -## James Guilford <james.guilford@intel.com> -## Tim Chen <tim.c.chen@linux.intel.com> -## -## References: -## This code was derived and highly optimized from the code described in paper: -## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation -## on Intel Architecture Processors. August, 2010 -## The details of the implementation is explained in: -## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode -## on Intel Architecture Processors. October, 2012. -## -## Assumptions: -## -## -## -## iv: -## 0 1 2 3 -## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | Salt (From the SA) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | Initialization Vector | -## | (This is the sequence number from IPSec header) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 0x1 | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## -## -## -## AAD: -## AAD padded to 128 bits with 0 -## for example, assume AAD is a u32 vector -## -## if AAD is 8 bytes: -## AAD[3] = {A0, A1}# -## padded AAD in xmm register = {A1 A0 0 0} -## -## 0 1 2 3 -## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | SPI (A1) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 32-bit Sequence Number (A0) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 0x0 | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## -## AAD Format with 32-bit Sequence Number -## -## if AAD is 12 bytes: -## AAD[3] = {A0, A1, A2}# -## padded AAD in xmm register = {A2 A1 A0 0} -## -## 0 1 2 3 -## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | SPI (A2) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 64-bit Extended Sequence Number {A1,A0} | -## | | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 0x0 | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## -## AAD Format with 64-bit Extended Sequence Number -## -## -## aadLen: -## from the definition of the spec, aadLen can only be 8 or 12 bytes. -## The code additionally supports aadLen of length 16 bytes. -## -## TLen: -## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. -## -## poly = x^128 + x^127 + x^126 + x^121 + 1 -## throughout the code, one tab and two tab indentations are used. one tab is -## for GHASH part, two tabs is for AES part. -## - -#include <linux/linkage.h> - -# constants in mergeable sections, linker can reorder and merge -.section .rodata.cst16.POLY, "aM", @progbits, 16 -.align 16 -POLY: .octa 0xC2000000000000000000000000000001 - -.section .rodata.cst16.POLY2, "aM", @progbits, 16 -.align 16 -POLY2: .octa 0xC20000000000000000000001C2000000 - -.section .rodata.cst16.TWOONE, "aM", @progbits, 16 -.align 16 -TWOONE: .octa 0x00000001000000000000000000000001 - -.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 -.align 16 -SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F - -.section .rodata.cst16.ONE, "aM", @progbits, 16 -.align 16 -ONE: .octa 0x00000000000000000000000000000001 - -.section .rodata.cst16.ONEf, "aM", @progbits, 16 -.align 16 -ONEf: .octa 0x01000000000000000000000000000000 - -# order of these constants should not change. -# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F -.section .rodata, "a", @progbits -.align 16 -SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 -ALL_F: .octa 0xffffffffffffffffffffffffffffffff - .octa 0x00000000000000000000000000000000 - -.text - - -#define AadHash 16*0 -#define AadLen 16*1 -#define InLen (16*1)+8 -#define PBlockEncKey 16*2 -#define OrigIV 16*3 -#define CurCount 16*4 -#define PBlockLen 16*5 - -HashKey = 16*6 # store HashKey <<1 mod poly here -HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here -HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here -HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here -HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here -HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here -HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here -HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here -HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) -HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) -HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) -HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) -HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) -HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) -HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) -HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) - -#define arg1 %rdi -#define arg2 %rsi -#define arg3 %rdx -#define arg4 %rcx -#define arg5 %r8 -#define arg6 %r9 -#define keysize 2*15*16(arg1) - -i = 0 -j = 0 - -out_order = 0 -in_order = 1 -DEC = 0 -ENC = 1 - -.macro define_reg r n -reg_\r = %xmm\n -.endm - -.macro setreg -.altmacro -define_reg i %i -define_reg j %j -.noaltmacro -.endm - -TMP1 = 16*0 # Temporary storage for AAD -TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) -TMP3 = 16*2 # Temporary storage for AES State 3 -TMP4 = 16*3 # Temporary storage for AES State 4 -TMP5 = 16*4 # Temporary storage for AES State 5 -TMP6 = 16*5 # Temporary storage for AES State 6 -TMP7 = 16*6 # Temporary storage for AES State 7 -TMP8 = 16*7 # Temporary storage for AES State 8 - -VARIABLE_OFFSET = 16*8 - -################################ -# Utility Macros -################################ - -.macro FUNC_SAVE - push %r12 - push %r13 - push %r15 - - push %rbp - mov %rsp, %rbp - - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes -.endm - -.macro FUNC_RESTORE - mov %rbp, %rsp - pop %rbp - - pop %r15 - pop %r13 - pop %r12 -.endm - -# Encryption of a single block -.macro ENCRYPT_SINGLE_BLOCK REP XMM0 - vpxor (arg1), \XMM0, \XMM0 - i = 1 - setreg -.rep \REP - vaesenc 16*i(arg1), \XMM0, \XMM0 - i = (i+1) - setreg -.endr - vaesenclast 16*i(arg1), \XMM0, \XMM0 -.endm - -# combined for GCM encrypt and decrypt functions -# clobbering all xmm registers -# clobbering r10, r11, r12, r13, r15, rax -.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP - vmovdqu AadHash(arg2), %xmm8 - vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey - add arg5, InLen(arg2) - - # initialize the data pointer offset as zero - xor %r11d, %r11d - - PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC - sub %r11, arg5 - - mov arg5, %r13 # save the number of bytes of plaintext/ciphertext - and $-16, %r13 # r13 = r13 - (r13 mod 16) - - mov %r13, %r12 - shr $4, %r12 - and $7, %r12 - jz .L_initial_num_blocks_is_0\@ - - cmp $7, %r12 - je .L_initial_num_blocks_is_7\@ - cmp $6, %r12 - je .L_initial_num_blocks_is_6\@ - cmp $5, %r12 - je .L_initial_num_blocks_is_5\@ - cmp $4, %r12 - je .L_initial_num_blocks_is_4\@ - cmp $3, %r12 - je .L_initial_num_blocks_is_3\@ - cmp $2, %r12 - je .L_initial_num_blocks_is_2\@ - - jmp .L_initial_num_blocks_is_1\@ - -.L_initial_num_blocks_is_7\@: - \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*7, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_6\@: - \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*6, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_5\@: - \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*5, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_4\@: - \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*4, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_3\@: - \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*3, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_2\@: - \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*2, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_1\@: - \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*1, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_0\@: - \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - - -.L_initial_blocks_encrypted\@: - test %r13, %r13 - je .L_zero_cipher_left\@ - - sub $128, %r13 - je .L_eight_cipher_left\@ - - - - - vmovd %xmm9, %r15d - and $255, %r15d - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - - -.L_encrypt_by_8_new\@: - cmp $(255-8), %r15d - jg .L_encrypt_by_8\@ - - - - add $8, %r15b - \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC - add $128, %r11 - sub $128, %r13 - jne .L_encrypt_by_8_new\@ - - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - jmp .L_eight_cipher_left\@ - -.L_encrypt_by_8\@: - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $8, %r15b - \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $128, %r11 - sub $128, %r13 - jne .L_encrypt_by_8_new\@ - - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - - - - -.L_eight_cipher_left\@: - \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 - - -.L_zero_cipher_left\@: - vmovdqu %xmm14, AadHash(arg2) - vmovdqu %xmm9, CurCount(arg2) - - # check for 0 length - mov arg5, %r13 - and $15, %r13 # r13 = (arg5 mod 16) - - je .L_multiple_of_16_bytes\@ - - # handle the last <16 Byte block separately - - mov %r13, PBlockLen(arg2) - - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vmovdqu %xmm9, CurCount(arg2) - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - - ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) - vmovdqu %xmm9, PBlockEncKey(arg2) - - cmp $16, arg5 - jge .L_large_enough_update\@ - - lea (arg4,%r11,1), %r10 - mov %r13, %r12 - - READ_PARTIAL_BLOCK %r10 %r12 %xmm1 - - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) - - jmp .L_final_ghash_mul\@ - -.L_large_enough_update\@: - sub $16, %r11 - add %r13, %r11 - - # receive the last <16 Byte block - vmovdqu (arg4, %r11, 1), %xmm1 - - sub %r13, %r11 - add $16, %r11 - - lea SHIFT_MASK+16(%rip), %r12 - # adjust the shuffle mask pointer to be able to shift 16-r13 bytes - # (r13 is the number of bytes in plaintext mod 16) - sub %r13, %r12 - # get the appropriate shuffle mask - vmovdqu (%r12), %xmm2 - # shift right 16-r13 bytes - vpshufb %xmm2, %xmm1, %xmm1 - -.L_final_ghash_mul\@: - .if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm2 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm2, %xmm2 - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm14, %xmm14 - - vmovdqu %xmm14, AadHash(arg2) - .else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vpxor %xmm9, %xmm14, %xmm14 - - vmovdqu %xmm14, AadHash(arg2) - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext - .endif - - - ############################# - # output r13 Bytes - vmovq %xmm9, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left\@ - - mov %rax, (arg3 , %r11) - add $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - vmovq %xmm9, %rax - sub $8, %r13 - -.L_less_than_8_bytes_left\@: - movb %al, (arg3 , %r11) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left\@ - ############################# - -.L_multiple_of_16_bytes\@: -.endm - - -# GCM_COMPLETE Finishes update of tag of last partial block -# Output: Authorization Tag (AUTH_TAG) -# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 -.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN - vmovdqu AadHash(arg2), %xmm14 - vmovdqu HashKey(arg2), %xmm13 - - mov PBlockLen(arg2), %r12 - test %r12, %r12 - je .L_partial_done\@ - - #GHASH computation for the last <16 Byte block - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - -.L_partial_done\@: - mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - vmovd %r12d, %xmm15 # len(A) in xmm15 - - mov InLen(arg2), %r12 - shl $3, %r12 # len(C) in bits (*128) - vmovq %r12, %xmm1 - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) - - vpxor %xmm15, %xmm14, %xmm14 - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap - - vmovdqu OrigIV(arg2), %xmm9 - - ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) - - vpxor %xmm14, %xmm9, %xmm9 - - - -.L_return_T\@: - mov \AUTH_TAG, %r10 # r10 = authTag - mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len - - cmp $16, %r11 - je .L_T_16\@ - - cmp $8, %r11 - jl .L_T_4\@ - -.L_T_8\@: - vmovq %xmm9, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - test %r11, %r11 - je .L_return_T_done\@ -.L_T_4\@: - vmovd %xmm9, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - vpsrldq $4, %xmm9, %xmm9 - test %r11, %r11 - je .L_return_T_done\@ -.L_T_123\@: - vmovd %xmm9, %eax - cmp $2, %r11 - jl .L_T_1\@ - mov %ax, (%r10) - cmp $2, %r11 - je .L_return_T_done\@ - add $2, %r10 - sar $16, %eax -.L_T_1\@: - mov %al, (%r10) - jmp .L_return_T_done\@ - -.L_T_16\@: - vmovdqu %xmm9, (%r10) - -.L_return_T_done\@: -.endm - -.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 - - mov \AAD, %r10 # r10 = AAD - mov \AADLEN, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor \T8, \T8, \T8 - vpxor \T7, \T7, \T7 - cmp $16, %r11 - jl .L_get_AAD_rest8\@ -.L_get_AAD_blocks\@: - vmovdqu (%r10), \T7 - vpshufb SHUF_MASK(%rip), \T7, \T7 - vpxor \T7, \T8, \T8 - \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 - add $16, %r10 - sub $16, %r12 - sub $16, %r11 - cmp $16, %r11 - jge .L_get_AAD_blocks\@ - vmovdqu \T8, \T7 - test %r11, %r11 - je .L_get_AAD_done\@ - - vpxor \T7, \T7, \T7 - - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -.L_get_AAD_rest8\@: - cmp $4, %r11 - jle .L_get_AAD_rest4\@ - movq (%r10), \T1 - add $8, %r10 - sub $8, %r11 - vpslldq $8, \T1, \T1 - vpsrldq $8, \T7, \T7 - vpxor \T1, \T7, \T7 - jmp .L_get_AAD_rest8\@ -.L_get_AAD_rest4\@: - test %r11, %r11 - jle .L_get_AAD_rest0\@ - mov (%r10), %eax - movq %rax, \T1 - add $4, %r10 - sub $4, %r11 - vpslldq $12, \T1, \T1 - vpsrldq $4, \T7, \T7 - vpxor \T1, \T7, \T7 -.L_get_AAD_rest0\@: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and a pair of shuffle masks */ - leaq ALL_F(%rip), %r11 - subq %r12, %r11 - vmovdqu 16(%r11), \T1 - andq $~3, %r11 - vpshufb (%r11), \T7, \T7 - vpand \T1, \T7, \T7 -.L_get_AAD_rest_final\@: - vpshufb SHUF_MASK(%rip), \T7, \T7 - vpxor \T8, \T7, \T7 - \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 - -.L_get_AAD_done\@: - vmovdqu \T7, AadHash(arg2) -.endm - -.macro INIT GHASH_MUL PRECOMPUTE - mov arg6, %r11 - mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length - xor %r11d, %r11d - mov %r11, InLen(arg2) # ctx_data.in_length = 0 - - mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 - mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 - mov arg3, %rax - movdqu (%rax), %xmm0 - movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv - - vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 - movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv - - vmovdqu (arg4), %xmm6 # xmm6 = HashKey - - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey - vmovdqa %xmm6, %xmm2 - vpsllq $1, %xmm6, %xmm6 - vpsrlq $63, %xmm2, %xmm2 - vmovdqa %xmm2, %xmm1 - vpslldq $8, %xmm2, %xmm2 - vpsrldq $8, %xmm1, %xmm1 - vpor %xmm2, %xmm6, %xmm6 - #reduction - vpshufd $0b00100100, %xmm1, %xmm2 - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 - vpand POLY(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly - ####################################################################### - vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly - - CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 - - \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 -.endm - - -# Reads DLEN bytes starting at DPTR and stores in XMMDst -# where 0 < DLEN < 16 -# Clobbers %rax, DLEN -.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst - vpxor \XMMDst, \XMMDst, \XMMDst - - cmp $8, \DLEN - jl .L_read_lt8_\@ - mov (\DPTR), %rax - vpinsrq $0, %rax, \XMMDst, \XMMDst - sub $8, \DLEN - jz .L_done_read_partial_block_\@ - xor %eax, %eax -.L_read_next_byte_\@: - shl $8, %rax - mov 7(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_\@ - vpinsrq $1, %rax, \XMMDst, \XMMDst - jmp .L_done_read_partial_block_\@ -.L_read_lt8_\@: - xor %eax, %eax -.L_read_next_byte_lt8_\@: - shl $8, %rax - mov -1(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_lt8_\@ - vpinsrq $0, %rax, \XMMDst, \XMMDst -.L_done_read_partial_block_\@: -.endm - -# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks -# between update calls. -# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK -# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context -# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 -.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ - AAD_HASH ENC_DEC - mov PBlockLen(arg2), %r13 - test %r13, %r13 - je .L_partial_block_done_\@ # Leave Macro if no partial blocks - # Read in input data without over reading - cmp $16, \PLAIN_CYPH_LEN - jl .L_fewer_than_16_bytes_\@ - vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp .L_data_read_\@ - -.L_fewer_than_16_bytes_\@: - lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 - mov \PLAIN_CYPH_LEN, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm1 - - mov PBlockLen(arg2), %r13 - -.L_data_read_\@: # Finished reading in data - - vmovdqu PBlockEncKey(arg2), %xmm9 - vmovdqu HashKey(arg2), %xmm13 - - lea SHIFT_MASK(%rip), %r12 - - # adjust the shuffle mask pointer to be able to shift r13 bytes - # r16-r13 is the number of bytes in plaintext mod 16) - add %r13, %r12 - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask - vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes - -.if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm3 - pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_1_\@ - sub %r10, %r12 -.L_no_extra_mask_1_\@: - - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 - - vpand %xmm1, %xmm3, %xmm3 - vmovdqa SHUF_MASK(%rip), %xmm10 - vpshufb %xmm10, %xmm3, %xmm3 - vpshufb %xmm2, %xmm3, %xmm3 - vpxor %xmm3, \AAD_HASH, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_1_\@ - - # GHASH computation for the last <16 Byte block - \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax,%eax - - mov %rax, PBlockLen(arg2) - jmp .L_dec_done_\@ -.L_partial_incomplete_1_\@: - add \PLAIN_CYPH_LEN, PBlockLen(arg2) -.L_dec_done_\@: - vmovdqu \AAD_HASH, AadHash(arg2) -.else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_2_\@ - sub %r10, %r12 -.L_no_extra_mask_2_\@: - - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 - - vmovdqa SHUF_MASK(%rip), %xmm1 - vpshufb %xmm1, %xmm9, %xmm9 - vpshufb %xmm2, %xmm9, %xmm9 - vpxor %xmm9, \AAD_HASH, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_2_\@ - - # GHASH computation for the last <16 Byte block - \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax,%eax - - mov %rax, PBlockLen(arg2) - jmp .L_encode_done_\@ -.L_partial_incomplete_2_\@: - add \PLAIN_CYPH_LEN, PBlockLen(arg2) -.L_encode_done_\@: - vmovdqu \AAD_HASH, AadHash(arg2) - - vmovdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm9 back to output as ciphertext - vpshufb %xmm10, %xmm9, %xmm9 - vpshufb %xmm2, %xmm9, %xmm9 -.endif - # output encrypted Bytes - test %r10, %r10 - jl .L_partial_fill_\@ - mov %r13, %r12 - mov $16, %r13 - # Set r13 to be the number of bytes to write out - sub %r12, %r13 - jmp .L_count_set_\@ -.L_partial_fill_\@: - mov \PLAIN_CYPH_LEN, %r13 -.L_count_set_\@: - vmovdqa %xmm9, %xmm0 - vmovq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - - mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $8, \DATA_OFFSET - psrldq $8, %xmm0 - vmovq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $1, \DATA_OFFSET - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_partial_block_done_\@: -.endm # PARTIAL_BLOCK - -############################################################################### -# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -# Input: A and B (128-bits each, bit-reflected) -# Output: C = A*B*x mod poly, (i.e. >>1 ) -# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -############################################################################### -.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 - - vpshufd $0b01001110, \GH, \T2 - vpshufd $0b01001110, \HK, \T3 - vpxor \GH , \T2, \T2 # T2 = (a1+a0) - vpxor \HK , \T3, \T3 # T3 = (b1+b0) - - vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 - vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 - vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) - vpxor \GH, \T2,\T2 - vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 - - vpslldq $8, \T2,\T3 # shift-L T3 2 DWs - vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs - vpxor \T3, \GH, \GH - vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK - - #first phase of the reduction - vpslld $31, \GH, \T2 # packed right shifting << 31 - vpslld $30, \GH, \T3 # packed right shifting shift << 30 - vpslld $25, \GH, \T4 # packed right shifting shift << 25 - - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T5 # shift-R T5 1 DW - - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \GH, \GH # first phase of the reduction complete - - #second phase of the reduction - - vpsrld $1,\GH, \T2 # packed left shifting >> 1 - vpsrld $2,\GH, \T3 # packed left shifting >> 2 - vpsrld $7,\GH, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpxor \T5, \T2, \T2 - vpxor \T2, \GH, \GH - vpxor \T1, \GH, \GH # the result is in GH - - -.endm - -.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 - - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vmovdqa \HK, \T5 - - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_2_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqu \T5, HashKey_3(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_3_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqu \T5, HashKey_4(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_4_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqu \T5, HashKey_5(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_5_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqu \T5, HashKey_6(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_6_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqu \T5, HashKey_7(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_7_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqu \T5, HashKey_8(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_8_k(arg2) - -.endm - -## if a = number of total plaintext bytes -## b = floor(a/16) -## num_initial_blocks = b mod 4# -## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext -## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, arg4 are used as pointers only, not modified - -.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC - i = (8-\num_initial_blocks) - setreg - vmovdqu AadHash(arg2), reg_i - - # start AES for num_initial_blocks blocks - vmovdqu CurCount(arg2), \CTR - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap - i = (i+1) - setreg -.endr - - vmovdqa (arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpxor \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = 1 - setreg -.rep \REP - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenc \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = (j+1) - setreg -.endr - - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenclast \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vmovdqu (arg4, %r11), \T1 - vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks - add $16, %r11 -.if \ENC_DEC == DEC - vmovdqa \T1, reg_i -.endif - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations - i = (i+1) - setreg -.endr - - - i = (8-\num_initial_blocks) - j = (9-\num_initial_blocks) - setreg - -.rep \num_initial_blocks - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks - i = (i+1) - j = (j+1) - setreg -.endr - # XMM8 has the combined result here - - vmovdqa \XMM8, TMP1(%rsp) - vmovdqa \XMM8, \T3 - - cmp $128, %r13 - jl .L_initial_blocks_done\@ # no need for precomputed constants - -############################################################################### -# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM1 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM2 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM3 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM4 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM5 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM6 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM7 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM8 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - vmovdqa (arg1), \T_key - vpxor \T_key, \XMM1, \XMM1 - vpxor \T_key, \XMM2, \XMM2 - vpxor \T_key, \XMM3, \XMM3 - vpxor \T_key, \XMM4, \XMM4 - vpxor \T_key, \XMM5, \XMM5 - vpxor \T_key, \XMM6, \XMM6 - vpxor \T_key, \XMM7, \XMM7 - vpxor \T_key, \XMM8, \XMM8 - - i = 1 - setreg -.rep \REP # do REP rounds - vmovdqa 16*i(arg1), \T_key - vaesenc \T_key, \XMM1, \XMM1 - vaesenc \T_key, \XMM2, \XMM2 - vaesenc \T_key, \XMM3, \XMM3 - vaesenc \T_key, \XMM4, \XMM4 - vaesenc \T_key, \XMM5, \XMM5 - vaesenc \T_key, \XMM6, \XMM6 - vaesenc \T_key, \XMM7, \XMM7 - vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg -.endr - - vmovdqa 16*i(arg1), \T_key - vaesenclast \T_key, \XMM1, \XMM1 - vaesenclast \T_key, \XMM2, \XMM2 - vaesenclast \T_key, \XMM3, \XMM3 - vaesenclast \T_key, \XMM4, \XMM4 - vaesenclast \T_key, \XMM5, \XMM5 - vaesenclast \T_key, \XMM6, \XMM6 - vaesenclast \T_key, \XMM7, \XMM7 - vaesenclast \T_key, \XMM8, \XMM8 - - vmovdqu (arg4, %r11), \T1 - vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM1 - .endif - - vmovdqu 16*1(arg4, %r11), \T1 - vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM2 - .endif - - vmovdqu 16*2(arg4, %r11), \T1 - vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM3 - .endif - - vmovdqu 16*3(arg4, %r11), \T1 - vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM4 - .endif - - vmovdqu 16*4(arg4, %r11), \T1 - vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM5 - .endif - - vmovdqu 16*5(arg4, %r11), \T1 - vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM6 - .endif - - vmovdqu 16*6(arg4, %r11), \T1 - vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM7 - .endif - - vmovdqu 16*7(arg4, %r11), \T1 - vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM8 - .endif - - add $128, %r11 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - -############################################################################### - -.L_initial_blocks_done\@: - -.endm - -# encrypt 8 blocks at a time -# ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3, arg4 are used as pointers only, not modified -# r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC - - vmovdqa \XMM1, \T2 - vmovdqa \XMM2, TMP2(%rsp) - vmovdqa \XMM3, TMP3(%rsp) - vmovdqa \XMM4, TMP4(%rsp) - vmovdqa \XMM5, TMP5(%rsp) - vmovdqa \XMM6, TMP6(%rsp) - vmovdqa \XMM7, TMP7(%rsp) - vmovdqa \XMM8, TMP8(%rsp) - -.if \loop_idx == in_order - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONE(%rip), \XMM1, \XMM2 - vpaddd ONE(%rip), \XMM2, \XMM3 - vpaddd ONE(%rip), \XMM3, \XMM4 - vpaddd ONE(%rip), \XMM4, \XMM5 - vpaddd ONE(%rip), \XMM5, \XMM6 - vpaddd ONE(%rip), \XMM6, \XMM7 - vpaddd ONE(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap -.else - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONEf(%rip), \XMM1, \XMM2 - vpaddd ONEf(%rip), \XMM2, \XMM3 - vpaddd ONEf(%rip), \XMM3, \XMM4 - vpaddd ONEf(%rip), \XMM4, \XMM5 - vpaddd ONEf(%rip), \XMM5, \XMM6 - vpaddd ONEf(%rip), \XMM6, \XMM7 - vpaddd ONEf(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR -.endif - - - ####################################################################### - - vmovdqu (arg1), \T1 - vpxor \T1, \XMM1, \XMM1 - vpxor \T1, \XMM2, \XMM2 - vpxor \T1, \XMM3, \XMM3 - vpxor \T1, \XMM4, \XMM4 - vpxor \T1, \XMM5, \XMM5 - vpxor \T1, \XMM6, \XMM6 - vpxor \T1, \XMM7, \XMM7 - vpxor \T1, \XMM8, \XMM8 - - ####################################################################### - - - - - - vmovdqu 16*1(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqu 16*2(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - ####################################################################### - - vmovdqu HashKey_8(arg2), \T5 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 - - vpshufd $0b01001110, \T2, \T6 - vpxor \T2, \T6, \T6 - - vmovdqu HashKey_8_k(arg2), \T5 - vpclmulqdq $0x00, \T5, \T6, \T6 - - vmovdqu 16*3(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP2(%rsp), \T1 - vmovdqu HashKey_7(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_7_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*4(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - ####################################################################### - - vmovdqa TMP3(%rsp), \T1 - vmovdqu HashKey_6(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_6_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*5(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP4(%rsp), \T1 - vmovdqu HashKey_5(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_5_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*6(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - vmovdqa TMP5(%rsp), \T1 - vmovdqu HashKey_4(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_4_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*7(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP6(%rsp), \T1 - vmovdqu HashKey_3(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_3_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - - vmovdqu 16*8(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP7(%rsp), \T1 - vmovdqu HashKey_2(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_2_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - ####################################################################### - - vmovdqu 16*9(arg1), \T5 - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqa TMP8(%rsp), \T1 - vmovdqu HashKey(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vpxor \T4, \T6, \T6 - vpxor \T7, \T6, \T6 - - vmovdqu 16*10(arg1), \T5 - - i = 11 - setreg -.rep (\REP-9) - - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqu 16*i(arg1), \T5 - i = i + 1 - setreg -.endr - - i = 0 - j = 1 - setreg -.rep 8 - vpxor 16*i(arg4, %r11), \T5, \T2 - .if \ENC_DEC == ENC - vaesenclast \T2, reg_j, reg_j - .else - vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg4, %r11), reg_j - vmovdqu \T3, 16*i(arg3, %r11) - .endif - i = (i+1) - j = (j+1) - setreg -.endr - ####################################################################### - - - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs - vpxor \T3, \T7, \T7 - vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 - - - - ####################################################################### - #first phase of the reduction - ####################################################################### - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 - - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW - - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer - .endif - - ####################################################################### - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 - ####################################################################### - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - - vpxor \T6, \XMM1, \XMM1 - - - -.endm - - -# GHASH the last 4 ciphertext blocks. -.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 - - ## Karatsuba Method - - - vpshufd $0b01001110, \XMM1, \T2 - vpxor \XMM1, \T2, \T2 - vmovdqu HashKey_8(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM1, \T6 - vpclmulqdq $0x00, \T5, \XMM1, \T7 - - vmovdqu HashKey_8_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM2, \T2 - vpxor \XMM2, \T2, \T2 - vmovdqu HashKey_7(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_7_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM3, \T2 - vpxor \XMM3, \T2, \T2 - vmovdqu HashKey_6(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_6_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM4, \T2 - vpxor \XMM4, \T2, \T2 - vmovdqu HashKey_5(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_5_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM5, \T2 - vpxor \XMM5, \T2, \T2 - vmovdqu HashKey_4(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_4_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM6, \T2 - vpxor \XMM6, \T2, \T2 - vmovdqu HashKey_3(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_3_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM7, \T2 - vpxor \XMM7, \T2, \T2 - vmovdqu HashKey_2(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_2_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM8, \T2 - vpxor \XMM8, \T2, \T2 - vmovdqu HashKey(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 - - - - - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 - - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # <T6:T7> holds the result of - # the accumulated carry-less multiplications - - ####################################################################### - #first phase of the reduction - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 - - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW - - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - - - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 - -.endm - -############################################################# -#void aesni_gcm_precomp_avx_gen2 -# (gcm_data *my_ctx_data, -# gcm_context_data *data, -# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -############################################################# -SYM_FUNC_START(aesni_gcm_init_avx_gen2) - FUNC_SAVE - INIT GHASH_MUL_AVX, PRECOMPUTE_AVX - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init_avx_gen2) - -############################################################################### -#void aesni_gcm_enc_update_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ -# const u8 *in, /* Plaintext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) - FUNC_SAVE - mov keysize, %eax - cmp $32, %eax - je key_256_enc_update - cmp $16, %eax - je key_128_enc_update - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 - FUNC_RESTORE - RET -key_128_enc_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 - FUNC_RESTORE - RET -key_256_enc_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) - -############################################################################### -#void aesni_gcm_dec_update_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ -# const u8 *in, /* Ciphertext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_dec_update - cmp $16, %eax - je key_128_dec_update - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 - FUNC_RESTORE - RET -key_128_dec_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 - FUNC_RESTORE - RET -key_256_dec_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) - -############################################################################### -#void aesni_gcm_finalize_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_finalize - cmp $16, %eax - je key_128_finalize - # must be 192 - GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 - FUNC_RESTORE - RET -key_128_finalize: - GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 - FUNC_RESTORE - RET -key_256_finalize: - GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) - -############################################################################### -# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -# Input: A and B (128-bits each, bit-reflected) -# Output: C = A*B*x mod poly, (i.e. >>1 ) -# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -############################################################################### -.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 - - vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 - vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 - vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 - vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 - vpxor \T3, \GH, \GH - - - vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs - vpslldq $8 , \GH, \GH # shift-L GH 2 DWs - - vpxor \T3, \T1, \T1 - vpxor \T2, \GH, \GH - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \GH, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L T2 2 DWs - - vpxor \T2, \GH, \GH # first phase of the reduction complete - ####################################################################### - #second phase of the reduction - vpclmulqdq $0x00, \GH, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \GH, \T3, \GH - vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \GH, \GH # second phase of the reduction complete - ####################################################################### - vpxor \T1, \GH, \GH # the result is in GH - - -.endm - -.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 - - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vmovdqa \HK, \T5 - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqu \T5, HashKey_3(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqu \T5, HashKey_4(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqu \T5, HashKey_5(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqu \T5, HashKey_6(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqu \T5, HashKey_7(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqu \T5, HashKey_8(arg2) - -.endm - -## if a = number of total plaintext bytes -## b = floor(a/16) -## num_initial_blocks = b mod 4# -## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext -## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, arg4 are used as pointers only, not modified - -.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER - i = (8-\num_initial_blocks) - setreg - vmovdqu AadHash(arg2), reg_i - - # start AES for num_initial_blocks blocks - vmovdqu CurCount(arg2), \CTR - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap - i = (i+1) - setreg -.endr - - vmovdqa (arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpxor \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = 1 - setreg -.rep \REP - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenc \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = (j+1) - setreg -.endr - - - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenclast \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vmovdqu (arg4, %r11), \T1 - vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for - # num_initial_blocks blocks - add $16, %r11 -.if \ENC_DEC == DEC - vmovdqa \T1, reg_i -.endif - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations - i = (i+1) - setreg -.endr - - - i = (8-\num_initial_blocks) - j = (9-\num_initial_blocks) - setreg - -.rep \num_initial_blocks - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks - i = (i+1) - j = (j+1) - setreg -.endr - # XMM8 has the combined result here - - vmovdqa \XMM8, TMP1(%rsp) - vmovdqa \XMM8, \T3 - - cmp $128, %r13 - jl .L_initial_blocks_done\@ # no need for precomputed constants - -############################################################################### -# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM1 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM2 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM3 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM4 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM5 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM6 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM7 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM8 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - vmovdqa (arg1), \T_key - vpxor \T_key, \XMM1, \XMM1 - vpxor \T_key, \XMM2, \XMM2 - vpxor \T_key, \XMM3, \XMM3 - vpxor \T_key, \XMM4, \XMM4 - vpxor \T_key, \XMM5, \XMM5 - vpxor \T_key, \XMM6, \XMM6 - vpxor \T_key, \XMM7, \XMM7 - vpxor \T_key, \XMM8, \XMM8 - - i = 1 - setreg -.rep \REP # do REP rounds - vmovdqa 16*i(arg1), \T_key - vaesenc \T_key, \XMM1, \XMM1 - vaesenc \T_key, \XMM2, \XMM2 - vaesenc \T_key, \XMM3, \XMM3 - vaesenc \T_key, \XMM4, \XMM4 - vaesenc \T_key, \XMM5, \XMM5 - vaesenc \T_key, \XMM6, \XMM6 - vaesenc \T_key, \XMM7, \XMM7 - vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg -.endr - - - vmovdqa 16*i(arg1), \T_key - vaesenclast \T_key, \XMM1, \XMM1 - vaesenclast \T_key, \XMM2, \XMM2 - vaesenclast \T_key, \XMM3, \XMM3 - vaesenclast \T_key, \XMM4, \XMM4 - vaesenclast \T_key, \XMM5, \XMM5 - vaesenclast \T_key, \XMM6, \XMM6 - vaesenclast \T_key, \XMM7, \XMM7 - vaesenclast \T_key, \XMM8, \XMM8 - - vmovdqu (arg4, %r11), \T1 - vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM1 - .endif - - vmovdqu 16*1(arg4, %r11), \T1 - vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM2 - .endif - - vmovdqu 16*2(arg4, %r11), \T1 - vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM3 - .endif - - vmovdqu 16*3(arg4, %r11), \T1 - vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM4 - .endif - - vmovdqu 16*4(arg4, %r11), \T1 - vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM5 - .endif - - vmovdqu 16*5(arg4, %r11), \T1 - vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM6 - .endif - - vmovdqu 16*6(arg4, %r11), \T1 - vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM7 - .endif - - vmovdqu 16*7(arg4, %r11), \T1 - vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM8 - .endif - - add $128, %r11 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with - # the corresponding ciphertext - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - -############################################################################### - -.L_initial_blocks_done\@: - - -.endm - - - -# encrypt 8 blocks at a time -# ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3, arg4 are used as pointers only, not modified -# r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC - - vmovdqa \XMM1, \T2 - vmovdqa \XMM2, TMP2(%rsp) - vmovdqa \XMM3, TMP3(%rsp) - vmovdqa \XMM4, TMP4(%rsp) - vmovdqa \XMM5, TMP5(%rsp) - vmovdqa \XMM6, TMP6(%rsp) - vmovdqa \XMM7, TMP7(%rsp) - vmovdqa \XMM8, TMP8(%rsp) - -.if \loop_idx == in_order - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONE(%rip), \XMM1, \XMM2 - vpaddd ONE(%rip), \XMM2, \XMM3 - vpaddd ONE(%rip), \XMM3, \XMM4 - vpaddd ONE(%rip), \XMM4, \XMM5 - vpaddd ONE(%rip), \XMM5, \XMM6 - vpaddd ONE(%rip), \XMM6, \XMM7 - vpaddd ONE(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap -.else - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONEf(%rip), \XMM1, \XMM2 - vpaddd ONEf(%rip), \XMM2, \XMM3 - vpaddd ONEf(%rip), \XMM3, \XMM4 - vpaddd ONEf(%rip), \XMM4, \XMM5 - vpaddd ONEf(%rip), \XMM5, \XMM6 - vpaddd ONEf(%rip), \XMM6, \XMM7 - vpaddd ONEf(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR -.endif - - - ####################################################################### - - vmovdqu (arg1), \T1 - vpxor \T1, \XMM1, \XMM1 - vpxor \T1, \XMM2, \XMM2 - vpxor \T1, \XMM3, \XMM3 - vpxor \T1, \XMM4, \XMM4 - vpxor \T1, \XMM5, \XMM5 - vpxor \T1, \XMM6, \XMM6 - vpxor \T1, \XMM7, \XMM7 - vpxor \T1, \XMM8, \XMM8 - - ####################################################################### - - - - - - vmovdqu 16*1(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqu 16*2(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - ####################################################################### - - vmovdqu HashKey_8(arg2), \T5 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 - vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 - vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 - vpxor \T5, \T6, \T6 - - vmovdqu 16*3(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP2(%rsp), \T1 - vmovdqu HashKey_7(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*4(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - ####################################################################### - - vmovdqa TMP3(%rsp), \T1 - vmovdqu HashKey_6(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*5(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP4(%rsp), \T1 - vmovdqu HashKey_5(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*6(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - vmovdqa TMP5(%rsp), \T1 - vmovdqu HashKey_4(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*7(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP6(%rsp), \T1 - vmovdqu HashKey_3(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*8(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP7(%rsp), \T1 - vmovdqu HashKey_2(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - - ####################################################################### - - vmovdqu 16*9(arg1), \T5 - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqa TMP8(%rsp), \T1 - vmovdqu HashKey(arg2), \T5 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T1 - - - vmovdqu 16*10(arg1), \T5 - - i = 11 - setreg -.rep (\REP-9) - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqu 16*i(arg1), \T5 - i = i + 1 - setreg -.endr - - i = 0 - j = 1 - setreg -.rep 8 - vpxor 16*i(arg4, %r11), \T5, \T2 - .if \ENC_DEC == ENC - vaesenclast \T2, reg_j, reg_j - .else - vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg4, %r11), reg_j - vmovdqu \T3, 16*i(arg3, %r11) - .endif - i = (i+1) - j = (j+1) - setreg -.endr - ####################################################################### - - - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs - vpxor \T3, \T7, \T7 - vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 - - - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \T7, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer - .endif - - ####################################################################### - #second phase of the reduction - vpclmulqdq $0x00, \T7, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \T7, \T3, \T4 - vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \T4, \T4 # second phase of the reduction complete - ####################################################################### - vpxor \T4, \T1, \T1 # the result is in T1 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - - vpxor \T1, \XMM1, \XMM1 - - - -.endm - - -# GHASH the last 4 ciphertext blocks. -.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 - - ## Karatsuba Method - - vmovdqu HashKey_8(arg2), \T5 - - vpshufd $0b01001110, \XMM1, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM1, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM1, \T6 - vpclmulqdq $0x00, \T5, \XMM1, \T7 - - vpclmulqdq $0x00, \T3, \T2, \XMM1 - - ###################### - - vmovdqu HashKey_7(arg2), \T5 - vpshufd $0b01001110, \XMM2, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM2, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_6(arg2), \T5 - vpshufd $0b01001110, \XMM3, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM3, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_5(arg2), \T5 - vpshufd $0b01001110, \XMM4, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM4, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_4(arg2), \T5 - vpshufd $0b01001110, \XMM5, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM5, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_3(arg2), \T5 - vpshufd $0b01001110, \XMM6, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM6, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_2(arg2), \T5 - vpshufd $0b01001110, \XMM7, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM7, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey(arg2), \T5 - vpshufd $0b01001110, \XMM8, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM8, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 - - - - - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 - - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the - # accumulated carry-less multiplications - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \T7, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - - - #second phase of the reduction - vpclmulqdq $0x00, \T7, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \T7, \T3, \T4 - vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \T4, \T4 # second phase of the reduction complete - ####################################################################### - vpxor \T4, \T6, \T6 # the result is in T6 -.endm - - - -############################################################# -#void aesni_gcm_init_avx_gen4 -# (gcm_data *my_ctx_data, -# gcm_context_data *data, -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -############################################################# -SYM_FUNC_START(aesni_gcm_init_avx_gen4) - FUNC_SAVE - INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init_avx_gen4) - -############################################################################### -#void aesni_gcm_enc_avx_gen4( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ -# const u8 *in, /* Plaintext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_enc_update4 - cmp $16, %eax - je key_128_enc_update4 - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 - FUNC_RESTORE - RET -key_128_enc_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 - FUNC_RESTORE - RET -key_256_enc_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) - -############################################################################### -#void aesni_gcm_dec_update_avx_gen4( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ -# const u8 *in, /* Ciphertext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_dec_update4 - cmp $16, %eax - je key_128_dec_update4 - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 - FUNC_RESTORE - RET -key_128_dec_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 - FUNC_RESTORE - RET -key_256_dec_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) - -############################################################################### -#void aesni_gcm_finalize_avx_gen4( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_finalize4 - cmp $16, %eax - je key_128_finalize4 - # must be 192 - GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 - FUNC_RESTORE - RET -key_128_finalize4: - GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 - FUNC_RESTORE - RET -key_256_finalize4: - GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index b1d90c25975a..bc655d794a95 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Support for Intel AES-NI instructions. This file contains glue - * code, the real AES implementation is in intel-aes_asm.S. + * Support for AES-NI and VAES instructions. This file contains glue code. + * The real AES implementations are in aesni-intel_asm.S and other .S files. * * Copyright (C) 2008, Intel Corp. * Author: Huang Ying <ying.huang@intel.com> @@ -13,6 +13,8 @@ * Tadeusz Struk (tadeusz.struk@intel.com) * Aidan O'Mahony (aidan.o.mahony@intel.com) * Copyright (c) 2010, Intel Corporation. + * + * Copyright 2024 Google LLC */ #include <linux/hardirq.h> @@ -21,7 +23,6 @@ #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/aes.h> -#include <crypto/ctr.h> #include <crypto/b128ops.h> #include <crypto/gcm.h> #include <crypto/xts.h> @@ -40,46 +41,15 @@ #define AESNI_ALIGN 16 #define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN))) #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1)) -#define RFC4106_HASH_SUBKEY_SIZE 16 #define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) -/* This data is stored at the end of the crypto_tfm struct. - * It's a type of per "session" data storage location. - * This needs to be 16 byte aligned. - */ -struct aesni_rfc4106_gcm_ctx { - u8 hash_subkey[16] AESNI_ALIGN_ATTR; - struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; - u8 nonce[4]; -}; - -struct generic_gcmaes_ctx { - u8 hash_subkey[16] AESNI_ALIGN_ATTR; - struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; -}; - struct aesni_xts_ctx { struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR; struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR; }; -#define GCM_BLOCK_LEN 16 - -struct gcm_context_data { - /* init, update and finalize context data */ - u8 aad_hash[GCM_BLOCK_LEN]; - u64 aad_length; - u64 in_length; - u8 partial_block_enc_key[GCM_BLOCK_LEN]; - u8 orig_IV[GCM_BLOCK_LEN]; - u8 current_counter[GCM_BLOCK_LEN]; - u64 partial_block_len; - u64 unused; - u8 hash_keys[GCM_BLOCK_LEN * 16]; -}; - static inline void *aes_align_addr(void *addr) { if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN) @@ -87,8 +57,8 @@ static inline void *aes_align_addr(void *addr) return PTR_ALIGN(addr, AESNI_ALIGN); } -asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, - unsigned int key_len); +asmlinkage void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, + unsigned int key_len); asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in); asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in); asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out, @@ -104,118 +74,15 @@ asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -#define AVX_GEN2_OPTSIZE 640 -#define AVX_GEN4_OPTSIZE 4096 - -asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); -asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); #ifdef CONFIG_X86_64 - asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc); - -/* Scatter / Gather routines, with args similar to above */ -asmlinkage void aesni_gcm_init(void *ctx, - struct gcm_context_data *gdata, - u8 *iv, - u8 *hash_subkey, const u8 *aad, - unsigned long aad_len); -asmlinkage void aesni_gcm_enc_update(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); -asmlinkage void aesni_gcm_dec_update(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, - unsigned long ciphertext_len); -asmlinkage void aesni_gcm_finalize(void *ctx, - struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); - -asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, - void *keys, u8 *out, unsigned int num_bytes); -asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, - void *keys, u8 *out, unsigned int num_bytes); -asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, - void *keys, u8 *out, unsigned int num_bytes); - - -asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, - const void *keys, u8 *out, unsigned int num_bytes, - unsigned int byte_ctr); - -asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, - const void *keys, u8 *out, unsigned int num_bytes, - unsigned int byte_ctr); - -asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, - const void *keys, u8 *out, unsigned int num_bytes, - unsigned int byte_ctr); - -/* - * asmlinkage void aesni_gcm_init_avx_gen2() - * gcm_data *my_ctx_data, context data - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - */ -asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data, - struct gcm_context_data *gdata, - u8 *iv, - u8 *hash_subkey, - const u8 *aad, - unsigned long aad_len); - -asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); -asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, - unsigned long ciphertext_len); -asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, - struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); - -/* - * asmlinkage void aesni_gcm_init_avx_gen4() - * gcm_data *my_ctx_data, context data - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - */ -asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data, - struct gcm_context_data *gdata, - u8 *iv, - u8 *hash_subkey, - const u8 *aad, - unsigned long aad_len); - -asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); -asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, - unsigned long ciphertext_len); -asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, - struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2); - -static inline struct -aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) -{ - return aes_align_addr(crypto_aead_ctx(tfm)); -} - -static inline struct -generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm) -{ - return aes_align_addr(crypto_aead_ctx(tfm)); -} #endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) @@ -233,19 +100,17 @@ static int aes_set_key_common(struct crypto_aes_ctx *ctx, { int err; - if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256) - return -EINVAL; - if (!crypto_simd_usable()) - err = aes_expandkey(ctx, in_key, key_len); - else { - kernel_fpu_begin(); - err = aesni_set_key(ctx, in_key, key_len); - kernel_fpu_end(); - } + return aes_expandkey(ctx, in_key, key_len); - return err; + err = aes_check_keylen(key_len); + if (err) + return err; + + kernel_fpu_begin(); + aesni_set_key(ctx, in_key, key_len); + kernel_fpu_end(); + return 0; } static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, @@ -488,24 +353,8 @@ static int cts_cbc_decrypt(struct skcipher_request *req) } #ifdef CONFIG_X86_64 -static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv) -{ - /* - * based on key length, override with the by8 version - * of ctr mode encryption/decryption for improved performance - * aes_set_key_common() ensures that key length is one of - * {128,192,256} - */ - if (ctx->key_length == AES_KEYSIZE_128) - aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len); - else if (ctx->key_length == AES_KEYSIZE_192) - aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len); - else - aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len); -} - -static int ctr_crypt(struct skcipher_request *req) +/* This is the non-AVX version. */ +static int ctr_crypt_aesni(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); @@ -519,10 +368,9 @@ static int ctr_crypt(struct skcipher_request *req) while ((nbytes = walk.nbytes) > 0) { kernel_fpu_begin(); if (nbytes & AES_BLOCK_MASK) - static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr, - walk.src.virt.addr, - nbytes & AES_BLOCK_MASK, - walk.iv); + aesni_ctr_enc(ctx, walk.dst.virt.addr, + walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= ~AES_BLOCK_MASK; if (walk.nbytes == walk.total && nbytes > 0) { @@ -538,346 +386,9 @@ static int ctr_crypt(struct skcipher_request *req) } return err; } - -static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, - const u8 *in, unsigned int len, u8 *iv, - unsigned int byte_ctr) -{ - if (ctx->key_length == AES_KEYSIZE_128) - aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len, - byte_ctr); - else if (ctx->key_length == AES_KEYSIZE_192) - aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len, - byte_ctr); - else - aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len, - byte_ctr); -} - -static int xctr_crypt(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); - u8 keystream[AES_BLOCK_SIZE]; - struct skcipher_walk walk; - unsigned int nbytes; - unsigned int byte_ctr = 0; - int err; - __le32 block[AES_BLOCK_SIZE / sizeof(__le32)]; - - err = skcipher_walk_virt(&walk, req, false); - - while ((nbytes = walk.nbytes) > 0) { - kernel_fpu_begin(); - if (nbytes & AES_BLOCK_MASK) - aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr, - walk.src.virt.addr, nbytes & AES_BLOCK_MASK, - walk.iv, byte_ctr); - nbytes &= ~AES_BLOCK_MASK; - byte_ctr += walk.nbytes - nbytes; - - if (walk.nbytes == walk.total && nbytes > 0) { - memcpy(block, walk.iv, AES_BLOCK_SIZE); - block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE); - aesni_enc(ctx, keystream, (u8 *)block); - crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - - nbytes, walk.src.virt.addr + walk.nbytes - - nbytes, keystream, nbytes); - byte_ctr += nbytes; - nbytes = 0; - } - kernel_fpu_end(); - err = skcipher_walk_done(&walk, nbytes); - } - return err; -} - -static int -rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) -{ - struct crypto_aes_ctx ctx; - int ret; - - ret = aes_expandkey(&ctx, key, key_len); - if (ret) - return ret; - - /* Clear the data in the hash sub key container to zero.*/ - /* We want to cipher all zeros to create the hash sub key. */ - memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE); - - aes_encrypt(&ctx, hash_subkey, hash_subkey); - - memzero_explicit(&ctx, sizeof(ctx)); - return 0; -} - -static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, - unsigned int key_len) -{ - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); - - if (key_len < 4) - return -EINVAL; - - /*Account for 4 byte nonce at the end.*/ - key_len -= 4; - - memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); - - return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: - rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); -} - -/* This is the Integrity Check Value (aka the authentication tag) length and can - * be 8, 12 or 16 bytes long. */ -static int common_rfc4106_set_authsize(struct crypto_aead *aead, - unsigned int authsize) -{ - switch (authsize) { - case 8: - case 12: - case 16: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - switch (authsize) { - case 4: - case 8: - case 12: - case 13: - case 14: - case 15: - case 16: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, - unsigned int assoclen, u8 *hash_subkey, - u8 *iv, void *aes_ctx, u8 *auth_tag, - unsigned long auth_tag_len) -{ - u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8); - struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN); - unsigned long left = req->cryptlen; - struct scatter_walk assoc_sg_walk; - struct skcipher_walk walk; - bool do_avx, do_avx2; - u8 *assocmem = NULL; - u8 *assoc; - int err; - - if (!enc) - left -= auth_tag_len; - - do_avx = (left >= AVX_GEN2_OPTSIZE); - do_avx2 = (left >= AVX_GEN4_OPTSIZE); - - /* Linearize assoc, if not already linear */ - if (req->src->length >= assoclen && req->src->length) { - scatterwalk_start(&assoc_sg_walk, req->src); - assoc = scatterwalk_map(&assoc_sg_walk); - } else { - gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? - GFP_KERNEL : GFP_ATOMIC; - - /* assoc can be any length, so must be on heap */ - assocmem = kmalloc(assoclen, flags); - if (unlikely(!assocmem)) - return -ENOMEM; - assoc = assocmem; - - scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); - } - - kernel_fpu_begin(); - if (static_branch_likely(&gcm_use_avx2) && do_avx2) - aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc, - assoclen); - else if (static_branch_likely(&gcm_use_avx) && do_avx) - aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc, - assoclen); - else - aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen); - kernel_fpu_end(); - - if (!assocmem) - scatterwalk_unmap(assoc); - else - kfree(assocmem); - - err = enc ? skcipher_walk_aead_encrypt(&walk, req, false) - : skcipher_walk_aead_decrypt(&walk, req, false); - - while (walk.nbytes > 0) { - kernel_fpu_begin(); - if (static_branch_likely(&gcm_use_avx2) && do_avx2) { - if (enc) - aesni_gcm_enc_update_avx_gen4(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - else - aesni_gcm_dec_update_avx_gen4(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - } else if (static_branch_likely(&gcm_use_avx) && do_avx) { - if (enc) - aesni_gcm_enc_update_avx_gen2(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - else - aesni_gcm_dec_update_avx_gen2(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - } else if (enc) { - aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr, - walk.src.virt.addr, walk.nbytes); - } else { - aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr, - walk.src.virt.addr, walk.nbytes); - } - kernel_fpu_end(); - - err = skcipher_walk_done(&walk, 0); - } - - if (err) - return err; - - kernel_fpu_begin(); - if (static_branch_likely(&gcm_use_avx2) && do_avx2) - aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag, - auth_tag_len); - else if (static_branch_likely(&gcm_use_avx) && do_avx) - aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag, - auth_tag_len); - else - aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len); - kernel_fpu_end(); - - return 0; -} - -static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, - u8 *hash_subkey, u8 *iv, void *aes_ctx) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 auth_tag[16]; - int err; - - err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx, - auth_tag, auth_tag_len); - if (err) - return err; - - scatterwalk_map_and_copy(auth_tag, req->dst, - req->assoclen + req->cryptlen, - auth_tag_len, 1); - return 0; -} - -static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, - u8 *hash_subkey, u8 *iv, void *aes_ctx) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 auth_tag_msg[16]; - u8 auth_tag[16]; - int err; - - err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx, - auth_tag, auth_tag_len); - if (err) - return err; - - /* Copy out original auth_tag */ - scatterwalk_map_and_copy(auth_tag_msg, req->src, - req->assoclen + req->cryptlen - auth_tag_len, - auth_tag_len, 0); - - /* Compare generated tag with passed in tag. */ - if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) { - memzero_explicit(auth_tag, sizeof(auth_tag)); - return -EBADMSG; - } - return 0; -} - -static int helper_rfc4106_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - unsigned int i; - __be32 counter = cpu_to_be32(1); - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length equal */ - /* to 16 or 20 bytes */ - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; - - return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, - aes_ctx); -} - -static int helper_rfc4106_decrypt(struct aead_request *req) -{ - __be32 counter = cpu_to_be32(1); - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - unsigned int i; - - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length */ - /* equal to 16 or 20 bytes */ - - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; - - return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, - aes_ctx); -} #endif -static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, +static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); @@ -898,108 +409,139 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key, return aes_set_key_common(&ctx->tweak_ctx, key + keylen, keylen); } -static int xts_crypt(struct skcipher_request *req, bool encrypt) +typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key, + u8 iv[AES_BLOCK_SIZE]); +typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, int len, + u8 tweak[AES_BLOCK_SIZE]); + +/* This handles cases where the source and/or destination span pages. */ +static noinline int +xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); + const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); int tail = req->cryptlen % AES_BLOCK_SIZE; + struct scatterlist sg_src[2], sg_dst[2]; struct skcipher_request subreq; struct skcipher_walk walk; + struct scatterlist *src, *dst; int err; - if (req->cryptlen < AES_BLOCK_SIZE) - return -EINVAL; - - err = skcipher_walk_virt(&walk, req, false); - if (!walk.nbytes) - return err; - - if (unlikely(tail > 0 && walk.nbytes < walk.total)) { - int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2; - - skcipher_walk_abort(&walk); - + /* + * If the message length isn't divisible by the AES block size, then + * separate off the last full block and the partial block. This ensures + * that they are processed in the same call to the assembly function, + * which is required for ciphertext stealing. + */ + if (tail) { skcipher_request_set_tfm(&subreq, tfm); skcipher_request_set_callback(&subreq, skcipher_request_flags(req), NULL, NULL); skcipher_request_set_crypt(&subreq, req->src, req->dst, - blocks * AES_BLOCK_SIZE, req->iv); + req->cryptlen - tail - AES_BLOCK_SIZE, + req->iv); req = &subreq; - - err = skcipher_walk_virt(&walk, req, false); - if (!walk.nbytes) - return err; - } else { - tail = 0; } - kernel_fpu_begin(); + err = skcipher_walk_virt(&walk, req, false); - /* calculate first value of T */ - aesni_enc(&ctx->tweak_ctx, walk.iv, walk.iv); + while (walk.nbytes) { + kernel_fpu_begin(); + (*crypt_func)(&ctx->crypt_ctx, + walk.src.virt.addr, walk.dst.virt.addr, + walk.nbytes & ~(AES_BLOCK_SIZE - 1), req->iv); + kernel_fpu_end(); + err = skcipher_walk_done(&walk, + walk.nbytes & (AES_BLOCK_SIZE - 1)); + } - while (walk.nbytes > 0) { - int nbytes = walk.nbytes; + if (err || !tail) + return err; - if (nbytes < walk.total) - nbytes &= ~(AES_BLOCK_SIZE - 1); + /* Do ciphertext stealing with the last full block and partial block. */ - if (encrypt) - aesni_xts_encrypt(&ctx->crypt_ctx, - walk.dst.virt.addr, walk.src.virt.addr, - nbytes, walk.iv); - else - aesni_xts_decrypt(&ctx->crypt_ctx, - walk.dst.virt.addr, walk.src.virt.addr, - nbytes, walk.iv); - kernel_fpu_end(); + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); - if (walk.nbytes > 0) - kernel_fpu_begin(); - } + err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; - if (unlikely(tail > 0 && !err)) { - struct scatterlist sg_src[2], sg_dst[2]; - struct scatterlist *src, *dst; + kernel_fpu_begin(); + (*crypt_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr, + walk.nbytes, req->iv); + kernel_fpu_end(); - dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); - if (req->dst != req->src) - dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + return skcipher_walk_done(&walk, 0); +} - skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, - req->iv); +/* __always_inline to avoid indirect call in fastpath */ +static __always_inline int +xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv, + xts_crypt_func crypt_func) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); - err = skcipher_walk_virt(&walk, &subreq, false); - if (err) - return err; + if (unlikely(req->cryptlen < AES_BLOCK_SIZE)) + return -EINVAL; - kernel_fpu_begin(); - if (encrypt) - aesni_xts_encrypt(&ctx->crypt_ctx, - walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes, walk.iv); - else - aesni_xts_decrypt(&ctx->crypt_ctx, - walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes, walk.iv); - kernel_fpu_end(); + kernel_fpu_begin(); + (*encrypt_iv)(&ctx->tweak_ctx, req->iv); - err = skcipher_walk_done(&walk, 0); + /* + * In practice, virtually all XTS plaintexts and ciphertexts are either + * 512 or 4096 bytes and do not use multiple scatterlist elements. To + * optimize the performance of these cases, the below fast-path handles + * single-scatterlist-element messages as efficiently as possible. The + * code is 64-bit specific, as it assumes no page mapping is needed. + */ + if (IS_ENABLED(CONFIG_X86_64) && + likely(req->src->length >= req->cryptlen && + req->dst->length >= req->cryptlen)) { + (*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src), + sg_virt(req->dst), req->cryptlen, req->iv); + kernel_fpu_end(); + return 0; } - return err; + kernel_fpu_end(); + return xts_crypt_slowpath(req, crypt_func); +} + +static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, + u8 iv[AES_BLOCK_SIZE]) +{ + aesni_enc(tweak_key, iv, iv); +} + +static void aesni_xts_encrypt(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, int len, + u8 tweak[AES_BLOCK_SIZE]) +{ + aesni_xts_enc(key, dst, src, len, tweak); +} + +static void aesni_xts_decrypt(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, int len, + u8 tweak[AES_BLOCK_SIZE]) +{ + aesni_xts_dec(key, dst, src, len, tweak); } -static int xts_encrypt(struct skcipher_request *req) +static int xts_encrypt_aesni(struct skcipher_request *req) { - return xts_crypt(req, true); + return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_encrypt); } -static int xts_decrypt(struct skcipher_request *req) +static int xts_decrypt_aesni(struct skcipher_request *req) { - return xts_crypt(req, false); + return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_decrypt); } static struct crypto_alg aesni_cipher_alg = { @@ -1086,8 +628,8 @@ static struct skcipher_alg aesni_skciphers[] = { .ivsize = AES_BLOCK_SIZE, .chunksize = AES_BLOCK_SIZE, .setkey = aesni_skcipher_setkey, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, + .encrypt = ctr_crypt_aesni, + .decrypt = ctr_crypt_aesni, #endif }, { .base = { @@ -1103,9 +645,9 @@ static struct skcipher_alg aesni_skciphers[] = { .max_keysize = 2 * AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .walksize = 2 * AES_BLOCK_SIZE, - .setkey = xts_aesni_setkey, - .encrypt = xts_encrypt, - .decrypt = xts_decrypt, + .setkey = xts_setkey_aesni, + .encrypt = xts_encrypt_aesni, + .decrypt = xts_decrypt_aesni, } }; @@ -1113,114 +655,1023 @@ static struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; #ifdef CONFIG_X86_64 +asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, + u8 iv[AES_BLOCK_SIZE]); + +/* __always_inline to avoid indirect call */ +static __always_inline int +ctr_crypt(struct skcipher_request *req, + void (*ctr64_func)(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, int len, + const u64 le_ctr[2])) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); + unsigned int nbytes, p1_nbytes, nblocks; + struct skcipher_walk walk; + u64 le_ctr[2]; + u64 ctr64; + int err; + + ctr64 = le_ctr[0] = get_unaligned_be64(&req->iv[8]); + le_ctr[1] = get_unaligned_be64(&req->iv[0]); + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) != 0) { + if (nbytes < walk.total) { + /* Not the end yet, so keep the length block-aligned. */ + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + nblocks = nbytes / AES_BLOCK_SIZE; + } else { + /* It's the end, so include any final partial block. */ + nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); + } + ctr64 += nblocks; + + kernel_fpu_begin(); + if (likely(ctr64 >= nblocks)) { + /* The low 64 bits of the counter won't overflow. */ + (*ctr64_func)(key, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, le_ctr); + } else { + /* + * The low 64 bits of the counter will overflow. The + * assembly doesn't handle this case, so split the + * operation into two at the point where the overflow + * will occur. After the first part, add the carry bit. + */ + p1_nbytes = min_t(unsigned int, nbytes, + (nblocks - ctr64) * AES_BLOCK_SIZE); + (*ctr64_func)(key, walk.src.virt.addr, + walk.dst.virt.addr, p1_nbytes, le_ctr); + le_ctr[0] = 0; + le_ctr[1]++; + (*ctr64_func)(key, walk.src.virt.addr + p1_nbytes, + walk.dst.virt.addr + p1_nbytes, + nbytes - p1_nbytes, le_ctr); + } + kernel_fpu_end(); + le_ctr[0] = ctr64; + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + put_unaligned_be64(ctr64, &req->iv[8]); + put_unaligned_be64(le_ctr[1], &req->iv[0]); + + return err; +} + +/* __always_inline to avoid indirect call */ +static __always_inline int +xctr_crypt(struct skcipher_request *req, + void (*xctr_func)(const struct crypto_aes_ctx *key, + const u8 *src, u8 *dst, int len, + const u8 iv[AES_BLOCK_SIZE], u64 ctr)) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); + struct skcipher_walk walk; + unsigned int nbytes; + u64 ctr = 1; + int err; + + err = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) != 0) { + if (nbytes < walk.total) + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + + kernel_fpu_begin(); + (*xctr_func)(key, walk.src.virt.addr, walk.dst.virt.addr, + nbytes, req->iv, ctr); + kernel_fpu_end(); + + ctr += DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + return err; +} + +#define DEFINE_AVX_SKCIPHER_ALGS(suffix, driver_name_suffix, priority) \ + \ +asmlinkage void \ +aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ + u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ +asmlinkage void \ +aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ + u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ + \ +static int xts_encrypt_##suffix(struct skcipher_request *req) \ +{ \ + return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix); \ +} \ + \ +static int xts_decrypt_##suffix(struct skcipher_request *req) \ +{ \ + return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix); \ +} \ + \ +asmlinkage void \ +aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, \ + const u8 *src, u8 *dst, int len, const u64 le_ctr[2]);\ + \ +static int ctr_crypt_##suffix(struct skcipher_request *req) \ +{ \ + return ctr_crypt(req, aes_ctr64_crypt_##suffix); \ +} \ + \ +asmlinkage void \ +aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, \ + const u8 *src, u8 *dst, int len, \ + const u8 iv[AES_BLOCK_SIZE], u64 ctr); \ + \ +static int xctr_crypt_##suffix(struct skcipher_request *req) \ +{ \ + return xctr_crypt(req, aes_xctr_crypt_##suffix); \ +} \ + \ +static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ + .base.cra_name = "__xts(aes)", \ + .base.cra_driver_name = "__xts-aes-" driver_name_suffix, \ + .base.cra_priority = priority, \ + .base.cra_flags = CRYPTO_ALG_INTERNAL, \ + .base.cra_blocksize = AES_BLOCK_SIZE, \ + .base.cra_ctxsize = XTS_AES_CTX_SIZE, \ + .base.cra_module = THIS_MODULE, \ + .min_keysize = 2 * AES_MIN_KEY_SIZE, \ + .max_keysize = 2 * AES_MAX_KEY_SIZE, \ + .ivsize = AES_BLOCK_SIZE, \ + .walksize = 2 * AES_BLOCK_SIZE, \ + .setkey = xts_setkey_aesni, \ + .encrypt = xts_encrypt_##suffix, \ + .decrypt = xts_decrypt_##suffix, \ +}, { \ + .base.cra_name = "__ctr(aes)", \ + .base.cra_driver_name = "__ctr-aes-" driver_name_suffix, \ + .base.cra_priority = priority, \ + .base.cra_flags = CRYPTO_ALG_INTERNAL, \ + .base.cra_blocksize = 1, \ + .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ + .base.cra_module = THIS_MODULE, \ + .min_keysize = AES_MIN_KEY_SIZE, \ + .max_keysize = AES_MAX_KEY_SIZE, \ + .ivsize = AES_BLOCK_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .setkey = aesni_skcipher_setkey, \ + .encrypt = ctr_crypt_##suffix, \ + .decrypt = ctr_crypt_##suffix, \ +}, { \ + .base.cra_name = "__xctr(aes)", \ + .base.cra_driver_name = "__xctr-aes-" driver_name_suffix, \ + .base.cra_priority = priority, \ + .base.cra_flags = CRYPTO_ALG_INTERNAL, \ + .base.cra_blocksize = 1, \ + .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ + .base.cra_module = THIS_MODULE, \ + .min_keysize = AES_MIN_KEY_SIZE, \ + .max_keysize = AES_MAX_KEY_SIZE, \ + .ivsize = AES_BLOCK_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .setkey = aesni_skcipher_setkey, \ + .encrypt = xctr_crypt_##suffix, \ + .decrypt = xctr_crypt_##suffix, \ +}}; \ + \ +static struct simd_skcipher_alg * \ +simd_skcipher_algs_##suffix[ARRAY_SIZE(skcipher_algs_##suffix)] + +DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500); +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600); +DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_256, "vaes-avx10_256", 700); +DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_512, "vaes-avx10_512", 800); +#endif + +/* The common part of the x86_64 AES-GCM key struct */ +struct aes_gcm_key { + /* Expanded AES key and the AES key length in bytes */ + struct crypto_aes_ctx aes_key; + + /* RFC4106 nonce (used only by the rfc4106 algorithms) */ + u32 rfc4106_nonce; +}; + +/* Key struct used by the AES-NI implementations of AES-GCM */ +struct aes_gcm_key_aesni { + /* + * Common part of the key. The assembly code requires 16-byte alignment + * for the round keys; we get this by them being located at the start of + * the struct and the whole struct being 16-byte aligned. + */ + struct aes_gcm_key base; + + /* + * Powers of the hash key H^8 through H^1. These are 128-bit values. + * They all have an extra factor of x^-1 and are byte-reversed. 16-byte + * alignment is required by the assembly code. + */ + u64 h_powers[8][2] __aligned(16); + + /* + * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd + * together. It's used for Karatsuba multiplication. 16-byte alignment + * is required by the assembly code. + */ + u64 h_powers_xored[8] __aligned(16); + + /* + * H^1 times x^64 (and also the usual extra factor of x^-1). 16-byte + * alignment is required by the assembly code. + */ + u64 h_times_x64[2] __aligned(16); +}; +#define AES_GCM_KEY_AESNI(key) \ + container_of((key), struct aes_gcm_key_aesni, base) +#define AES_GCM_KEY_AESNI_SIZE \ + (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1))) + +/* Key struct used by the VAES + AVX10 implementations of AES-GCM */ +struct aes_gcm_key_avx10 { + /* + * Common part of the key. The assembly code prefers 16-byte alignment + * for the round keys; we get this by them being located at the start of + * the struct and the whole struct being 64-byte aligned. + */ + struct aes_gcm_key base; + + /* + * Powers of the hash key H^16 through H^1. These are 128-bit values. + * They all have an extra factor of x^-1 and are byte-reversed. This + * array is aligned to a 64-byte boundary to make it naturally aligned + * for 512-bit loads, which can improve performance. (The assembly code + * doesn't *need* the alignment; this is just an optimization.) + */ + u64 h_powers[16][2] __aligned(64); + + /* Three padding blocks required by the assembly code */ + u64 padding[3][2]; +}; +#define AES_GCM_KEY_AVX10(key) \ + container_of((key), struct aes_gcm_key_avx10, base) +#define AES_GCM_KEY_AVX10_SIZE \ + (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1))) + /* - * XCTR does not have a non-AVX implementation, so it must be enabled - * conditionally. + * These flags are passed to the AES-GCM helper functions to specify the + * specific version of AES-GCM (RFC4106 or not), whether it's encryption or + * decryption, and which assembly functions should be called. Assembly + * functions are selected using flags instead of function pointers to avoid + * indirect calls (which are very expensive on x86) regardless of inlining. */ -static struct skcipher_alg aesni_xctr = { - .base = { - .cra_name = "__xctr(aes)", - .cra_driver_name = "__xctr-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = CRYPTO_AES_CTX_SIZE, - .cra_module = THIS_MODULE, - }, - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .chunksize = AES_BLOCK_SIZE, - .setkey = aesni_skcipher_setkey, - .encrypt = xctr_crypt, - .decrypt = xctr_crypt, -}; +#define FLAG_RFC4106 BIT(0) +#define FLAG_ENC BIT(1) +#define FLAG_AVX BIT(2) +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +# define FLAG_AVX10_256 BIT(3) +# define FLAG_AVX10_512 BIT(4) +#else + /* + * This should cause all calls to the AVX10 assembly functions to be + * optimized out, avoiding the need to ifdef each call individually. + */ +# define FLAG_AVX10_256 0 +# define FLAG_AVX10_512 0 +#endif -static struct simd_skcipher_alg *aesni_simd_xctr; -#endif /* CONFIG_X86_64 */ +static inline struct aes_gcm_key * +aes_gcm_key_get(struct crypto_aead *tfm, int flags) +{ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + return PTR_ALIGN(crypto_aead_ctx(tfm), 64); + else + return PTR_ALIGN(crypto_aead_ctx(tfm), 16); +} -#ifdef CONFIG_X86_64 -static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, - unsigned int key_len) +asmlinkage void +aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key); +asmlinkage void +aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key); +asmlinkage void +aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key); +asmlinkage void +aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key); + +static void aes_gcm_precompute(struct aes_gcm_key *key, int flags) { - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead); + /* + * To make things a bit easier on the assembly side, the AVX10 + * implementations use the same key format. Therefore, a single + * function using 256-bit vectors would suffice here. However, it's + * straightforward to provide a 512-bit one because of how the assembly + * code is structured, and it works nicely because the total size of the + * key powers is a multiple of 512 bits. So we take advantage of that. + * + * A similar situation applies to the AES-NI implementations. + */ + if (flags & FLAG_AVX10_512) + aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key)); + else if (flags & FLAG_AVX10_256) + aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key)); + else if (flags & FLAG_AVX) + aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key)); + else + aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key)); +} - return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: - rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); +asmlinkage void +aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); +asmlinkage void +aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); +asmlinkage void +aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); + +static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16], + const u8 *aad, int aadlen, int flags) +{ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc, + aad, aadlen); + else if (flags & FLAG_AVX) + aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc, + aad, aadlen); + else + aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc, + aad, aadlen); } -static int generic_gcmaes_encrypt(struct aead_request *req) +asmlinkage void +aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); + +asmlinkage void +aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline void +aes_gcm_update(const struct aes_gcm_key *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen, int flags) { - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - __be32 counter = cpu_to_be32(1); + if (flags & FLAG_ENC) { + if (flags & FLAG_AVX10_512) + aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_AVX10_256) + aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_AVX) + aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + src, dst, datalen); + else + aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, + ghash_acc, src, dst, datalen); + } else { + if (flags & FLAG_AVX10_512) + aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_AVX10_256) + aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_AVX) + aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + src, dst, datalen); + else + aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + src, dst, datalen); + } +} - memcpy(iv, req->iv, 12); - *((__be32 *)(iv+12)) = counter; +asmlinkage void +aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); +asmlinkage void +aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); +asmlinkage void +aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline void +aes_gcm_enc_final(const struct aes_gcm_key *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, int flags) +{ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); + else if (flags & FLAG_AVX) + aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); + else + aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); +} - return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv, - aes_ctx); +asmlinkage bool __must_check +aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); +asmlinkage bool __must_check +aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); +asmlinkage bool __must_check +aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline bool __must_check +aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4], + u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, + u8 tag[16], int taglen, int flags) +{ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); + else if (flags & FLAG_AVX) + return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); + else + return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); } -static int generic_gcmaes_decrypt(struct aead_request *req) +/* + * This is the Integrity Check Value (aka the authentication tag) length and can + * be 8, 12 or 16 bytes long. + */ +static int common_rfc4106_set_authsize(struct crypto_aead *aead, + unsigned int authsize) +{ + switch (authsize) { + case 8: + case 12: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12: + case 13: + case 14: + case 15: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * This is the setkey function for the x86_64 implementations of AES-GCM. It + * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes + * powers of the hash key. + * + * To comply with the crypto_aead API, this has to be usable in no-SIMD context. + * For that reason, this function includes a portable C implementation of the + * needed logic. However, the portable C implementation is very slow, taking + * about the same time as encrypting 37 KB of data. To be ready for users that + * may set a key even somewhat frequently, we therefore also include a SIMD + * assembly implementation, expanding the AES key using AES-NI and precomputing + * the hash key powers using PCLMULQDQ or VPCLMULQDQ. + */ +static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, + unsigned int keylen, int flags) +{ + struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); + int err; + + if (flags & FLAG_RFC4106) { + if (keylen < 4) + return -EINVAL; + keylen -= 4; + key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen); + } + + /* The assembly code assumes the following offsets. */ + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_enc) != 0); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_length) != 480); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768); + + if (likely(crypto_simd_usable())) { + err = aes_check_keylen(keylen); + if (err) + return err; + kernel_fpu_begin(); + aesni_set_key(&key->aes_key, raw_key, keylen); + aes_gcm_precompute(key, flags); + kernel_fpu_end(); + } else { + static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = { + [0] = 0xc2, [15] = 1 + }; + static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = { + [7] = 1, + }; + be128 h1 = {}; + be128 h; + int i; + + err = aes_expandkey(&key->aes_key, raw_key, keylen); + if (err) + return err; + + /* Encrypt the all-zeroes block to get the hash key H^1 */ + aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1); + + /* Compute H^1 * x^-1 */ + h = h1; + gf128mul_lle(&h, (const be128 *)x_to_the_minus1); + + /* Compute the needed key powers */ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) { + struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key); + + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { + k->h_powers[i][0] = be64_to_cpu(h.b); + k->h_powers[i][1] = be64_to_cpu(h.a); + gf128mul_lle(&h, &h1); + } + memset(k->padding, 0, sizeof(k->padding)); + } else { + struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key); + + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { + k->h_powers[i][0] = be64_to_cpu(h.b); + k->h_powers[i][1] = be64_to_cpu(h.a); + k->h_powers_xored[i] = k->h_powers[i][0] ^ + k->h_powers[i][1]; + gf128mul_lle(&h, &h1); + } + gf128mul_lle(&h1, (const be128 *)x_to_the_63); + k->h_times_x64[0] = be64_to_cpu(h1.b); + k->h_times_x64[1] = be64_to_cpu(h1.a); + } + } + return 0; +} + +/* + * Initialize @ghash_acc, then pass all @assoclen bytes of associated data + * (a.k.a. additional authenticated data) from @sg_src through the GHASH update + * assembly function. kernel_fpu_begin() must have already been called. + */ +static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16], + struct scatterlist *sg_src, unsigned int assoclen, + int flags) +{ + struct scatter_walk walk; + /* + * The assembly function requires that the length of any non-last + * segment of associated data be a multiple of 16 bytes, so this + * function does the buffering needed to achieve that. + */ + unsigned int pos = 0; + u8 buf[16]; + + memset(ghash_acc, 0, 16); + scatterwalk_start(&walk, sg_src); + + while (assoclen) { + unsigned int orig_len_this_step = scatterwalk_next( + &walk, assoclen); + unsigned int len_this_step = orig_len_this_step; + unsigned int len; + const u8 *src = walk.addr; + + if (unlikely(pos)) { + len = min(len_this_step, 16 - pos); + memcpy(&buf[pos], src, len); + pos += len; + src += len; + len_this_step -= len; + if (pos < 16) + goto next; + aes_gcm_aad_update(key, ghash_acc, buf, 16, flags); + pos = 0; + } + len = len_this_step; + if (unlikely(assoclen)) /* Not the last segment yet? */ + len = round_down(len, 16); + aes_gcm_aad_update(key, ghash_acc, src, len, flags); + src += len; + len_this_step -= len; + if (unlikely(len_this_step)) { + memcpy(buf, src, len_this_step); + pos = len_this_step; + } +next: + scatterwalk_done_src(&walk, orig_len_this_step); + if (need_resched()) { + kernel_fpu_end(); + kernel_fpu_begin(); + } + assoclen -= orig_len_this_step; + } + if (unlikely(pos)) + aes_gcm_aad_update(key, ghash_acc, buf, pos, flags); +} + + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline int +gcm_crypt(struct aead_request *req, int flags) { - __be32 counter = cpu_to_be32(1); struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); + const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); + unsigned int assoclen = req->assoclen; + struct skcipher_walk walk; + unsigned int nbytes; + u8 ghash_acc[16]; /* GHASH accumulator */ + u32 le_ctr[4]; /* Counter in little-endian format */ + int taglen; + int err; - memcpy(iv, req->iv, 12); - *((__be32 *)(iv+12)) = counter; + /* Initialize the counter and determine the associated data length. */ + le_ctr[0] = 2; + if (flags & FLAG_RFC4106) { + if (unlikely(assoclen != 16 && assoclen != 20)) + return -EINVAL; + assoclen -= 8; + le_ctr[1] = get_unaligned_be32(req->iv + 4); + le_ctr[2] = get_unaligned_be32(req->iv + 0); + le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */ + } else { + le_ctr[1] = get_unaligned_be32(req->iv + 8); + le_ctr[2] = get_unaligned_be32(req->iv + 4); + le_ctr[3] = get_unaligned_be32(req->iv + 0); + } - return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv, - aes_ctx); + /* Begin walking through the plaintext or ciphertext. */ + if (flags & FLAG_ENC) + err = skcipher_walk_aead_encrypt(&walk, req, false); + else + err = skcipher_walk_aead_decrypt(&walk, req, false); + if (err) + return err; + + /* + * Since the AES-GCM assembly code requires that at least three assembly + * functions be called to process any message (this is needed to support + * incremental updates cleanly), to reduce overhead we try to do all + * three calls in the same kernel FPU section if possible. We close the + * section and start a new one if there are multiple data segments or if + * rescheduling is needed while processing the associated data. + */ + kernel_fpu_begin(); + + /* Pass the associated data through GHASH. */ + gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags); + + /* En/decrypt the data and pass the ciphertext through GHASH. */ + while (unlikely((nbytes = walk.nbytes) < walk.total)) { + /* + * Non-last segment. In this case, the assembly function + * requires that the length be a multiple of 16 (AES_BLOCK_SIZE) + * bytes. The needed buffering of up to 16 bytes is handled by + * the skcipher_walk. Here we just need to round down to a + * multiple of 16. + */ + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, flags); + le_ctr[0] += nbytes / AES_BLOCK_SIZE; + kernel_fpu_end(); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + if (err) + return err; + kernel_fpu_begin(); + } + /* Last segment: process all remaining data. */ + aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, flags); + /* + * The low word of the counter isn't used by the finalize, so there's no + * need to increment it here. + */ + + /* Finalize */ + taglen = crypto_aead_authsize(tfm); + if (flags & FLAG_ENC) { + /* Finish computing the auth tag. */ + aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen, + req->cryptlen, flags); + + /* Store the computed auth tag in the dst scatterlist. */ + scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen + + req->cryptlen, taglen, 1); + } else { + unsigned int datalen = req->cryptlen - taglen; + u8 tag[16]; + + /* Get the transmitted auth tag from the src scatterlist. */ + scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen, + taglen, 0); + /* + * Finish computing the auth tag and compare it to the + * transmitted one. The assembly function does the actual tag + * comparison. Here, just check the boolean result. + */ + if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen, + datalen, tag, taglen, flags)) + err = -EBADMSG; + } + kernel_fpu_end(); + if (nbytes) + skcipher_walk_done(&walk, 0); + return err; } -static struct aead_alg aesni_aeads[] = { { - .setkey = common_rfc4106_set_key, - .setauthsize = common_rfc4106_set_authsize, - .encrypt = helper_rfc4106_encrypt, - .decrypt = helper_rfc4106_decrypt, - .ivsize = GCM_RFC4106_IV_SIZE, - .maxauthsize = 16, - .base = { - .cra_name = "__rfc4106(gcm(aes))", - .cra_driver_name = "__rfc4106-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, -}, { - .setkey = generic_gcmaes_set_key, - .setauthsize = generic_gcmaes_set_authsize, - .encrypt = generic_gcmaes_encrypt, - .decrypt = generic_gcmaes_decrypt, - .ivsize = GCM_AES_IV_SIZE, - .maxauthsize = 16, - .base = { - .cra_name = "__gcm(aes)", - .cra_driver_name = "__generic-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, -} }; -#else -static struct aead_alg aesni_aeads[0]; +#define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name, \ + ctxsize, priority) \ + \ +static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ + unsigned int keylen) \ +{ \ + return gcm_setkey(tfm, raw_key, keylen, (flags)); \ +} \ + \ +static int gcm_encrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags) | FLAG_ENC); \ +} \ + \ +static int gcm_decrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags)); \ +} \ + \ +static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ + unsigned int keylen) \ +{ \ + return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106); \ +} \ + \ +static int rfc4106_encrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC); \ +} \ + \ +static int rfc4106_decrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags) | FLAG_RFC4106); \ +} \ + \ +static struct aead_alg aes_gcm_algs_##suffix[] = { { \ + .setkey = gcm_setkey_##suffix, \ + .setauthsize = generic_gcmaes_set_authsize, \ + .encrypt = gcm_encrypt_##suffix, \ + .decrypt = gcm_decrypt_##suffix, \ + .ivsize = GCM_AES_IV_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .maxauthsize = 16, \ + .base = { \ + .cra_name = "__gcm(aes)", \ + .cra_driver_name = "__" generic_driver_name, \ + .cra_priority = (priority), \ + .cra_flags = CRYPTO_ALG_INTERNAL, \ + .cra_blocksize = 1, \ + .cra_ctxsize = (ctxsize), \ + .cra_module = THIS_MODULE, \ + }, \ +}, { \ + .setkey = rfc4106_setkey_##suffix, \ + .setauthsize = common_rfc4106_set_authsize, \ + .encrypt = rfc4106_encrypt_##suffix, \ + .decrypt = rfc4106_decrypt_##suffix, \ + .ivsize = GCM_RFC4106_IV_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .maxauthsize = 16, \ + .base = { \ + .cra_name = "__rfc4106(gcm(aes))", \ + .cra_driver_name = "__" rfc_driver_name, \ + .cra_priority = (priority), \ + .cra_flags = CRYPTO_ALG_INTERNAL, \ + .cra_blocksize = 1, \ + .cra_ctxsize = (ctxsize), \ + .cra_module = THIS_MODULE, \ + }, \ +} }; \ + \ +static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2] \ + +/* aes_gcm_algs_aesni */ +DEFINE_GCM_ALGS(aesni, /* no flags */ 0, + "generic-gcm-aesni", "rfc4106-gcm-aesni", + AES_GCM_KEY_AESNI_SIZE, 400); + +/* aes_gcm_algs_aesni_avx */ +DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX, + "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx", + AES_GCM_KEY_AESNI_SIZE, 500); + +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +/* aes_gcm_algs_vaes_avx10_256 */ +DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256, + "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256", + AES_GCM_KEY_AVX10_SIZE, 700); + +/* aes_gcm_algs_vaes_avx10_512 */ +DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512, + "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512", + AES_GCM_KEY_AVX10_SIZE, 800); +#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ + +static int __init register_avx_algs(void) +{ + int err; + + if (!boot_cpu_has(X86_FEATURE_AVX)) + return 0; + err = simd_register_skciphers_compat(skcipher_algs_aesni_avx, + ARRAY_SIZE(skcipher_algs_aesni_avx), + simd_skcipher_algs_aesni_avx); + if (err) + return err; + err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx, + ARRAY_SIZE(aes_gcm_algs_aesni_avx), + aes_gcm_simdalgs_aesni_avx); + if (err) + return err; + /* + * Note: not all the algorithms registered below actually require + * VPCLMULQDQ. But in practice every CPU with VAES also has VPCLMULQDQ. + * Similarly, the assembler support was added at about the same time. + * For simplicity, just always check for VAES and VPCLMULQDQ together. + */ +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_VAES) || + !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) || + !boot_cpu_has(X86_FEATURE_PCLMULQDQ) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + return 0; + err = simd_register_skciphers_compat(skcipher_algs_vaes_avx2, + ARRAY_SIZE(skcipher_algs_vaes_avx2), + simd_skcipher_algs_vaes_avx2); + if (err) + return err; + + if (!boot_cpu_has(X86_FEATURE_AVX512BW) || + !boot_cpu_has(X86_FEATURE_AVX512VL) || + !boot_cpu_has(X86_FEATURE_BMI2) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL)) + return 0; + + err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_256, + ARRAY_SIZE(skcipher_algs_vaes_avx10_256), + simd_skcipher_algs_vaes_avx10_256); + if (err) + return err; + err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), + aes_gcm_simdalgs_vaes_avx10_256); + if (err) + return err; + + if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) { + int i; + + for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx10_512); i++) + skcipher_algs_vaes_avx10_512[i].base.cra_priority = 1; + for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) + aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; + } + + err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_512, + ARRAY_SIZE(skcipher_algs_vaes_avx10_512), + simd_skcipher_algs_vaes_avx10_512); + if (err) + return err; + err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), + aes_gcm_simdalgs_vaes_avx10_512); + if (err) + return err; +#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ + return 0; +} + +static void unregister_avx_algs(void) +{ + if (simd_skcipher_algs_aesni_avx[0]) + simd_unregister_skciphers(skcipher_algs_aesni_avx, + ARRAY_SIZE(skcipher_algs_aesni_avx), + simd_skcipher_algs_aesni_avx); + if (aes_gcm_simdalgs_aesni_avx[0]) + simd_unregister_aeads(aes_gcm_algs_aesni_avx, + ARRAY_SIZE(aes_gcm_algs_aesni_avx), + aes_gcm_simdalgs_aesni_avx); +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) + if (simd_skcipher_algs_vaes_avx2[0]) + simd_unregister_skciphers(skcipher_algs_vaes_avx2, + ARRAY_SIZE(skcipher_algs_vaes_avx2), + simd_skcipher_algs_vaes_avx2); + if (simd_skcipher_algs_vaes_avx10_256[0]) + simd_unregister_skciphers(skcipher_algs_vaes_avx10_256, + ARRAY_SIZE(skcipher_algs_vaes_avx10_256), + simd_skcipher_algs_vaes_avx10_256); + if (aes_gcm_simdalgs_vaes_avx10_256[0]) + simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), + aes_gcm_simdalgs_vaes_avx10_256); + if (simd_skcipher_algs_vaes_avx10_512[0]) + simd_unregister_skciphers(skcipher_algs_vaes_avx10_512, + ARRAY_SIZE(skcipher_algs_vaes_avx10_512), + simd_skcipher_algs_vaes_avx10_512); + if (aes_gcm_simdalgs_vaes_avx10_512[0]) + simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), + aes_gcm_simdalgs_vaes_avx10_512); #endif +} +#else /* CONFIG_X86_64 */ +static struct aead_alg aes_gcm_algs_aesni[0]; +static struct simd_aead_alg *aes_gcm_simdalgs_aesni[0]; + +static int __init register_avx_algs(void) +{ + return 0; +} -static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)]; +static void unregister_avx_algs(void) +{ +} +#endif /* !CONFIG_X86_64 */ static const struct x86_cpu_id aesni_cpu_id[] = { X86_MATCH_FEATURE(X86_FEATURE_AES, NULL), @@ -1234,24 +1685,6 @@ static int __init aesni_init(void) if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_AVX2)) { - pr_info("AVX2 version of gcm_enc/dec engaged.\n"); - static_branch_enable(&gcm_use_avx); - static_branch_enable(&gcm_use_avx2); - } else - if (boot_cpu_has(X86_FEATURE_AVX)) { - pr_info("AVX version of gcm_enc/dec engaged.\n"); - static_branch_enable(&gcm_use_avx); - } else { - pr_info("SSE version of gcm_enc/dec engaged.\n"); - } - if (boot_cpu_has(X86_FEATURE_AVX)) { - /* optimize performance of ctr mode encryption transform */ - static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); - pr_info("AES CTR mode by8 optimization enabled\n"); - } -#endif /* CONFIG_X86_64 */ err = crypto_register_alg(&aesni_cipher_alg); if (err) @@ -1263,27 +1696,23 @@ static int __init aesni_init(void) if (err) goto unregister_cipher; - err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); + err = simd_register_aeads_compat(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni), + aes_gcm_simdalgs_aesni); if (err) goto unregister_skciphers; -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_AVX)) - err = simd_register_skciphers_compat(&aesni_xctr, 1, - &aesni_simd_xctr); + err = register_avx_algs(); if (err) - goto unregister_aeads; -#endif /* CONFIG_X86_64 */ + goto unregister_avx; return 0; -#ifdef CONFIG_X86_64 -unregister_aeads: - simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); -#endif /* CONFIG_X86_64 */ - +unregister_avx: + unregister_avx_algs(); + simd_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni), + aes_gcm_simdalgs_aesni); unregister_skciphers: simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); @@ -1294,20 +1723,18 @@ unregister_cipher: static void __exit aesni_exit(void) { - simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); + simd_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni), + aes_gcm_simdalgs_aesni); simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); crypto_unregister_alg(&aesni_cipher_alg); -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_AVX)) - simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); -#endif /* CONFIG_X86_64 */ + unregister_avx_algs(); } -late_initcall(aesni_init); +module_init(aesni_init); module_exit(aesni_exit); -MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized"); +MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("aes"); diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 552f2df0643f..26c5f2ee5d10 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -94,7 +94,6 @@ static struct crypto_alg bf_cipher_alg = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = BF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index 646477a13e11..1dfef28c1266 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -16,6 +16,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -882,7 +883,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16) jmp .Ldec_max24; SYM_FUNC_END(__camellia_dec_blk16) -SYM_FUNC_START(camellia_ecb_enc_16way) +SYM_TYPED_FUNC_START(camellia_ecb_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -907,7 +908,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way) RET; SYM_FUNC_END(camellia_ecb_enc_16way) -SYM_FUNC_START(camellia_ecb_dec_16way) +SYM_TYPED_FUNC_START(camellia_ecb_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -937,7 +938,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way) RET; SYM_FUNC_END(camellia_ecb_dec_16way) -SYM_FUNC_START(camellia_cbc_dec_16way) +SYM_TYPED_FUNC_START(camellia_cbc_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index a0eb94e53b1b..b1c9b9450555 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #define CAMELLIA_TABLE_BYTE_LEN 272 diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 816b6bb8bded..824cb94de6c2 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> .file "camellia-x86_64-asm_64.S" .text @@ -177,7 +178,7 @@ bswapq RAB0; \ movq RAB0, 4*2(RIO); -SYM_FUNC_START(__camellia_enc_blk) +SYM_TYPED_FUNC_START(__camellia_enc_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -224,7 +225,7 @@ SYM_FUNC_START(__camellia_enc_blk) RET; SYM_FUNC_END(__camellia_enc_blk) -SYM_FUNC_START(camellia_dec_blk) +SYM_TYPED_FUNC_START(camellia_dec_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -411,7 +412,7 @@ SYM_FUNC_END(camellia_dec_blk) bswapq RAB1; \ movq RAB1, 12*2(RIO); -SYM_FUNC_START(__camellia_enc_blk_2way) +SYM_TYPED_FUNC_START(__camellia_enc_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -460,7 +461,7 @@ SYM_FUNC_START(__camellia_enc_blk_2way) RET; SYM_FUNC_END(__camellia_enc_blk_2way) -SYM_FUNC_START(camellia_dec_blk_2way) +SYM_TYPED_FUNC_START(camellia_dec_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index d45e9c0c42ac..3bd37d664121 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -8,7 +8,7 @@ * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation) */ -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> @@ -1313,7 +1313,6 @@ static struct crypto_alg camellia_cipher_alg = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_ctx), - .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index b4e460a87f18..fb95a614249d 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -487,79 +487,3 @@ SYM_FUNC_START(cast5_cbc_dec_16way) FRAME_END RET; SYM_FUNC_END(cast5_cbc_dec_16way) - -SYM_FUNC_START(cast5_ctr_16way) - /* input: - * %rdi: ctx - * %rsi: dst - * %rdx: src - * %rcx: iv (big endian, 64bit) - */ - FRAME_BEGIN - pushq %r12; - pushq %r15; - - movq %rdi, CTX; - movq %rsi, %r11; - movq %rdx, %r12; - - vpcmpeqd RTMP, RTMP, RTMP; - vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */ - - vpcmpeqd RKR, RKR, RKR; - vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */ - vmovdqa .Lbswap_iv_mask(%rip), R1ST; - vmovdqa .Lbswap128_mask(%rip), RKM; - - /* load IV and byteswap */ - vmovq (%rcx), RX; - vpshufb R1ST, RX, RX; - - /* construct IVs */ - vpsubq RTMP, RX, RX; /* le: IV1, IV0 */ - vpshufb RKM, RX, RL1; /* be: IV0, IV1 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RR1; /* be: IV2, IV3 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RL2; /* be: IV4, IV5 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RR2; /* be: IV6, IV7 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RL3; /* be: IV8, IV9 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RR3; /* be: IV10, IV11 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RL4; /* be: IV12, IV13 */ - vpsubq RKR, RX, RX; - vpshufb RKM, RX, RR4; /* be: IV14, IV15 */ - - /* store last IV */ - vpsubq RTMP, RX, RX; /* le: IV16, IV14 */ - vpshufb R1ST, RX, RX; /* be: IV16, IV16 */ - vmovq RX, (%rcx); - - call __cast5_enc_blk16; - - /* dst = src ^ iv */ - vpxor (0*16)(%r12), RR1, RR1; - vpxor (1*16)(%r12), RL1, RL1; - vpxor (2*16)(%r12), RR2, RR2; - vpxor (3*16)(%r12), RL2, RL2; - vpxor (4*16)(%r12), RR3, RR3; - vpxor (5*16)(%r12), RL3, RL3; - vpxor (6*16)(%r12), RR4, RR4; - vpxor (7*16)(%r12), RL4, RL4; - vmovdqu RR1, (0*16)(%r11); - vmovdqu RL1, (1*16)(%r11); - vmovdqu RR2, (2*16)(%r11); - vmovdqu RL2, (3*16)(%r11); - vmovdqu RR3, (4*16)(%r11); - vmovdqu RL3, (5*16)(%r11); - vmovdqu RR4, (6*16)(%r11); - vmovdqu RL4, (7*16)(%r11); - - popq %r15; - popq %r12; - FRAME_END - RET; -SYM_FUNC_END(cast5_ctr_16way) diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 7b3a1cf0984b..8bb74a272879 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -133,12 +133,6 @@ void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) } EXPORT_SYMBOL(hchacha_block_arch); -void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) -{ - chacha_init_generic(state, key, iv); -} -EXPORT_SYMBOL(chacha_init_arch); - void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { @@ -169,7 +163,7 @@ static int chacha_simd_stream_xor(struct skcipher_request *req, err = skcipher_walk_virt(&walk, req, false); - chacha_init_generic(state, ctx->key, iv); + chacha_init(state, ctx->key, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; @@ -211,7 +205,7 @@ static int xchacha_simd(struct skcipher_request *req) struct chacha_ctx subctx; u8 real_iv[16]; - chacha_init_generic(state, ctx->key, req->iv); + chacha_init(state, ctx->key, req->iv); if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { kernel_fpu_begin(); diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S deleted file mode 100644 index 5d31137e2c7d..000000000000 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ /dev/null @@ -1,218 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2012 Xyratex Technology Limited - * - * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 - * calculation. - * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) - * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found - * at: - * http://www.intel.com/products/processor/manuals/ - * Intel(R) 64 and IA-32 Architectures Software Developer's Manual - * Volume 2B: Instruction Set Reference, N-Z - * - * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> - * Alexander Boyko <Alexander_Boyko@xyratex.com> - */ - -#include <linux/linkage.h> - - -.section .rodata -.align 16 -/* - * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 - * #define CONSTANT_R1 0x154442bd4LL - * - * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 - * #define CONSTANT_R2 0x1c6e41596LL - */ -.Lconstant_R2R1: - .octa 0x00000001c6e415960000000154442bd4 -/* - * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 - * #define CONSTANT_R3 0x1751997d0LL - * - * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e - * #define CONSTANT_R4 0x0ccaa009eLL - */ -.Lconstant_R4R3: - .octa 0x00000000ccaa009e00000001751997d0 -/* - * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 - * #define CONSTANT_R5 0x163cd6124LL - */ -.Lconstant_R5: - .octa 0x00000000000000000000000163cd6124 -.Lconstant_mask32: - .octa 0x000000000000000000000000FFFFFFFF -/* - * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL - * - * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL - * #define CONSTANT_RU 0x1F7011641LL - */ -.Lconstant_RUpoly: - .octa 0x00000001F701164100000001DB710641 - -#define CONSTANT %xmm0 - -#ifdef __x86_64__ -#define BUF %rdi -#define LEN %rsi -#define CRC %edx -#else -#define BUF %eax -#define LEN %edx -#define CRC %ecx -#endif - - - -.text -/** - * Calculate crc32 - * BUF - buffer (16 bytes aligned) - * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 - * CRC - initial crc32 - * return %eax crc32 - * uint crc32_pclmul_le_16(unsigned char const *buffer, - * size_t len, uint crc32) - */ - -SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ - movdqa (BUF), %xmm1 - movdqa 0x10(BUF), %xmm2 - movdqa 0x20(BUF), %xmm3 - movdqa 0x30(BUF), %xmm4 - movd CRC, CONSTANT - pxor CONSTANT, %xmm1 - sub $0x40, LEN - add $0x40, BUF - cmp $0x40, LEN - jb .Lless_64 - -#ifdef __x86_64__ - movdqa .Lconstant_R2R1(%rip), CONSTANT -#else - movdqa .Lconstant_R2R1, CONSTANT -#endif - -.Lloop_64:/* 64 bytes Full cache line folding */ - prefetchnta 0x40(BUF) - movdqa %xmm1, %xmm5 - movdqa %xmm2, %xmm6 - movdqa %xmm3, %xmm7 -#ifdef __x86_64__ - movdqa %xmm4, %xmm8 -#endif - pclmulqdq $0x00, CONSTANT, %xmm1 - pclmulqdq $0x00, CONSTANT, %xmm2 - pclmulqdq $0x00, CONSTANT, %xmm3 -#ifdef __x86_64__ - pclmulqdq $0x00, CONSTANT, %xmm4 -#endif - pclmulqdq $0x11, CONSTANT, %xmm5 - pclmulqdq $0x11, CONSTANT, %xmm6 - pclmulqdq $0x11, CONSTANT, %xmm7 -#ifdef __x86_64__ - pclmulqdq $0x11, CONSTANT, %xmm8 -#endif - pxor %xmm5, %xmm1 - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 -#ifdef __x86_64__ - pxor %xmm8, %xmm4 -#else - /* xmm8 unsupported for x32 */ - movdqa %xmm4, %xmm5 - pclmulqdq $0x00, CONSTANT, %xmm4 - pclmulqdq $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm4 -#endif - - pxor (BUF), %xmm1 - pxor 0x10(BUF), %xmm2 - pxor 0x20(BUF), %xmm3 - pxor 0x30(BUF), %xmm4 - - sub $0x40, LEN - add $0x40, BUF - cmp $0x40, LEN - jge .Lloop_64 -.Lless_64:/* Folding cache line into 128bit */ -#ifdef __x86_64__ - movdqa .Lconstant_R4R3(%rip), CONSTANT -#else - movdqa .Lconstant_R4R3, CONSTANT -#endif - prefetchnta (BUF) - - movdqa %xmm1, %xmm5 - pclmulqdq $0x00, CONSTANT, %xmm1 - pclmulqdq $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm1 - pxor %xmm2, %xmm1 - - movdqa %xmm1, %xmm5 - pclmulqdq $0x00, CONSTANT, %xmm1 - pclmulqdq $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm1 - pxor %xmm3, %xmm1 - - movdqa %xmm1, %xmm5 - pclmulqdq $0x00, CONSTANT, %xmm1 - pclmulqdq $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm1 - pxor %xmm4, %xmm1 - - cmp $0x10, LEN - jb .Lfold_64 -.Lloop_16:/* Folding rest buffer into 128bit */ - movdqa %xmm1, %xmm5 - pclmulqdq $0x00, CONSTANT, %xmm1 - pclmulqdq $0x11, CONSTANT, %xmm5 - pxor %xmm5, %xmm1 - pxor (BUF), %xmm1 - sub $0x10, LEN - add $0x10, BUF - cmp $0x10, LEN - jge .Lloop_16 - -.Lfold_64: - /* perform the last 64 bit fold, also adds 32 zeroes - * to the input stream */ - pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ - psrldq $0x08, %xmm1 - pxor CONSTANT, %xmm1 - - /* final 32-bit fold */ - movdqa %xmm1, %xmm2 -#ifdef __x86_64__ - movdqa .Lconstant_R5(%rip), CONSTANT - movdqa .Lconstant_mask32(%rip), %xmm3 -#else - movdqa .Lconstant_R5, CONSTANT - movdqa .Lconstant_mask32, %xmm3 -#endif - psrldq $0x04, %xmm2 - pand %xmm3, %xmm1 - pclmulqdq $0x00, CONSTANT, %xmm1 - pxor %xmm2, %xmm1 - - /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ -#ifdef __x86_64__ - movdqa .Lconstant_RUpoly(%rip), CONSTANT -#else - movdqa .Lconstant_RUpoly, CONSTANT -#endif - movdqa %xmm1, %xmm2 - pand %xmm3, %xmm1 - pclmulqdq $0x10, CONSTANT, %xmm1 - pand %xmm3, %xmm1 - pclmulqdq $0x00, CONSTANT, %xmm1 - pxor %xmm2, %xmm1 - pextrd $0x01, %xmm1, %eax - - RET -SYM_FUNC_END(crc32_pclmul_le_16) diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c deleted file mode 100644 index 98cf3b4e4c9f..000000000000 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ /dev/null @@ -1,201 +0,0 @@ -/* GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * Please visit http://www.xyratex.com/contact if you need additional - * information or have any questions. - * - * GPL HEADER END - */ - -/* - * Copyright 2012 Xyratex Technology Limited - * - * Wrappers for kernel crypto shash api to pclmulqdq crc32 implementation. - */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/crc32.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> - -#include <asm/cpufeatures.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> - -#define CHKSUM_BLOCK_SIZE 1 -#define CHKSUM_DIGEST_SIZE 4 - -#define PCLMUL_MIN_LEN 64L /* minimum size of buffer - * for crc32_pclmul_le_16 */ -#define SCALE_F 16L /* size of xmm register */ -#define SCALE_F_MASK (SCALE_F - 1) - -u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32); - -static u32 __attribute__((pure)) - crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len) -{ - unsigned int iquotient; - unsigned int iremainder; - unsigned int prealign; - - if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !crypto_simd_usable()) - return crc32_le(crc, p, len); - - if ((long)p & SCALE_F_MASK) { - /* align p to 16 byte */ - prealign = SCALE_F - ((long)p & SCALE_F_MASK); - - crc = crc32_le(crc, p, prealign); - len -= prealign; - p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) & - ~SCALE_F_MASK); - } - iquotient = len & (~SCALE_F_MASK); - iremainder = len & SCALE_F_MASK; - - kernel_fpu_begin(); - crc = crc32_pclmul_le_16(p, iquotient, crc); - kernel_fpu_end(); - - if (iremainder) - crc = crc32_le(crc, p + iquotient, iremainder); - - return crc; -} - -static int crc32_pclmul_cra_init(struct crypto_tfm *tfm) -{ - u32 *key = crypto_tfm_ctx(tfm); - - *key = 0; - - return 0; -} - -static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key, - unsigned int keylen) -{ - u32 *mctx = crypto_shash_ctx(hash); - - if (keylen != sizeof(u32)) - return -EINVAL; - *mctx = le32_to_cpup((__le32 *)key); - return 0; -} - -static int crc32_pclmul_init(struct shash_desc *desc) -{ - u32 *mctx = crypto_shash_ctx(desc->tfm); - u32 *crcp = shash_desc_ctx(desc); - - *crcp = *mctx; - - return 0; -} - -static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - u32 *crcp = shash_desc_ctx(desc); - - *crcp = crc32_pclmul_le(*crcp, data, len); - return 0; -} - -/* No final XOR 0xFFFFFFFF, like crc32_le */ -static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len, - u8 *out) -{ - *(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len)); - return 0; -} - -static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out); -} - -static int crc32_pclmul_final(struct shash_desc *desc, u8 *out) -{ - u32 *crcp = shash_desc_ctx(desc); - - *(__le32 *)out = cpu_to_le32p(crcp); - return 0; -} - -static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len, - out); -} - -static struct shash_alg alg = { - .setkey = crc32_pclmul_setkey, - .init = crc32_pclmul_init, - .update = crc32_pclmul_update, - .final = crc32_pclmul_final, - .finup = crc32_pclmul_finup, - .digest = crc32_pclmul_digest, - .descsize = sizeof(u32), - .digestsize = CHKSUM_DIGEST_SIZE, - .base = { - .cra_name = "crc32", - .cra_driver_name = "crc32-pclmul", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_ctxsize = sizeof(u32), - .cra_module = THIS_MODULE, - .cra_init = crc32_pclmul_cra_init, - } -}; - -static const struct x86_cpu_id crc32pclmul_cpu_id[] = { - X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id); - - -static int __init crc32_pclmul_mod_init(void) -{ - - if (!x86_match_cpu(crc32pclmul_cpu_id)) { - pr_info("PCLMULQDQ-NI instructions are not detected.\n"); - return -ENODEV; - } - return crypto_register_shash(&alg); -} - -static void __exit crc32_pclmul_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(crc32_pclmul_mod_init); -module_exit(crc32_pclmul_mod_fini); - -MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>"); -MODULE_LICENSE("GPL"); - -MODULE_ALIAS_CRYPTO("crc32"); -MODULE_ALIAS_CRYPTO("crc32-pclmul"); diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c deleted file mode 100644 index feccb5254c7e..000000000000 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal. - * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE) - * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at: - * http://www.intel.com/products/processor/manuals/ - * Intel(R) 64 and IA-32 Architectures Software Developer's Manual - * Volume 2A: Instruction Set Reference, A-M - * - * Copyright (C) 2008 Intel Corporation - * Authors: Austin Zhang <austin_zhang@linux.intel.com> - * Kent Liu <kent.liu@intel.com> - */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> - -#include <asm/cpufeatures.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> - -#define CHKSUM_BLOCK_SIZE 1 -#define CHKSUM_DIGEST_SIZE 4 - -#define SCALE_F sizeof(unsigned long) - -#ifdef CONFIG_X86_64 -#define CRC32_INST "crc32q %1, %q0" -#else -#define CRC32_INST "crc32l %1, %0" -#endif - -#ifdef CONFIG_X86_64 -/* - * use carryless multiply version of crc32c when buffer - * size is >= 512 to account - * for fpu state save/restore overhead. - */ -#define CRC32C_PCL_BREAKEVEN 512 - -asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, - unsigned int crc_init); -#endif /* CONFIG_X86_64 */ - -static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) -{ - while (length--) { - asm("crc32b %1, %0" - : "+r" (crc) : "rm" (*data)); - data++; - } - - return crc; -} - -static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len) -{ - unsigned int iquotient = len / SCALE_F; - unsigned int iremainder = len % SCALE_F; - unsigned long *ptmp = (unsigned long *)p; - - while (iquotient--) { - asm(CRC32_INST - : "+r" (crc) : "rm" (*ptmp)); - ptmp++; - } - - if (iremainder) - crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp, - iremainder); - - return crc; -} - -/* - * Setting the seed allows arbitrary accumulators and flexible XOR policy - * If your algorithm starts with ~0, then XOR with ~0 before you set - * the seed. - */ -static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key, - unsigned int keylen) -{ - u32 *mctx = crypto_shash_ctx(hash); - - if (keylen != sizeof(u32)) - return -EINVAL; - *mctx = le32_to_cpup((__le32 *)key); - return 0; -} - -static int crc32c_intel_init(struct shash_desc *desc) -{ - u32 *mctx = crypto_shash_ctx(desc->tfm); - u32 *crcp = shash_desc_ctx(desc); - - *crcp = *mctx; - - return 0; -} - -static int crc32c_intel_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - u32 *crcp = shash_desc_ctx(desc); - - *crcp = crc32c_intel_le_hw(*crcp, data, len); - return 0; -} - -static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len, - u8 *out) -{ - *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); - return 0; -} - -static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out); -} - -static int crc32c_intel_final(struct shash_desc *desc, u8 *out) -{ - u32 *crcp = shash_desc_ctx(desc); - - *(__le32 *)out = ~cpu_to_le32p(crcp); - return 0; -} - -static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len, - out); -} - -static int crc32c_intel_cra_init(struct crypto_tfm *tfm) -{ - u32 *key = crypto_tfm_ctx(tfm); - - *key = ~0; - - return 0; -} - -#ifdef CONFIG_X86_64 -static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - u32 *crcp = shash_desc_ctx(desc); - - /* - * use faster PCL version if datasize is large enough to - * overcome kernel fpu state save/restore overhead - */ - if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) { - kernel_fpu_begin(); - *crcp = crc_pcl(data, len, *crcp); - kernel_fpu_end(); - } else - *crcp = crc32c_intel_le_hw(*crcp, data, len); - return 0; -} - -static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, - u8 *out) -{ - if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) { - kernel_fpu_begin(); - *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); - kernel_fpu_end(); - } else - *(__le32 *)out = - ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); - return 0; -} - -static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out); -} - -static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len, - out); -} -#endif /* CONFIG_X86_64 */ - -static struct shash_alg alg = { - .setkey = crc32c_intel_setkey, - .init = crc32c_intel_init, - .update = crc32c_intel_update, - .final = crc32c_intel_final, - .finup = crc32c_intel_finup, - .digest = crc32c_intel_digest, - .descsize = sizeof(u32), - .digestsize = CHKSUM_DIGEST_SIZE, - .base = { - .cra_name = "crc32c", - .cra_driver_name = "crc32c-intel", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_ctxsize = sizeof(u32), - .cra_module = THIS_MODULE, - .cra_init = crc32c_intel_cra_init, - } -}; - -static const struct x86_cpu_id crc32c_cpu_id[] = { - X86_MATCH_FEATURE(X86_FEATURE_XMM4_2, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id); - -static int __init crc32c_intel_mod_init(void) -{ - if (!x86_match_cpu(crc32c_cpu_id)) - return -ENODEV; -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { - alg.update = crc32c_pcl_intel_update; - alg.finup = crc32c_pcl_intel_finup; - alg.digest = crc32c_pcl_intel_digest; - } -#endif - return crypto_register_shash(&alg); -} - -static void __exit crc32c_intel_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(crc32c_intel_mod_init); -module_exit(crc32c_intel_mod_fini); - -MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>"); -MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware."); -MODULE_LICENSE("GPL"); - -MODULE_ALIAS_CRYPTO("crc32c"); -MODULE_ALIAS_CRYPTO("crc32c-intel"); diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S deleted file mode 100644 index bbcff1fb78cb..000000000000 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) - * - * The white papers on CRC32C calculations with PCLMULQDQ instruction can be - * downloaded from: - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf - * - * Copyright (C) 2012 Intel Corporation. - * - * Authors: - * Wajdi Feghali <wajdi.k.feghali@intel.com> - * James Guilford <james.guilford@intel.com> - * David Cote <david.m.cote@intel.com> - * Tim Chen <tim.c.chen@linux.intel.com> - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/linkage.h> -#include <asm/nospec-branch.h> - -## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction - -.macro LABEL prefix n -.L\prefix\n\(): -.endm - -.macro JMPTBL_ENTRY i -.quad .Lcrc_\i -.endm - -.macro JNC_LESS_THAN j - jnc .Lless_than_\j -.endm - -# Define threshold where buffers are considered "small" and routed to more -# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so -# SMALL_SIZE can be no larger than 255. - -#define SMALL_SIZE 200 - -.if (SMALL_SIZE > 255) -.error "SMALL_ SIZE must be < 256" -.endif - -# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); - -.text -SYM_FUNC_START(crc_pcl) -#define bufp rdi -#define bufp_dw %edi -#define bufp_w %di -#define bufp_b %dil -#define bufptmp %rcx -#define block_0 %rcx -#define block_1 %rdx -#define block_2 %r11 -#define len %rsi -#define len_dw %esi -#define len_w %si -#define len_b %sil -#define crc_init_arg %rdx -#define tmp %rbx -#define crc_init %r8 -#define crc_init_dw %r8d -#define crc1 %r9 -#define crc2 %r10 - - pushq %rbx - pushq %rdi - pushq %rsi - - ## Move crc_init for Linux to a different - mov crc_init_arg, crc_init - - ################################################################ - ## 1) ALIGN: - ################################################################ - - mov %bufp, bufptmp # rdi = *buf - neg %bufp - and $7, %bufp # calculate the unalignment amount of - # the address - je .Lproc_block # Skip if aligned - - ## If len is less than 8 and we're unaligned, we need to jump - ## to special code to avoid reading beyond the end of the buffer - cmp $8, len - jae .Ldo_align - # less_than_8 expects length in upper 3 bits of len_dw - # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] - shl $32-3+1, len_dw - jmp .Lless_than_8_post_shl1 - -.Ldo_align: - #### Calculate CRC of unaligned bytes of the buffer (if any) - movq (bufptmp), tmp # load a quadward from the buffer - add %bufp, bufptmp # align buffer pointer for quadword - # processing - sub %bufp, len # update buffer length -.Lalign_loop: - crc32b %bl, crc_init_dw # compute crc32 of 1-byte - shr $8, tmp # get next byte - dec %bufp - jne .Lalign_loop - -.Lproc_block: - - ################################################################ - ## 2) PROCESS BLOCKS: - ################################################################ - - ## compute num of bytes to be processed - movq len, tmp # save num bytes in tmp - - cmpq $128*24, len - jae .Lfull_block - -.Lcontinue_block: - cmpq $SMALL_SIZE, len - jb .Lsmall - - ## len < 128*24 - movq $2731, %rax # 2731 = ceil(2^16 / 24) - mul len_dw - shrq $16, %rax - - ## eax contains floor(bytes / 24) = num 24-byte chunks to do - - ## process rax 24-byte chunks (128 >= rax >= 0) - - ## compute end address of each block - ## block 0 (base addr + RAX * 8) - ## block 1 (base addr + RAX * 16) - ## block 2 (base addr + RAX * 24) - lea (bufptmp, %rax, 8), block_0 - lea (block_0, %rax, 8), block_1 - lea (block_1, %rax, 8), block_2 - - xor crc1, crc1 - xor crc2, crc2 - - ## branch into array - leaq jump_table(%rip), %bufp - mov (%bufp,%rax,8), %bufp - JMP_NOSPEC bufp - - ################################################################ - ## 2a) PROCESS FULL BLOCKS: - ################################################################ -.Lfull_block: - movl $128,%eax - lea 128*8*2(block_0), block_1 - lea 128*8*3(block_0), block_2 - add $128*8*1, block_0 - - xor crc1,crc1 - xor crc2,crc2 - - # Fall through into top of crc array (crc_128) - - ################################################################ - ## 3) CRC Array: - ################################################################ - - i=128 -.rept 128-1 -.altmacro -LABEL crc_ %i -.noaltmacro - ENDBR - crc32q -i*8(block_0), crc_init - crc32q -i*8(block_1), crc1 - crc32q -i*8(block_2), crc2 - i=(i-1) -.endr - -.altmacro -LABEL crc_ %i -.noaltmacro - ENDBR - crc32q -i*8(block_0), crc_init - crc32q -i*8(block_1), crc1 -# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet - - mov block_2, block_0 - - ################################################################ - ## 4) Combine three results: - ################################################################ - - lea (K_table-8)(%rip), %bufp # first entry is for idx 1 - shlq $3, %rax # rax *= 8 - pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 - leal (%eax,%eax,2), %eax # rax *= 3 (total *24) - subq %rax, tmp # tmp -= rax*24 - - movq crc_init, %xmm1 # CRC for block 1 - pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 - - movq crc1, %xmm2 # CRC for block 2 - pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 - - pxor %xmm2,%xmm1 - movq %xmm1, %rax - xor -i*8(block_2), %rax - mov crc2, crc_init - crc32 %rax, crc_init - - ################################################################ - ## 5) Check for end: - ################################################################ - -LABEL crc_ 0 - ENDBR - mov tmp, len - cmp $128*24, tmp - jae .Lfull_block - cmp $24, tmp - jae .Lcontinue_block - -.Lless_than_24: - shl $32-4, len_dw # less_than_16 expects length - # in upper 4 bits of len_dw - jnc .Lless_than_16 - crc32q (bufptmp), crc_init - crc32q 8(bufptmp), crc_init - jz .Ldo_return - add $16, bufptmp - # len is less than 8 if we got here - # less_than_8 expects length in upper 3 bits of len_dw - # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] - shl $2, len_dw - jmp .Lless_than_8_post_shl1 - - ####################################################################### - ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) - ####################################################################### -.Lsmall: - shl $32-8, len_dw # Prepare len_dw for less_than_256 - j=256 -.rept 5 # j = {256, 128, 64, 32, 16} -.altmacro -LABEL less_than_ %j # less_than_j: Length should be in - # upper lg(j) bits of len_dw - j=(j/2) - shl $1, len_dw # Get next MSB - JNC_LESS_THAN %j -.noaltmacro - i=0 -.rept (j/8) - crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data - i=i+8 -.endr - jz .Ldo_return # Return if remaining length is zero - add $j, bufptmp # Advance buf -.endr - -.Lless_than_8: # Length should be stored in - # upper 3 bits of len_dw - shl $1, len_dw -.Lless_than_8_post_shl1: - jnc .Lless_than_4 - crc32l (bufptmp), crc_init_dw # CRC of 4 bytes - jz .Ldo_return # return if remaining data is zero - add $4, bufptmp -.Lless_than_4: # Length should be stored in - # upper 2 bits of len_dw - shl $1, len_dw - jnc .Lless_than_2 - crc32w (bufptmp), crc_init_dw # CRC of 2 bytes - jz .Ldo_return # return if remaining data is zero - add $2, bufptmp -.Lless_than_2: # Length should be stored in the MSB - # of len_dw - shl $1, len_dw - jnc .Lless_than_1 - crc32b (bufptmp), crc_init_dw # CRC of 1 byte -.Lless_than_1: # Length should be zero -.Ldo_return: - movq crc_init, %rax - popq %rsi - popq %rdi - popq %rbx - RET -SYM_FUNC_END(crc_pcl) - -.section .rodata, "a", @progbits - ################################################################ - ## jump table Table is 129 entries x 2 bytes each - ################################################################ -.align 4 -jump_table: - i=0 -.rept 129 -.altmacro -JMPTBL_ENTRY %i -.noaltmacro - i=i+1 -.endr - - - ################################################################ - ## PCLMULQDQ tables - ## Table is 128 entries x 2 words (8 bytes) each - ################################################################ -.align 8 -K_table: - .long 0x493c7d27, 0x00000001 - .long 0xba4fc28e, 0x493c7d27 - .long 0xddc0152b, 0xf20c0dfe - .long 0x9e4addf8, 0xba4fc28e - .long 0x39d3b296, 0x3da6d0cb - .long 0x0715ce53, 0xddc0152b - .long 0x47db8317, 0x1c291d04 - .long 0x0d3b6092, 0x9e4addf8 - .long 0xc96cfdc0, 0x740eef02 - .long 0x878a92a7, 0x39d3b296 - .long 0xdaece73e, 0x083a6eec - .long 0xab7aff2a, 0x0715ce53 - .long 0x2162d385, 0xc49f4f67 - .long 0x83348832, 0x47db8317 - .long 0x299847d5, 0x2ad91c30 - .long 0xb9e02b86, 0x0d3b6092 - .long 0x18b33a4e, 0x6992cea2 - .long 0xb6dd949b, 0xc96cfdc0 - .long 0x78d9ccb7, 0x7e908048 - .long 0xbac2fd7b, 0x878a92a7 - .long 0xa60ce07b, 0x1b3d8f29 - .long 0xce7f39f4, 0xdaece73e - .long 0x61d82e56, 0xf1d0f55e - .long 0xd270f1a2, 0xab7aff2a - .long 0xc619809d, 0xa87ab8a8 - .long 0x2b3cac5d, 0x2162d385 - .long 0x65863b64, 0x8462d800 - .long 0x1b03397f, 0x83348832 - .long 0xebb883bd, 0x71d111a8 - .long 0xb3e32c28, 0x299847d5 - .long 0x064f7f26, 0xffd852c6 - .long 0xdd7e3b0c, 0xb9e02b86 - .long 0xf285651c, 0xdcb17aa4 - .long 0x10746f3c, 0x18b33a4e - .long 0xc7a68855, 0xf37c5aee - .long 0x271d9844, 0xb6dd949b - .long 0x8e766a0c, 0x6051d5a2 - .long 0x93a5f730, 0x78d9ccb7 - .long 0x6cb08e5c, 0x18b0d4ff - .long 0x6b749fb2, 0xbac2fd7b - .long 0x1393e203, 0x21f3d99c - .long 0xcec3662e, 0xa60ce07b - .long 0x96c515bb, 0x8f158014 - .long 0xe6fc4e6a, 0xce7f39f4 - .long 0x8227bb8a, 0xa00457f7 - .long 0xb0cd4768, 0x61d82e56 - .long 0x39c7ff35, 0x8d6d2c43 - .long 0xd7a4825c, 0xd270f1a2 - .long 0x0ab3844b, 0x00ac29cf - .long 0x0167d312, 0xc619809d - .long 0xf6076544, 0xe9adf796 - .long 0x26f6a60a, 0x2b3cac5d - .long 0xa741c1bf, 0x96638b34 - .long 0x98d8d9cb, 0x65863b64 - .long 0x49c3cc9c, 0xe0e9f351 - .long 0x68bce87a, 0x1b03397f - .long 0x57a3d037, 0x9af01f2d - .long 0x6956fc3b, 0xebb883bd - .long 0x42d98888, 0x2cff42cf - .long 0x3771e98f, 0xb3e32c28 - .long 0xb42ae3d9, 0x88f25a3a - .long 0x2178513a, 0x064f7f26 - .long 0xe0ac139e, 0x4e36f0b0 - .long 0x170076fa, 0xdd7e3b0c - .long 0x444dd413, 0xbd6f81f8 - .long 0x6f345e45, 0xf285651c - .long 0x41d17b64, 0x91c9bd4b - .long 0xff0dba97, 0x10746f3c - .long 0xa2b73df1, 0x885f087b - .long 0xf872e54c, 0xc7a68855 - .long 0x1e41e9fc, 0x4c144932 - .long 0x86d8e4d2, 0x271d9844 - .long 0x651bd98b, 0x52148f02 - .long 0x5bb8f1bc, 0x8e766a0c - .long 0xa90fd27a, 0xa3c6f37a - .long 0xb3af077a, 0x93a5f730 - .long 0x4984d782, 0xd7c0557f - .long 0xca6ef3ac, 0x6cb08e5c - .long 0x234e0b26, 0x63ded06a - .long 0xdd66cbbb, 0x6b749fb2 - .long 0x4597456a, 0x4d56973c - .long 0xe9e28eb4, 0x1393e203 - .long 0x7b3ff57a, 0x9669c9df - .long 0xc9c8b782, 0xcec3662e - .long 0x3f70cc6f, 0xe417f38a - .long 0x93e106a4, 0x96c515bb - .long 0x62ec6c6d, 0x4b9e0f71 - .long 0xd813b325, 0xe6fc4e6a - .long 0x0df04680, 0xd104b8fc - .long 0x2342001e, 0x8227bb8a - .long 0x0a2a8d7e, 0x5b397730 - .long 0x6d9a4957, 0xb0cd4768 - .long 0xe8b6368b, 0xe78eb416 - .long 0xd2c3ed1a, 0x39c7ff35 - .long 0x995a5724, 0x61ff0e01 - .long 0x9ef68d35, 0xd7a4825c - .long 0x0c139b31, 0x8d96551c - .long 0xf2271e60, 0x0ab3844b - .long 0x0b0bf8ca, 0x0bf80dd2 - .long 0x2664fd8b, 0x0167d312 - .long 0xed64812d, 0x8821abed - .long 0x02ee03b2, 0xf6076544 - .long 0x8604ae0f, 0x6a45d2b2 - .long 0x363bd6b3, 0x26f6a60a - .long 0x135c83fd, 0xd8d26619 - .long 0x5fabe670, 0xa741c1bf - .long 0x35ec3279, 0xde87806c - .long 0x00bcf5f6, 0x98d8d9cb - .long 0x8ae00689, 0x14338754 - .long 0x17f27698, 0x49c3cc9c - .long 0x58ca5f00, 0x5bd2011f - .long 0xaa7c7ad5, 0x68bce87a - .long 0xb5cfca28, 0xdd07448e - .long 0xded288f8, 0x57a3d037 - .long 0x59f229bc, 0xdde8f5b9 - .long 0x6d390dec, 0x6956fc3b - .long 0x37170390, 0xa3e3e02c - .long 0x6353c1cc, 0x42d98888 - .long 0xc4584f5c, 0xd73c7bea - .long 0xf48642e9, 0x3771e98f - .long 0x531377e2, 0x80ff0093 - .long 0xdd35bc8d, 0xb42ae3d9 - .long 0xb25b29f2, 0x8fe4c34d - .long 0x9a5ede41, 0x2178513a - .long 0xa563905d, 0xdf99fc11 - .long 0x45cddf4e, 0xe0ac139e - .long 0xacfa3103, 0x6c23e841 - .long 0xa51b6135, 0x170076fa diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S deleted file mode 100644 index 5286db5b8165..000000000000 --- a/arch/x86/crypto/crct10dif-pcl-asm_64.S +++ /dev/null @@ -1,332 +0,0 @@ -######################################################################## -# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions -# -# Copyright (c) 2013, Intel Corporation -# -# Authors: -# Erdinc Ozturk <erdinc.ozturk@intel.com> -# Vinodh Gopal <vinodh.gopal@intel.com> -# James Guilford <james.guilford@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the -# distribution. -# -# * Neither the name of the Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# -# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# Reference paper titled "Fast CRC Computation for Generic -# Polynomials Using PCLMULQDQ Instruction" -# URL: http://www.intel.com/content/dam/www/public/us/en/documents -# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf -# - -#include <linux/linkage.h> - -.text - -#define init_crc %edi -#define buf %rsi -#define len %rdx - -#define FOLD_CONSTS %xmm10 -#define BSWAP_MASK %xmm11 - -# Fold reg1, reg2 into the next 32 data bytes, storing the result back into -# reg1, reg2. -.macro fold_32_bytes offset, reg1, reg2 - movdqu \offset(buf), %xmm9 - movdqu \offset+16(buf), %xmm12 - pshufb BSWAP_MASK, %xmm9 - pshufb BSWAP_MASK, %xmm12 - movdqa \reg1, %xmm8 - movdqa \reg2, %xmm13 - pclmulqdq $0x00, FOLD_CONSTS, \reg1 - pclmulqdq $0x11, FOLD_CONSTS, %xmm8 - pclmulqdq $0x00, FOLD_CONSTS, \reg2 - pclmulqdq $0x11, FOLD_CONSTS, %xmm13 - pxor %xmm9 , \reg1 - xorps %xmm8 , \reg1 - pxor %xmm12, \reg2 - xorps %xmm13, \reg2 -.endm - -# Fold src_reg into dst_reg. -.macro fold_16_bytes src_reg, dst_reg - movdqa \src_reg, %xmm8 - pclmulqdq $0x11, FOLD_CONSTS, \src_reg - pclmulqdq $0x00, FOLD_CONSTS, %xmm8 - pxor %xmm8, \dst_reg - xorps \src_reg, \dst_reg -.endm - -# -# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len); -# -# Assumes len >= 16. -# -SYM_FUNC_START(crc_t10dif_pcl) - - movdqa .Lbswap_mask(%rip), BSWAP_MASK - - # For sizes less than 256 bytes, we can't fold 128 bytes at a time. - cmp $256, len - jl .Lless_than_256_bytes - - # Load the first 128 data bytes. Byte swapping is necessary to make the - # bit order match the polynomial coefficient order. - movdqu 16*0(buf), %xmm0 - movdqu 16*1(buf), %xmm1 - movdqu 16*2(buf), %xmm2 - movdqu 16*3(buf), %xmm3 - movdqu 16*4(buf), %xmm4 - movdqu 16*5(buf), %xmm5 - movdqu 16*6(buf), %xmm6 - movdqu 16*7(buf), %xmm7 - add $128, buf - pshufb BSWAP_MASK, %xmm0 - pshufb BSWAP_MASK, %xmm1 - pshufb BSWAP_MASK, %xmm2 - pshufb BSWAP_MASK, %xmm3 - pshufb BSWAP_MASK, %xmm4 - pshufb BSWAP_MASK, %xmm5 - pshufb BSWAP_MASK, %xmm6 - pshufb BSWAP_MASK, %xmm7 - - # XOR the first 16 data *bits* with the initial CRC value. - pxor %xmm8, %xmm8 - pinsrw $7, init_crc, %xmm8 - pxor %xmm8, %xmm0 - - movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS - - # Subtract 128 for the 128 data bytes just consumed. Subtract another - # 128 to simplify the termination condition of the following loop. - sub $256, len - - # While >= 128 data bytes remain (not counting xmm0-7), fold the 128 - # bytes xmm0-7 into them, storing the result back into xmm0-7. -.Lfold_128_bytes_loop: - fold_32_bytes 0, %xmm0, %xmm1 - fold_32_bytes 32, %xmm2, %xmm3 - fold_32_bytes 64, %xmm4, %xmm5 - fold_32_bytes 96, %xmm6, %xmm7 - add $128, buf - sub $128, len - jge .Lfold_128_bytes_loop - - # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7. - - # Fold across 64 bytes. - movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS - fold_16_bytes %xmm0, %xmm4 - fold_16_bytes %xmm1, %xmm5 - fold_16_bytes %xmm2, %xmm6 - fold_16_bytes %xmm3, %xmm7 - # Fold across 32 bytes. - movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS - fold_16_bytes %xmm4, %xmm6 - fold_16_bytes %xmm5, %xmm7 - # Fold across 16 bytes. - movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS - fold_16_bytes %xmm6, %xmm7 - - # Add 128 to get the correct number of data bytes remaining in 0...127 - # (not counting xmm7), following the previous extra subtraction by 128. - # Then subtract 16 to simplify the termination condition of the - # following loop. - add $128-16, len - - # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes - # xmm7 into them, storing the result back into xmm7. - jl .Lfold_16_bytes_loop_done -.Lfold_16_bytes_loop: - movdqa %xmm7, %xmm8 - pclmulqdq $0x11, FOLD_CONSTS, %xmm7 - pclmulqdq $0x00, FOLD_CONSTS, %xmm8 - pxor %xmm8, %xmm7 - movdqu (buf), %xmm0 - pshufb BSWAP_MASK, %xmm0 - pxor %xmm0 , %xmm7 - add $16, buf - sub $16, len - jge .Lfold_16_bytes_loop - -.Lfold_16_bytes_loop_done: - # Add 16 to get the correct number of data bytes remaining in 0...15 - # (not counting xmm7), following the previous extra subtraction by 16. - add $16, len - je .Lreduce_final_16_bytes - -.Lhandle_partial_segment: - # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16 - # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do - # this without needing a fold constant for each possible 'len', redivide - # the bytes into a first chunk of 'len' bytes and a second chunk of 16 - # bytes, then fold the first chunk into the second. - - movdqa %xmm7, %xmm2 - - # xmm1 = last 16 original data bytes - movdqu -16(buf, len), %xmm1 - pshufb BSWAP_MASK, %xmm1 - - # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes. - lea .Lbyteshift_table+16(%rip), %rax - sub len, %rax - movdqu (%rax), %xmm0 - pshufb %xmm0, %xmm2 - - # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes. - pxor .Lmask1(%rip), %xmm0 - pshufb %xmm0, %xmm7 - - # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes), - # then '16-len' bytes from xmm2 (high-order bytes). - pblendvb %xmm2, %xmm1 #xmm0 is implicit - - # Fold the first chunk into the second chunk, storing the result in xmm7. - movdqa %xmm7, %xmm8 - pclmulqdq $0x11, FOLD_CONSTS, %xmm7 - pclmulqdq $0x00, FOLD_CONSTS, %xmm8 - pxor %xmm8, %xmm7 - pxor %xmm1, %xmm7 - -.Lreduce_final_16_bytes: - # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC - - # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. - movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS - - # Fold the high 64 bits into the low 64 bits, while also multiplying by - # x^64. This produces a 128-bit value congruent to x^64 * M(x) and - # whose low 48 bits are 0. - movdqa %xmm7, %xmm0 - pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x)) - pslldq $8, %xmm0 - pxor %xmm0, %xmm7 # + low bits * x^64 - - # Fold the high 32 bits into the low 96 bits. This produces a 96-bit - # value congruent to x^64 * M(x) and whose low 48 bits are 0. - movdqa %xmm7, %xmm0 - pand .Lmask2(%rip), %xmm0 # zero high 32 bits - psrldq $12, %xmm7 # extract high 32 bits - pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x)) - pxor %xmm0, %xmm7 # + low bits - - # Load G(x) and floor(x^48 / G(x)). - movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS - - # Use Barrett reduction to compute the final CRC value. - movdqa %xmm7, %xmm0 - pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x)) - psrlq $32, %xmm7 # /= x^32 - pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x) - psrlq $48, %xmm0 - pxor %xmm7, %xmm0 # + low 16 nonzero bits - # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0. - - pextrw $0, %xmm0, %eax - RET - -.align 16 -.Lless_than_256_bytes: - # Checksumming a buffer of length 16...255 bytes - - # Load the first 16 data bytes. - movdqu (buf), %xmm7 - pshufb BSWAP_MASK, %xmm7 - add $16, buf - - # XOR the first 16 data *bits* with the initial CRC value. - pxor %xmm0, %xmm0 - pinsrw $7, init_crc, %xmm0 - pxor %xmm0, %xmm7 - - movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS - cmp $16, len - je .Lreduce_final_16_bytes # len == 16 - sub $32, len - jge .Lfold_16_bytes_loop # 32 <= len <= 255 - add $16, len - jmp .Lhandle_partial_segment # 17 <= len <= 31 -SYM_FUNC_END(crc_t10dif_pcl) - -.section .rodata, "a", @progbits -.align 16 - -# Fold constants precomputed from the polynomial 0x18bb7 -# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 -.Lfold_across_128_bytes_consts: - .quad 0x0000000000006123 # x^(8*128) mod G(x) - .quad 0x0000000000002295 # x^(8*128+64) mod G(x) -.Lfold_across_64_bytes_consts: - .quad 0x0000000000001069 # x^(4*128) mod G(x) - .quad 0x000000000000dd31 # x^(4*128+64) mod G(x) -.Lfold_across_32_bytes_consts: - .quad 0x000000000000857d # x^(2*128) mod G(x) - .quad 0x0000000000007acc # x^(2*128+64) mod G(x) -.Lfold_across_16_bytes_consts: - .quad 0x000000000000a010 # x^(1*128) mod G(x) - .quad 0x0000000000001faa # x^(1*128+64) mod G(x) -.Lfinal_fold_consts: - .quad 0x1368000000000000 # x^48 * (x^48 mod G(x)) - .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x)) -.Lbarrett_reduction_consts: - .quad 0x0000000000018bb7 # G(x) - .quad 0x00000001f65a57f8 # floor(x^48 / G(x)) - -.section .rodata.cst16.mask1, "aM", @progbits, 16 -.align 16 -.Lmask1: - .octa 0x80808080808080808080808080808080 - -.section .rodata.cst16.mask2, "aM", @progbits, 16 -.align 16 -.Lmask2: - .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF - -.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 -.align 16 -.Lbswap_mask: - .octa 0x000102030405060708090A0B0C0D0E0F - -.section .rodata.cst32.byteshift_table, "aM", @progbits, 32 -.align 16 -# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len] -# is the index vector to shift left by 'len' bytes, and is also {0x80, ..., -# 0x80} XOR the index vector to shift right by '16 - len' bytes. -.Lbyteshift_table: - .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 - .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f - .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 - .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c deleted file mode 100644 index 71291d5af9f4..000000000000 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Cryptographic API. - * - * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions - * - * Copyright (C) 2013 Intel Corporation - * Author: Tim Chen <tim.c.chen@linux.intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include <linux/types.h> -#include <linux/module.h> -#include <linux/crc-t10dif.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <asm/cpufeatures.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> - -asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len); - -struct chksum_desc_ctx { - __u16 crc; -}; - -static int chksum_init(struct shash_desc *desc) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - ctx->crc = 0; - - return 0; -} - -static int chksum_update(struct shash_desc *desc, const u8 *data, - unsigned int length) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - if (length >= 16 && crypto_simd_usable()) { - kernel_fpu_begin(); - ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); - kernel_fpu_end(); - } else - ctx->crc = crc_t10dif_generic(ctx->crc, data, length); - return 0; -} - -static int chksum_final(struct shash_desc *desc, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - *(__u16 *)out = ctx->crc; - return 0; -} - -static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out) -{ - if (len >= 16 && crypto_simd_usable()) { - kernel_fpu_begin(); - *(__u16 *)out = crc_t10dif_pcl(crc, data, len); - kernel_fpu_end(); - } else - *(__u16 *)out = crc_t10dif_generic(crc, data, len); - return 0; -} - -static int chksum_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - return __chksum_finup(ctx->crc, data, len, out); -} - -static int chksum_digest(struct shash_desc *desc, const u8 *data, - unsigned int length, u8 *out) -{ - return __chksum_finup(0, data, length, out); -} - -static struct shash_alg alg = { - .digestsize = CRC_T10DIF_DIGEST_SIZE, - .init = chksum_init, - .update = chksum_update, - .final = chksum_final, - .finup = chksum_finup, - .digest = chksum_digest, - .descsize = sizeof(struct chksum_desc_ctx), - .base = { - .cra_name = "crct10dif", - .cra_driver_name = "crct10dif-pclmul", - .cra_priority = 200, - .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}; - -static const struct x86_cpu_id crct10dif_cpu_id[] = { - X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); - -static int __init crct10dif_intel_mod_init(void) -{ - if (!x86_match_cpu(crct10dif_cpu_id)) - return -ENODEV; - - return crypto_register_shash(&alg); -} - -static void __exit crct10dif_intel_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(crct10dif_intel_mod_init); -module_exit(crct10dif_intel_mod_fini); - -MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>"); -MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); -MODULE_LICENSE("GPL"); - -MODULE_ALIAS_CRYPTO("crct10dif"); -MODULE_ALIAS_CRYPTO("crct10dif-pclmul"); diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index d55fa9e9b9e6..dcfc0de333de 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -1720,5 +1720,6 @@ module_exit(curve25519_mod_exit); MODULE_ALIAS_CRYPTO("curve25519"); MODULE_ALIAS_CRYPTO("curve25519-x86"); +MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index abb8b1fe123b..34600f90d8a6 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -73,7 +73,7 @@ static int ecb_crypt(struct skcipher_request *req, const u32 *expkey) err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { - u8 *wsrc = walk.src.virt.addr; + const u8 *wsrc = walk.src.virt.addr; u8 *wdst = walk.dst.virt.addr; /* Process four block batch */ @@ -291,7 +291,6 @@ static struct crypto_alg des3_ede_cipher = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES3_EDE_BLOCK_SIZE, .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), - .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 700ecaee9a08..c759ec808bf1 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -19,7 +19,7 @@ #include <crypto/internal/simd.h> #include <asm/cpu_device_id.h> #include <asm/simd.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 @@ -189,6 +189,20 @@ static int ghash_async_init(struct ahash_request *req) return crypto_shash_init(desc); } +static void ghash_init_cryptd_req(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + ahash_request_set_callback(cryptd_req, req->base.flags, + req->base.complete, req->base.data); + ahash_request_set_crypt(cryptd_req, req->src, req->result, + req->nbytes); +} + static int ghash_async_update(struct ahash_request *req) { struct ahash_request *cryptd_req = ahash_request_ctx(req); @@ -198,8 +212,7 @@ static int ghash_async_update(struct ahash_request *req) if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - memcpy(cryptd_req, req, sizeof(*req)); - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + ghash_init_cryptd_req(req); return crypto_ahash_update(cryptd_req); } else { struct shash_desc *desc = cryptd_shash_desc(cryptd_req); @@ -216,8 +229,7 @@ static int ghash_async_final(struct ahash_request *req) if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - memcpy(cryptd_req, req, sizeof(*req)); - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + ghash_init_cryptd_req(req); return crypto_ahash_final(cryptd_req); } else { struct shash_desc *desc = cryptd_shash_desc(cryptd_req); @@ -257,8 +269,7 @@ static int ghash_async_digest(struct ahash_request *req) if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - memcpy(cryptd_req, req, sizeof(*req)); - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + ghash_init_cryptd_req(req); return crypto_ahash_digest(cryptd_req); } else { struct shash_desc *desc = cryptd_shash_desc(cryptd_req); diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S index ef73a3ab8726..791386d9a83a 100644 --- a/arch/x86/crypto/nh-avx2-x86_64.S +++ b/arch/x86/crypto/nh-avx2-x86_64.S @@ -154,5 +154,6 @@ SYM_TYPED_FUNC_START(nh_avx2) vpaddq T1, T0, T0 vpaddq T4, T0, T0 vmovdqu T0, (HASH) + vzeroupper RET SYM_FUNC_END(nh_avx2) diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 1dfb8af48a3c..08ff4b489f7e 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -12,7 +12,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/sizes.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/simd.h> asmlinkage void poly1305_init_x86_64(void *ctx, @@ -269,7 +269,7 @@ static int __init poly1305_simd_mod_init(void) boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ - boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) + boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) static_branch_enable(&poly1305_use_avx512); return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; } diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 97e283621851..84e47f7f6188 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -9,6 +9,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #include "glue_helper-asm-avx.S" @@ -656,7 +657,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) RET; SYM_FUNC_END(__serpent_dec_blk8_avx) -SYM_FUNC_START(serpent_ecb_enc_8way_avx) +SYM_TYPED_FUNC_START(serpent_ecb_enc_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -674,7 +675,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx) RET; SYM_FUNC_END(serpent_ecb_enc_8way_avx) -SYM_FUNC_START(serpent_ecb_dec_8way_avx) +SYM_TYPED_FUNC_START(serpent_ecb_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -692,7 +693,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx) RET; SYM_FUNC_END(serpent_ecb_dec_8way_avx) -SYM_FUNC_START(serpent_cbc_dec_8way_avx) +SYM_TYPED_FUNC_START(serpent_cbc_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 9918212faf91..0bbec1c75cd0 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -592,22 +592,22 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) leaq K256+0*32(%rip), INP ## reuse INP as scratch reg vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 0*32 + FOUR_ROUNDS_AND_SCHED (_XFER + 0*32) leaq K256+1*32(%rip), INP vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 1*32 + FOUR_ROUNDS_AND_SCHED (_XFER + 1*32) leaq K256+2*32(%rip), INP vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 2*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 2*32 + FOUR_ROUNDS_AND_SCHED (_XFER + 2*32) leaq K256+3*32(%rip), INP vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 3*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED _XFER + 3*32 + FOUR_ROUNDS_AND_SCHED (_XFER + 3*32) add $4*32, SRND cmp $3*4*32, SRND @@ -618,12 +618,12 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) leaq K256+0*32(%rip), INP vpaddd (INP, SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - DO_4ROUNDS _XFER + 0*32 + DO_4ROUNDS (_XFER + 0*32) leaq K256+1*32(%rip), INP vpaddd (INP, SRND), X1, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - DO_4ROUNDS _XFER + 1*32 + DO_4ROUNDS (_XFER + 1*32) add $2*32, SRND vmovdqa X2, X0 @@ -651,8 +651,8 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) xor SRND, SRND .align 16 .Lloop3: - DO_4ROUNDS _XFER + 0*32 + 16 - DO_4ROUNDS _XFER + 1*32 + 16 + DO_4ROUNDS (_XFER + 0*32 + 16) + DO_4ROUNDS (_XFER + 1*32 + 16) add $2*32, SRND cmp $4*4*32, SRND jb .Lloop3 @@ -716,6 +716,7 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx) popq %r13 popq %r12 popq %rbx + vzeroupper RET SYM_FUNC_END(sha256_transform_rorx) diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index 537b6dcd7ed8..d515a55a3bc1 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -62,20 +62,41 @@ #define SHA256CONSTANTS %rax -#define MSG %xmm0 +#define MSG %xmm0 /* sha256rnds2 implicit operand */ #define STATE0 %xmm1 #define STATE1 %xmm2 -#define MSGTMP0 %xmm3 -#define MSGTMP1 %xmm4 -#define MSGTMP2 %xmm5 -#define MSGTMP3 %xmm6 -#define MSGTMP4 %xmm7 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 +#define TMP %xmm7 #define SHUF_MASK %xmm8 #define ABEF_SAVE %xmm9 #define CDGH_SAVE %xmm10 +.macro do_4rounds i, m0, m1, m2, m3 +.if \i < 16 + movdqu \i*4(DATA_PTR), \m0 + pshufb SHUF_MASK, \m0 +.endif + movdqa (\i-32)*4(SHA256CONSTANTS), MSG + paddd \m0, MSG + sha256rnds2 STATE0, STATE1 +.if \i >= 12 && \i < 60 + movdqa \m0, TMP + palignr $4, \m3, TMP + paddd TMP, \m1 + sha256msg2 \m0, \m1 +.endif + punpckhqdq MSG, MSG + sha256rnds2 STATE1, STATE0 +.if \i >= 4 && \i < 52 + sha256msg1 \m0, \m3 +.endif +.endm + /* * Intel SHA Extensions optimized implementation of a SHA-256 update function * @@ -86,9 +107,6 @@ * store partial blocks. All message padding and hash value initialization must * be done outside the update function. * - * The indented lines in the loop are instructions related to rounds processing. - * The non-indented lines are instructions related to the message schedule. - * * void sha256_ni_transform(uint32_t *digest, const void *data, uint32_t numBlocks); * digest : pointer to digest @@ -108,202 +126,29 @@ SYM_TYPED_FUNC_START(sha256_ni_transform) * Need to reorder these appropriately * DCBA, HGFE -> ABEF, CDGH */ - movdqu 0*16(DIGEST_PTR), STATE0 - movdqu 1*16(DIGEST_PTR), STATE1 + movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */ + movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */ - pshufd $0xB1, STATE0, STATE0 /* CDAB */ - pshufd $0x1B, STATE1, STATE1 /* EFGH */ - movdqa STATE0, MSGTMP4 - palignr $8, STATE1, STATE0 /* ABEF */ - pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ + movdqa STATE0, TMP + punpcklqdq STATE1, STATE0 /* FEBA */ + punpckhqdq TMP, STATE1 /* DCHG */ + pshufd $0x1B, STATE0, STATE0 /* ABEF */ + pshufd $0xB1, STATE1, STATE1 /* CDGH */ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - lea K256(%rip), SHA256CONSTANTS + lea K256+32*4(%rip), SHA256CONSTANTS .Lloop0: /* Save hash values for addition after rounds */ movdqa STATE0, ABEF_SAVE movdqa STATE1, CDGH_SAVE - /* Rounds 0-3 */ - movdqu 0*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP0 - paddd 0*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 4-7 */ - movdqu 1*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP1 - paddd 1*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 8-11 */ - movdqu 2*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP2 - paddd 2*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 12-15 */ - movdqu 3*16(DATA_PTR), MSG - pshufb SHUF_MASK, MSG - movdqa MSG, MSGTMP3 - paddd 3*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 16-19 */ - movdqa MSGTMP0, MSG - paddd 4*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 20-23 */ - movdqa MSGTMP1, MSG - paddd 5*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 24-27 */ - movdqa MSGTMP2, MSG - paddd 6*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 28-31 */ - movdqa MSGTMP3, MSG - paddd 7*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 32-35 */ - movdqa MSGTMP0, MSG - paddd 8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 36-39 */ - movdqa MSGTMP1, MSG - paddd 9*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP1, MSGTMP0 - - /* Rounds 40-43 */ - movdqa MSGTMP2, MSG - paddd 10*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP2, MSGTMP1 - - /* Rounds 44-47 */ - movdqa MSGTMP3, MSG - paddd 11*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP3, MSGTMP4 - palignr $4, MSGTMP2, MSGTMP4 - paddd MSGTMP4, MSGTMP0 - sha256msg2 MSGTMP3, MSGTMP0 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP3, MSGTMP2 - - /* Rounds 48-51 */ - movdqa MSGTMP0, MSG - paddd 12*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP0, MSGTMP4 - palignr $4, MSGTMP3, MSGTMP4 - paddd MSGTMP4, MSGTMP1 - sha256msg2 MSGTMP0, MSGTMP1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - sha256msg1 MSGTMP0, MSGTMP3 - - /* Rounds 52-55 */ - movdqa MSGTMP1, MSG - paddd 13*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP1, MSGTMP4 - palignr $4, MSGTMP0, MSGTMP4 - paddd MSGTMP4, MSGTMP2 - sha256msg2 MSGTMP1, MSGTMP2 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 56-59 */ - movdqa MSGTMP2, MSG - paddd 14*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - movdqa MSGTMP2, MSGTMP4 - palignr $4, MSGTMP1, MSGTMP4 - paddd MSGTMP4, MSGTMP3 - sha256msg2 MSGTMP2, MSGTMP3 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 - - /* Rounds 60-63 */ - movdqa MSGTMP3, MSG - paddd 15*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 - pshufd $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 +.irp i, 0, 16, 32, 48 + do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3 + do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0 + do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1 + do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2 +.endr /* Add current hash values with previously saved */ paddd ABEF_SAVE, STATE0 @@ -315,14 +160,14 @@ SYM_TYPED_FUNC_START(sha256_ni_transform) jne .Lloop0 /* Write hash values back in the correct order */ - pshufd $0x1B, STATE0, STATE0 /* FEBA */ - pshufd $0xB1, STATE1, STATE1 /* DCHG */ - movdqa STATE0, MSGTMP4 - pblendw $0xF0, STATE1, STATE0 /* DCBA */ - palignr $8, MSGTMP4, STATE1 /* HGFE */ - - movdqu STATE0, 0*16(DIGEST_PTR) - movdqu STATE1, 1*16(DIGEST_PTR) + movdqa STATE0, TMP + punpcklqdq STATE1, STATE0 /* GHEF */ + punpckhqdq TMP, STATE1 /* ABCD */ + pshufd $0xB1, STATE0, STATE0 /* HGFE */ + pshufd $0x1B, STATE1, STATE1 /* DCBA */ + + movdqu STATE1, 0*16(DIGEST_PTR) + movdqu STATE0, 1*16(DIGEST_PTR) .Ldone_hash: diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index f08496cd6870..24973f42c43f 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -680,6 +680,7 @@ SYM_TYPED_FUNC_START(sha512_transform_rorx) pop %r12 pop %rbx + vzeroupper RET SYM_FUNC_END(sha512_transform_rorx) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index d2288bf38a8a..071e90e7f0d8 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> .file "twofish-x86_64-asm-3way.S" .text @@ -220,7 +221,7 @@ rorq $32, RAB2; \ outunpack3(mov, RIO, 2, RAB, 2); -SYM_FUNC_START(__twofish_enc_blk_3way) +SYM_TYPED_FUNC_START(__twofish_enc_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -269,7 +270,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way) RET; SYM_FUNC_END(__twofish_enc_blk_3way) -SYM_FUNC_START(twofish_dec_blk_3way) +SYM_TYPED_FUNC_START(twofish_dec_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index 775af290cd19..e08b4ba07b93 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -8,6 +8,7 @@ .text #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/asm-offsets.h> #define a_offset 0 @@ -202,7 +203,7 @@ xor %r8d, d ## D;\ ror $1, d ## D; -SYM_FUNC_START(twofish_enc_blk) +SYM_TYPED_FUNC_START(twofish_enc_blk) pushq R1 /* %rdi contains the ctx address */ @@ -255,7 +256,7 @@ SYM_FUNC_START(twofish_enc_blk) RET SYM_FUNC_END(twofish_enc_blk) -SYM_FUNC_START(twofish_dec_blk) +SYM_TYPED_FUNC_START(twofish_dec_blk) pushq R1 /* %rdi contains the ctx address */ diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index 0614beece279..4c67184dc573 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -68,7 +68,6 @@ static struct crypto_alg alg = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 90454cf18e0d..1a1ecfa7f72a 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -5,6 +5,7 @@ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> */ +#include <asm/cpu_device_id.h> #include <crypto/algapi.h> #include <crypto/twofish.h> #include <linux/crypto.h> @@ -107,10 +108,10 @@ static bool is_blacklisted_cpu(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return false; - if (boot_cpu_data.x86 == 0x06 && - (boot_cpu_data.x86_model == 0x1c || - boot_cpu_data.x86_model == 0x26 || - boot_cpu_data.x86_model == 0x36)) { + switch (boot_cpu_data.x86_vfm) { + case INTEL_ATOM_BONNELL: + case INTEL_ATOM_BONNELL_MID: + case INTEL_ATOM_SALTWELL: /* * On Atom, twofish-3way is slower than original assembler * implementation. Twofish-3way trades off some performance in |