diff options
Diffstat (limited to 'arch/x86/crypto/aes-xts-avx-x86_64.S')
-rw-r--r-- | arch/x86/crypto/aes-xts-avx-x86_64.S | 329 |
1 files changed, 168 insertions, 161 deletions
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index 48f97b79f7a9..8a3e23fbcf85 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -80,22 +80,6 @@ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 .text -// Function parameters -.set KEY, %rdi // Initially points to crypto_aes_ctx, then is - // advanced to point to 7th-from-last round key -.set SRC, %rsi // Pointer to next source data -.set DST, %rdx // Pointer to next destination data -.set LEN, %ecx // Remaining length in bytes -.set LEN8, %cl -.set LEN64, %rcx -.set TWEAK, %r8 // Pointer to next tweak - -// %rax holds the AES key length in bytes. -.set KEYLEN, %eax -.set KEYLEN64, %rax - -// %r9-r11 are available as temporaries. - .macro _define_Vi i .if VL == 16 .set V\i, %xmm\i @@ -112,41 +96,31 @@ // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers // are available, that map to the xmm, ymm, or zmm registers according // to the selected Vector Length (VL). - _define_Vi 0 - _define_Vi 1 - _define_Vi 2 - _define_Vi 3 - _define_Vi 4 - _define_Vi 5 - _define_Vi 6 - _define_Vi 7 - _define_Vi 8 - _define_Vi 9 - _define_Vi 10 - _define_Vi 11 - _define_Vi 12 - _define_Vi 13 - _define_Vi 14 - _define_Vi 15 +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + _define_Vi \i +.endr .if USE_AVX10 - _define_Vi 16 - _define_Vi 17 - _define_Vi 18 - _define_Vi 19 - _define_Vi 20 - _define_Vi 21 - _define_Vi 22 - _define_Vi 23 - _define_Vi 24 - _define_Vi 25 - _define_Vi 26 - _define_Vi 27 - _define_Vi 28 - _define_Vi 29 - _define_Vi 30 - _define_Vi 31 +.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + _define_Vi \i +.endr .endif + // Function parameters + .set KEY, %rdi // Initially points to crypto_aes_ctx, then is + // advanced to point to 7th-from-last round key + .set SRC, %rsi // Pointer to next source data + .set DST, %rdx // Pointer to next destination data + .set LEN, %ecx // Remaining length in bytes + .set LEN8, %cl + .set LEN64, %rcx + .set TWEAK, %r8 // Pointer to next tweak + + // %rax holds the AES key length in bytes. + .set KEYLEN, %eax + .set KEYLEN64, %rax + + // %r9-r11 are available as temporaries. + // V0-V3 hold the data blocks during the main loop, or temporary values // otherwise. V4-V5 hold temporary values. @@ -214,6 +188,7 @@ .endm // Move a vector between memory and a register. +// The register operand must be in the first 16 vector registers. .macro _vmovdqu src, dst .if VL < 64 vmovdqu \src, \dst @@ -234,11 +209,12 @@ .endm // XOR two vectors together. +// Any register operands must be in the first 16 vector registers. .macro _vpxor src1, src2, dst -.if USE_AVX10 - vpxord \src1, \src2, \dst -.else +.if VL < 64 vpxor \src1, \src2, \dst +.else + vpxord \src1, \src2, \dst .endif .endm @@ -259,8 +235,12 @@ vpshufd $0x13, \src, \tmp vpaddq \src, \src, \dst vpsrad $31, \tmp, \tmp +.if USE_AVX10 + vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst +.else vpand GF_POLY_XMM, \tmp, \tmp vpxor \tmp, \dst, \dst +.endif .endm // Given the XTS tweak(s) in the vector \src, compute the next vector of @@ -369,9 +349,14 @@ // Do one step in computing the next set of tweaks using the VPCLMULQDQ method // (the same method _next_tweakvec uses for VL > 16). This means multiplying -// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8 -// when VL > 16 (which it is here), the needed shift amounts are byte-aligned, -// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts. +// each tweak by x^(4*VL/16) independently. +// +// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed +// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq +// to do 128-bit wide shifts. The 128-bit left shift (vpslldq) saves +// instructions directly. The 128-bit right shift (vpsrldq) performs better +// than a 64-bit right shift on Intel CPUs in the context where it is used here, +// because it runs on a different execution port from the AES instructions. .macro _tweak_step_pclmul i .if \i == 0 vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 @@ -406,7 +391,7 @@ // \i that include at least 0 through 19, then 1000 which signals the last step. // // This is used to interleave the computation of the next set of tweaks with the -// AES en/decryptions, which increases performance in some cases. +// AES en/decryptions, which increases performance in some cases. Clobbers V5. .macro _tweak_step i .if VL == 16 _tweak_step_mulx \i @@ -443,9 +428,10 @@ // the last round needs different instructions. // // An alternative approach would be to roll up all the round loops. We - // don't do that because it isn't compatible with caching the round keys - // in registers which we do when possible (see below), and also because - // it seems unwise to rely *too* heavily on the CPU's branch predictor. + // don't do that because (a) it isn't compatible with caching the round + // keys in registers which we do when possible (see below), (b) we + // interleave the AES rounds with the XTS tweak computation, and (c) it + // seems unwise to rely *too* heavily on the CPU's branch predictor. lea OFFS-16(KEY, KEYLEN64, 4), KEY // If all 32 SIMD registers are available, cache all the round keys. @@ -472,90 +458,94 @@ .endif .endm -// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0) -// on the block(s) in \data using the round key(s) in \key. The register length -// determines the number of AES blocks en/decrypted. -.macro _vaes enc, last, key, data +// Do a single non-last round of AES encryption (if \enc==1) or decryption (if +// \enc==0) on the block(s) in \data using the round key(s) in \key. The +// register length determines the number of AES blocks en/decrypted. +.macro _vaes enc, key, data .if \enc -.if \last - vaesenclast \key, \data, \data -.else vaesenc \key, \data, \data -.endif -.else -.if \last - vaesdeclast \key, \data, \data .else vaesdec \key, \data, \data .endif +.endm + +// Same as _vaes, but does the last round. +.macro _vaeslast enc, key, data +.if \enc + vaesenclast \key, \data, \data +.else + vaesdeclast \key, \data, \data .endif .endm -// Do a single round of AES en/decryption on the block(s) in \data, using the -// same key for all block(s). The round key is loaded from the appropriate -// register or memory location for round \i. May clobber V4. -.macro _vaes_1x enc, last, i, xmm_suffix, data +// Do a single non-last round of AES en/decryption on the block(s) in \data, +// using the same key for all block(s). The round key is loaded from the +// appropriate register or memory location for round \i. May clobber \tmp. +.macro _vaes_1x enc, i, xmm_suffix, data, tmp .if USE_AVX10 - _vaes \enc, \last, KEY\i\xmm_suffix, \data + _vaes \enc, KEY\i\xmm_suffix, \data .else .ifnb \xmm_suffix - _vaes \enc, \last, (\i-7)*16(KEY), \data + _vaes \enc, (\i-7)*16(KEY), \data .else - _vbroadcast128 (\i-7)*16(KEY), V4 - _vaes \enc, \last, V4, \data + _vbroadcast128 (\i-7)*16(KEY), \tmp + _vaes \enc, \tmp, \data .endif .endif .endm -// Do a single round of AES en/decryption on the blocks in registers V0-V3, -// using the same key for all blocks. The round key is loaded from the +// Do a single non-last round of AES en/decryption on the blocks in registers +// V0-V3, using the same key for all blocks. The round key is loaded from the // appropriate register or memory location for round \i. In addition, does two -// steps of the computation of the next set of tweaks. May clobber V4. -.macro _vaes_4x enc, last, i +// steps of the computation of the next set of tweaks. May clobber V4 and V5. +.macro _vaes_4x enc, i .if USE_AVX10 _tweak_step (2*(\i-5)) - _vaes \enc, \last, KEY\i, V0 - _vaes \enc, \last, KEY\i, V1 + _vaes \enc, KEY\i, V0 + _vaes \enc, KEY\i, V1 _tweak_step (2*(\i-5) + 1) - _vaes \enc, \last, KEY\i, V2 - _vaes \enc, \last, KEY\i, V3 + _vaes \enc, KEY\i, V2 + _vaes \enc, KEY\i, V3 .else _vbroadcast128 (\i-7)*16(KEY), V4 _tweak_step (2*(\i-5)) - _vaes \enc, \last, V4, V0 - _vaes \enc, \last, V4, V1 + _vaes \enc, V4, V0 + _vaes \enc, V4, V1 _tweak_step (2*(\i-5) + 1) - _vaes \enc, \last, V4, V2 - _vaes \enc, \last, V4, V3 + _vaes \enc, V4, V2 + _vaes \enc, V4, V3 .endif .endm // Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, // then XOR with \tweak again) of the block(s) in \data. To process a single // block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of -// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4. -.macro _aes_crypt enc, xmm_suffix, tweak, data +// length VL, use V* registers and leave \xmm_suffix empty. Clobbers \tmp. +.macro _aes_crypt enc, xmm_suffix, tweak, data, tmp _xor3 KEY0\xmm_suffix, \tweak, \data cmp $24, KEYLEN jl .Laes128\@ je .Laes192\@ - _vaes_1x \enc, 0, 1, \xmm_suffix, \data - _vaes_1x \enc, 0, 2, \xmm_suffix, \data + _vaes_1x \enc, 1, \xmm_suffix, \data, tmp=\tmp + _vaes_1x \enc, 2, \xmm_suffix, \data, tmp=\tmp .Laes192\@: - _vaes_1x \enc, 0, 3, \xmm_suffix, \data - _vaes_1x \enc, 0, 4, \xmm_suffix, \data + _vaes_1x \enc, 3, \xmm_suffix, \data, tmp=\tmp + _vaes_1x \enc, 4, \xmm_suffix, \data, tmp=\tmp .Laes128\@: - _vaes_1x \enc, 0, 5, \xmm_suffix, \data - _vaes_1x \enc, 0, 6, \xmm_suffix, \data - _vaes_1x \enc, 0, 7, \xmm_suffix, \data - _vaes_1x \enc, 0, 8, \xmm_suffix, \data - _vaes_1x \enc, 0, 9, \xmm_suffix, \data - _vaes_1x \enc, 0, 10, \xmm_suffix, \data - _vaes_1x \enc, 0, 11, \xmm_suffix, \data - _vaes_1x \enc, 0, 12, \xmm_suffix, \data - _vaes_1x \enc, 0, 13, \xmm_suffix, \data - _vaes_1x \enc, 1, 14, \xmm_suffix, \data - _vpxor \tweak, \data, \data +.irp i, 5,6,7,8,9,10,11,12,13 + _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp +.endr +.if USE_AVX10 + vpxord KEY14\xmm_suffix, \tweak, \tmp +.else +.ifnb \xmm_suffix + vpxor 7*16(KEY), \tweak, \tmp +.else + _vbroadcast128 7*16(KEY), \tmp + vpxor \tweak, \tmp, \tmp +.endif +.endif + _vaeslast \enc, \tmp, \data .endm .macro _aes_xts_crypt enc @@ -581,7 +571,7 @@ // Compute the first set of tweaks TWEAK[0-3]. _compute_first_set_of_tweaks - sub $4*VL, LEN + add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32 jl .Lhandle_remainder\@ .Lmain_loop\@: @@ -589,10 +579,10 @@ // XOR each source block with its tweak and the zero-th round key. .if USE_AVX10 - vmovdqu8 0*VL(SRC), V0 - vmovdqu8 1*VL(SRC), V1 - vmovdqu8 2*VL(SRC), V2 - vmovdqu8 3*VL(SRC), V3 + _vmovdqu 0*VL(SRC), V0 + _vmovdqu 1*VL(SRC), V1 + _vmovdqu 2*VL(SRC), V2 + _vmovdqu 3*VL(SRC), V3 vpternlogd $0x96, TWEAK0, KEY0, V0 vpternlogd $0x96, TWEAK1, KEY0, V1 vpternlogd $0x96, TWEAK2, KEY0, V2 @@ -612,28 +602,43 @@ je .Laes192\@ // Do all the AES rounds on the data blocks, interleaved with // the computation of the next set of tweaks. - _vaes_4x \enc, 0, 1 - _vaes_4x \enc, 0, 2 + _vaes_4x \enc, 1 + _vaes_4x \enc, 2 .Laes192\@: - _vaes_4x \enc, 0, 3 - _vaes_4x \enc, 0, 4 + _vaes_4x \enc, 3 + _vaes_4x \enc, 4 .Laes128\@: - _vaes_4x \enc, 0, 5 - _vaes_4x \enc, 0, 6 - _vaes_4x \enc, 0, 7 - _vaes_4x \enc, 0, 8 - _vaes_4x \enc, 0, 9 - _vaes_4x \enc, 0, 10 - _vaes_4x \enc, 0, 11 - _vaes_4x \enc, 0, 12 - _vaes_4x \enc, 0, 13 - _vaes_4x \enc, 1, 14 - - // XOR in the tweaks again. - _vpxor TWEAK0, V0, V0 - _vpxor TWEAK1, V1, V1 - _vpxor TWEAK2, V2, V2 - _vpxor TWEAK3, V3, V3 +.irp i, 5,6,7,8,9,10,11,12,13 + _vaes_4x \enc, \i +.endr + // Do the last AES round, then XOR the results with the tweaks again. + // Reduce latency by doing the XOR before the vaesenclast, utilizing the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) + // (and likewise for vaesdeclast). +.if USE_AVX10 + _tweak_step 18 + _tweak_step 19 + vpxord TWEAK0, KEY14, V4 + vpxord TWEAK1, KEY14, V5 + _vaeslast \enc, V4, V0 + _vaeslast \enc, V5, V1 + vpxord TWEAK2, KEY14, V4 + vpxord TWEAK3, KEY14, V5 + _vaeslast \enc, V4, V2 + _vaeslast \enc, V5, V3 +.else + _vbroadcast128 7*16(KEY), V4 + _tweak_step 18 // uses V5 + _tweak_step 19 // uses V5 + vpxor TWEAK0, V4, V5 + _vaeslast \enc, V5, V0 + vpxor TWEAK1, V4, V5 + _vaeslast \enc, V5, V1 + vpxor TWEAK2, V4, V5 + vpxor TWEAK3, V4, V4 + _vaeslast \enc, V5, V2 + _vaeslast \enc, V4, V3 +.endif // Store the destination blocks. _vmovdqu V0, 0*VL(DST) @@ -644,9 +649,9 @@ // Finish computing the next set of tweaks. _tweak_step 1000 - add $4*VL, SRC - add $4*VL, DST - sub $4*VL, LEN + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, LEN jge .Lmain_loop\@ // Check for the uncommon case where the data length isn't a multiple of @@ -670,7 +675,7 @@ jl .Lvec_at_a_time_done\@ .Lvec_at_a_time\@: _vmovdqu (SRC), V0 - _aes_crypt \enc, , TWEAK0, V0 + _aes_crypt \enc, , TWEAK0, V0, tmp=V1 _vmovdqu V0, (DST) _next_tweakvec TWEAK0, V0, V1, TWEAK0 add $VL, SRC @@ -687,7 +692,7 @@ jl .Lblock_at_a_time_done\@ .Lblock_at_a_time\@: vmovdqu (SRC), %xmm0 - _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 vmovdqu %xmm0, (DST) _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM add $16, SRC @@ -715,7 +720,7 @@ // Do it now by advancing the tweak and decrypting the last full block. _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM vmovdqu (SRC), %xmm0 - _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0 + _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1 .endif .if USE_AVX10 @@ -758,47 +763,49 @@ vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 .endif // En/decrypt again and store the last full block. - _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 vmovdqu %xmm0, (DST) jmp .Ldone\@ .endm // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, // u8 iv[AES_BLOCK_SIZE]); +// +// Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes +// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10. SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) - vmovdqu (%rsi), %xmm0 - vpxor (%rdi), %xmm0, %xmm0 - movl 480(%rdi), %eax // AES key length - lea -16(%rdi, %rax, 4), %rdi - cmp $24, %eax + .set TWEAK_KEY, %rdi + .set IV, %rsi + .set KEYLEN, %eax + .set KEYLEN64, %rax + + vmovdqu (IV), %xmm0 + vpxor (TWEAK_KEY), %xmm0, %xmm0 + movl 480(TWEAK_KEY), KEYLEN + lea -16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY + cmp $24, KEYLEN jl .Lencrypt_iv_aes128 je .Lencrypt_iv_aes192 - vaesenc -6*16(%rdi), %xmm0, %xmm0 - vaesenc -5*16(%rdi), %xmm0, %xmm0 + vaesenc -6*16(TWEAK_KEY), %xmm0, %xmm0 + vaesenc -5*16(TWEAK_KEY), %xmm0, %xmm0 .Lencrypt_iv_aes192: - vaesenc -4*16(%rdi), %xmm0, %xmm0 - vaesenc -3*16(%rdi), %xmm0, %xmm0 + vaesenc -4*16(TWEAK_KEY), %xmm0, %xmm0 + vaesenc -3*16(TWEAK_KEY), %xmm0, %xmm0 .Lencrypt_iv_aes128: - vaesenc -2*16(%rdi), %xmm0, %xmm0 - vaesenc -1*16(%rdi), %xmm0, %xmm0 - vaesenc 0*16(%rdi), %xmm0, %xmm0 - vaesenc 1*16(%rdi), %xmm0, %xmm0 - vaesenc 2*16(%rdi), %xmm0, %xmm0 - vaesenc 3*16(%rdi), %xmm0, %xmm0 - vaesenc 4*16(%rdi), %xmm0, %xmm0 - vaesenc 5*16(%rdi), %xmm0, %xmm0 - vaesenc 6*16(%rdi), %xmm0, %xmm0 - vaesenclast 7*16(%rdi), %xmm0, %xmm0 - vmovdqu %xmm0, (%rsi) +.irp i, -2,-1,0,1,2,3,4,5,6 + vaesenc \i*16(TWEAK_KEY), %xmm0, %xmm0 +.endr + vaesenclast 7*16(TWEAK_KEY), %xmm0, %xmm0 + vmovdqu %xmm0, (IV) RET SYM_FUNC_END(aes_xts_encrypt_iv) // Below are the actual AES-XTS encryption and decryption functions, // instantiated from the above macro. They all have the following prototype: // -// void (*xts_asm_func)(const struct crypto_aes_ctx *key, -// const u8 *src, u8 *dst, unsigned int len, -// u8 tweak[AES_BLOCK_SIZE]); +// void (*xts_crypt_func)(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, int len, +// u8 tweak[AES_BLOCK_SIZE]); // // |key| is the data key. |tweak| contains the next tweak; the encryption of // the original IV with the tweak key was already done. This function supports |