diff options
Diffstat (limited to 'arch/x86/crypto/aes-xts-avx-x86_64.S')
-rw-r--r-- | arch/x86/crypto/aes-xts-avx-x86_64.S | 562 |
1 files changed, 312 insertions, 250 deletions
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index 48f97b79f7a9..db79cdf81588 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -1,11 +1,50 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * AES-XTS for modern x86_64 CPUs - * - * Copyright 2024 Google LLC - * - * Author: Eric Biggers <ebiggers@google.com> - */ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// AES-XTS for modern x86_64 CPUs +// +// Copyright 2024 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. /* * This file implements AES-XTS for modern x86_64 CPUs. To handle the @@ -13,32 +52,25 @@ * different code, it uses a macro to generate several implementations that * share similar source code but are targeted at different CPUs, listed below: * - * AES-NI + AVX + * AES-NI && AVX * - 128-bit vectors (1 AES block per vector) * - VEX-coded instructions * - xmm0-xmm15 * - This is for older CPUs that lack VAES but do have AVX. * - * VAES + VPCLMULQDQ + AVX2 + * VAES && VPCLMULQDQ && AVX2 * - 256-bit vectors (2 AES blocks per vector) * - VEX-coded instructions * - ymm0-ymm15 - * - This is for CPUs that have VAES but lack AVX512 or AVX10, - * e.g. Intel's Alder Lake and AMD's Zen 3. + * - This is for CPUs that have VAES but either lack AVX512 (e.g. Intel's + * Alder Lake and AMD's Zen 3) or downclock too eagerly when using zmm + * registers (e.g. Intel's Ice Lake). * - * VAES + VPCLMULQDQ + AVX10/256 + BMI2 - * - 256-bit vectors (2 AES blocks per vector) + * VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2 + * - 512-bit vectors (4 AES blocks per vector) * - EVEX-coded instructions - * - ymm0-ymm31 - * - This is for CPUs that have AVX512 but where using zmm registers causes - * downclocking, and for CPUs that have AVX10/256 but not AVX10/512. - * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256. - * To avoid confusion with 512-bit, we just write AVX10/256. - * - * VAES + VPCLMULQDQ + AVX10/512 + BMI2 - * - Same as the previous one, but upgrades to 512-bit vectors - * (4 AES blocks per vector) in zmm0-zmm31. - * - This is for CPUs that have good AVX512 or AVX10/512 support. + * - zmm0-zmm31 + * - This is for CPUs that have good AVX512 support. * * This file doesn't have an implementation for AES-NI alone (without AVX), as * the lack of VEX would make all the assembly code different. @@ -68,9 +100,20 @@ // exists when there's a carry out of the low 64 bits of the tweak. .quad 0x87, 1 + // These are the shift amounts that are needed when multiplying by [x^0, + // x^1, x^2, x^3] to compute the first vector of tweaks when VL=64. + // + // The right shifts by 64 are expected to zeroize the destination. + // 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the + // amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would. +.Lrshift_amounts: + .byte 64, 64, 63, 63, 62, 62, 61, 61 +.Llshift_amounts: + .byte 0, 0, 1, 1, 2, 2, 3, 3 + // This table contains constants for vpshufb and vpblendvb, used to // handle variable byte shifts and blending during ciphertext stealing - // on CPUs that don't support AVX10-style masking. + // on CPUs that don't support AVX512-style masking. .Lcts_permute_table: .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 @@ -80,22 +123,6 @@ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 .text -// Function parameters -.set KEY, %rdi // Initially points to crypto_aes_ctx, then is - // advanced to point to 7th-from-last round key -.set SRC, %rsi // Pointer to next source data -.set DST, %rdx // Pointer to next destination data -.set LEN, %ecx // Remaining length in bytes -.set LEN8, %cl -.set LEN64, %rcx -.set TWEAK, %r8 // Pointer to next tweak - -// %rax holds the AES key length in bytes. -.set KEYLEN, %eax -.set KEYLEN64, %rax - -// %r9-r11 are available as temporaries. - .macro _define_Vi i .if VL == 16 .set V\i, %xmm\i @@ -112,41 +139,31 @@ // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers // are available, that map to the xmm, ymm, or zmm registers according // to the selected Vector Length (VL). - _define_Vi 0 - _define_Vi 1 - _define_Vi 2 - _define_Vi 3 - _define_Vi 4 - _define_Vi 5 - _define_Vi 6 - _define_Vi 7 - _define_Vi 8 - _define_Vi 9 - _define_Vi 10 - _define_Vi 11 - _define_Vi 12 - _define_Vi 13 - _define_Vi 14 - _define_Vi 15 -.if USE_AVX10 - _define_Vi 16 - _define_Vi 17 - _define_Vi 18 - _define_Vi 19 - _define_Vi 20 - _define_Vi 21 - _define_Vi 22 - _define_Vi 23 - _define_Vi 24 - _define_Vi 25 - _define_Vi 26 - _define_Vi 27 - _define_Vi 28 - _define_Vi 29 - _define_Vi 30 - _define_Vi 31 +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + _define_Vi \i +.endr +.if USE_AVX512 +.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + _define_Vi \i +.endr .endif + // Function parameters + .set KEY, %rdi // Initially points to crypto_aes_ctx, then is + // advanced to point to 7th-from-last round key + .set SRC, %rsi // Pointer to next source data + .set DST, %rdx // Pointer to next destination data + .set LEN, %ecx // Remaining length in bytes + .set LEN8, %cl + .set LEN64, %rcx + .set TWEAK, %r8 // Pointer to next tweak + + // %rax holds the AES key length in bytes. + .set KEYLEN, %eax + .set KEYLEN64, %rax + + // %r9-r11 are available as temporaries. + // V0-V3 hold the data blocks during the main loop, or temporary values // otherwise. V4-V5 hold temporary values. @@ -180,7 +197,7 @@ // keys to the *end* of this register range. I.e., AES-128 uses // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14. // (All also use KEY0 for the XOR-only "round" at the beginning.) -.if USE_AVX10 +.if USE_AVX512 .set KEY1_XMM, %xmm16 .set KEY1, V16 .set KEY2_XMM, %xmm17 @@ -224,9 +241,9 @@ // Broadcast a 128-bit value into a vector. .macro _vbroadcast128 src, dst -.if VL == 16 && !USE_AVX10 +.if VL == 16 vmovdqu \src, \dst -.elseif VL == 32 && !USE_AVX10 +.elseif VL == 32 vbroadcasti128 \src, \dst .else vbroadcasti32x4 \src, \dst @@ -235,16 +252,16 @@ // XOR two vectors together. .macro _vpxor src1, src2, dst -.if USE_AVX10 - vpxord \src1, \src2, \dst -.else +.if VL < 64 vpxor \src1, \src2, \dst +.else + vpxord \src1, \src2, \dst .endif .endm // XOR three vectors together. .macro _xor3 src1, src2, src3_and_dst -.if USE_AVX10 +.if USE_AVX512 // vpternlogd with immediate 0x96 is a three-argument XOR. vpternlogd $0x96, \src1, \src2, \src3_and_dst .else @@ -259,8 +276,12 @@ vpshufd $0x13, \src, \tmp vpaddq \src, \src, \dst vpsrad $31, \tmp, \tmp +.if USE_AVX512 + vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst +.else vpand GF_POLY_XMM, \tmp, \tmp vpxor \tmp, \dst, \dst +.endif .endm // Given the XTS tweak(s) in the vector \src, compute the next vector of @@ -284,52 +305,75 @@ // Given the first XTS tweak at (TWEAK), compute the first set of tweaks and // store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5. .macro _compute_first_set_of_tweaks - vmovdqu (TWEAK), TWEAK0_XMM - _vbroadcast128 .Lgf_poly(%rip), GF_POLY .if VL == 16 - // With VL=16, multiplying by x serially is fastest. + vmovdqu (TWEAK), TWEAK0_XMM + vmovdqu .Lgf_poly(%rip), GF_POLY _next_tweak TWEAK0, %xmm0, TWEAK1 _next_tweak TWEAK1, %xmm0, TWEAK2 _next_tweak TWEAK2, %xmm0, TWEAK3 -.else -.if VL == 32 - // Compute the second block of TWEAK0. +.elseif VL == 32 + vmovdqu (TWEAK), TWEAK0_XMM + vbroadcasti128 .Lgf_poly(%rip), GF_POLY + + // Compute the first vector of tweaks. _next_tweak TWEAK0_XMM, %xmm0, %xmm1 vinserti128 $1, %xmm1, TWEAK0, TWEAK0 -.elseif VL == 64 - // Compute the remaining blocks of TWEAK0. - _next_tweak TWEAK0_XMM, %xmm0, %xmm1 - _next_tweak %xmm1, %xmm0, %xmm2 - _next_tweak %xmm2, %xmm0, %xmm3 - vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0 - vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0 - vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0 -.endif - // Compute TWEAK[1-3] from TWEAK0. - vpsrlq $64 - 1*VL/16, TWEAK0, V0 - vpsrlq $64 - 2*VL/16, TWEAK0, V2 - vpsrlq $64 - 3*VL/16, TWEAK0, V4 + + // Compute the next three vectors of tweaks: + // TWEAK1 = TWEAK0 * [x^2, x^2] + // TWEAK2 = TWEAK0 * [x^4, x^4] + // TWEAK3 = TWEAK0 * [x^6, x^6] + vpsrlq $64 - 2, TWEAK0, V0 + vpsrlq $64 - 4, TWEAK0, V2 + vpsrlq $64 - 6, TWEAK0, V4 vpclmulqdq $0x01, GF_POLY, V0, V1 vpclmulqdq $0x01, GF_POLY, V2, V3 vpclmulqdq $0x01, GF_POLY, V4, V5 vpslldq $8, V0, V0 vpslldq $8, V2, V2 vpslldq $8, V4, V4 - vpsllq $1*VL/16, TWEAK0, TWEAK1 - vpsllq $2*VL/16, TWEAK0, TWEAK2 - vpsllq $3*VL/16, TWEAK0, TWEAK3 -.if USE_AVX10 - vpternlogd $0x96, V0, V1, TWEAK1 - vpternlogd $0x96, V2, V3, TWEAK2 - vpternlogd $0x96, V4, V5, TWEAK3 -.else + vpsllq $2, TWEAK0, TWEAK1 + vpsllq $4, TWEAK0, TWEAK2 + vpsllq $6, TWEAK0, TWEAK3 vpxor V0, TWEAK1, TWEAK1 vpxor V2, TWEAK2, TWEAK2 vpxor V4, TWEAK3, TWEAK3 vpxor V1, TWEAK1, TWEAK1 vpxor V3, TWEAK2, TWEAK2 vpxor V5, TWEAK3, TWEAK3 -.endif +.else + vbroadcasti32x4 (TWEAK), TWEAK0 + vbroadcasti32x4 .Lgf_poly(%rip), GF_POLY + + // Compute the first vector of tweaks: + // TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3] + vpmovzxbq .Lrshift_amounts(%rip), V4 + vpsrlvq V4, TWEAK0, V0 + vpclmulqdq $0x01, GF_POLY, V0, V1 + vpmovzxbq .Llshift_amounts(%rip), V4 + vpslldq $8, V0, V0 + vpsllvq V4, TWEAK0, TWEAK0 + vpternlogd $0x96, V0, V1, TWEAK0 + + // Compute the next three vectors of tweaks: + // TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4] + // TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8] + // TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12] + // x^8 only needs byte-aligned shifts, so optimize accordingly. + vpsrlq $64 - 4, TWEAK0, V0 + vpsrldq $(64 - 8) / 8, TWEAK0, V2 + vpsrlq $64 - 12, TWEAK0, V4 + vpclmulqdq $0x01, GF_POLY, V0, V1 + vpclmulqdq $0x01, GF_POLY, V2, V3 + vpclmulqdq $0x01, GF_POLY, V4, V5 + vpslldq $8, V0, V0 + vpslldq $8, V4, V4 + vpsllq $4, TWEAK0, TWEAK1 + vpslldq $8 / 8, TWEAK0, TWEAK2 + vpsllq $12, TWEAK0, TWEAK3 + vpternlogd $0x96, V0, V1, TWEAK1 + vpxord V3, TWEAK2, TWEAK2 + vpternlogd $0x96, V4, V5, TWEAK3 .endif .endm @@ -369,9 +413,14 @@ // Do one step in computing the next set of tweaks using the VPCLMULQDQ method // (the same method _next_tweakvec uses for VL > 16). This means multiplying -// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8 -// when VL > 16 (which it is here), the needed shift amounts are byte-aligned, -// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts. +// each tweak by x^(4*VL/16) independently. +// +// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed +// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq +// to do 128-bit wide shifts. The 128-bit left shift (vpslldq) saves +// instructions directly. The 128-bit right shift (vpsrldq) performs better +// than a 64-bit right shift on Intel CPUs in the context where it is used here, +// because it runs on a different execution port from the AES instructions. .macro _tweak_step_pclmul i .if \i == 0 vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 @@ -406,7 +455,7 @@ // \i that include at least 0 through 19, then 1000 which signals the last step. // // This is used to interleave the computation of the next set of tweaks with the -// AES en/decryptions, which increases performance in some cases. +// AES en/decryptions, which increases performance in some cases. Clobbers V5. .macro _tweak_step i .if VL == 16 _tweak_step_mulx \i @@ -443,119 +492,124 @@ // the last round needs different instructions. // // An alternative approach would be to roll up all the round loops. We - // don't do that because it isn't compatible with caching the round keys - // in registers which we do when possible (see below), and also because - // it seems unwise to rely *too* heavily on the CPU's branch predictor. + // don't do that because (a) it isn't compatible with caching the round + // keys in registers which we do when possible (see below), (b) we + // interleave the AES rounds with the XTS tweak computation, and (c) it + // seems unwise to rely *too* heavily on the CPU's branch predictor. lea OFFS-16(KEY, KEYLEN64, 4), KEY // If all 32 SIMD registers are available, cache all the round keys. -.if USE_AVX10 +.if USE_AVX512 cmp $24, KEYLEN jl .Laes128\@ je .Laes192\@ - _vbroadcast128 -6*16(KEY), KEY1 - _vbroadcast128 -5*16(KEY), KEY2 + vbroadcasti32x4 -6*16(KEY), KEY1 + vbroadcasti32x4 -5*16(KEY), KEY2 .Laes192\@: - _vbroadcast128 -4*16(KEY), KEY3 - _vbroadcast128 -3*16(KEY), KEY4 + vbroadcasti32x4 -4*16(KEY), KEY3 + vbroadcasti32x4 -3*16(KEY), KEY4 .Laes128\@: - _vbroadcast128 -2*16(KEY), KEY5 - _vbroadcast128 -1*16(KEY), KEY6 - _vbroadcast128 0*16(KEY), KEY7 - _vbroadcast128 1*16(KEY), KEY8 - _vbroadcast128 2*16(KEY), KEY9 - _vbroadcast128 3*16(KEY), KEY10 - _vbroadcast128 4*16(KEY), KEY11 - _vbroadcast128 5*16(KEY), KEY12 - _vbroadcast128 6*16(KEY), KEY13 - _vbroadcast128 7*16(KEY), KEY14 + vbroadcasti32x4 -2*16(KEY), KEY5 + vbroadcasti32x4 -1*16(KEY), KEY6 + vbroadcasti32x4 0*16(KEY), KEY7 + vbroadcasti32x4 1*16(KEY), KEY8 + vbroadcasti32x4 2*16(KEY), KEY9 + vbroadcasti32x4 3*16(KEY), KEY10 + vbroadcasti32x4 4*16(KEY), KEY11 + vbroadcasti32x4 5*16(KEY), KEY12 + vbroadcasti32x4 6*16(KEY), KEY13 + vbroadcasti32x4 7*16(KEY), KEY14 .endif .endm -// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0) -// on the block(s) in \data using the round key(s) in \key. The register length -// determines the number of AES blocks en/decrypted. -.macro _vaes enc, last, key, data +// Do a single non-last round of AES encryption (if \enc==1) or decryption (if +// \enc==0) on the block(s) in \data using the round key(s) in \key. The +// register length determines the number of AES blocks en/decrypted. +.macro _vaes enc, key, data .if \enc -.if \last - vaesenclast \key, \data, \data -.else vaesenc \key, \data, \data -.endif -.else -.if \last - vaesdeclast \key, \data, \data .else vaesdec \key, \data, \data .endif +.endm + +// Same as _vaes, but does the last round. +.macro _vaeslast enc, key, data +.if \enc + vaesenclast \key, \data, \data +.else + vaesdeclast \key, \data, \data .endif .endm -// Do a single round of AES en/decryption on the block(s) in \data, using the -// same key for all block(s). The round key is loaded from the appropriate -// register or memory location for round \i. May clobber V4. -.macro _vaes_1x enc, last, i, xmm_suffix, data -.if USE_AVX10 - _vaes \enc, \last, KEY\i\xmm_suffix, \data +// Do a single non-last round of AES en/decryption on the block(s) in \data, +// using the same key for all block(s). The round key is loaded from the +// appropriate register or memory location for round \i. May clobber \tmp. +.macro _vaes_1x enc, i, xmm_suffix, data, tmp +.if USE_AVX512 + _vaes \enc, KEY\i\xmm_suffix, \data .else .ifnb \xmm_suffix - _vaes \enc, \last, (\i-7)*16(KEY), \data + _vaes \enc, (\i-7)*16(KEY), \data .else - _vbroadcast128 (\i-7)*16(KEY), V4 - _vaes \enc, \last, V4, \data + _vbroadcast128 (\i-7)*16(KEY), \tmp + _vaes \enc, \tmp, \data .endif .endif .endm -// Do a single round of AES en/decryption on the blocks in registers V0-V3, -// using the same key for all blocks. The round key is loaded from the +// Do a single non-last round of AES en/decryption on the blocks in registers +// V0-V3, using the same key for all blocks. The round key is loaded from the // appropriate register or memory location for round \i. In addition, does two -// steps of the computation of the next set of tweaks. May clobber V4. -.macro _vaes_4x enc, last, i -.if USE_AVX10 +// steps of the computation of the next set of tweaks. May clobber V4 and V5. +.macro _vaes_4x enc, i +.if USE_AVX512 _tweak_step (2*(\i-5)) - _vaes \enc, \last, KEY\i, V0 - _vaes \enc, \last, KEY\i, V1 + _vaes \enc, KEY\i, V0 + _vaes \enc, KEY\i, V1 _tweak_step (2*(\i-5) + 1) - _vaes \enc, \last, KEY\i, V2 - _vaes \enc, \last, KEY\i, V3 + _vaes \enc, KEY\i, V2 + _vaes \enc, KEY\i, V3 .else _vbroadcast128 (\i-7)*16(KEY), V4 _tweak_step (2*(\i-5)) - _vaes \enc, \last, V4, V0 - _vaes \enc, \last, V4, V1 + _vaes \enc, V4, V0 + _vaes \enc, V4, V1 _tweak_step (2*(\i-5) + 1) - _vaes \enc, \last, V4, V2 - _vaes \enc, \last, V4, V3 + _vaes \enc, V4, V2 + _vaes \enc, V4, V3 .endif .endm // Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, // then XOR with \tweak again) of the block(s) in \data. To process a single // block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of -// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4. -.macro _aes_crypt enc, xmm_suffix, tweak, data +// length VL, use V* registers and leave \xmm_suffix empty. Clobbers \tmp. +.macro _aes_crypt enc, xmm_suffix, tweak, data, tmp _xor3 KEY0\xmm_suffix, \tweak, \data cmp $24, KEYLEN jl .Laes128\@ je .Laes192\@ - _vaes_1x \enc, 0, 1, \xmm_suffix, \data - _vaes_1x \enc, 0, 2, \xmm_suffix, \data + _vaes_1x \enc, 1, \xmm_suffix, \data, tmp=\tmp + _vaes_1x \enc, 2, \xmm_suffix, \data, tmp=\tmp .Laes192\@: - _vaes_1x \enc, 0, 3, \xmm_suffix, \data - _vaes_1x \enc, 0, 4, \xmm_suffix, \data + _vaes_1x \enc, 3, \xmm_suffix, \data, tmp=\tmp + _vaes_1x \enc, 4, \xmm_suffix, \data, tmp=\tmp .Laes128\@: - _vaes_1x \enc, 0, 5, \xmm_suffix, \data - _vaes_1x \enc, 0, 6, \xmm_suffix, \data - _vaes_1x \enc, 0, 7, \xmm_suffix, \data - _vaes_1x \enc, 0, 8, \xmm_suffix, \data - _vaes_1x \enc, 0, 9, \xmm_suffix, \data - _vaes_1x \enc, 0, 10, \xmm_suffix, \data - _vaes_1x \enc, 0, 11, \xmm_suffix, \data - _vaes_1x \enc, 0, 12, \xmm_suffix, \data - _vaes_1x \enc, 0, 13, \xmm_suffix, \data - _vaes_1x \enc, 1, 14, \xmm_suffix, \data - _vpxor \tweak, \data, \data +.irp i, 5,6,7,8,9,10,11,12,13 + _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp +.endr +.if USE_AVX512 + vpxord KEY14\xmm_suffix, \tweak, \tmp +.else +.ifnb \xmm_suffix + vpxor 7*16(KEY), \tweak, \tmp +.else + _vbroadcast128 7*16(KEY), \tmp + vpxor \tweak, \tmp, \tmp +.endif +.endif + _vaeslast \enc, \tmp, \data .endm .macro _aes_xts_crypt enc @@ -581,14 +635,14 @@ // Compute the first set of tweaks TWEAK[0-3]. _compute_first_set_of_tweaks - sub $4*VL, LEN + add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32 jl .Lhandle_remainder\@ .Lmain_loop\@: // This is the main loop, en/decrypting 4*VL bytes per iteration. // XOR each source block with its tweak and the zero-th round key. -.if USE_AVX10 +.if USE_AVX512 vmovdqu8 0*VL(SRC), V0 vmovdqu8 1*VL(SRC), V1 vmovdqu8 2*VL(SRC), V2 @@ -612,28 +666,43 @@ je .Laes192\@ // Do all the AES rounds on the data blocks, interleaved with // the computation of the next set of tweaks. - _vaes_4x \enc, 0, 1 - _vaes_4x \enc, 0, 2 + _vaes_4x \enc, 1 + _vaes_4x \enc, 2 .Laes192\@: - _vaes_4x \enc, 0, 3 - _vaes_4x \enc, 0, 4 + _vaes_4x \enc, 3 + _vaes_4x \enc, 4 .Laes128\@: - _vaes_4x \enc, 0, 5 - _vaes_4x \enc, 0, 6 - _vaes_4x \enc, 0, 7 - _vaes_4x \enc, 0, 8 - _vaes_4x \enc, 0, 9 - _vaes_4x \enc, 0, 10 - _vaes_4x \enc, 0, 11 - _vaes_4x \enc, 0, 12 - _vaes_4x \enc, 0, 13 - _vaes_4x \enc, 1, 14 - - // XOR in the tweaks again. - _vpxor TWEAK0, V0, V0 - _vpxor TWEAK1, V1, V1 - _vpxor TWEAK2, V2, V2 - _vpxor TWEAK3, V3, V3 +.irp i, 5,6,7,8,9,10,11,12,13 + _vaes_4x \enc, \i +.endr + // Do the last AES round, then XOR the results with the tweaks again. + // Reduce latency by doing the XOR before the vaesenclast, utilizing the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) + // (and likewise for vaesdeclast). +.if USE_AVX512 + _tweak_step 18 + _tweak_step 19 + vpxord TWEAK0, KEY14, V4 + vpxord TWEAK1, KEY14, V5 + _vaeslast \enc, V4, V0 + _vaeslast \enc, V5, V1 + vpxord TWEAK2, KEY14, V4 + vpxord TWEAK3, KEY14, V5 + _vaeslast \enc, V4, V2 + _vaeslast \enc, V5, V3 +.else + _vbroadcast128 7*16(KEY), V4 + _tweak_step 18 // uses V5 + _tweak_step 19 // uses V5 + vpxor TWEAK0, V4, V5 + _vaeslast \enc, V5, V0 + vpxor TWEAK1, V4, V5 + _vaeslast \enc, V5, V1 + vpxor TWEAK2, V4, V5 + vpxor TWEAK3, V4, V4 + _vaeslast \enc, V5, V2 + _vaeslast \enc, V4, V3 +.endif // Store the destination blocks. _vmovdqu V0, 0*VL(DST) @@ -644,9 +713,9 @@ // Finish computing the next set of tweaks. _tweak_step 1000 - add $4*VL, SRC - add $4*VL, DST - sub $4*VL, LEN + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, LEN jge .Lmain_loop\@ // Check for the uncommon case where the data length isn't a multiple of @@ -670,7 +739,7 @@ jl .Lvec_at_a_time_done\@ .Lvec_at_a_time\@: _vmovdqu (SRC), V0 - _aes_crypt \enc, , TWEAK0, V0 + _aes_crypt \enc, , TWEAK0, V0, tmp=V1 _vmovdqu V0, (DST) _next_tweakvec TWEAK0, V0, V1, TWEAK0 add $VL, SRC @@ -687,7 +756,7 @@ jl .Lblock_at_a_time_done\@ .Lblock_at_a_time\@: vmovdqu (SRC), %xmm0 - _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 vmovdqu %xmm0, (DST) _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM add $16, SRC @@ -715,10 +784,10 @@ // Do it now by advancing the tweak and decrypting the last full block. _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM vmovdqu (SRC), %xmm0 - _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0 + _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1 .endif -.if USE_AVX10 +.if USE_AVX512 // Create a mask that has the first LEN bits set. mov $-1, %r9d bzhi LEN, %r9d, %r9d @@ -758,47 +827,49 @@ vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 .endif // En/decrypt again and store the last full block. - _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 vmovdqu %xmm0, (DST) jmp .Ldone\@ .endm // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, // u8 iv[AES_BLOCK_SIZE]); +// +// Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes +// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX512. SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) - vmovdqu (%rsi), %xmm0 - vpxor (%rdi), %xmm0, %xmm0 - movl 480(%rdi), %eax // AES key length - lea -16(%rdi, %rax, 4), %rdi - cmp $24, %eax + .set TWEAK_KEY, %rdi + .set IV, %rsi + .set KEYLEN, %eax + .set KEYLEN64, %rax + + vmovdqu (IV), %xmm0 + vpxor (TWEAK_KEY), %xmm0, %xmm0 + movl 480(TWEAK_KEY), KEYLEN + lea -16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY + cmp $24, KEYLEN jl .Lencrypt_iv_aes128 je .Lencrypt_iv_aes192 - vaesenc -6*16(%rdi), %xmm0, %xmm0 - vaesenc -5*16(%rdi), %xmm0, %xmm0 + vaesenc -6*16(TWEAK_KEY), %xmm0, %xmm0 + vaesenc -5*16(TWEAK_KEY), %xmm0, %xmm0 .Lencrypt_iv_aes192: - vaesenc -4*16(%rdi), %xmm0, %xmm0 - vaesenc -3*16(%rdi), %xmm0, %xmm0 + vaesenc -4*16(TWEAK_KEY), %xmm0, %xmm0 + vaesenc -3*16(TWEAK_KEY), %xmm0, %xmm0 .Lencrypt_iv_aes128: - vaesenc -2*16(%rdi), %xmm0, %xmm0 - vaesenc -1*16(%rdi), %xmm0, %xmm0 - vaesenc 0*16(%rdi), %xmm0, %xmm0 - vaesenc 1*16(%rdi), %xmm0, %xmm0 - vaesenc 2*16(%rdi), %xmm0, %xmm0 - vaesenc 3*16(%rdi), %xmm0, %xmm0 - vaesenc 4*16(%rdi), %xmm0, %xmm0 - vaesenc 5*16(%rdi), %xmm0, %xmm0 - vaesenc 6*16(%rdi), %xmm0, %xmm0 - vaesenclast 7*16(%rdi), %xmm0, %xmm0 - vmovdqu %xmm0, (%rsi) +.irp i, -2,-1,0,1,2,3,4,5,6 + vaesenc \i*16(TWEAK_KEY), %xmm0, %xmm0 +.endr + vaesenclast 7*16(TWEAK_KEY), %xmm0, %xmm0 + vmovdqu %xmm0, (IV) RET SYM_FUNC_END(aes_xts_encrypt_iv) // Below are the actual AES-XTS encryption and decryption functions, // instantiated from the above macro. They all have the following prototype: // -// void (*xts_asm_func)(const struct crypto_aes_ctx *key, -// const u8 *src, u8 *dst, unsigned int len, -// u8 tweak[AES_BLOCK_SIZE]); +// void (*xts_crypt_func)(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, int len, +// u8 tweak[AES_BLOCK_SIZE]); // // |key| is the data key. |tweak| contains the next tweak; the encryption of // the original IV with the tweak key was already done. This function supports @@ -807,7 +878,7 @@ SYM_FUNC_END(aes_xts_encrypt_iv) // multiple of 16, then this function updates |tweak| to contain the next tweak. .set VL, 16 -.set USE_AVX10, 0 +.set USE_AVX512, 0 SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx) _aes_xts_crypt 1 SYM_FUNC_END(aes_xts_encrypt_aesni_avx) @@ -817,7 +888,7 @@ SYM_FUNC_END(aes_xts_decrypt_aesni_avx) #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) .set VL, 32 -.set USE_AVX10, 0 +.set USE_AVX512, 0 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2) _aes_xts_crypt 1 SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) @@ -825,21 +896,12 @@ SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2) _aes_xts_crypt 0 SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) -.set VL, 32 -.set USE_AVX10, 1 -SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256) - _aes_xts_crypt 1 -SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256) -SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256) - _aes_xts_crypt 0 -SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256) - .set VL, 64 -.set USE_AVX10, 1 -SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512) +.set USE_AVX512, 1 +SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx512) _aes_xts_crypt 1 -SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512) -SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512) +SYM_FUNC_END(aes_xts_encrypt_vaes_avx512) +SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx512) _aes_xts_crypt 0 -SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512) +SYM_FUNC_END(aes_xts_decrypt_vaes_avx512) #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ |