diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-01-24 07:48:10 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-01-24 07:48:10 -0800 |
commit | 454cb97726fe62a04b187a0d631ec0a69f6b713a (patch) | |
tree | 16dae08192b97960398b9cb9cc92439fb1b7e4a6 /arch/x86 | |
parent | ae2d4fc540cd27d667d10597b6ad8cc4c6ce622a (diff) | |
parent | 9d4f8e54cef2c42e23ef258833dbd06a1eaff89b (diff) |
Merge tag 'v6.14-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu:
"API:
- Remove physical address skcipher walking
- Fix boot-up self-test race
Algorithms:
- Optimisations for x86/aes-gcm
- Optimisations for x86/aes-xts
- Remove VMAC
- Remove keywrap
Drivers:
- Remove n2
Others:
- Fixes for padata UAF
- Fix potential rhashtable deadlock by moving schedule_work outside
lock"
* tag 'v6.14-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (75 commits)
rhashtable: Fix rhashtable_try_insert test
dt-bindings: crypto: qcom,inline-crypto-engine: Document the SM8750 ICE
dt-bindings: crypto: qcom,prng: Document SM8750 RNG
dt-bindings: crypto: qcom-qce: Document the SM8750 crypto engine
crypto: asymmetric_keys - Remove unused key_being_used_for[]
padata: avoid UAF for reorder_work
padata: fix UAF in padata_reorder
padata: add pd get/put refcnt helper
crypto: skcipher - call cond_resched() directly
crypto: skcipher - optimize initializing skcipher_walk fields
crypto: skcipher - clean up initialization of skcipher_walk::flags
crypto: skcipher - fold skcipher_walk_skcipher() into skcipher_walk_virt()
crypto: skcipher - remove redundant check for SKCIPHER_WALK_SLOW
crypto: skcipher - remove redundant clamping to page size
crypto: skcipher - remove unnecessary page alignment of bounce buffer
crypto: skcipher - document skcipher_walk_done() and rename some vars
crypto: omap - switch from scatter_walk to plain offset
crypto: powerpc/p10-aes-gcm - simplify handling of linear associated data
crypto: bcm - Drop unused setting of local 'ptr' variable
crypto: hisilicon/qm - support new function communication
...
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/crypto/aegis128-aesni-glue.c | 1 | ||||
-rw-r--r-- | arch/x86/crypto/aes-gcm-avx10-x86_64.S | 119 | ||||
-rw-r--r-- | arch/x86/crypto/aes-xts-avx-x86_64.S | 329 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_glue.c | 10 | ||||
-rw-r--r-- | arch/x86/crypto/blowfish_glue.c | 1 | ||||
-rw-r--r-- | arch/x86/crypto/camellia_glue.c | 1 | ||||
-rw-r--r-- | arch/x86/crypto/des3_ede_glue.c | 1 | ||||
-rw-r--r-- | arch/x86/crypto/twofish_glue.c | 1 |
8 files changed, 221 insertions, 242 deletions
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index c19d8e3d96a3..01fa568dc5fc 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -240,7 +240,6 @@ static struct aead_alg crypto_aegis128_aesni_alg = { .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aegis_ctx) + __alignof__(struct aegis_ctx), - .cra_alignmask = 0, .cra_priority = 400, .cra_name = "__aegis128", diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S index 97e0ee515fc5..02ee11083d4f 100644 --- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S +++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S @@ -88,7 +88,7 @@ // A shuffle mask that reflects the bytes of 16-byte blocks .Lbswap_mask: - .octa 0x000102030405060708090a0b0c0d0e0f + .octa 0x000102030405060708090a0b0c0d0e0f // This is the GHASH reducing polynomial without its constant term, i.e. // x^128 + x^7 + x^2 + x, represented using the backwards mapping @@ -384,8 +384,8 @@ vpshufd $0xd3, H_CUR_XMM, %xmm0 vpsrad $31, %xmm0, %xmm0 vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM - vpand .Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0 - vpxor %xmm0, H_CUR_XMM, H_CUR_XMM + // H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit + vpternlogd $0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, H_CUR_XMM // Load the gfpoly constant. vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY @@ -562,6 +562,32 @@ vpxord RNDKEY0, V3, V3 .endm +// Do the last AES round for four vectors of counter blocks V0-V3, XOR source +// data with the resulting keystream, and write the result to DST and +// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.) +.macro _aesenclast_and_xor_4x + // XOR the source data with the last round key, saving the result in + // GHASHDATA[0-3]. This reduces latency by taking advantage of the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). + vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0 + vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1 + vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2 + vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3 + + // Do the last AES round. This handles the XOR with the source data + // too, as per the optimization described above. + vaesenclast GHASHDATA0, V0, GHASHDATA0 + vaesenclast GHASHDATA1, V1, GHASHDATA1 + vaesenclast GHASHDATA2, V2, GHASHDATA2 + vaesenclast GHASHDATA3, V3, GHASHDATA3 + + // Store the en/decrypted data to DST. + vmovdqu8 GHASHDATA0, 0*VL(DST) + vmovdqu8 GHASHDATA1, 1*VL(DST) + vmovdqu8 GHASHDATA2, 2*VL(DST) + vmovdqu8 GHASHDATA3, 3*VL(DST) +.endm + // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, // const u32 le_ctr[4], u8 ghash_acc[16], // const u8 *src, u8 *dst, int datalen); @@ -640,7 +666,7 @@ // LE_CTR contains the next set of little-endian counter blocks. .set LE_CTR, V12 - // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys, + // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys, // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. .set RNDKEY0, V13 @@ -650,15 +676,10 @@ .set RNDKEY_M7, V17 .set RNDKEY_M6, V18 .set RNDKEY_M5, V19 - - // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with - // the corresponding block of source data. This is useful because - // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can - // be computed in parallel with the AES rounds. - .set RNDKEYLAST0, V20 - .set RNDKEYLAST1, V21 - .set RNDKEYLAST2, V22 - .set RNDKEYLAST3, V23 + .set RNDKEY_M4, V20 + .set RNDKEY_M3, V21 + .set RNDKEY_M2, V22 + .set RNDKEY_M1, V23 // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These // cannot coincide with anything used for AES encryption, since for @@ -713,7 +734,7 @@ // Pre-subtracting 4*VL from DATALEN saves an instruction from the main // loop and also ensures that at least one write always occurs to // DATALEN, zero-extending it and allowing DATALEN64 to be used later. - sub $4*VL, DATALEN + add $-4*VL, DATALEN // shorter than 'sub 4*VL' when VL=32 jl .Lcrypt_loop_4x_done\@ // Load powers of the hash key. @@ -748,26 +769,15 @@ add $16, %rax cmp %rax, RNDKEYLAST_PTR jne 1b - vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 - vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 - vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 - vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 - vaesenclast RNDKEYLAST0, V0, GHASHDATA0 - vaesenclast RNDKEYLAST1, V1, GHASHDATA1 - vaesenclast RNDKEYLAST2, V2, GHASHDATA2 - vaesenclast RNDKEYLAST3, V3, GHASHDATA3 - vmovdqu8 GHASHDATA0, 0*VL(DST) - vmovdqu8 GHASHDATA1, 1*VL(DST) - vmovdqu8 GHASHDATA2, 2*VL(DST) - vmovdqu8 GHASHDATA3, 3*VL(DST) - add $4*VL, SRC - add $4*VL, DST - sub $4*VL, DATALEN + _aesenclast_and_xor_4x + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, DATALEN jl .Lghash_last_ciphertext_4x\@ .endif // Cache as many additional AES round keys as possible. -.irp i, 9,8,7,6,5 +.irp i, 9,8,7,6,5,4,3,2,1 vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i .endr @@ -799,50 +809,17 @@ _vaesenc_4x RNDKEY 128: - // XOR the source data with the last round key, saving the result in - // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the - // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). -.if \enc - vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 - vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 - vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 - vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 -.else - vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0 - vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1 - vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2 - vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3 -.endif - // Finish the AES encryption of the counter blocks in V0-V3, interleaved // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. -.irp i, 9,8,7,6,5 +.irp i, 9,8,7,6,5,4,3,2,1 + _ghash_step_4x (9 - \i) _vaesenc_4x RNDKEY_M\i - _ghash_step_4x (9 - \i) -.endr -.irp i, 4,3,2,1 - vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY - _vaesenc_4x RNDKEY - _ghash_step_4x (9 - \i) .endr _ghash_step_4x 9 - - // Do the last AES round. This handles the XOR with the source data - // too, as per the optimization described above. - vaesenclast RNDKEYLAST0, V0, GHASHDATA0 - vaesenclast RNDKEYLAST1, V1, GHASHDATA1 - vaesenclast RNDKEYLAST2, V2, GHASHDATA2 - vaesenclast RNDKEYLAST3, V3, GHASHDATA3 - - // Store the en/decrypted data to DST. - vmovdqu8 GHASHDATA0, 0*VL(DST) - vmovdqu8 GHASHDATA1, 1*VL(DST) - vmovdqu8 GHASHDATA2, 2*VL(DST) - vmovdqu8 GHASHDATA3, 3*VL(DST) - - add $4*VL, SRC - add $4*VL, DST - sub $4*VL, DATALEN + _aesenclast_and_xor_4x + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, DATALEN jge .Lcrypt_loop_4x\@ .if \enc @@ -856,7 +833,7 @@ .Lcrypt_loop_4x_done\@: // Undo the extra subtraction by 4*VL and check whether data remains. - add $4*VL, DATALEN + sub $-4*VL, DATALEN // shorter than 'add 4*VL' when VL=32 jz .Ldone\@ // The data length isn't a multiple of 4*VL. Process the remaining data @@ -940,7 +917,7 @@ // GHASH. However, any such blocks are all-zeroes, and the values that // they're multiplied with are also all-zeroes. Therefore they just add // 0 * 0 = 0 to the final GHASH result, which makes no difference. - vmovdqu8 (POWERS_PTR), H_POW1 + vmovdqu8 (POWERS_PTR), H_POW1 .if \enc vmovdqu8 V0, V1{%k1}{z} .endif diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index 48f97b79f7a9..8a3e23fbcf85 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -80,22 +80,6 @@ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 .text -// Function parameters -.set KEY, %rdi // Initially points to crypto_aes_ctx, then is - // advanced to point to 7th-from-last round key -.set SRC, %rsi // Pointer to next source data -.set DST, %rdx // Pointer to next destination data -.set LEN, %ecx // Remaining length in bytes -.set LEN8, %cl -.set LEN64, %rcx -.set TWEAK, %r8 // Pointer to next tweak - -// %rax holds the AES key length in bytes. -.set KEYLEN, %eax -.set KEYLEN64, %rax - -// %r9-r11 are available as temporaries. - .macro _define_Vi i .if VL == 16 .set V\i, %xmm\i @@ -112,41 +96,31 @@ // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers // are available, that map to the xmm, ymm, or zmm registers according // to the selected Vector Length (VL). - _define_Vi 0 - _define_Vi 1 - _define_Vi 2 - _define_Vi 3 - _define_Vi 4 - _define_Vi 5 - _define_Vi 6 - _define_Vi 7 - _define_Vi 8 - _define_Vi 9 - _define_Vi 10 - _define_Vi 11 - _define_Vi 12 - _define_Vi 13 - _define_Vi 14 - _define_Vi 15 +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + _define_Vi \i +.endr .if USE_AVX10 - _define_Vi 16 - _define_Vi 17 - _define_Vi 18 - _define_Vi 19 - _define_Vi 20 - _define_Vi 21 - _define_Vi 22 - _define_Vi 23 - _define_Vi 24 - _define_Vi 25 - _define_Vi 26 - _define_Vi 27 - _define_Vi 28 - _define_Vi 29 - _define_Vi 30 - _define_Vi 31 +.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + _define_Vi \i +.endr .endif + // Function parameters + .set KEY, %rdi // Initially points to crypto_aes_ctx, then is + // advanced to point to 7th-from-last round key + .set SRC, %rsi // Pointer to next source data + .set DST, %rdx // Pointer to next destination data + .set LEN, %ecx // Remaining length in bytes + .set LEN8, %cl + .set LEN64, %rcx + .set TWEAK, %r8 // Pointer to next tweak + + // %rax holds the AES key length in bytes. + .set KEYLEN, %eax + .set KEYLEN64, %rax + + // %r9-r11 are available as temporaries. + // V0-V3 hold the data blocks during the main loop, or temporary values // otherwise. V4-V5 hold temporary values. @@ -214,6 +188,7 @@ .endm // Move a vector between memory and a register. +// The register operand must be in the first 16 vector registers. .macro _vmovdqu src, dst .if VL < 64 vmovdqu \src, \dst @@ -234,11 +209,12 @@ .endm // XOR two vectors together. +// Any register operands must be in the first 16 vector registers. .macro _vpxor src1, src2, dst -.if USE_AVX10 - vpxord \src1, \src2, \dst -.else +.if VL < 64 vpxor \src1, \src2, \dst +.else + vpxord \src1, \src2, \dst .endif .endm @@ -259,8 +235,12 @@ vpshufd $0x13, \src, \tmp vpaddq \src, \src, \dst vpsrad $31, \tmp, \tmp +.if USE_AVX10 + vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst +.else vpand GF_POLY_XMM, \tmp, \tmp vpxor \tmp, \dst, \dst +.endif .endm // Given the XTS tweak(s) in the vector \src, compute the next vector of @@ -369,9 +349,14 @@ // Do one step in computing the next set of tweaks using the VPCLMULQDQ method // (the same method _next_tweakvec uses for VL > 16). This means multiplying -// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8 -// when VL > 16 (which it is here), the needed shift amounts are byte-aligned, -// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts. +// each tweak by x^(4*VL/16) independently. +// +// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed +// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq +// to do 128-bit wide shifts. The 128-bit left shift (vpslldq) saves +// instructions directly. The 128-bit right shift (vpsrldq) performs better +// than a 64-bit right shift on Intel CPUs in the context where it is used here, +// because it runs on a different execution port from the AES instructions. .macro _tweak_step_pclmul i .if \i == 0 vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 @@ -406,7 +391,7 @@ // \i that include at least 0 through 19, then 1000 which signals the last step. // // This is used to interleave the computation of the next set of tweaks with the -// AES en/decryptions, which increases performance in some cases. +// AES en/decryptions, which increases performance in some cases. Clobbers V5. .macro _tweak_step i .if VL == 16 _tweak_step_mulx \i @@ -443,9 +428,10 @@ // the last round needs different instructions. // // An alternative approach would be to roll up all the round loops. We - // don't do that because it isn't compatible with caching the round keys - // in registers which we do when possible (see below), and also because - // it seems unwise to rely *too* heavily on the CPU's branch predictor. + // don't do that because (a) it isn't compatible with caching the round + // keys in registers which we do when possible (see below), (b) we + // interleave the AES rounds with the XTS tweak computation, and (c) it + // seems unwise to rely *too* heavily on the CPU's branch predictor. lea OFFS-16(KEY, KEYLEN64, 4), KEY // If all 32 SIMD registers are available, cache all the round keys. @@ -472,90 +458,94 @@ .endif .endm -// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0) -// on the block(s) in \data using the round key(s) in \key. The register length -// determines the number of AES blocks en/decrypted. -.macro _vaes enc, last, key, data +// Do a single non-last round of AES encryption (if \enc==1) or decryption (if +// \enc==0) on the block(s) in \data using the round key(s) in \key. The +// register length determines the number of AES blocks en/decrypted. +.macro _vaes enc, key, data .if \enc -.if \last - vaesenclast \key, \data, \data -.else vaesenc \key, \data, \data -.endif -.else -.if \last - vaesdeclast \key, \data, \data .else vaesdec \key, \data, \data .endif +.endm + +// Same as _vaes, but does the last round. +.macro _vaeslast enc, key, data +.if \enc + vaesenclast \key, \data, \data +.else + vaesdeclast \key, \data, \data .endif .endm -// Do a single round of AES en/decryption on the block(s) in \data, using the -// same key for all block(s). The round key is loaded from the appropriate -// register or memory location for round \i. May clobber V4. -.macro _vaes_1x enc, last, i, xmm_suffix, data +// Do a single non-last round of AES en/decryption on the block(s) in \data, +// using the same key for all block(s). The round key is loaded from the +// appropriate register or memory location for round \i. May clobber \tmp. +.macro _vaes_1x enc, i, xmm_suffix, data, tmp .if USE_AVX10 - _vaes \enc, \last, KEY\i\xmm_suffix, \data + _vaes \enc, KEY\i\xmm_suffix, \data .else .ifnb \xmm_suffix - _vaes \enc, \last, (\i-7)*16(KEY), \data + _vaes \enc, (\i-7)*16(KEY), \data .else - _vbroadcast128 (\i-7)*16(KEY), V4 - _vaes \enc, \last, V4, \data + _vbroadcast128 (\i-7)*16(KEY), \tmp + _vaes \enc, \tmp, \data .endif .endif .endm -// Do a single round of AES en/decryption on the blocks in registers V0-V3, -// using the same key for all blocks. The round key is loaded from the +// Do a single non-last round of AES en/decryption on the blocks in registers +// V0-V3, using the same key for all blocks. The round key is loaded from the // appropriate register or memory location for round \i. In addition, does two -// steps of the computation of the next set of tweaks. May clobber V4. -.macro _vaes_4x enc, last, i +// steps of the computation of the next set of tweaks. May clobber V4 and V5. +.macro _vaes_4x enc, i .if USE_AVX10 _tweak_step (2*(\i-5)) - _vaes \enc, \last, KEY\i, V0 - _vaes \enc, \last, KEY\i, V1 + _vaes \enc, KEY\i, V0 + _vaes \enc, KEY\i, V1 _tweak_step (2*(\i-5) + 1) - _vaes \enc, \last, KEY\i, V2 - _vaes \enc, \last, KEY\i, V3 + _vaes \enc, KEY\i, V2 + _vaes \enc, KEY\i, V3 .else _vbroadcast128 (\i-7)*16(KEY), V4 _tweak_step (2*(\i-5)) - _vaes \enc, \last, V4, V0 - _vaes \enc, \last, V4, V1 + _vaes \enc, V4, V0 + _vaes \enc, V4, V1 _tweak_step (2*(\i-5) + 1) - _vaes \enc, \last, V4, V2 - _vaes \enc, \last, V4, V3 + _vaes \enc, V4, V2 + _vaes \enc, V4, V3 .endif .endm // Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, // then XOR with \tweak again) of the block(s) in \data. To process a single // block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of -// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4. -.macro _aes_crypt enc, xmm_suffix, tweak, data +// length VL, use V* registers and leave \xmm_suffix empty. Clobbers \tmp. +.macro _aes_crypt enc, xmm_suffix, tweak, data, tmp _xor3 KEY0\xmm_suffix, \tweak, \data cmp $24, KEYLEN jl .Laes128\@ je .Laes192\@ - _vaes_1x \enc, 0, 1, \xmm_suffix, \data - _vaes_1x \enc, 0, 2, \xmm_suffix, \data + _vaes_1x \enc, 1, \xmm_suffix, \data, tmp=\tmp + _vaes_1x \enc, 2, \xmm_suffix, \data, tmp=\tmp .Laes192\@: - _vaes_1x \enc, 0, 3, \xmm_suffix, \data - _vaes_1x \enc, 0, 4, \xmm_suffix, \data + _vaes_1x \enc, 3, \xmm_suffix, \data, tmp=\tmp + _vaes_1x \enc, 4, \xmm_suffix, \data, tmp=\tmp .Laes128\@: - _vaes_1x \enc, 0, 5, \xmm_suffix, \data - _vaes_1x \enc, 0, 6, \xmm_suffix, \data - _vaes_1x \enc, 0, 7, \xmm_suffix, \data - _vaes_1x \enc, 0, 8, \xmm_suffix, \data - _vaes_1x \enc, 0, 9, \xmm_suffix, \data - _vaes_1x \enc, 0, 10, \xmm_suffix, \data - _vaes_1x \enc, 0, 11, \xmm_suffix, \data - _vaes_1x \enc, 0, 12, \xmm_suffix, \data - _vaes_1x \enc, 0, 13, \xmm_suffix, \data - _vaes_1x \enc, 1, 14, \xmm_suffix, \data - _vpxor \tweak, \data, \data +.irp i, 5,6,7,8,9,10,11,12,13 + _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp +.endr +.if USE_AVX10 + vpxord KEY14\xmm_suffix, \tweak, \tmp +.else +.ifnb \xmm_suffix + vpxor 7*16(KEY), \tweak, \tmp +.else + _vbroadcast128 7*16(KEY), \tmp + vpxor \tweak, \tmp, \tmp +.endif +.endif + _vaeslast \enc, \tmp, \data .endm .macro _aes_xts_crypt enc @@ -581,7 +571,7 @@ // Compute the first set of tweaks TWEAK[0-3]. _compute_first_set_of_tweaks - sub $4*VL, LEN + add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32 jl .Lhandle_remainder\@ .Lmain_loop\@: @@ -589,10 +579,10 @@ // XOR each source block with its tweak and the zero-th round key. .if USE_AVX10 - vmovdqu8 0*VL(SRC), V0 - vmovdqu8 1*VL(SRC), V1 - vmovdqu8 2*VL(SRC), V2 - vmovdqu8 3*VL(SRC), V3 + _vmovdqu 0*VL(SRC), V0 + _vmovdqu 1*VL(SRC), V1 + _vmovdqu 2*VL(SRC), V2 + _vmovdqu 3*VL(SRC), V3 vpternlogd $0x96, TWEAK0, KEY0, V0 vpternlogd $0x96, TWEAK1, KEY0, V1 vpternlogd $0x96, TWEAK2, KEY0, V2 @@ -612,28 +602,43 @@ je .Laes192\@ // Do all the AES rounds on the data blocks, interleaved with // the computation of the next set of tweaks. - _vaes_4x \enc, 0, 1 - _vaes_4x \enc, 0, 2 + _vaes_4x \enc, 1 + _vaes_4x \enc, 2 .Laes192\@: - _vaes_4x \enc, 0, 3 - _vaes_4x \enc, 0, 4 + _vaes_4x \enc, 3 + _vaes_4x \enc, 4 .Laes128\@: - _vaes_4x \enc, 0, 5 - _vaes_4x \enc, 0, 6 - _vaes_4x \enc, 0, 7 - _vaes_4x \enc, 0, 8 - _vaes_4x \enc, 0, 9 - _vaes_4x \enc, 0, 10 - _vaes_4x \enc, 0, 11 - _vaes_4x \enc, 0, 12 - _vaes_4x \enc, 0, 13 - _vaes_4x \enc, 1, 14 - - // XOR in the tweaks again. - _vpxor TWEAK0, V0, V0 - _vpxor TWEAK1, V1, V1 - _vpxor TWEAK2, V2, V2 - _vpxor TWEAK3, V3, V3 +.irp i, 5,6,7,8,9,10,11,12,13 + _vaes_4x \enc, \i +.endr + // Do the last AES round, then XOR the results with the tweaks again. + // Reduce latency by doing the XOR before the vaesenclast, utilizing the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) + // (and likewise for vaesdeclast). +.if USE_AVX10 + _tweak_step 18 + _tweak_step 19 + vpxord TWEAK0, KEY14, V4 + vpxord TWEAK1, KEY14, V5 + _vaeslast \enc, V4, V0 + _vaeslast \enc, V5, V1 + vpxord TWEAK2, KEY14, V4 + vpxord TWEAK3, KEY14, V5 + _vaeslast \enc, V4, V2 + _vaeslast \enc, V5, V3 +.else + _vbroadcast128 7*16(KEY), V4 + _tweak_step 18 // uses V5 + _tweak_step 19 // uses V5 + vpxor TWEAK0, V4, V5 + _vaeslast \enc, V5, V0 + vpxor TWEAK1, V4, V5 + _vaeslast \enc, V5, V1 + vpxor TWEAK2, V4, V5 + vpxor TWEAK3, V4, V4 + _vaeslast \enc, V5, V2 + _vaeslast \enc, V4, V3 +.endif // Store the destination blocks. _vmovdqu V0, 0*VL(DST) @@ -644,9 +649,9 @@ // Finish computing the next set of tweaks. _tweak_step 1000 - add $4*VL, SRC - add $4*VL, DST - sub $4*VL, LEN + sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 + sub $-4*VL, DST + add $-4*VL, LEN jge .Lmain_loop\@ // Check for the uncommon case where the data length isn't a multiple of @@ -670,7 +675,7 @@ jl .Lvec_at_a_time_done\@ .Lvec_at_a_time\@: _vmovdqu (SRC), V0 - _aes_crypt \enc, , TWEAK0, V0 + _aes_crypt \enc, , TWEAK0, V0, tmp=V1 _vmovdqu V0, (DST) _next_tweakvec TWEAK0, V0, V1, TWEAK0 add $VL, SRC @@ -687,7 +692,7 @@ jl .Lblock_at_a_time_done\@ .Lblock_at_a_time\@: vmovdqu (SRC), %xmm0 - _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 vmovdqu %xmm0, (DST) _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM add $16, SRC @@ -715,7 +720,7 @@ // Do it now by advancing the tweak and decrypting the last full block. _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM vmovdqu (SRC), %xmm0 - _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0 + _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1 .endif .if USE_AVX10 @@ -758,47 +763,49 @@ vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 .endif // En/decrypt again and store the last full block. - _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 + _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 vmovdqu %xmm0, (DST) jmp .Ldone\@ .endm // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, // u8 iv[AES_BLOCK_SIZE]); +// +// Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes +// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10. SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) - vmovdqu (%rsi), %xmm0 - vpxor (%rdi), %xmm0, %xmm0 - movl 480(%rdi), %eax // AES key length - lea -16(%rdi, %rax, 4), %rdi - cmp $24, %eax + .set TWEAK_KEY, %rdi + .set IV, %rsi + .set KEYLEN, %eax + .set KEYLEN64, %rax + + vmovdqu (IV), %xmm0 + vpxor (TWEAK_KEY), %xmm0, %xmm0 + movl 480(TWEAK_KEY), KEYLEN + lea -16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY + cmp $24, KEYLEN jl .Lencrypt_iv_aes128 je .Lencrypt_iv_aes192 - vaesenc -6*16(%rdi), %xmm0, %xmm0 - vaesenc -5*16(%rdi), %xmm0, %xmm0 + vaesenc -6*16(TWEAK_KEY), %xmm0, %xmm0 + vaesenc -5*16(TWEAK_KEY), %xmm0, %xmm0 .Lencrypt_iv_aes192: - vaesenc -4*16(%rdi), %xmm0, %xmm0 - vaesenc -3*16(%rdi), %xmm0, %xmm0 + vaesenc -4*16(TWEAK_KEY), %xmm0, %xmm0 + vaesenc -3*16(TWEAK_KEY), %xmm0, %xmm0 .Lencrypt_iv_aes128: - vaesenc -2*16(%rdi), %xmm0, %xmm0 - vaesenc -1*16(%rdi), %xmm0, %xmm0 - vaesenc 0*16(%rdi), %xmm0, %xmm0 - vaesenc 1*16(%rdi), %xmm0, %xmm0 - vaesenc 2*16(%rdi), %xmm0, %xmm0 - vaesenc 3*16(%rdi), %xmm0, %xmm0 - vaesenc 4*16(%rdi), %xmm0, %xmm0 - vaesenc 5*16(%rdi), %xmm0, %xmm0 - vaesenc 6*16(%rdi), %xmm0, %xmm0 - vaesenclast 7*16(%rdi), %xmm0, %xmm0 - vmovdqu %xmm0, (%rsi) +.irp i, -2,-1,0,1,2,3,4,5,6 + vaesenc \i*16(TWEAK_KEY), %xmm0, %xmm0 +.endr + vaesenclast 7*16(TWEAK_KEY), %xmm0, %xmm0 + vmovdqu %xmm0, (IV) RET SYM_FUNC_END(aes_xts_encrypt_iv) // Below are the actual AES-XTS encryption and decryption functions, // instantiated from the above macro. They all have the following prototype: // -// void (*xts_asm_func)(const struct crypto_aes_ctx *key, -// const u8 *src, u8 *dst, unsigned int len, -// u8 tweak[AES_BLOCK_SIZE]); +// void (*xts_crypt_func)(const struct crypto_aes_ctx *key, +// const u8 *src, u8 *dst, int len, +// u8 tweak[AES_BLOCK_SIZE]); // // |key| is the data key. |tweak| contains the next tweak; the encryption of // the original IV with the tweak key was already done. This function supports diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index fbf43482e1f5..11e95fc62636 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -505,7 +505,7 @@ static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key, u8 iv[AES_BLOCK_SIZE]); typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key, - const u8 *src, u8 *dst, unsigned int len, + const u8 *src, u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); /* This handles cases where the source and/or destination span pages. */ @@ -624,14 +624,14 @@ static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, } static void aesni_xts_encrypt(const struct crypto_aes_ctx *key, - const u8 *src, u8 *dst, unsigned int len, + const u8 *src, u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]) { aesni_xts_enc(key, dst, src, len, tweak); } static void aesni_xts_decrypt(const struct crypto_aes_ctx *key, - const u8 *src, u8 *dst, unsigned int len, + const u8 *src, u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]) { aesni_xts_dec(key, dst, src, len, tweak); @@ -790,10 +790,10 @@ asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, \ asmlinkage void \ aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ - u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \ + u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ asmlinkage void \ aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ - u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \ + u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ \ static int xts_encrypt_##suffix(struct skcipher_request *req) \ { \ diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 552f2df0643f..26c5f2ee5d10 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -94,7 +94,6 @@ static struct crypto_alg bf_cipher_alg = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = BF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct bf_ctx), - .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index f110708c8038..3bd37d664121 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1313,7 +1313,6 @@ static struct crypto_alg camellia_cipher_alg = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_ctx), - .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index abb8b1fe123b..e88439d3828e 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -291,7 +291,6 @@ static struct crypto_alg des3_ede_cipher = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES3_EDE_BLOCK_SIZE, .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), - .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index 0614beece279..4c67184dc573 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -68,7 +68,6 @@ static struct crypto_alg alg = { .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_ctx), - .cra_alignmask = 0, .cra_module = THIS_MODULE, .cra_u = { .cipher = { |