summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-01-24 07:48:10 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-01-24 07:48:10 -0800
commit454cb97726fe62a04b187a0d631ec0a69f6b713a (patch)
tree16dae08192b97960398b9cb9cc92439fb1b7e4a6 /arch/x86
parentae2d4fc540cd27d667d10597b6ad8cc4c6ce622a (diff)
parent9d4f8e54cef2c42e23ef258833dbd06a1eaff89b (diff)
Merge tag 'v6.14-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "API: - Remove physical address skcipher walking - Fix boot-up self-test race Algorithms: - Optimisations for x86/aes-gcm - Optimisations for x86/aes-xts - Remove VMAC - Remove keywrap Drivers: - Remove n2 Others: - Fixes for padata UAF - Fix potential rhashtable deadlock by moving schedule_work outside lock" * tag 'v6.14-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (75 commits) rhashtable: Fix rhashtable_try_insert test dt-bindings: crypto: qcom,inline-crypto-engine: Document the SM8750 ICE dt-bindings: crypto: qcom,prng: Document SM8750 RNG dt-bindings: crypto: qcom-qce: Document the SM8750 crypto engine crypto: asymmetric_keys - Remove unused key_being_used_for[] padata: avoid UAF for reorder_work padata: fix UAF in padata_reorder padata: add pd get/put refcnt helper crypto: skcipher - call cond_resched() directly crypto: skcipher - optimize initializing skcipher_walk fields crypto: skcipher - clean up initialization of skcipher_walk::flags crypto: skcipher - fold skcipher_walk_skcipher() into skcipher_walk_virt() crypto: skcipher - remove redundant check for SKCIPHER_WALK_SLOW crypto: skcipher - remove redundant clamping to page size crypto: skcipher - remove unnecessary page alignment of bounce buffer crypto: skcipher - document skcipher_walk_done() and rename some vars crypto: omap - switch from scatter_walk to plain offset crypto: powerpc/p10-aes-gcm - simplify handling of linear associated data crypto: bcm - Drop unused setting of local 'ptr' variable crypto: hisilicon/qm - support new function communication ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/crypto/aegis128-aesni-glue.c1
-rw-r--r--arch/x86/crypto/aes-gcm-avx10-x86_64.S119
-rw-r--r--arch/x86/crypto/aes-xts-avx-x86_64.S329
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c10
-rw-r--r--arch/x86/crypto/blowfish_glue.c1
-rw-r--r--arch/x86/crypto/camellia_glue.c1
-rw-r--r--arch/x86/crypto/des3_ede_glue.c1
-rw-r--r--arch/x86/crypto/twofish_glue.c1
8 files changed, 221 insertions, 242 deletions
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index c19d8e3d96a3..01fa568dc5fc 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -240,7 +240,6 @@ static struct aead_alg crypto_aegis128_aesni_alg = {
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct aegis_ctx) +
__alignof__(struct aegis_ctx),
- .cra_alignmask = 0,
.cra_priority = 400,
.cra_name = "__aegis128",
diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
index 97e0ee515fc5..02ee11083d4f 100644
--- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S
+++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
@@ -88,7 +88,7 @@
// A shuffle mask that reflects the bytes of 16-byte blocks
.Lbswap_mask:
- .octa 0x000102030405060708090a0b0c0d0e0f
+ .octa 0x000102030405060708090a0b0c0d0e0f
// This is the GHASH reducing polynomial without its constant term, i.e.
// x^128 + x^7 + x^2 + x, represented using the backwards mapping
@@ -384,8 +384,8 @@
vpshufd $0xd3, H_CUR_XMM, %xmm0
vpsrad $31, %xmm0, %xmm0
vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM
- vpand .Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0
- vpxor %xmm0, H_CUR_XMM, H_CUR_XMM
+ // H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit
+ vpternlogd $0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, H_CUR_XMM
// Load the gfpoly constant.
vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY
@@ -562,6 +562,32 @@
vpxord RNDKEY0, V3, V3
.endm
+// Do the last AES round for four vectors of counter blocks V0-V3, XOR source
+// data with the resulting keystream, and write the result to DST and
+// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.)
+.macro _aesenclast_and_xor_4x
+ // XOR the source data with the last round key, saving the result in
+ // GHASHDATA[0-3]. This reduces latency by taking advantage of the
+ // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+ vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0
+ vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1
+ vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2
+ vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3
+
+ // Do the last AES round. This handles the XOR with the source data
+ // too, as per the optimization described above.
+ vaesenclast GHASHDATA0, V0, GHASHDATA0
+ vaesenclast GHASHDATA1, V1, GHASHDATA1
+ vaesenclast GHASHDATA2, V2, GHASHDATA2
+ vaesenclast GHASHDATA3, V3, GHASHDATA3
+
+ // Store the en/decrypted data to DST.
+ vmovdqu8 GHASHDATA0, 0*VL(DST)
+ vmovdqu8 GHASHDATA1, 1*VL(DST)
+ vmovdqu8 GHASHDATA2, 2*VL(DST)
+ vmovdqu8 GHASHDATA3, 3*VL(DST)
+.endm
+
// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
// const u32 le_ctr[4], u8 ghash_acc[16],
// const u8 *src, u8 *dst, int datalen);
@@ -640,7 +666,7 @@
// LE_CTR contains the next set of little-endian counter blocks.
.set LE_CTR, V12
- // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys,
+ // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
// copied to all 128-bit lanes. RNDKEY0 is the zero-th round key,
// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
.set RNDKEY0, V13
@@ -650,15 +676,10 @@
.set RNDKEY_M7, V17
.set RNDKEY_M6, V18
.set RNDKEY_M5, V19
-
- // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with
- // the corresponding block of source data. This is useful because
- // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can
- // be computed in parallel with the AES rounds.
- .set RNDKEYLAST0, V20
- .set RNDKEYLAST1, V21
- .set RNDKEYLAST2, V22
- .set RNDKEYLAST3, V23
+ .set RNDKEY_M4, V20
+ .set RNDKEY_M3, V21
+ .set RNDKEY_M2, V22
+ .set RNDKEY_M1, V23
// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These
// cannot coincide with anything used for AES encryption, since for
@@ -713,7 +734,7 @@
// Pre-subtracting 4*VL from DATALEN saves an instruction from the main
// loop and also ensures that at least one write always occurs to
// DATALEN, zero-extending it and allowing DATALEN64 to be used later.
- sub $4*VL, DATALEN
+ add $-4*VL, DATALEN // shorter than 'sub 4*VL' when VL=32
jl .Lcrypt_loop_4x_done\@
// Load powers of the hash key.
@@ -748,26 +769,15 @@
add $16, %rax
cmp %rax, RNDKEYLAST_PTR
jne 1b
- vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
- vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
- vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
- vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
- vaesenclast RNDKEYLAST0, V0, GHASHDATA0
- vaesenclast RNDKEYLAST1, V1, GHASHDATA1
- vaesenclast RNDKEYLAST2, V2, GHASHDATA2
- vaesenclast RNDKEYLAST3, V3, GHASHDATA3
- vmovdqu8 GHASHDATA0, 0*VL(DST)
- vmovdqu8 GHASHDATA1, 1*VL(DST)
- vmovdqu8 GHASHDATA2, 2*VL(DST)
- vmovdqu8 GHASHDATA3, 3*VL(DST)
- add $4*VL, SRC
- add $4*VL, DST
- sub $4*VL, DATALEN
+ _aesenclast_and_xor_4x
+ sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32
+ sub $-4*VL, DST
+ add $-4*VL, DATALEN
jl .Lghash_last_ciphertext_4x\@
.endif
// Cache as many additional AES round keys as possible.
-.irp i, 9,8,7,6,5
+.irp i, 9,8,7,6,5,4,3,2,1
vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
.endr
@@ -799,50 +809,17 @@
_vaesenc_4x RNDKEY
128:
- // XOR the source data with the last round key, saving the result in
- // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the
- // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
-.if \enc
- vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
- vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
- vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
- vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
-.else
- vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0
- vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1
- vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2
- vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3
-.endif
-
// Finish the AES encryption of the counter blocks in V0-V3, interleaved
// with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
-.irp i, 9,8,7,6,5
+.irp i, 9,8,7,6,5,4,3,2,1
+ _ghash_step_4x (9 - \i)
_vaesenc_4x RNDKEY_M\i
- _ghash_step_4x (9 - \i)
-.endr
-.irp i, 4,3,2,1
- vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY
- _vaesenc_4x RNDKEY
- _ghash_step_4x (9 - \i)
.endr
_ghash_step_4x 9
-
- // Do the last AES round. This handles the XOR with the source data
- // too, as per the optimization described above.
- vaesenclast RNDKEYLAST0, V0, GHASHDATA0
- vaesenclast RNDKEYLAST1, V1, GHASHDATA1
- vaesenclast RNDKEYLAST2, V2, GHASHDATA2
- vaesenclast RNDKEYLAST3, V3, GHASHDATA3
-
- // Store the en/decrypted data to DST.
- vmovdqu8 GHASHDATA0, 0*VL(DST)
- vmovdqu8 GHASHDATA1, 1*VL(DST)
- vmovdqu8 GHASHDATA2, 2*VL(DST)
- vmovdqu8 GHASHDATA3, 3*VL(DST)
-
- add $4*VL, SRC
- add $4*VL, DST
- sub $4*VL, DATALEN
+ _aesenclast_and_xor_4x
+ sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32
+ sub $-4*VL, DST
+ add $-4*VL, DATALEN
jge .Lcrypt_loop_4x\@
.if \enc
@@ -856,7 +833,7 @@
.Lcrypt_loop_4x_done\@:
// Undo the extra subtraction by 4*VL and check whether data remains.
- add $4*VL, DATALEN
+ sub $-4*VL, DATALEN // shorter than 'add 4*VL' when VL=32
jz .Ldone\@
// The data length isn't a multiple of 4*VL. Process the remaining data
@@ -940,7 +917,7 @@
// GHASH. However, any such blocks are all-zeroes, and the values that
// they're multiplied with are also all-zeroes. Therefore they just add
// 0 * 0 = 0 to the final GHASH result, which makes no difference.
- vmovdqu8 (POWERS_PTR), H_POW1
+ vmovdqu8 (POWERS_PTR), H_POW1
.if \enc
vmovdqu8 V0, V1{%k1}{z}
.endif
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index 48f97b79f7a9..8a3e23fbcf85 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -80,22 +80,6 @@
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
.text
-// Function parameters
-.set KEY, %rdi // Initially points to crypto_aes_ctx, then is
- // advanced to point to 7th-from-last round key
-.set SRC, %rsi // Pointer to next source data
-.set DST, %rdx // Pointer to next destination data
-.set LEN, %ecx // Remaining length in bytes
-.set LEN8, %cl
-.set LEN64, %rcx
-.set TWEAK, %r8 // Pointer to next tweak
-
-// %rax holds the AES key length in bytes.
-.set KEYLEN, %eax
-.set KEYLEN64, %rax
-
-// %r9-r11 are available as temporaries.
-
.macro _define_Vi i
.if VL == 16
.set V\i, %xmm\i
@@ -112,41 +96,31 @@
// Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
// are available, that map to the xmm, ymm, or zmm registers according
// to the selected Vector Length (VL).
- _define_Vi 0
- _define_Vi 1
- _define_Vi 2
- _define_Vi 3
- _define_Vi 4
- _define_Vi 5
- _define_Vi 6
- _define_Vi 7
- _define_Vi 8
- _define_Vi 9
- _define_Vi 10
- _define_Vi 11
- _define_Vi 12
- _define_Vi 13
- _define_Vi 14
- _define_Vi 15
+.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+ _define_Vi \i
+.endr
.if USE_AVX10
- _define_Vi 16
- _define_Vi 17
- _define_Vi 18
- _define_Vi 19
- _define_Vi 20
- _define_Vi 21
- _define_Vi 22
- _define_Vi 23
- _define_Vi 24
- _define_Vi 25
- _define_Vi 26
- _define_Vi 27
- _define_Vi 28
- _define_Vi 29
- _define_Vi 30
- _define_Vi 31
+.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+ _define_Vi \i
+.endr
.endif
+ // Function parameters
+ .set KEY, %rdi // Initially points to crypto_aes_ctx, then is
+ // advanced to point to 7th-from-last round key
+ .set SRC, %rsi // Pointer to next source data
+ .set DST, %rdx // Pointer to next destination data
+ .set LEN, %ecx // Remaining length in bytes
+ .set LEN8, %cl
+ .set LEN64, %rcx
+ .set TWEAK, %r8 // Pointer to next tweak
+
+ // %rax holds the AES key length in bytes.
+ .set KEYLEN, %eax
+ .set KEYLEN64, %rax
+
+ // %r9-r11 are available as temporaries.
+
// V0-V3 hold the data blocks during the main loop, or temporary values
// otherwise. V4-V5 hold temporary values.
@@ -214,6 +188,7 @@
.endm
// Move a vector between memory and a register.
+// The register operand must be in the first 16 vector registers.
.macro _vmovdqu src, dst
.if VL < 64
vmovdqu \src, \dst
@@ -234,11 +209,12 @@
.endm
// XOR two vectors together.
+// Any register operands must be in the first 16 vector registers.
.macro _vpxor src1, src2, dst
-.if USE_AVX10
- vpxord \src1, \src2, \dst
-.else
+.if VL < 64
vpxor \src1, \src2, \dst
+.else
+ vpxord \src1, \src2, \dst
.endif
.endm
@@ -259,8 +235,12 @@
vpshufd $0x13, \src, \tmp
vpaddq \src, \src, \dst
vpsrad $31, \tmp, \tmp
+.if USE_AVX10
+ vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst
+.else
vpand GF_POLY_XMM, \tmp, \tmp
vpxor \tmp, \dst, \dst
+.endif
.endm
// Given the XTS tweak(s) in the vector \src, compute the next vector of
@@ -369,9 +349,14 @@
// Do one step in computing the next set of tweaks using the VPCLMULQDQ method
// (the same method _next_tweakvec uses for VL > 16). This means multiplying
-// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8
-// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
-// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
+// each tweak by x^(4*VL/16) independently.
+//
+// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed
+// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq
+// to do 128-bit wide shifts. The 128-bit left shift (vpslldq) saves
+// instructions directly. The 128-bit right shift (vpsrldq) performs better
+// than a 64-bit right shift on Intel CPUs in the context where it is used here,
+// because it runs on a different execution port from the AES instructions.
.macro _tweak_step_pclmul i
.if \i == 0
vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
@@ -406,7 +391,7 @@
// \i that include at least 0 through 19, then 1000 which signals the last step.
//
// This is used to interleave the computation of the next set of tweaks with the
-// AES en/decryptions, which increases performance in some cases.
+// AES en/decryptions, which increases performance in some cases. Clobbers V5.
.macro _tweak_step i
.if VL == 16
_tweak_step_mulx \i
@@ -443,9 +428,10 @@
// the last round needs different instructions.
//
// An alternative approach would be to roll up all the round loops. We
- // don't do that because it isn't compatible with caching the round keys
- // in registers which we do when possible (see below), and also because
- // it seems unwise to rely *too* heavily on the CPU's branch predictor.
+ // don't do that because (a) it isn't compatible with caching the round
+ // keys in registers which we do when possible (see below), (b) we
+ // interleave the AES rounds with the XTS tweak computation, and (c) it
+ // seems unwise to rely *too* heavily on the CPU's branch predictor.
lea OFFS-16(KEY, KEYLEN64, 4), KEY
// If all 32 SIMD registers are available, cache all the round keys.
@@ -472,90 +458,94 @@
.endif
.endm
-// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
-// on the block(s) in \data using the round key(s) in \key. The register length
-// determines the number of AES blocks en/decrypted.
-.macro _vaes enc, last, key, data
+// Do a single non-last round of AES encryption (if \enc==1) or decryption (if
+// \enc==0) on the block(s) in \data using the round key(s) in \key. The
+// register length determines the number of AES blocks en/decrypted.
+.macro _vaes enc, key, data
.if \enc
-.if \last
- vaesenclast \key, \data, \data
-.else
vaesenc \key, \data, \data
-.endif
-.else
-.if \last
- vaesdeclast \key, \data, \data
.else
vaesdec \key, \data, \data
.endif
+.endm
+
+// Same as _vaes, but does the last round.
+.macro _vaeslast enc, key, data
+.if \enc
+ vaesenclast \key, \data, \data
+.else
+ vaesdeclast \key, \data, \data
.endif
.endm
-// Do a single round of AES en/decryption on the block(s) in \data, using the
-// same key for all block(s). The round key is loaded from the appropriate
-// register or memory location for round \i. May clobber V4.
-.macro _vaes_1x enc, last, i, xmm_suffix, data
+// Do a single non-last round of AES en/decryption on the block(s) in \data,
+// using the same key for all block(s). The round key is loaded from the
+// appropriate register or memory location for round \i. May clobber \tmp.
+.macro _vaes_1x enc, i, xmm_suffix, data, tmp
.if USE_AVX10
- _vaes \enc, \last, KEY\i\xmm_suffix, \data
+ _vaes \enc, KEY\i\xmm_suffix, \data
.else
.ifnb \xmm_suffix
- _vaes \enc, \last, (\i-7)*16(KEY), \data
+ _vaes \enc, (\i-7)*16(KEY), \data
.else
- _vbroadcast128 (\i-7)*16(KEY), V4
- _vaes \enc, \last, V4, \data
+ _vbroadcast128 (\i-7)*16(KEY), \tmp
+ _vaes \enc, \tmp, \data
.endif
.endif
.endm
-// Do a single round of AES en/decryption on the blocks in registers V0-V3,
-// using the same key for all blocks. The round key is loaded from the
+// Do a single non-last round of AES en/decryption on the blocks in registers
+// V0-V3, using the same key for all blocks. The round key is loaded from the
// appropriate register or memory location for round \i. In addition, does two
-// steps of the computation of the next set of tweaks. May clobber V4.
-.macro _vaes_4x enc, last, i
+// steps of the computation of the next set of tweaks. May clobber V4 and V5.
+.macro _vaes_4x enc, i
.if USE_AVX10
_tweak_step (2*(\i-5))
- _vaes \enc, \last, KEY\i, V0
- _vaes \enc, \last, KEY\i, V1
+ _vaes \enc, KEY\i, V0
+ _vaes \enc, KEY\i, V1
_tweak_step (2*(\i-5) + 1)
- _vaes \enc, \last, KEY\i, V2
- _vaes \enc, \last, KEY\i, V3
+ _vaes \enc, KEY\i, V2
+ _vaes \enc, KEY\i, V3
.else
_vbroadcast128 (\i-7)*16(KEY), V4
_tweak_step (2*(\i-5))
- _vaes \enc, \last, V4, V0
- _vaes \enc, \last, V4, V1
+ _vaes \enc, V4, V0
+ _vaes \enc, V4, V1
_tweak_step (2*(\i-5) + 1)
- _vaes \enc, \last, V4, V2
- _vaes \enc, \last, V4, V3
+ _vaes \enc, V4, V2
+ _vaes \enc, V4, V3
.endif
.endm
// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
// then XOR with \tweak again) of the block(s) in \data. To process a single
// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of
-// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4.
-.macro _aes_crypt enc, xmm_suffix, tweak, data
+// length VL, use V* registers and leave \xmm_suffix empty. Clobbers \tmp.
+.macro _aes_crypt enc, xmm_suffix, tweak, data, tmp
_xor3 KEY0\xmm_suffix, \tweak, \data
cmp $24, KEYLEN
jl .Laes128\@
je .Laes192\@
- _vaes_1x \enc, 0, 1, \xmm_suffix, \data
- _vaes_1x \enc, 0, 2, \xmm_suffix, \data
+ _vaes_1x \enc, 1, \xmm_suffix, \data, tmp=\tmp
+ _vaes_1x \enc, 2, \xmm_suffix, \data, tmp=\tmp
.Laes192\@:
- _vaes_1x \enc, 0, 3, \xmm_suffix, \data
- _vaes_1x \enc, 0, 4, \xmm_suffix, \data
+ _vaes_1x \enc, 3, \xmm_suffix, \data, tmp=\tmp
+ _vaes_1x \enc, 4, \xmm_suffix, \data, tmp=\tmp
.Laes128\@:
- _vaes_1x \enc, 0, 5, \xmm_suffix, \data
- _vaes_1x \enc, 0, 6, \xmm_suffix, \data
- _vaes_1x \enc, 0, 7, \xmm_suffix, \data
- _vaes_1x \enc, 0, 8, \xmm_suffix, \data
- _vaes_1x \enc, 0, 9, \xmm_suffix, \data
- _vaes_1x \enc, 0, 10, \xmm_suffix, \data
- _vaes_1x \enc, 0, 11, \xmm_suffix, \data
- _vaes_1x \enc, 0, 12, \xmm_suffix, \data
- _vaes_1x \enc, 0, 13, \xmm_suffix, \data
- _vaes_1x \enc, 1, 14, \xmm_suffix, \data
- _vpxor \tweak, \data, \data
+.irp i, 5,6,7,8,9,10,11,12,13
+ _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp
+.endr
+.if USE_AVX10
+ vpxord KEY14\xmm_suffix, \tweak, \tmp
+.else
+.ifnb \xmm_suffix
+ vpxor 7*16(KEY), \tweak, \tmp
+.else
+ _vbroadcast128 7*16(KEY), \tmp
+ vpxor \tweak, \tmp, \tmp
+.endif
+.endif
+ _vaeslast \enc, \tmp, \data
.endm
.macro _aes_xts_crypt enc
@@ -581,7 +571,7 @@
// Compute the first set of tweaks TWEAK[0-3].
_compute_first_set_of_tweaks
- sub $4*VL, LEN
+ add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32
jl .Lhandle_remainder\@
.Lmain_loop\@:
@@ -589,10 +579,10 @@
// XOR each source block with its tweak and the zero-th round key.
.if USE_AVX10
- vmovdqu8 0*VL(SRC), V0
- vmovdqu8 1*VL(SRC), V1
- vmovdqu8 2*VL(SRC), V2
- vmovdqu8 3*VL(SRC), V3
+ _vmovdqu 0*VL(SRC), V0
+ _vmovdqu 1*VL(SRC), V1
+ _vmovdqu 2*VL(SRC), V2
+ _vmovdqu 3*VL(SRC), V3
vpternlogd $0x96, TWEAK0, KEY0, V0
vpternlogd $0x96, TWEAK1, KEY0, V1
vpternlogd $0x96, TWEAK2, KEY0, V2
@@ -612,28 +602,43 @@
je .Laes192\@
// Do all the AES rounds on the data blocks, interleaved with
// the computation of the next set of tweaks.
- _vaes_4x \enc, 0, 1
- _vaes_4x \enc, 0, 2
+ _vaes_4x \enc, 1
+ _vaes_4x \enc, 2
.Laes192\@:
- _vaes_4x \enc, 0, 3
- _vaes_4x \enc, 0, 4
+ _vaes_4x \enc, 3
+ _vaes_4x \enc, 4
.Laes128\@:
- _vaes_4x \enc, 0, 5
- _vaes_4x \enc, 0, 6
- _vaes_4x \enc, 0, 7
- _vaes_4x \enc, 0, 8
- _vaes_4x \enc, 0, 9
- _vaes_4x \enc, 0, 10
- _vaes_4x \enc, 0, 11
- _vaes_4x \enc, 0, 12
- _vaes_4x \enc, 0, 13
- _vaes_4x \enc, 1, 14
-
- // XOR in the tweaks again.
- _vpxor TWEAK0, V0, V0
- _vpxor TWEAK1, V1, V1
- _vpxor TWEAK2, V2, V2
- _vpxor TWEAK3, V3, V3
+.irp i, 5,6,7,8,9,10,11,12,13
+ _vaes_4x \enc, \i
+.endr
+ // Do the last AES round, then XOR the results with the tweaks again.
+ // Reduce latency by doing the XOR before the vaesenclast, utilizing the
+ // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a)
+ // (and likewise for vaesdeclast).
+.if USE_AVX10
+ _tweak_step 18
+ _tweak_step 19
+ vpxord TWEAK0, KEY14, V4
+ vpxord TWEAK1, KEY14, V5
+ _vaeslast \enc, V4, V0
+ _vaeslast \enc, V5, V1
+ vpxord TWEAK2, KEY14, V4
+ vpxord TWEAK3, KEY14, V5
+ _vaeslast \enc, V4, V2
+ _vaeslast \enc, V5, V3
+.else
+ _vbroadcast128 7*16(KEY), V4
+ _tweak_step 18 // uses V5
+ _tweak_step 19 // uses V5
+ vpxor TWEAK0, V4, V5
+ _vaeslast \enc, V5, V0
+ vpxor TWEAK1, V4, V5
+ _vaeslast \enc, V5, V1
+ vpxor TWEAK2, V4, V5
+ vpxor TWEAK3, V4, V4
+ _vaeslast \enc, V5, V2
+ _vaeslast \enc, V4, V3
+.endif
// Store the destination blocks.
_vmovdqu V0, 0*VL(DST)
@@ -644,9 +649,9 @@
// Finish computing the next set of tweaks.
_tweak_step 1000
- add $4*VL, SRC
- add $4*VL, DST
- sub $4*VL, LEN
+ sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32
+ sub $-4*VL, DST
+ add $-4*VL, LEN
jge .Lmain_loop\@
// Check for the uncommon case where the data length isn't a multiple of
@@ -670,7 +675,7 @@
jl .Lvec_at_a_time_done\@
.Lvec_at_a_time\@:
_vmovdqu (SRC), V0
- _aes_crypt \enc, , TWEAK0, V0
+ _aes_crypt \enc, , TWEAK0, V0, tmp=V1
_vmovdqu V0, (DST)
_next_tweakvec TWEAK0, V0, V1, TWEAK0
add $VL, SRC
@@ -687,7 +692,7 @@
jl .Lblock_at_a_time_done\@
.Lblock_at_a_time\@:
vmovdqu (SRC), %xmm0
- _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
+ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1
vmovdqu %xmm0, (DST)
_next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM
add $16, SRC
@@ -715,7 +720,7 @@
// Do it now by advancing the tweak and decrypting the last full block.
_next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM
vmovdqu (SRC), %xmm0
- _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0
+ _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1
.endif
.if USE_AVX10
@@ -758,47 +763,49 @@
vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
.endif
// En/decrypt again and store the last full block.
- _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
+ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1
vmovdqu %xmm0, (DST)
jmp .Ldone\@
.endm
// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
// u8 iv[AES_BLOCK_SIZE]);
+//
+// Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes
+// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10.
SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
- vmovdqu (%rsi), %xmm0
- vpxor (%rdi), %xmm0, %xmm0
- movl 480(%rdi), %eax // AES key length
- lea -16(%rdi, %rax, 4), %rdi
- cmp $24, %eax
+ .set TWEAK_KEY, %rdi
+ .set IV, %rsi
+ .set KEYLEN, %eax
+ .set KEYLEN64, %rax
+
+ vmovdqu (IV), %xmm0
+ vpxor (TWEAK_KEY), %xmm0, %xmm0
+ movl 480(TWEAK_KEY), KEYLEN
+ lea -16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY
+ cmp $24, KEYLEN
jl .Lencrypt_iv_aes128
je .Lencrypt_iv_aes192
- vaesenc -6*16(%rdi), %xmm0, %xmm0
- vaesenc -5*16(%rdi), %xmm0, %xmm0
+ vaesenc -6*16(TWEAK_KEY), %xmm0, %xmm0
+ vaesenc -5*16(TWEAK_KEY), %xmm0, %xmm0
.Lencrypt_iv_aes192:
- vaesenc -4*16(%rdi), %xmm0, %xmm0
- vaesenc -3*16(%rdi), %xmm0, %xmm0
+ vaesenc -4*16(TWEAK_KEY), %xmm0, %xmm0
+ vaesenc -3*16(TWEAK_KEY), %xmm0, %xmm0
.Lencrypt_iv_aes128:
- vaesenc -2*16(%rdi), %xmm0, %xmm0
- vaesenc -1*16(%rdi), %xmm0, %xmm0
- vaesenc 0*16(%rdi), %xmm0, %xmm0
- vaesenc 1*16(%rdi), %xmm0, %xmm0
- vaesenc 2*16(%rdi), %xmm0, %xmm0
- vaesenc 3*16(%rdi), %xmm0, %xmm0
- vaesenc 4*16(%rdi), %xmm0, %xmm0
- vaesenc 5*16(%rdi), %xmm0, %xmm0
- vaesenc 6*16(%rdi), %xmm0, %xmm0
- vaesenclast 7*16(%rdi), %xmm0, %xmm0
- vmovdqu %xmm0, (%rsi)
+.irp i, -2,-1,0,1,2,3,4,5,6
+ vaesenc \i*16(TWEAK_KEY), %xmm0, %xmm0
+.endr
+ vaesenclast 7*16(TWEAK_KEY), %xmm0, %xmm0
+ vmovdqu %xmm0, (IV)
RET
SYM_FUNC_END(aes_xts_encrypt_iv)
// Below are the actual AES-XTS encryption and decryption functions,
// instantiated from the above macro. They all have the following prototype:
//
-// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
-// const u8 *src, u8 *dst, unsigned int len,
-// u8 tweak[AES_BLOCK_SIZE]);
+// void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
+// const u8 *src, u8 *dst, int len,
+// u8 tweak[AES_BLOCK_SIZE]);
//
// |key| is the data key. |tweak| contains the next tweak; the encryption of
// the original IV with the tweak key was already done. This function supports
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index fbf43482e1f5..11e95fc62636 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -505,7 +505,7 @@ static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
u8 iv[AES_BLOCK_SIZE]);
typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
- const u8 *src, u8 *dst, unsigned int len,
+ const u8 *src, u8 *dst, int len,
u8 tweak[AES_BLOCK_SIZE]);
/* This handles cases where the source and/or destination span pages. */
@@ -624,14 +624,14 @@ static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
}
static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
- const u8 *src, u8 *dst, unsigned int len,
+ const u8 *src, u8 *dst, int len,
u8 tweak[AES_BLOCK_SIZE])
{
aesni_xts_enc(key, dst, src, len, tweak);
}
static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
- const u8 *src, u8 *dst, unsigned int len,
+ const u8 *src, u8 *dst, int len,
u8 tweak[AES_BLOCK_SIZE])
{
aesni_xts_dec(key, dst, src, len, tweak);
@@ -790,10 +790,10 @@ asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
\
asmlinkage void \
aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
- u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
+ u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \
asmlinkage void \
aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
- u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
+ u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \
\
static int xts_encrypt_##suffix(struct skcipher_request *req) \
{ \
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 552f2df0643f..26c5f2ee5d10 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -94,7 +94,6 @@ static struct crypto_alg bf_cipher_alg = {
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = BF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct bf_ctx),
- .cra_alignmask = 0,
.cra_module = THIS_MODULE,
.cra_u = {
.cipher = {
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index f110708c8038..3bd37d664121 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1313,7 +1313,6 @@ static struct crypto_alg camellia_cipher_alg = {
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct camellia_ctx),
- .cra_alignmask = 0,
.cra_module = THIS_MODULE,
.cra_u = {
.cipher = {
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index abb8b1fe123b..e88439d3828e 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -291,7 +291,6 @@ static struct crypto_alg des3_ede_cipher = {
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = DES3_EDE_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
- .cra_alignmask = 0,
.cra_module = THIS_MODULE,
.cra_u = {
.cipher = {
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
index 0614beece279..4c67184dc573 100644
--- a/arch/x86/crypto/twofish_glue.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -68,7 +68,6 @@ static struct crypto_alg alg = {
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_ctx),
- .cra_alignmask = 0,
.cra_module = THIS_MODULE,
.cra_u = {
.cipher = {