1 files changed, 111 insertions, 95 deletions
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index 93ba0ddbe009..db79cdf81588 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -52,32 +52,25 @@
  * different code, it uses a macro to generate several implementations that
  * share similar source code but are targeted at different CPUs, listed below:
  *
- * AES-NI + AVX
+ * AES-NI && AVX
  *    - 128-bit vectors (1 AES block per vector)
  *    - VEX-coded instructions
  *    - xmm0-xmm15
  *    - This is for older CPUs that lack VAES but do have AVX.
  *
- * VAES + VPCLMULQDQ + AVX2
+ * VAES && VPCLMULQDQ && AVX2
  *    - 256-bit vectors (2 AES blocks per vector)
  *    - VEX-coded instructions
  *    - ymm0-ymm15
- *    - This is for CPUs that have VAES but lack AVX512 or AVX10,
- *      e.g. Intel's Alder Lake and AMD's Zen 3.
+ *    - This is for CPUs that have VAES but either lack AVX512 (e.g. Intel's
+ *      Alder Lake and AMD's Zen 3) or downclock too eagerly when using zmm
+ *      registers (e.g. Intel's Ice Lake).
  *
- * VAES + VPCLMULQDQ + AVX10/256 + BMI2
- *    - 256-bit vectors (2 AES blocks per vector)
+ * VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2
+ *    - 512-bit vectors (4 AES blocks per vector)
  *    - EVEX-coded instructions
- *    - ymm0-ymm31
- *    - This is for CPUs that have AVX512 but where using zmm registers causes
- *      downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
- *    - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
- *      To avoid confusion with 512-bit, we just write AVX10/256.
- *
- * VAES + VPCLMULQDQ + AVX10/512 + BMI2
- *    - Same as the previous one, but upgrades to 512-bit vectors
- *      (4 AES blocks per vector) in zmm0-zmm31.
- *    - This is for CPUs that have good AVX512 or AVX10/512 support.
+ *    - zmm0-zmm31
+ *    - This is for CPUs that have good AVX512 support.
  *
  * This file doesn't have an implementation for AES-NI alone (without AVX), as
  * the lack of VEX would make all the assembly code different.
@@ -107,9 +100,20 @@
 	// exists when there's a carry out of the low 64 bits of the tweak.
 	.quad	0x87, 1
 
+	// These are the shift amounts that are needed when multiplying by [x^0,
+	// x^1, x^2, x^3] to compute the first vector of tweaks when VL=64.
+	//
+	// The right shifts by 64 are expected to zeroize the destination.
+	// 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the
+	// amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would.
+.Lrshift_amounts:
+	.byte	64, 64, 63, 63, 62, 62, 61, 61
+.Llshift_amounts:
+	.byte	0, 0, 1, 1, 2, 2, 3, 3
+
 	// This table contains constants for vpshufb and vpblendvb, used to
 	// handle variable byte shifts and blending during ciphertext stealing
-	// on CPUs that don't support AVX10-style masking.
+	// on CPUs that don't support AVX512-style masking.
 .Lcts_permute_table:
 	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
 	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
@@ -138,7 +142,7 @@
 .irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
 	_define_Vi	\i
 .endr
-.if USE_AVX10
+.if USE_AVX512
 .irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
 	_define_Vi	\i
 .endr
@@ -193,7 +197,7 @@
 	// keys to the *end* of this register range.  I.e., AES-128 uses
 	// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
 	// (All also use KEY0 for the XOR-only "round" at the beginning.)
-.if USE_AVX10
+.if USE_AVX512
 	.set	KEY1_XMM,	%xmm16
 	.set	KEY1,		V16
 	.set	KEY2_XMM,	%xmm17
@@ -227,7 +231,6 @@
 .endm
 
 // Move a vector between memory and a register.
-// The register operand must be in the first 16 vector registers.
 .macro	_vmovdqu	src, dst
 .if VL < 64
 	vmovdqu		\src, \dst
@@ -238,9 +241,9 @@
 
 // Broadcast a 128-bit value into a vector.
 .macro	_vbroadcast128	src, dst
-.if VL == 16 && !USE_AVX10
+.if VL == 16
 	vmovdqu		\src, \dst
-.elseif VL == 32 && !USE_AVX10
+.elseif VL == 32
 	vbroadcasti128	\src, \dst
 .else
 	vbroadcasti32x4	\src, \dst
@@ -248,7 +251,6 @@
 .endm
 
 // XOR two vectors together.
-// Any register operands must be in the first 16 vector registers.
 .macro	_vpxor	src1, src2, dst
 .if VL < 64
 	vpxor		\src1, \src2, \dst
@@ -259,7 +261,7 @@
 
 // XOR three vectors together.
 .macro	_xor3	src1, src2, src3_and_dst
-.if USE_AVX10
+.if USE_AVX512
 	// vpternlogd with immediate 0x96 is a three-argument XOR.
 	vpternlogd	$0x96, \src1, \src2, \src3_and_dst
 .else
@@ -274,7 +276,7 @@
 	vpshufd		$0x13, \src, \tmp
 	vpaddq		\src, \src, \dst
 	vpsrad		$31, \tmp, \tmp
-.if USE_AVX10
+.if USE_AVX512
 	vpternlogd	$0x78, GF_POLY_XMM, \tmp, \dst
 .else
 	vpand		GF_POLY_XMM, \tmp, \tmp
@@ -303,52 +305,75 @@
 // Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
 // store them in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V5.
 .macro	_compute_first_set_of_tweaks
-	vmovdqu		(TWEAK), TWEAK0_XMM
-	_vbroadcast128	.Lgf_poly(%rip), GF_POLY
 .if VL == 16
-	// With VL=16, multiplying by x serially is fastest.
+	vmovdqu		(TWEAK), TWEAK0_XMM
+	vmovdqu		.Lgf_poly(%rip), GF_POLY
 	_next_tweak	TWEAK0, %xmm0, TWEAK1
 	_next_tweak	TWEAK1, %xmm0, TWEAK2
 	_next_tweak	TWEAK2, %xmm0, TWEAK3
-.else
-.if VL == 32
-	// Compute the second block of TWEAK0.
+.elseif VL == 32
+	vmovdqu		(TWEAK), TWEAK0_XMM
+	vbroadcasti128	.Lgf_poly(%rip), GF_POLY
+
+	// Compute the first vector of tweaks.
 	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
 	vinserti128	$1, %xmm1, TWEAK0, TWEAK0
-.elseif VL == 64
-	// Compute the remaining blocks of TWEAK0.
-	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
-	_next_tweak	%xmm1, %xmm0, %xmm2
-	_next_tweak	%xmm2, %xmm0, %xmm3
-	vinserti32x4	$1, %xmm1, TWEAK0, TWEAK0
-	vinserti32x4	$2, %xmm2, TWEAK0, TWEAK0
-	vinserti32x4	$3, %xmm3, TWEAK0, TWEAK0
-.endif
-	// Compute TWEAK[1-3] from TWEAK0.
-	vpsrlq		$64 - 1*VL/16, TWEAK0, V0
-	vpsrlq		$64 - 2*VL/16, TWEAK0, V2
-	vpsrlq		$64 - 3*VL/16, TWEAK0, V4
+
+	// Compute the next three vectors of tweaks:
+	//	TWEAK1 = TWEAK0 * [x^2, x^2]
+	//	TWEAK2 = TWEAK0 * [x^4, x^4]
+	//	TWEAK3 = TWEAK0 * [x^6, x^6]
+	vpsrlq		$64 - 2, TWEAK0, V0
+	vpsrlq		$64 - 4, TWEAK0, V2
+	vpsrlq		$64 - 6, TWEAK0, V4
 	vpclmulqdq	$0x01, GF_POLY, V0, V1
 	vpclmulqdq	$0x01, GF_POLY, V2, V3
 	vpclmulqdq	$0x01, GF_POLY, V4, V5
 	vpslldq		$8, V0, V0
 	vpslldq		$8, V2, V2
 	vpslldq		$8, V4, V4
-	vpsllq		$1*VL/16, TWEAK0, TWEAK1
-	vpsllq		$2*VL/16, TWEAK0, TWEAK2
-	vpsllq		$3*VL/16, TWEAK0, TWEAK3
-.if USE_AVX10
-	vpternlogd	$0x96, V0, V1, TWEAK1
-	vpternlogd	$0x96, V2, V3, TWEAK2
-	vpternlogd	$0x96, V4, V5, TWEAK3
-.else
+	vpsllq		$2, TWEAK0, TWEAK1
+	vpsllq		$4, TWEAK0, TWEAK2
+	vpsllq		$6, TWEAK0, TWEAK3
 	vpxor		V0, TWEAK1, TWEAK1
 	vpxor		V2, TWEAK2, TWEAK2
 	vpxor		V4, TWEAK3, TWEAK3
 	vpxor		V1, TWEAK1, TWEAK1
 	vpxor		V3, TWEAK2, TWEAK2
 	vpxor		V5, TWEAK3, TWEAK3
-.endif
+.else
+	vbroadcasti32x4	(TWEAK), TWEAK0
+	vbroadcasti32x4	.Lgf_poly(%rip), GF_POLY
+
+	// Compute the first vector of tweaks:
+	//	TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3]
+	vpmovzxbq	.Lrshift_amounts(%rip), V4
+	vpsrlvq		V4, TWEAK0, V0
+	vpclmulqdq	$0x01, GF_POLY, V0, V1
+	vpmovzxbq	.Llshift_amounts(%rip), V4
+	vpslldq		$8, V0, V0
+	vpsllvq		V4, TWEAK0, TWEAK0
+	vpternlogd	$0x96, V0, V1, TWEAK0
+
+	// Compute the next three vectors of tweaks:
+	//	TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4]
+	//	TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8]
+	//	TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12]
+	// x^8 only needs byte-aligned shifts, so optimize accordingly.
+	vpsrlq		$64 - 4, TWEAK0, V0
+	vpsrldq		$(64 - 8) / 8, TWEAK0, V2
+	vpsrlq		$64 - 12, TWEAK0, V4
+	vpclmulqdq	$0x01, GF_POLY, V0, V1
+	vpclmulqdq	$0x01, GF_POLY, V2, V3
+	vpclmulqdq	$0x01, GF_POLY, V4, V5
+	vpslldq		$8, V0, V0
+	vpslldq		$8, V4, V4
+	vpsllq		$4, TWEAK0, TWEAK1
+	vpslldq		$8 / 8, TWEAK0, TWEAK2
+	vpsllq		$12, TWEAK0, TWEAK3
+	vpternlogd	$0x96, V0, V1, TWEAK1
+	vpxord		V3, TWEAK2, TWEAK2
+	vpternlogd	$0x96, V4, V5, TWEAK3
 .endif
 .endm
 
@@ -474,26 +499,26 @@
 	lea		OFFS-16(KEY, KEYLEN64, 4), KEY
 
 	// If all 32 SIMD registers are available, cache all the round keys.
-.if USE_AVX10
+.if USE_AVX512
 	cmp		$24, KEYLEN
 	jl		.Laes128\@
 	je		.Laes192\@
-	_vbroadcast128	-6*16(KEY), KEY1
-	_vbroadcast128	-5*16(KEY), KEY2
+	vbroadcasti32x4	-6*16(KEY), KEY1
+	vbroadcasti32x4	-5*16(KEY), KEY2
 .Laes192\@:
-	_vbroadcast128	-4*16(KEY), KEY3
-	_vbroadcast128	-3*16(KEY), KEY4
+	vbroadcasti32x4	-4*16(KEY), KEY3
+	vbroadcasti32x4	-3*16(KEY), KEY4
 .Laes128\@:
-	_vbroadcast128	-2*16(KEY), KEY5
-	_vbroadcast128	-1*16(KEY), KEY6
-	_vbroadcast128	0*16(KEY), KEY7
-	_vbroadcast128	1*16(KEY), KEY8
-	_vbroadcast128	2*16(KEY), KEY9
-	_vbroadcast128	3*16(KEY), KEY10
-	_vbroadcast128	4*16(KEY), KEY11
-	_vbroadcast128	5*16(KEY), KEY12
-	_vbroadcast128	6*16(KEY), KEY13
-	_vbroadcast128	7*16(KEY), KEY14
+	vbroadcasti32x4	-2*16(KEY), KEY5
+	vbroadcasti32x4	-1*16(KEY), KEY6
+	vbroadcasti32x4	0*16(KEY), KEY7
+	vbroadcasti32x4	1*16(KEY), KEY8
+	vbroadcasti32x4	2*16(KEY), KEY9
+	vbroadcasti32x4	3*16(KEY), KEY10
+	vbroadcasti32x4	4*16(KEY), KEY11
+	vbroadcasti32x4	5*16(KEY), KEY12
+	vbroadcasti32x4	6*16(KEY), KEY13
+	vbroadcasti32x4	7*16(KEY), KEY14
 .endif
 .endm
 
@@ -521,7 +546,7 @@
 // using the same key for all block(s).  The round key is loaded from the
 // appropriate register or memory location for round \i.  May clobber \tmp.
 .macro _vaes_1x		enc, i, xmm_suffix, data, tmp
-.if USE_AVX10
+.if USE_AVX512
 	_vaes		\enc, KEY\i\xmm_suffix, \data
 .else
 .ifnb \xmm_suffix
@@ -538,7 +563,7 @@
 // appropriate register or memory location for round \i.  In addition, does two
 // steps of the computation of the next set of tweaks.  May clobber V4 and V5.
 .macro	_vaes_4x	enc, i
-.if USE_AVX10
+.if USE_AVX512
 	_tweak_step	(2*(\i-5))
 	_vaes		\enc, KEY\i, V0
 	_vaes		\enc, KEY\i, V1
@@ -574,7 +599,7 @@
 .irp i, 5,6,7,8,9,10,11,12,13
 	_vaes_1x	\enc, \i, \xmm_suffix, \data, tmp=\tmp
 .endr
-.if USE_AVX10
+.if USE_AVX512
 	vpxord		KEY14\xmm_suffix, \tweak, \tmp
 .else
 .ifnb \xmm_suffix
@@ -617,11 +642,11 @@
 	// This is the main loop, en/decrypting 4*VL bytes per iteration.
 
 	// XOR each source block with its tweak and the zero-th round key.
-.if USE_AVX10
-	_vmovdqu	0*VL(SRC), V0
-	_vmovdqu	1*VL(SRC), V1
-	_vmovdqu	2*VL(SRC), V2
-	_vmovdqu	3*VL(SRC), V3
+.if USE_AVX512
+	vmovdqu8	0*VL(SRC), V0
+	vmovdqu8	1*VL(SRC), V1
+	vmovdqu8	2*VL(SRC), V2
+	vmovdqu8	3*VL(SRC), V3
 	vpternlogd	$0x96, TWEAK0, KEY0, V0
 	vpternlogd	$0x96, TWEAK1, KEY0, V1
 	vpternlogd	$0x96, TWEAK2, KEY0, V2
@@ -654,7 +679,7 @@
 	// Reduce latency by doing the XOR before the vaesenclast, utilizing the
 	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a)
 	// (and likewise for vaesdeclast).
-.if USE_AVX10
+.if USE_AVX512
 	_tweak_step	18
 	_tweak_step	19
 	vpxord		TWEAK0, KEY14, V4
@@ -762,7 +787,7 @@
 	_aes_crypt	\enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1
 .endif
 
-.if USE_AVX10
+.if USE_AVX512
 	// Create a mask that has the first LEN bits set.
 	mov		$-1, %r9d
 	bzhi		LEN, %r9d, %r9d
@@ -811,7 +836,7 @@
 //			   u8 iv[AES_BLOCK_SIZE]);
 //
 // Encrypt |iv| using the AES key |tweak_key| to get the first tweak.  Assumes
-// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10.
+// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX512.
 SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
 	.set	TWEAK_KEY,	%rdi
 	.set	IV,		%rsi
@@ -853,7 +878,7 @@ SYM_FUNC_END(aes_xts_encrypt_iv)
 // multiple of 16, then this function updates |tweak| to contain the next tweak.
 
 .set	VL, 16
-.set	USE_AVX10, 0
+.set	USE_AVX512, 0
 SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
 	_aes_xts_crypt	1
 SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
@@ -863,7 +888,7 @@ SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
 
 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
 .set	VL, 32
-.set	USE_AVX10, 0
+.set	USE_AVX512, 0
 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
 	_aes_xts_crypt	1
 SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
@@ -871,21 +896,12 @@ SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
 	_aes_xts_crypt	0
 SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
 
-.set	VL, 32
-.set	USE_AVX10, 1
-SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
-	_aes_xts_crypt	1
-SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
-SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
-	_aes_xts_crypt	0
-SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
-
 .set	VL, 64
-.set	USE_AVX10, 1
-SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
+.set	USE_AVX512, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx512)
 	_aes_xts_crypt	1
-SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
-SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx512)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx512)
 	_aes_xts_crypt	0
-SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx512)
 #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */