1 files changed, 168 insertions, 161 deletions
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index 48f97b79f7a9..8a3e23fbcf85 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -80,22 +80,6 @@
 	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
 .text
 
-// Function parameters
-.set	KEY,		%rdi	// Initially points to crypto_aes_ctx, then is
-				// advanced to point to 7th-from-last round key
-.set	SRC,		%rsi	// Pointer to next source data
-.set	DST,		%rdx	// Pointer to next destination data
-.set	LEN,		%ecx	// Remaining length in bytes
-.set	LEN8,		%cl
-.set	LEN64,		%rcx
-.set	TWEAK,		%r8	// Pointer to next tweak
-
-// %rax holds the AES key length in bytes.
-.set	KEYLEN,		%eax
-.set	KEYLEN64,	%rax
-
-// %r9-r11 are available as temporaries.
-
 .macro	_define_Vi	i
 .if VL == 16
 	.set	V\i,		%xmm\i
@@ -112,41 +96,31 @@
 	// Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
 	// are available, that map to the xmm, ymm, or zmm registers according
 	// to the selected Vector Length (VL).
-	_define_Vi	0
-	_define_Vi	1
-	_define_Vi	2
-	_define_Vi	3
-	_define_Vi	4
-	_define_Vi	5
-	_define_Vi	6
-	_define_Vi	7
-	_define_Vi	8
-	_define_Vi	9
-	_define_Vi	10
-	_define_Vi	11
-	_define_Vi	12
-	_define_Vi	13
-	_define_Vi	14
-	_define_Vi	15
+.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+	_define_Vi	\i
+.endr
 .if USE_AVX10
-	_define_Vi	16
-	_define_Vi	17
-	_define_Vi	18
-	_define_Vi	19
-	_define_Vi	20
-	_define_Vi	21
-	_define_Vi	22
-	_define_Vi	23
-	_define_Vi	24
-	_define_Vi	25
-	_define_Vi	26
-	_define_Vi	27
-	_define_Vi	28
-	_define_Vi	29
-	_define_Vi	30
-	_define_Vi	31
+.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+	_define_Vi	\i
+.endr
 .endif
 
+	// Function parameters
+	.set	KEY,		%rdi	// Initially points to crypto_aes_ctx, then is
+					// advanced to point to 7th-from-last round key
+	.set	SRC,		%rsi	// Pointer to next source data
+	.set	DST,		%rdx	// Pointer to next destination data
+	.set	LEN,		%ecx	// Remaining length in bytes
+	.set	LEN8,		%cl
+	.set	LEN64,		%rcx
+	.set	TWEAK,		%r8	// Pointer to next tweak
+
+	// %rax holds the AES key length in bytes.
+	.set	KEYLEN,		%eax
+	.set	KEYLEN64,	%rax
+
+	// %r9-r11 are available as temporaries.
+
 	// V0-V3 hold the data blocks during the main loop, or temporary values
 	// otherwise.  V4-V5 hold temporary values.
 
@@ -214,6 +188,7 @@
 .endm
 
 // Move a vector between memory and a register.
+// The register operand must be in the first 16 vector registers.
 .macro	_vmovdqu	src, dst
 .if VL < 64
 	vmovdqu		\src, \dst
@@ -234,11 +209,12 @@
 .endm
 
 // XOR two vectors together.
+// Any register operands must be in the first 16 vector registers.
 .macro	_vpxor	src1, src2, dst
-.if USE_AVX10
-	vpxord		\src1, \src2, \dst
-.else
+.if VL < 64
 	vpxor		\src1, \src2, \dst
+.else
+	vpxord		\src1, \src2, \dst
 .endif
 .endm
 
@@ -259,8 +235,12 @@
 	vpshufd		$0x13, \src, \tmp
 	vpaddq		\src, \src, \dst
 	vpsrad		$31, \tmp, \tmp
+.if USE_AVX10
+	vpternlogd	$0x78, GF_POLY_XMM, \tmp, \dst
+.else
 	vpand		GF_POLY_XMM, \tmp, \tmp
 	vpxor		\tmp, \dst, \dst
+.endif
 .endm
 
 // Given the XTS tweak(s) in the vector \src, compute the next vector of
@@ -369,9 +349,14 @@
 
 // Do one step in computing the next set of tweaks using the VPCLMULQDQ method
 // (the same method _next_tweakvec uses for VL > 16).  This means multiplying
-// each tweak by x^(4*VL/16) independently.  Since 4*VL/16 is a multiple of 8
-// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
-// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
+// each tweak by x^(4*VL/16) independently.
+//
+// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed
+// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq
+// to do 128-bit wide shifts.  The 128-bit left shift (vpslldq) saves
+// instructions directly.  The 128-bit right shift (vpsrldq) performs better
+// than a 64-bit right shift on Intel CPUs in the context where it is used here,
+// because it runs on a different execution port from the AES instructions.
 .macro	_tweak_step_pclmul	i
 .if \i == 0
 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
@@ -406,7 +391,7 @@
 // \i that include at least 0 through 19, then 1000 which signals the last step.
 //
 // This is used to interleave the computation of the next set of tweaks with the
-// AES en/decryptions, which increases performance in some cases.
+// AES en/decryptions, which increases performance in some cases.  Clobbers V5.
 .macro	_tweak_step	i
 .if VL == 16
 	_tweak_step_mulx	\i
@@ -443,9 +428,10 @@
 	// the last round needs different instructions.
 	//
 	// An alternative approach would be to roll up all the round loops.  We
-	// don't do that because it isn't compatible with caching the round keys
-	// in registers which we do when possible (see below), and also because
-	// it seems unwise to rely *too* heavily on the CPU's branch predictor.
+	// don't do that because (a) it isn't compatible with caching the round
+	// keys in registers which we do when possible (see below), (b) we
+	// interleave the AES rounds with the XTS tweak computation, and (c) it
+	// seems unwise to rely *too* heavily on the CPU's branch predictor.
 	lea		OFFS-16(KEY, KEYLEN64, 4), KEY
 
 	// If all 32 SIMD registers are available, cache all the round keys.
@@ -472,90 +458,94 @@
 .endif
 .endm
 
-// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
-// on the block(s) in \data using the round key(s) in \key.  The register length
-// determines the number of AES blocks en/decrypted.
-.macro	_vaes	enc, last, key, data
+// Do a single non-last round of AES encryption (if \enc==1) or decryption (if
+// \enc==0) on the block(s) in \data using the round key(s) in \key.  The
+// register length determines the number of AES blocks en/decrypted.
+.macro	_vaes	enc, key, data
 .if \enc
-.if \last
-	vaesenclast	\key, \data, \data
-.else
 	vaesenc		\key, \data, \data
-.endif
-.else
-.if \last
-	vaesdeclast	\key, \data, \data
 .else
 	vaesdec		\key, \data, \data
 .endif
+.endm
+
+// Same as _vaes, but does the last round.
+.macro	_vaeslast	enc, key, data
+.if \enc
+	vaesenclast	\key, \data, \data
+.else
+	vaesdeclast	\key, \data, \data
 .endif
 .endm
 
-// Do a single round of AES en/decryption on the block(s) in \data, using the
-// same key for all block(s).  The round key is loaded from the appropriate
-// register or memory location for round \i.  May clobber V4.
-.macro _vaes_1x		enc, last, i, xmm_suffix, data
+// Do a single non-last round of AES en/decryption on the block(s) in \data,
+// using the same key for all block(s).  The round key is loaded from the
+// appropriate register or memory location for round \i.  May clobber \tmp.
+.macro _vaes_1x		enc, i, xmm_suffix, data, tmp
 .if USE_AVX10
-	_vaes		\enc, \last, KEY\i\xmm_suffix, \data
+	_vaes		\enc, KEY\i\xmm_suffix, \data
 .else
 .ifnb \xmm_suffix
-	_vaes		\enc, \last, (\i-7)*16(KEY), \data
+	_vaes		\enc, (\i-7)*16(KEY), \data
 .else
-	_vbroadcast128	(\i-7)*16(KEY), V4
-	_vaes		\enc, \last, V4, \data
+	_vbroadcast128	(\i-7)*16(KEY), \tmp
+	_vaes		\enc, \tmp, \data
 .endif
 .endif
 .endm
 
-// Do a single round of AES en/decryption on the blocks in registers V0-V3,
-// using the same key for all blocks.  The round key is loaded from the
+// Do a single non-last round of AES en/decryption on the blocks in registers
+// V0-V3, using the same key for all blocks.  The round key is loaded from the
 // appropriate register or memory location for round \i.  In addition, does two
-// steps of the computation of the next set of tweaks.  May clobber V4.
-.macro	_vaes_4x	enc, last, i
+// steps of the computation of the next set of tweaks.  May clobber V4 and V5.
+.macro	_vaes_4x	enc, i
 .if USE_AVX10
 	_tweak_step	(2*(\i-5))
-	_vaes		\enc, \last, KEY\i, V0
-	_vaes		\enc, \last, KEY\i, V1
+	_vaes		\enc, KEY\i, V0
+	_vaes		\enc, KEY\i, V1
 	_tweak_step	(2*(\i-5) + 1)
-	_vaes		\enc, \last, KEY\i, V2
-	_vaes		\enc, \last, KEY\i, V3
+	_vaes		\enc, KEY\i, V2
+	_vaes		\enc, KEY\i, V3
 .else
 	_vbroadcast128	(\i-7)*16(KEY), V4
 	_tweak_step	(2*(\i-5))
-	_vaes		\enc, \last, V4, V0
-	_vaes		\enc, \last, V4, V1
+	_vaes		\enc, V4, V0
+	_vaes		\enc, V4, V1
 	_tweak_step	(2*(\i-5) + 1)
-	_vaes		\enc, \last, V4, V2
-	_vaes		\enc, \last, V4, V3
+	_vaes		\enc, V4, V2
+	_vaes		\enc, V4, V3
 .endif
 .endm
 
 // Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
 // then XOR with \tweak again) of the block(s) in \data.  To process a single
 // block, use xmm registers and set \xmm_suffix=_XMM.  To process a vector of
-// length VL, use V* registers and leave \xmm_suffix empty.  May clobber V4.
-.macro	_aes_crypt	enc, xmm_suffix, tweak, data
+// length VL, use V* registers and leave \xmm_suffix empty.  Clobbers \tmp.
+.macro	_aes_crypt	enc, xmm_suffix, tweak, data, tmp
 	_xor3		KEY0\xmm_suffix, \tweak, \data
 	cmp		$24, KEYLEN
 	jl		.Laes128\@
 	je		.Laes192\@
-	_vaes_1x	\enc, 0, 1, \xmm_suffix, \data
-	_vaes_1x	\enc, 0, 2, \xmm_suffix, \data
+	_vaes_1x	\enc, 1, \xmm_suffix, \data, tmp=\tmp
+	_vaes_1x	\enc, 2, \xmm_suffix, \data, tmp=\tmp
 .Laes192\@:
-	_vaes_1x	\enc, 0, 3, \xmm_suffix, \data
-	_vaes_1x	\enc, 0, 4, \xmm_suffix, \data
+	_vaes_1x	\enc, 3, \xmm_suffix, \data, tmp=\tmp
+	_vaes_1x	\enc, 4, \xmm_suffix, \data, tmp=\tmp
 .Laes128\@:
-	_vaes_1x	\enc, 0, 5, \xmm_suffix, \data
-	_vaes_1x	\enc, 0, 6, \xmm_suffix, \data
-	_vaes_1x	\enc, 0, 7, \xmm_suffix, \data
-	_vaes_1x	\enc, 0, 8, \xmm_suffix, \data
-	_vaes_1x	\enc, 0, 9, \xmm_suffix, \data
-	_vaes_1x	\enc, 0, 10, \xmm_suffix, \data
-	_vaes_1x	\enc, 0, 11, \xmm_suffix, \data
-	_vaes_1x	\enc, 0, 12, \xmm_suffix, \data
-	_vaes_1x	\enc, 0, 13, \xmm_suffix, \data
-	_vaes_1x	\enc, 1, 14, \xmm_suffix, \data
-	_vpxor		\tweak, \data, \data
+.irp i, 5,6,7,8,9,10,11,12,13
+	_vaes_1x	\enc, \i, \xmm_suffix, \data, tmp=\tmp
+.endr
+.if USE_AVX10
+	vpxord		KEY14\xmm_suffix, \tweak, \tmp
+.else
+.ifnb \xmm_suffix
+	vpxor		7*16(KEY), \tweak, \tmp
+.else
+	_vbroadcast128	7*16(KEY), \tmp
+	vpxor		\tweak, \tmp, \tmp
+.endif
+.endif
+	_vaeslast	\enc, \tmp, \data
 .endm
 
 .macro	_aes_xts_crypt	enc
@@ -581,7 +571,7 @@
 	// Compute the first set of tweaks TWEAK[0-3].
 	_compute_first_set_of_tweaks
 
-	sub		$4*VL, LEN
+	add		$-4*VL, LEN  // shorter than 'sub 4*VL' when VL=32
 	jl		.Lhandle_remainder\@
 
 .Lmain_loop\@:
@@ -589,10 +579,10 @@
 
 	// XOR each source block with its tweak and the zero-th round key.
 .if USE_AVX10
-	vmovdqu8	0*VL(SRC), V0
-	vmovdqu8	1*VL(SRC), V1
-	vmovdqu8	2*VL(SRC), V2
-	vmovdqu8	3*VL(SRC), V3
+	_vmovdqu	0*VL(SRC), V0
+	_vmovdqu	1*VL(SRC), V1
+	_vmovdqu	2*VL(SRC), V2
+	_vmovdqu	3*VL(SRC), V3
 	vpternlogd	$0x96, TWEAK0, KEY0, V0
 	vpternlogd	$0x96, TWEAK1, KEY0, V1
 	vpternlogd	$0x96, TWEAK2, KEY0, V2
@@ -612,28 +602,43 @@
 	je		.Laes192\@
 	// Do all the AES rounds on the data blocks, interleaved with
 	// the computation of the next set of tweaks.
-	_vaes_4x	\enc, 0, 1
-	_vaes_4x	\enc, 0, 2
+	_vaes_4x	\enc, 1
+	_vaes_4x	\enc, 2
 .Laes192\@:
-	_vaes_4x	\enc, 0, 3
-	_vaes_4x	\enc, 0, 4
+	_vaes_4x	\enc, 3
+	_vaes_4x	\enc, 4
 .Laes128\@:
-	_vaes_4x	\enc, 0, 5
-	_vaes_4x	\enc, 0, 6
-	_vaes_4x	\enc, 0, 7
-	_vaes_4x	\enc, 0, 8
-	_vaes_4x	\enc, 0, 9
-	_vaes_4x	\enc, 0, 10
-	_vaes_4x	\enc, 0, 11
-	_vaes_4x	\enc, 0, 12
-	_vaes_4x	\enc, 0, 13
-	_vaes_4x	\enc, 1, 14
-
-	// XOR in the tweaks again.
-	_vpxor		TWEAK0, V0, V0
-	_vpxor		TWEAK1, V1, V1
-	_vpxor		TWEAK2, V2, V2
-	_vpxor		TWEAK3, V3, V3
+.irp i, 5,6,7,8,9,10,11,12,13
+	_vaes_4x	\enc, \i
+.endr
+	// Do the last AES round, then XOR the results with the tweaks again.
+	// Reduce latency by doing the XOR before the vaesenclast, utilizing the
+	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a)
+	// (and likewise for vaesdeclast).
+.if USE_AVX10
+	_tweak_step	18
+	_tweak_step	19
+	vpxord		TWEAK0, KEY14, V4
+	vpxord		TWEAK1, KEY14, V5
+	_vaeslast	\enc, V4, V0
+	_vaeslast	\enc, V5, V1
+	vpxord		TWEAK2, KEY14, V4
+	vpxord		TWEAK3, KEY14, V5
+	_vaeslast	\enc, V4, V2
+	_vaeslast	\enc, V5, V3
+.else
+	_vbroadcast128	7*16(KEY), V4
+	_tweak_step	18 // uses V5
+	_tweak_step	19 // uses V5
+	vpxor		TWEAK0, V4, V5
+	_vaeslast	\enc, V5, V0
+	vpxor		TWEAK1, V4, V5
+	_vaeslast	\enc, V5, V1
+	vpxor		TWEAK2, V4, V5
+	vpxor		TWEAK3, V4, V4
+	_vaeslast	\enc, V5, V2
+	_vaeslast	\enc, V4, V3
+.endif
 
 	// Store the destination blocks.
 	_vmovdqu	V0, 0*VL(DST)
@@ -644,9 +649,9 @@
 	// Finish computing the next set of tweaks.
 	_tweak_step	1000
 
-	add		$4*VL, SRC
-	add		$4*VL, DST
-	sub		$4*VL, LEN
+	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
+	sub		$-4*VL, DST
+	add		$-4*VL, LEN
 	jge		.Lmain_loop\@
 
 	// Check for the uncommon case where the data length isn't a multiple of
@@ -670,7 +675,7 @@
 	jl		.Lvec_at_a_time_done\@
 .Lvec_at_a_time\@:
 	_vmovdqu	(SRC), V0
-	_aes_crypt	\enc, , TWEAK0, V0
+	_aes_crypt	\enc, , TWEAK0, V0, tmp=V1
 	_vmovdqu	V0, (DST)
 	_next_tweakvec	TWEAK0, V0, V1, TWEAK0
 	add		$VL, SRC
@@ -687,7 +692,7 @@
 	jl		.Lblock_at_a_time_done\@
 .Lblock_at_a_time\@:
 	vmovdqu		(SRC), %xmm0
-	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0
+	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1
 	vmovdqu		%xmm0, (DST)
 	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK0_XMM
 	add		$16, SRC
@@ -715,7 +720,7 @@
 	// Do it now by advancing the tweak and decrypting the last full block.
 	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK1_XMM
 	vmovdqu		(SRC), %xmm0
-	_aes_crypt	\enc, _XMM, TWEAK1_XMM, %xmm0
+	_aes_crypt	\enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1
 .endif
 
 .if USE_AVX10
@@ -758,47 +763,49 @@
 	vpblendvb	%xmm3, %xmm0, %xmm1, %xmm0
 .endif
 	// En/decrypt again and store the last full block.
-	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0
+	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1
 	vmovdqu		%xmm0, (DST)
 	jmp		.Ldone\@
 .endm
 
 // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
 //			   u8 iv[AES_BLOCK_SIZE]);
+//
+// Encrypt |iv| using the AES key |tweak_key| to get the first tweak.  Assumes
+// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10.
 SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
-	vmovdqu		(%rsi), %xmm0
-	vpxor		(%rdi), %xmm0, %xmm0
-	movl		480(%rdi), %eax		// AES key length
-	lea		-16(%rdi, %rax, 4), %rdi
-	cmp		$24, %eax
+	.set	TWEAK_KEY,	%rdi
+	.set	IV,		%rsi
+	.set	KEYLEN,		%eax
+	.set	KEYLEN64,	%rax
+
+	vmovdqu		(IV), %xmm0
+	vpxor		(TWEAK_KEY), %xmm0, %xmm0
+	movl		480(TWEAK_KEY), KEYLEN
+	lea		-16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY
+	cmp		$24, KEYLEN
 	jl		.Lencrypt_iv_aes128
 	je		.Lencrypt_iv_aes192
-	vaesenc		-6*16(%rdi), %xmm0, %xmm0
-	vaesenc		-5*16(%rdi), %xmm0, %xmm0
+	vaesenc		-6*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenc		-5*16(TWEAK_KEY), %xmm0, %xmm0
 .Lencrypt_iv_aes192:
-	vaesenc		-4*16(%rdi), %xmm0, %xmm0
-	vaesenc		-3*16(%rdi), %xmm0, %xmm0
+	vaesenc		-4*16(TWEAK_KEY), %xmm0, %xmm0
+	vaesenc		-3*16(TWEAK_KEY), %xmm0, %xmm0
 .Lencrypt_iv_aes128:
-	vaesenc		-2*16(%rdi), %xmm0, %xmm0
-	vaesenc		-1*16(%rdi), %xmm0, %xmm0
-	vaesenc		0*16(%rdi), %xmm0, %xmm0
-	vaesenc		1*16(%rdi), %xmm0, %xmm0
-	vaesenc		2*16(%rdi), %xmm0, %xmm0
-	vaesenc		3*16(%rdi), %xmm0, %xmm0
-	vaesenc		4*16(%rdi), %xmm0, %xmm0
-	vaesenc		5*16(%rdi), %xmm0, %xmm0
-	vaesenc		6*16(%rdi), %xmm0, %xmm0
-	vaesenclast	7*16(%rdi), %xmm0, %xmm0
-	vmovdqu		%xmm0, (%rsi)
+.irp i, -2,-1,0,1,2,3,4,5,6
+	vaesenc		\i*16(TWEAK_KEY), %xmm0, %xmm0
+.endr
+	vaesenclast	7*16(TWEAK_KEY), %xmm0, %xmm0
+	vmovdqu		%xmm0, (IV)
 	RET
 SYM_FUNC_END(aes_xts_encrypt_iv)
 
 // Below are the actual AES-XTS encryption and decryption functions,
 // instantiated from the above macro.  They all have the following prototype:
 //
-// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
-//			const u8 *src, u8 *dst, unsigned int len,
-//			u8 tweak[AES_BLOCK_SIZE]);
+// void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
+//			  const u8 *src, u8 *dst, int len,
+//			  u8 tweak[AES_BLOCK_SIZE]);
 //
 // |key| is the data key.  |tweak| contains the next tweak; the encryption of
 // the original IV with the tweak key was already done.  This function supports