crypto: arm/chacha20 - add XChaCha20 support

Add an XChaCha20 implementation that is hooked up to the ARM NEON implementation of ChaCha20. This is needed for use in the Adiantum encryption mode; see the generic code patch, "crypto: chacha20-generic - add XChaCha20 support", for more details. We also update the NEON code to support HChaCha20 on one block, so we can use that in XChaCha20 rather than calling the generic HChaCha20. This required factoring the permutation out into its own macro. Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Eric Biggers <ebiggers@google.com> 2018-11-16 17:26:24 -0800
committer: Herbert Xu <herbert@gondor.apana.org.au> 2018-11-20 14:26:56 +0800
commit: d97a94309d764ed907d4281da6246f5d935166f8 (patch)
tree: 6b17f507f905fc8240c49bac6b54c508aef6c79b /arch/arm/crypto/chacha20-neon-core.S
parent: be2830b15b60011845ad701076511e8b93b2fd76 (diff)
1 files changed, 48 insertions, 22 deletions
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S
index 50e7b9896818..2335e5055d2b 100644
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha20-neon-core.S
@@ -52,27 +52,16 @@
 	.fpu		neon
 	.align		5
 
-ENTRY(chacha20_block_xor_neon)
-	// r0: Input state matrix, s
-	// r1: 1 data block output, o
-	// r2: 1 data block input, i
-
-	//
-	// This function encrypts one ChaCha20 block by loading the state matrix
-	// in four NEON registers. It performs matrix operation on four words in
-	// parallel, but requireds shuffling to rearrange the words after each
-	// round.
-	//
-
-	// x0..3 = s0..3
-	add		ip, r0, #0x20
-	vld1.32		{q0-q1}, [r0]
-	vld1.32		{q2-q3}, [ip]
-
-	vmov		q8, q0
-	vmov		q9, q1
-	vmov		q10, q2
-	vmov		q11, q3
+/*
+ * chacha20_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha20_permute:
 
 	adr		ip, .Lrol8_table
 	mov		r3, #10
@@ -142,6 +131,27 @@ ENTRY(chacha20_block_xor_neon)
 	subs		r3, r3, #1
 	bne		.Ldoubleround
 
+	bx		lr
+ENDPROC(chacha20_permute)
+
+ENTRY(chacha20_block_xor_neon)
+	// r0: Input state matrix, s
+	// r1: 1 data block output, o
+	// r2: 1 data block input, i
+	push		{lr}
+
+	// x0..3 = s0..3
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	vmov		q8, q0
+	vmov		q9, q1
+	vmov		q10, q2
+	vmov		q11, q3
+
+	bl		chacha20_permute
+
 	add		ip, r2, #0x20
 	vld1.8		{q4-q5}, [r2]
 	vld1.8		{q6-q7}, [ip]
@@ -166,9 +176,25 @@ ENTRY(chacha20_block_xor_neon)
 	vst1.8		{q0-q1}, [r1]
 	vst1.8		{q2-q3}, [ip]
 
-	bx		lr
+	pop		{pc}
 ENDPROC(chacha20_block_xor_neon)
 
+ENTRY(hchacha20_block_neon)
+	// r0: Input state matrix, s
+	// r1: output (8 32-bit words)
+	push		{lr}
+
+	vld1.32		{q0-q1}, [r0]!
+	vld1.32		{q2-q3}, [r0]
+
+	bl		chacha20_permute
+
+	vst1.32		{q0}, [r1]!
+	vst1.32		{q3}, [r1]
+
+	pop		{pc}
+ENDPROC(hchacha20_block_neon)
+
 	.align		4
 .Lctrinc:	.word	0, 1, 2, 3
 .Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
author	Eric Biggers <ebiggers@google.com>	2018-11-16 17:26:24 -0800
committer	Herbert Xu <herbert@gondor.apana.org.au>	2018-11-20 14:26:56 +0800
commit	d97a94309d764ed907d4281da6246f5d935166f8 (patch)
tree	6b17f507f905fc8240c49bac6b54c508aef6c79b /arch/arm/crypto/chacha20-neon-core.S
parent	be2830b15b60011845ad701076511e8b93b2fd76 (diff)