crypto: arm64/chacha - optimize for arbitrary length inputs

Update the 4-way NEON ChaCha routine so it can handle input of any length >64 bytes in its entirety, rather than having to call into the 1-way routine and/or memcpy()s via temp buffers to handle the tail of a ChaCha invocation that is not a multiple of 256 bytes. On inputs that are a multiple of 256 bytes (and thus in tcrypt benchmarks), performance drops by around 1% on Cortex-A57, while performance for inputs drawn randomly from the range [64, 1024) increases by around 30%. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Ard Biesheuvel <ard.biesheuvel@linaro.org> 2018-12-04 14:13:32 +0100
committer: Herbert Xu <herbert@gondor.apana.org.au> 2018-12-13 18:24:40 +0800
commit: f2ca1cbd0fb584b5b5e0dbd9bda819f49cf9cdb6 (patch)
tree: 13a610a77ea99a6862e68fcacae99ea4b18924cf /arch/arm64/crypto/chacha-neon-glue.c
parent: ee5bbc9fd3a1fb81e9f6103d6c52ab88926a9603 (diff)
1 files changed, 14 insertions, 24 deletions
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index 346eb85498a1..67f8feb0c717 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -32,41 +32,29 @@
 asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
 				      int nrounds);
 asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
-				       int nrounds);
+				       int nrounds, int bytes);
 asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
 
 static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
-			  unsigned int bytes, int nrounds)
+			  int bytes, int nrounds)
 {
 	u8 buf[CHACHA_BLOCK_SIZE];
 
-	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
-		kernel_neon_begin();
-		chacha_4block_xor_neon(state, dst, src, nrounds);
-		kernel_neon_end();
+	if (bytes < CHACHA_BLOCK_SIZE) {
+		memcpy(buf, src, bytes);
+		chacha_block_xor_neon(state, buf, buf, nrounds);
+		memcpy(dst, buf, bytes);
+		return;
+	}
+
+	while (bytes > 0) {
+		chacha_4block_xor_neon(state, dst, src, nrounds,
+				       min(bytes, CHACHA_BLOCK_SIZE * 4));
 		bytes -= CHACHA_BLOCK_SIZE * 4;
 		src += CHACHA_BLOCK_SIZE * 4;
 		dst += CHACHA_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
-
-	if (!bytes)
-		return;
-
-	kernel_neon_begin();
-	while (bytes >= CHACHA_BLOCK_SIZE) {
-		chacha_block_xor_neon(state, dst, src, nrounds);
-		bytes -= CHACHA_BLOCK_SIZE;
-		src += CHACHA_BLOCK_SIZE;
-		dst += CHACHA_BLOCK_SIZE;
-		state[12]++;
-	}
-	if (bytes) {
-		memcpy(buf, src, bytes);
-		chacha_block_xor_neon(state, buf, buf, nrounds);
-		memcpy(dst, buf, bytes);
-	}
-	kernel_neon_end();
 }
 
 static int chacha_neon_stream_xor(struct skcipher_request *req,
@@ -86,8 +74,10 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
 		if (nbytes < walk.total)
 			nbytes = round_down(nbytes, walk.stride);
 
+		kernel_neon_begin();
 		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
 			      nbytes, ctx->nrounds);
+		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
author	Ard Biesheuvel <ard.biesheuvel@linaro.org>	2018-12-04 14:13:32 +0100
committer	Herbert Xu <herbert@gondor.apana.org.au>	2018-12-13 18:24:40 +0800
commit	f2ca1cbd0fb584b5b5e0dbd9bda819f49cf9cdb6 (patch)
tree	13a610a77ea99a6862e68fcacae99ea4b18924cf /arch/arm64/crypto/chacha-neon-glue.c
parent	ee5bbc9fd3a1fb81e9f6103d6c52ab88926a9603 (diff)