20 files changed, 241 insertions, 932 deletions
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index eb7b423ba463..5636ab83f22a 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -268,6 +268,7 @@ config CRYPTO_AES_ARM64_CE_CCM
 	depends on ARM64 && KERNEL_MODE_NEON
 	select CRYPTO_ALGAPI
 	select CRYPTO_AES_ARM64_CE
+	select CRYPTO_AES_ARM64_CE_BLK
 	select CRYPTO_AEAD
 	select CRYPTO_LIB_AES
 	help
@@ -311,15 +312,5 @@ config CRYPTO_SM4_ARM64_CE_GCM
 	  - PMULL (Polynomial Multiply Long) instructions
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_CRCT10DIF_ARM64_CE
-	tristate "CRCT10DIF (PMULL)"
-	depends on KERNEL_MODE_NEON && CRC_T10DIF
-	select CRYPTO_HASH
-	help
-	  CRC16 CRC algorithm used for the T10 (SCSI) Data Integrity Field (DIF)
-
-	  Architecture: arm64 using
-	  - PMULL (Polynomial Multiply Long) instructions
-
 endmenu
 
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index fbe64dce66e0..e7139c4768ce 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -44,9 +44,6 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o
 polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
 
-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
-crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
-
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
 
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index b03f7f71f893..f2624238fd95 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -1,8 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
+ * aes-ce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
  *
- * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd.
+ * Copyright (C) 2024 Google LLC
+ *
+ * Author: Ard Biesheuvel <ardb@kernel.org>
  */
 
 #include <linux/linkage.h>
@@ -11,211 +14,129 @@
 	.text
 	.arch	armv8-a+crypto
 
-	/*
-	 * u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
-	 *			    u32 macp, u8 const rk[], u32 rounds);
-	 */
-SYM_FUNC_START(ce_aes_ccm_auth_data)
-	ld1	{v0.16b}, [x0]			/* load mac */
-	cbz	w3, 1f
-	sub	w3, w3, #16
-	eor	v1.16b, v1.16b, v1.16b
-0:	ldrb	w7, [x1], #1			/* get 1 byte of input */
-	subs	w2, w2, #1
-	add	w3, w3, #1
-	ins	v1.b[0], w7
-	ext	v1.16b, v1.16b, v1.16b, #1	/* rotate in the input bytes */
-	beq	8f				/* out of input? */
-	cbnz	w3, 0b
-	eor	v0.16b, v0.16b, v1.16b
-1:	ld1	{v3.4s}, [x4]			/* load first round key */
-	prfm	pldl1strm, [x1]
-	cmp	w5, #12				/* which key size? */
-	add	x6, x4, #16
-	sub	w7, w5, #2			/* modified # of rounds */
-	bmi	2f
-	bne	5f
-	mov	v5.16b, v3.16b
-	b	4f
-2:	mov	v4.16b, v3.16b
-	ld1	{v5.4s}, [x6], #16		/* load 2nd round key */
-3:	aese	v0.16b, v4.16b
-	aesmc	v0.16b, v0.16b
-4:	ld1	{v3.4s}, [x6], #16		/* load next round key */
-	aese	v0.16b, v5.16b
-	aesmc	v0.16b, v0.16b
-5:	ld1	{v4.4s}, [x6], #16		/* load next round key */
-	subs	w7, w7, #3
-	aese	v0.16b, v3.16b
-	aesmc	v0.16b, v0.16b
-	ld1	{v5.4s}, [x6], #16		/* load next round key */
-	bpl	3b
-	aese	v0.16b, v4.16b
-	subs	w2, w2, #16			/* last data? */
-	eor	v0.16b, v0.16b, v5.16b		/* final round */
-	bmi	6f
-	ld1	{v1.16b}, [x1], #16		/* load next input block */
-	eor	v0.16b, v0.16b, v1.16b		/* xor with mac */
-	bne	1b
-6:	st1	{v0.16b}, [x0]			/* store mac */
-	beq	10f
-	adds	w2, w2, #16
-	beq	10f
-	mov	w3, w2
-7:	ldrb	w7, [x1], #1
-	umov	w6, v0.b[0]
-	eor	w6, w6, w7
-	strb	w6, [x0], #1
-	subs	w2, w2, #1
-	beq	10f
-	ext	v0.16b, v0.16b, v0.16b, #1	/* rotate out the mac bytes */
-	b	7b
-8:	cbz	w3, 91f
-	mov	w7, w3
-	add	w3, w3, #16
-9:	ext	v1.16b, v1.16b, v1.16b, #1
-	adds	w7, w7, #1
-	bne	9b
-91:	eor	v0.16b, v0.16b, v1.16b
-	st1	{v0.16b}, [x0]
-10:	mov	w0, w3
-	ret
-SYM_FUNC_END(ce_aes_ccm_auth_data)
+	.macro	load_round_keys, rk, nr, tmp
+	sub	w\tmp, \nr, #10
+	add	\tmp, \rk, w\tmp, sxtw #4
+	ld1	{v10.4s-v13.4s}, [\rk]
+	ld1	{v14.4s-v17.4s}, [\tmp], #64
+	ld1	{v18.4s-v21.4s}, [\tmp], #64
+	ld1	{v3.4s-v5.4s}, [\tmp]
+	.endm
 
-	/*
-	 * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
-	 * 			 u32 rounds);
-	 */
-SYM_FUNC_START(ce_aes_ccm_final)
-	ld1	{v3.4s}, [x2], #16		/* load first round key */
-	ld1	{v0.16b}, [x0]			/* load mac */
-	cmp	w3, #12				/* which key size? */
-	sub	w3, w3, #2			/* modified # of rounds */
-	ld1	{v1.16b}, [x1]			/* load 1st ctriv */
-	bmi	0f
-	bne	3f
-	mov	v5.16b, v3.16b
-	b	2f
-0:	mov	v4.16b, v3.16b
-1:	ld1	{v5.4s}, [x2], #16		/* load next round key */
-	aese	v0.16b, v4.16b
-	aesmc	v0.16b, v0.16b
-	aese	v1.16b, v4.16b
-	aesmc	v1.16b, v1.16b
-2:	ld1	{v3.4s}, [x2], #16		/* load next round key */
-	aese	v0.16b, v5.16b
-	aesmc	v0.16b, v0.16b
-	aese	v1.16b, v5.16b
-	aesmc	v1.16b, v1.16b
-3:	ld1	{v4.4s}, [x2], #16		/* load next round key */
-	subs	w3, w3, #3
-	aese	v0.16b, v3.16b
-	aesmc	v0.16b, v0.16b
-	aese	v1.16b, v3.16b
-	aesmc	v1.16b, v1.16b
-	bpl	1b
-	aese	v0.16b, v4.16b
-	aese	v1.16b, v4.16b
-	/* final round key cancels out */
-	eor	v0.16b, v0.16b, v1.16b		/* en-/decrypt the mac */
-	st1	{v0.16b}, [x0]			/* store result */
-	ret
-SYM_FUNC_END(ce_aes_ccm_final)
+	.macro	dround, va, vb, vk
+	aese	\va\().16b, \vk\().16b
+	aesmc	\va\().16b, \va\().16b
+	aese	\vb\().16b, \vk\().16b
+	aesmc	\vb\().16b, \vb\().16b
+	.endm
+
+	.macro	aes_encrypt, va, vb, nr
+	tbz	\nr, #2, .L\@
+	dround	\va, \vb, v10
+	dround	\va, \vb, v11
+	tbz	\nr, #1, .L\@
+	dround	\va, \vb, v12
+	dround	\va, \vb, v13
+.L\@:	.irp	v, v14, v15, v16, v17, v18, v19, v20, v21, v3
+	dround	\va, \vb, \v
+	.endr
+	aese	\va\().16b, v4.16b
+	aese	\vb\().16b, v4.16b
+	.endm
 
 	.macro	aes_ccm_do_crypt,enc
-	cbz	x2, 5f
-	ldr	x8, [x6, #8]			/* load lower ctr */
+	load_round_keys	x3, w4, x10
+
 	ld1	{v0.16b}, [x5]			/* load mac */
+	cbz	x2, ce_aes_ccm_final
+	ldr	x8, [x6, #8]			/* load lower ctr */
 CPU_LE(	rev	x8, x8			)	/* keep swabbed ctr in reg */
 0:	/* outer loop */
 	ld1	{v1.8b}, [x6]			/* load upper ctr */
 	prfm	pldl1strm, [x1]
 	add	x8, x8, #1
 	rev	x9, x8
-	cmp	w4, #12				/* which key size? */
-	sub	w7, w4, #2			/* get modified # of rounds */
 	ins	v1.d[1], x9			/* no carry in lower ctr */
-	ld1	{v3.4s}, [x3]			/* load first round key */
-	add	x10, x3, #16
-	bmi	1f
-	bne	4f
-	mov	v5.16b, v3.16b
-	b	3f
-1:	mov	v4.16b, v3.16b
-	ld1	{v5.4s}, [x10], #16		/* load 2nd round key */
-2:	/* inner loop: 3 rounds, 2x interleaved */
-	aese	v0.16b, v4.16b
-	aesmc	v0.16b, v0.16b
-	aese	v1.16b, v4.16b
-	aesmc	v1.16b, v1.16b
-3:	ld1	{v3.4s}, [x10], #16		/* load next round key */
-	aese	v0.16b, v5.16b
-	aesmc	v0.16b, v0.16b
-	aese	v1.16b, v5.16b
-	aesmc	v1.16b, v1.16b
-4:	ld1	{v4.4s}, [x10], #16		/* load next round key */
-	subs	w7, w7, #3
-	aese	v0.16b, v3.16b
-	aesmc	v0.16b, v0.16b
-	aese	v1.16b, v3.16b
-	aesmc	v1.16b, v1.16b
-	ld1	{v5.4s}, [x10], #16		/* load next round key */
-	bpl	2b
-	aese	v0.16b, v4.16b
-	aese	v1.16b, v4.16b
+
+	aes_encrypt	v0, v1, w4
+
 	subs	w2, w2, #16
-	bmi	6f				/* partial block? */
+	bmi	ce_aes_ccm_crypt_tail
 	ld1	{v2.16b}, [x1], #16		/* load next input block */
 	.if	\enc == 1
 	eor	v2.16b, v2.16b, v5.16b		/* final round enc+mac */
-	eor	v1.16b, v1.16b, v2.16b		/* xor with crypted ctr */
+	eor	v6.16b, v1.16b, v2.16b		/* xor with crypted ctr */
 	.else
 	eor	v2.16b, v2.16b, v1.16b		/* xor with crypted ctr */
-	eor	v1.16b, v2.16b, v5.16b		/* final round enc */
+	eor	v6.16b, v2.16b, v5.16b		/* final round enc */
 	.endif
 	eor	v0.16b, v0.16b, v2.16b		/* xor mac with pt ^ rk[last] */
-	st1	{v1.16b}, [x0], #16		/* write output block */
+	st1	{v6.16b}, [x0], #16		/* write output block */
 	bne	0b
 CPU_LE(	rev	x8, x8			)
-	st1	{v0.16b}, [x5]			/* store mac */
 	str	x8, [x6, #8]			/* store lsb end of ctr (BE) */
-5:	ret
-
-6:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
-	eor	v1.16b, v1.16b, v5.16b		/* final round enc */
+	cbnz	x7, ce_aes_ccm_final
 	st1	{v0.16b}, [x5]			/* store mac */
-	add	w2, w2, #16			/* process partial tail block */
-7:	ldrb	w9, [x1], #1			/* get 1 byte of input */
-	umov	w6, v1.b[0]			/* get top crypted ctr byte */
-	umov	w7, v0.b[0]			/* get top mac byte */
-	.if	\enc == 1
-	eor	w7, w7, w9
-	eor	w9, w9, w6
-	.else
-	eor	w9, w9, w6
-	eor	w7, w7, w9
-	.endif
-	strb	w9, [x0], #1			/* store out byte */
-	strb	w7, [x5], #1			/* store mac byte */
-	subs	w2, w2, #1
-	beq	5b
-	ext	v0.16b, v0.16b, v0.16b, #1	/* shift out mac byte */
-	ext	v1.16b, v1.16b, v1.16b, #1	/* shift out ctr byte */
-	b	7b
+	ret
 	.endm
 
+SYM_FUNC_START_LOCAL(ce_aes_ccm_crypt_tail)
+	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
+	eor	v1.16b, v1.16b, v5.16b		/* final round enc */
+
+	add	x1, x1, w2, sxtw		/* rewind the input pointer (w2 < 0) */
+	add	x0, x0, w2, sxtw		/* rewind the output pointer */
+
+	adr_l	x8, .Lpermute			/* load permute vectors */
+	add	x9, x8, w2, sxtw
+	sub	x8, x8, w2, sxtw
+	ld1	{v7.16b-v8.16b}, [x9]
+	ld1	{v9.16b}, [x8]
+
+	ld1	{v2.16b}, [x1]			/* load a full block of input */
+	tbl	v1.16b, {v1.16b}, v7.16b	/* move keystream to end of register */
+	eor	v7.16b, v2.16b, v1.16b		/* encrypt partial input block */
+	bif	v2.16b, v7.16b, v22.16b		/* select plaintext */
+	tbx	v7.16b, {v6.16b}, v8.16b	/* insert output from previous iteration */
+	tbl	v2.16b, {v2.16b}, v9.16b	/* copy plaintext to start of v2 */
+	eor	v0.16b, v0.16b, v2.16b		/* fold plaintext into mac */
+
+	st1	{v7.16b}, [x0]			/* store output block */
+	cbz	x7, 0f
+
+SYM_INNER_LABEL(ce_aes_ccm_final, SYM_L_LOCAL)
+	ld1	{v1.16b}, [x7]			/* load 1st ctriv */
+
+	aes_encrypt	v0, v1, w4
+
+	/* final round key cancels out */
+	eor	v0.16b, v0.16b, v1.16b		/* en-/decrypt the mac */
+0:	st1	{v0.16b}, [x5]			/* store result */
+	ret
+SYM_FUNC_END(ce_aes_ccm_crypt_tail)
+
 	/*
 	 * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
 	 * 			   u8 const rk[], u32 rounds, u8 mac[],
-	 * 			   u8 ctr[]);
+	 * 			   u8 ctr[], u8 const final_iv[]);
 	 * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
 	 * 			   u8 const rk[], u32 rounds, u8 mac[],
-	 * 			   u8 ctr[]);
+	 * 			   u8 ctr[], u8 const final_iv[]);
 	 */
 SYM_FUNC_START(ce_aes_ccm_encrypt)
+	movi	v22.16b, #255
 	aes_ccm_do_crypt	1
 SYM_FUNC_END(ce_aes_ccm_encrypt)
 
 SYM_FUNC_START(ce_aes_ccm_decrypt)
+	movi	v22.16b, #0
 	aes_ccm_do_crypt	0
 SYM_FUNC_END(ce_aes_ccm_decrypt)
+
+	.section ".rodata", "a"
+	.align	6
+	.fill	15, 1, 0xff
+.Lpermute:
+	.byte	0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+	.byte	0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+	.fill	15, 1, 0xff
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index 25cd3808ecbe..a2b5d6f20f4d 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -1,12 +1,15 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions
+ * aes-ce-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions
  *
- * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd.
+ * Copyright (C) 2024 Google LLC
+ *
+ * Author: Ard Biesheuvel <ardb@kernel.org>
  */
 
 #include <asm/neon.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <crypto/aes.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/internal/aead.h>
@@ -15,6 +18,8 @@
 
 #include "aes-ce-setkey.h"
 
+MODULE_IMPORT_NS("CRYPTO_INTERNAL");
+
 static int num_rounds(struct crypto_aes_ctx *ctx)
 {
 	/*
@@ -27,19 +32,17 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
 	return 6 + ctx->key_length / 4;
 }
 
-asmlinkage u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
-				    u32 macp, u32 const rk[], u32 rounds);
+asmlinkage u32 ce_aes_mac_update(u8 const in[], u32 const rk[], int rounds,
+				 int blocks, u8 dg[], int enc_before,
+				 int enc_after);
 
 asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
 				   u32 const rk[], u32 rounds, u8 mac[],
-				   u8 ctr[]);
+				   u8 ctr[], u8 const final_iv[]);
 
 asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
 				   u32 const rk[], u32 rounds, u8 mac[],
-				   u8 ctr[]);
-
-asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[],
-				 u32 rounds);
+				   u8 ctr[], u8 const final_iv[]);
 
 static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
 		      unsigned int key_len)
@@ -94,6 +97,41 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
 	return 0;
 }
 
+static u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
+				u32 macp, u32 const rk[], u32 rounds)
+{
+	int enc_after = (macp + abytes) % AES_BLOCK_SIZE;
+
+	do {
+		u32 blocks = abytes / AES_BLOCK_SIZE;
+
+		if (macp == AES_BLOCK_SIZE || (!macp && blocks > 0)) {
+			u32 rem = ce_aes_mac_update(in, rk, rounds, blocks, mac,
+						    macp, enc_after);
+			u32 adv = (blocks - rem) * AES_BLOCK_SIZE;
+
+			macp = enc_after ? 0 : AES_BLOCK_SIZE;
+			in += adv;
+			abytes -= adv;
+
+			if (unlikely(rem)) {
+				kernel_neon_end();
+				kernel_neon_begin();
+				macp = 0;
+			}
+		} else {
+			u32 l = min(AES_BLOCK_SIZE - macp, abytes);
+
+			crypto_xor(&mac[macp], in, l);
+			in += l;
+			macp += l;
+			abytes -= l;
+		}
+	} while (abytes > 0);
+
+	return macp;
+}
+
 static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
@@ -101,7 +139,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
 	struct __packed { __be16 l; __be32 h; u16 len; } ltag;
 	struct scatter_walk walk;
 	u32 len = req->assoclen;
-	u32 macp = 0;
+	u32 macp = AES_BLOCK_SIZE;
 
 	/* prepend the AAD with a length tag */
 	if (len < 0xff00) {
@@ -125,16 +163,11 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
 			scatterwalk_start(&walk, sg_next(walk.sg));
 			n = scatterwalk_clamp(&walk, len);
 		}
-		n = min_t(u32, n, SZ_4K); /* yield NEON at least every 4k */
 		p = scatterwalk_map(&walk);
 
 		macp = ce_aes_ccm_auth_data(mac, p, n, macp, ctx->key_enc,
 					    num_rounds(ctx));
 
-		if (len / SZ_4K > (len - n) / SZ_4K) {
-			kernel_neon_end();
-			kernel_neon_begin();
-		}
 		len -= n;
 
 		scatterwalk_unmap(p);
@@ -149,7 +182,7 @@ static int ccm_encrypt(struct aead_request *req)
 	struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
 	struct skcipher_walk walk;
 	u8 __aligned(8) mac[AES_BLOCK_SIZE];
-	u8 buf[AES_BLOCK_SIZE];
+	u8 orig_iv[AES_BLOCK_SIZE];
 	u32 len = req->cryptlen;
 	int err;
 
@@ -158,42 +191,55 @@ static int ccm_encrypt(struct aead_request *req)
 		return err;
 
 	/* preserve the original iv for the final round */
-	memcpy(buf, req->iv, AES_BLOCK_SIZE);
+	memcpy(orig_iv, req->iv, AES_BLOCK_SIZE);
 
 	err = skcipher_walk_aead_encrypt(&walk, req, false);
+	if (unlikely(err))
+		return err;
 
 	kernel_neon_begin();
 
 	if (req->assoclen)
 		ccm_calculate_auth_mac(req, mac);
 
-	while (walk.nbytes) {
+	do {
 		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
-		bool final = walk.nbytes == walk.total;
+		const u8 *src = walk.src.virt.addr;
+		u8 *dst = walk.dst.virt.addr;
+		u8 buf[AES_BLOCK_SIZE];
+		u8 *final_iv = NULL;
 
-		if (final)
+		if (walk.nbytes == walk.total) {
 			tail = 0;
+			final_iv = orig_iv;
+		}
 
-		ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				   walk.nbytes - tail, ctx->key_enc,
-				   num_rounds(ctx), mac, walk.iv);
+		if (unlikely(walk.nbytes < AES_BLOCK_SIZE))
+			src = dst = memcpy(&buf[sizeof(buf) - walk.nbytes],
+					   src, walk.nbytes);
 
-		if (!final)
-			kernel_neon_end();
-		err = skcipher_walk_done(&walk, tail);
-		if (!final)
-			kernel_neon_begin();
-	}
+		ce_aes_ccm_encrypt(dst, src, walk.nbytes - tail,
+				   ctx->key_enc, num_rounds(ctx),
+				   mac, walk.iv, final_iv);
+
+		if (unlikely(walk.nbytes < AES_BLOCK_SIZE))
+			memcpy(walk.dst.virt.addr, dst, walk.nbytes);
 
-	ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+		if (walk.nbytes) {
+			err = skcipher_walk_done(&walk, tail);
+		}
+	} while (walk.nbytes);
 
 	kernel_neon_end();
 
+	if (unlikely(err))
+		return err;
+
 	/* copy authtag to end of dst */
 	scatterwalk_map_and_copy(mac, req->dst, req->assoclen + req->cryptlen,
 				 crypto_aead_authsize(aead), 1);
 
-	return err;
+	return 0;
 }
 
 static int ccm_decrypt(struct aead_request *req)
@@ -203,7 +249,7 @@ static int ccm_decrypt(struct aead_request *req)
 	unsigned int authsize = crypto_aead_authsize(aead);
 	struct skcipher_walk walk;
 	u8 __aligned(8) mac[AES_BLOCK_SIZE];
-	u8 buf[AES_BLOCK_SIZE];
+	u8 orig_iv[AES_BLOCK_SIZE];
 	u32 len = req->cryptlen - authsize;
 	int err;
 
@@ -212,34 +258,44 @@ static int ccm_decrypt(struct aead_request *req)
 		return err;
 
 	/* preserve the original iv for the final round */
-	memcpy(buf, req->iv, AES_BLOCK_SIZE);
+	memcpy(orig_iv, req->iv, AES_BLOCK_SIZE);
 
 	err = skcipher_walk_aead_decrypt(&walk, req, false);
+	if (unlikely(err))
+		return err;
 
 	kernel_neon_begin();
 
 	if (req->assoclen)
 		ccm_calculate_auth_mac(req, mac);
 
-	while (walk.nbytes) {
+	do {
 		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
-		bool final = walk.nbytes == walk.total;
+		const u8 *src = walk.src.virt.addr;
+		u8 *dst = walk.dst.virt.addr;
+		u8 buf[AES_BLOCK_SIZE];
+		u8 *final_iv = NULL;
 
-		if (final)
+		if (walk.nbytes == walk.total) {
 			tail = 0;
+			final_iv = orig_iv;
+		}
 
-		ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				   walk.nbytes - tail, ctx->key_enc,
-				   num_rounds(ctx), mac, walk.iv);
+		if (unlikely(walk.nbytes < AES_BLOCK_SIZE))
+			src = dst = memcpy(&buf[sizeof(buf) - walk.nbytes],
+					   src, walk.nbytes);
 
-		if (!final)
-			kernel_neon_end();
-		err = skcipher_walk_done(&walk, tail);
-		if (!final)
-			kernel_neon_begin();
-	}
+		ce_aes_ccm_decrypt(dst, src, walk.nbytes - tail,
+				   ctx->key_enc, num_rounds(ctx),
+				   mac, walk.iv, final_iv);
+
+		if (unlikely(walk.nbytes < AES_BLOCK_SIZE))
+			memcpy(walk.dst.virt.addr, dst, walk.nbytes);
 
-	ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+		if (walk.nbytes) {
+			err = skcipher_walk_done(&walk, tail);
+		}
+	} while (walk.nbytes);
 
 	kernel_neon_end();
 
@@ -247,11 +303,11 @@ static int ccm_decrypt(struct aead_request *req)
 		return err;
 
 	/* compare calculated auth tag with the stored one */
-	scatterwalk_map_and_copy(buf, req->src,
+	scatterwalk_map_and_copy(orig_iv, req->src,
 				 req->assoclen + req->cryptlen - authsize,
 				 authsize, 0);
 
-	if (crypto_memneq(mac, buf, authsize))
+	if (crypto_memneq(mac, orig_iv, authsize))
 		return -EBADMSG;
 	return 0;
 }
@@ -290,6 +346,6 @@ module_init(aes_mod_init);
 module_exit(aes_mod_exit);
 
 MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("ccm(aes)");
diff --git a/arch/arm64/crypto/aes-ce-glue.c b/arch/arm64/crypto/aes-ce-glue.c
index e921823ca103..00b8749013c5 100644
--- a/arch/arm64/crypto/aes-ce-glue.c
+++ b/arch/arm64/crypto/aes-ce-glue.c
@@ -7,7 +7,7 @@
 
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <crypto/aes.h>
 #include <crypto/algapi.h>
 #include <crypto/internal/simd.h>
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index 1dc5bbbfeed2..b262eaa9170c 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -25,33 +25,28 @@
 	.endm
 
 	/* preload all round keys */
-	.macro		load_round_keys, rounds, rk
-	cmp		\rounds, #12
-	blo		2222f		/* 128 bits */
-	beq		1111f		/* 192 bits */
-	ld1		{v17.4s-v18.4s}, [\rk], #32
-1111:	ld1		{v19.4s-v20.4s}, [\rk], #32
-2222:	ld1		{v21.4s-v24.4s}, [\rk], #64
-	ld1		{v25.4s-v28.4s}, [\rk], #64
-	ld1		{v29.4s-v31.4s}, [\rk]
+	.macro		load_round_keys, rk, nr, tmp
+	add		\tmp, \rk, \nr, sxtw #4
+	sub		\tmp, \tmp, #160
+	ld1		{v17.4s-v20.4s}, [\rk]
+	ld1		{v21.4s-v24.4s}, [\tmp], #64
+	ld1		{v25.4s-v28.4s}, [\tmp], #64
+	ld1		{v29.4s-v31.4s}, [\tmp]
 	.endm
 
 	/* prepare for encryption with key in rk[] */
 	.macro		enc_prepare, rounds, rk, temp
-	mov		\temp, \rk
-	load_round_keys	\rounds, \temp
+	load_round_keys	\rk, \rounds, \temp
 	.endm
 
 	/* prepare for encryption (again) but with new key in rk[] */
 	.macro		enc_switch_key, rounds, rk, temp
-	mov		\temp, \rk
-	load_round_keys	\rounds, \temp
+	load_round_keys	\rk, \rounds, \temp
 	.endm
 
 	/* prepare for decryption with key in rk[] */
 	.macro		dec_prepare, rounds, rk, temp
-	mov		\temp, \rk
-	load_round_keys	\rounds, \temp
+	load_round_keys	\rk, \rounds, \temp
 	.endm
 
 	.macro		do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4
@@ -110,14 +105,13 @@
 
 	/* up to 5 interleaved blocks */
 	.macro		do_block_Nx, enc, rounds, i0, i1, i2, i3, i4
-	cmp		\rounds, #12
-	blo		2222f		/* 128 bits */
-	beq		1111f		/* 192 bits */
+	tbz		\rounds, #2, .L\@	/* 128 bits */
 	round_Nx	\enc, v17, \i0, \i1, \i2, \i3, \i4
 	round_Nx	\enc, v18, \i0, \i1, \i2, \i3, \i4
-1111:	round_Nx	\enc, v19, \i0, \i1, \i2, \i3, \i4
+	tbz		\rounds, #1, .L\@	/* 192 bits */
+	round_Nx	\enc, v19, \i0, \i1, \i2, \i3, \i4
 	round_Nx	\enc, v20, \i0, \i1, \i2, \i3, \i4
-2222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
+.L\@:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
 	round_Nx	\enc, \key, \i0, \i1, \i2, \i3, \i4
 	.endr
 	fin_round_Nx	\enc, v30, v31, \i0, \i1, \i2, \i3, \i4
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 162787c7aa86..b0150999743f 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -1048,6 +1048,7 @@ unregister_ciphers:
 
 #ifdef USE_V8_CRYPTO_EXTENSIONS
 module_cpu_feature_match(AES, aes_init);
+EXPORT_SYMBOL_NS(ce_aes_mac_update, "CRYPTO_INTERNAL");
 #else
 module_init(aes_init);
 EXPORT_SYMBOL(neon_aes_ecb_encrypt);
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 9de7fbc797af..3a8961b6ea51 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -99,16 +99,16 @@
 	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
-1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
+.La\@:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
 	movi		v15.16b, #0x40
 	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
 	sub_bytes	\in
-	subs		\i, \i, #1
+	sub		\i, \i, #1
 	ld1		{v15.4s}, [\rkp], #16
-	beq		2222f
+	cbz		\i, .Lb\@
 	mix_columns	\in, \enc
-	b		1111b
-2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
+	b		.La\@
+.Lb\@:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
 	.endm
 
 	.macro		encrypt_block, in, rounds, rk, rkp, i
@@ -206,7 +206,7 @@
 	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
-1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
+.La\@:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
 	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
 	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
@@ -216,13 +216,13 @@
 	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
 	sub_bytes_4x	\in0, \in1, \in2, \in3
-	subs		\i, \i, #1
+	sub		\i, \i, #1
 	ld1		{v15.4s}, [\rkp], #16
-	beq		2222f
+	cbz		\i, .Lb\@
 	mix_columns_2x	\in0, \in1, \enc
 	mix_columns_2x	\in2, \in3, \enc
-	b		1111b
-2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
+	b		.La\@
+.Lb\@:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
 	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
 	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index 467ac2f768ac..46425e7b9755 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_DESCRIPTION("Bit sliced AES using NEON instructions");
 MODULE_LICENSE("GPL v2");
 
 MODULE_ALIAS_CRYPTO("ecb(aes)");
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
deleted file mode 100644
index 5604de61d06d..000000000000
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ /dev/null
@@ -1,514 +0,0 @@
-//
-// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
-//
-// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
-// Copyright (C) 2019 Google LLC <ebiggers@google.com>
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License version 2 as
-// published by the Free Software Foundation.
-//
-
-// Derived from the x86 version:
-//
-// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
-//
-// Copyright (c) 2013, Intel Corporation
-//
-// Authors:
-//     Erdinc Ozturk <erdinc.ozturk@intel.com>
-//     Vinodh Gopal <vinodh.gopal@intel.com>
-//     James Guilford <james.guilford@intel.com>
-//     Tim Chen <tim.c.chen@linux.intel.com>
-//
-// This software is available to you under a choice of one of two
-// licenses.  You may choose to be licensed under the terms of the GNU
-// General Public License (GPL) Version 2, available from the file
-// COPYING in the main directory of this source tree, or the
-// OpenIB.org BSD license below:
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// * Neither the name of the Intel Corporation nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-//
-// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-//       Reference paper titled "Fast CRC Computation for Generic
-//	Polynomials Using PCLMULQDQ Instruction"
-//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
-//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-//
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.text
-	.arch		armv8-a+crypto
-
-	init_crc	.req	w0
-	buf		.req	x1
-	len		.req	x2
-	fold_consts_ptr	.req	x3
-
-	fold_consts	.req	v10
-
-	ad		.req	v14
-
-	k00_16		.req	v15
-	k32_48		.req	v16
-
-	t3		.req	v17
-	t4		.req	v18
-	t5		.req	v19
-	t6		.req	v20
-	t7		.req	v21
-	t8		.req	v22
-	t9		.req	v23
-
-	perm1		.req	v24
-	perm2		.req	v25
-	perm3		.req	v26
-	perm4		.req	v27
-
-	bd1		.req	v28
-	bd2		.req	v29
-	bd3		.req	v30
-	bd4		.req	v31
-
-	.macro		__pmull_init_p64
-	.endm
-
-	.macro		__pmull_pre_p64, bd
-	.endm
-
-	.macro		__pmull_init_p8
-	// k00_16 := 0x0000000000000000_000000000000ffff
-	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
-	movi		k32_48.2d, #0xffffffff
-	mov		k32_48.h[2], k32_48.h[0]
-	ushr		k00_16.2d, k32_48.2d, #32
-
-	// prepare the permutation vectors
-	mov_q		x5, 0x080f0e0d0c0b0a09
-	movi		perm4.8b, #8
-	dup		perm1.2d, x5
-	eor		perm1.16b, perm1.16b, perm4.16b
-	ushr		perm2.2d, perm1.2d, #8
-	ushr		perm3.2d, perm1.2d, #16
-	ushr		perm4.2d, perm1.2d, #24
-	sli		perm2.2d, perm1.2d, #56
-	sli		perm3.2d, perm1.2d, #48
-	sli		perm4.2d, perm1.2d, #40
-	.endm
-
-	.macro		__pmull_pre_p8, bd
-	tbl		bd1.16b, {\bd\().16b}, perm1.16b
-	tbl		bd2.16b, {\bd\().16b}, perm2.16b
-	tbl		bd3.16b, {\bd\().16b}, perm3.16b
-	tbl		bd4.16b, {\bd\().16b}, perm4.16b
-	.endm
-
-SYM_FUNC_START_LOCAL(__pmull_p8_core)
-.L__pmull_p8_core:
-	ext		t4.8b, ad.8b, ad.8b, #1			// A1
-	ext		t5.8b, ad.8b, ad.8b, #2			// A2
-	ext		t6.8b, ad.8b, ad.8b, #3			// A3
-
-	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
-	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
-	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
-	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
-	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
-	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
-	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
-	b		0f
-
-.L__pmull_p8_core2:
-	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
-	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
-	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
-
-	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
-	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
-	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
-	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
-	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
-	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
-	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
-
-0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
-	eor		t5.16b, t5.16b, t7.16b			// M = G + H
-	eor		t6.16b, t6.16b, t9.16b			// N = I + J
-
-	uzp1		t8.2d, t4.2d, t5.2d
-	uzp2		t4.2d, t4.2d, t5.2d
-	uzp1		t7.2d, t6.2d, t3.2d
-	uzp2		t6.2d, t6.2d, t3.2d
-
-	// t4 = (L) (P0 + P1) << 8
-	// t5 = (M) (P2 + P3) << 16
-	eor		t8.16b, t8.16b, t4.16b
-	and		t4.16b, t4.16b, k32_48.16b
-
-	// t6 = (N) (P4 + P5) << 24
-	// t7 = (K) (P6 + P7) << 32
-	eor		t7.16b, t7.16b, t6.16b
-	and		t6.16b, t6.16b, k00_16.16b
-
-	eor		t8.16b, t8.16b, t4.16b
-	eor		t7.16b, t7.16b, t6.16b
-
-	zip2		t5.2d, t8.2d, t4.2d
-	zip1		t4.2d, t8.2d, t4.2d
-	zip2		t3.2d, t7.2d, t6.2d
-	zip1		t6.2d, t7.2d, t6.2d
-
-	ext		t4.16b, t4.16b, t4.16b, #15
-	ext		t5.16b, t5.16b, t5.16b, #14
-	ext		t6.16b, t6.16b, t6.16b, #13
-	ext		t3.16b, t3.16b, t3.16b, #12
-
-	eor		t4.16b, t4.16b, t5.16b
-	eor		t6.16b, t6.16b, t3.16b
-	ret
-SYM_FUNC_END(__pmull_p8_core)
-
-	.macro		__pmull_p8, rq, ad, bd, i
-	.ifnc		\bd, fold_consts
-	.err
-	.endif
-	mov		ad.16b, \ad\().16b
-	.ifb		\i
-	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
-	.else
-	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
-	.endif
-
-	bl		.L__pmull_p8_core\i
-
-	eor		\rq\().16b, \rq\().16b, t4.16b
-	eor		\rq\().16b, \rq\().16b, t6.16b
-	.endm
-
-	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
-	// into reg1, reg2.
-	.macro		fold_32_bytes, p, reg1, reg2
-	ldp		q11, q12, [buf], #0x20
-
-	__pmull_\p	v8, \reg1, fold_consts, 2
-	__pmull_\p	\reg1, \reg1, fold_consts
-
-CPU_LE(	rev64		v11.16b, v11.16b		)
-CPU_LE(	rev64		v12.16b, v12.16b		)
-
-	__pmull_\p	v9, \reg2, fold_consts, 2
-	__pmull_\p	\reg2, \reg2, fold_consts
-
-CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
-CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
-
-	eor		\reg1\().16b, \reg1\().16b, v8.16b
-	eor		\reg2\().16b, \reg2\().16b, v9.16b
-	eor		\reg1\().16b, \reg1\().16b, v11.16b
-	eor		\reg2\().16b, \reg2\().16b, v12.16b
-	.endm
-
-	// Fold src_reg into dst_reg, optionally loading the next fold constants
-	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
-	__pmull_\p	v8, \src_reg, fold_consts
-	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
-	.ifnb		\load_next_consts
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	__pmull_pre_\p	fold_consts
-	.endif
-	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
-	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
-	.endm
-
-	.macro		__pmull_p64, rd, rn, rm, n
-	.ifb		\n
-	pmull		\rd\().1q, \rn\().1d, \rm\().1d
-	.else
-	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
-	.endif
-	.endm
-
-	.macro		crc_t10dif_pmull, p
-	__pmull_init_\p
-
-	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
-	cmp		len, #256
-	b.lt		.Lless_than_256_bytes_\@
-
-	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
-
-	// Load the first 128 data bytes.  Byte swapping is necessary to make
-	// the bit order match the polynomial coefficient order.
-	ldp		q0, q1, [buf]
-	ldp		q2, q3, [buf, #0x20]
-	ldp		q4, q5, [buf, #0x40]
-	ldp		q6, q7, [buf, #0x60]
-	add		buf, buf, #0x80
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	rev64		v1.16b, v1.16b			)
-CPU_LE(	rev64		v2.16b, v2.16b			)
-CPU_LE(	rev64		v3.16b, v3.16b			)
-CPU_LE(	rev64		v4.16b, v4.16b			)
-CPU_LE(	rev64		v5.16b, v5.16b			)
-CPU_LE(	rev64		v6.16b, v6.16b			)
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
-CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
-CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
-CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
-CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
-CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
-
-	// XOR the first 16 data *bits* with the initial CRC value.
-	movi		v8.16b, #0
-	mov		v8.h[7], init_crc
-	eor		v0.16b, v0.16b, v8.16b
-
-	// Load the constants for folding across 128 bytes.
-	ld1		{fold_consts.2d}, [fold_consts_ptr]
-	__pmull_pre_\p	fold_consts
-
-	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
-	// 128 to simplify the termination condition of the following loop.
-	sub		len, len, #256
-
-	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
-	// bytes v0-v7 into them, storing the result back into v0-v7.
-.Lfold_128_bytes_loop_\@:
-	fold_32_bytes	\p, v0, v1
-	fold_32_bytes	\p, v2, v3
-	fold_32_bytes	\p, v4, v5
-	fold_32_bytes	\p, v6, v7
-
-	subs		len, len, #128
-	b.ge		.Lfold_128_bytes_loop_\@
-
-	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
-
-	// Fold across 64 bytes.
-	add		fold_consts_ptr, fold_consts_ptr, #16
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	__pmull_pre_\p	fold_consts
-	fold_16_bytes	\p, v0, v4
-	fold_16_bytes	\p, v1, v5
-	fold_16_bytes	\p, v2, v6
-	fold_16_bytes	\p, v3, v7, 1
-	// Fold across 32 bytes.
-	fold_16_bytes	\p, v4, v6
-	fold_16_bytes	\p, v5, v7, 1
-	// Fold across 16 bytes.
-	fold_16_bytes	\p, v6, v7
-
-	// Add 128 to get the correct number of data bytes remaining in 0...127
-	// (not counting v7), following the previous extra subtraction by 128.
-	// Then subtract 16 to simplify the termination condition of the
-	// following loop.
-	adds		len, len, #(128-16)
-
-	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
-	// into them, storing the result back into v7.
-	b.lt		.Lfold_16_bytes_loop_done_\@
-.Lfold_16_bytes_loop_\@:
-	__pmull_\p	v8, v7, fold_consts
-	__pmull_\p	v7, v7, fold_consts, 2
-	eor		v7.16b, v7.16b, v8.16b
-	ldr		q0, [buf], #16
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-	eor		v7.16b, v7.16b, v0.16b
-	subs		len, len, #16
-	b.ge		.Lfold_16_bytes_loop_\@
-
-.Lfold_16_bytes_loop_done_\@:
-	// Add 16 to get the correct number of data bytes remaining in 0...15
-	// (not counting v7), following the previous extra subtraction by 16.
-	adds		len, len, #16
-	b.eq		.Lreduce_final_16_bytes_\@
-
-.Lhandle_partial_segment_\@:
-	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
-	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
-	// do this without needing a fold constant for each possible 'len',
-	// redivide the bytes into a first chunk of 'len' bytes and a second
-	// chunk of 16 bytes, then fold the first chunk into the second.
-
-	// v0 = last 16 original data bytes
-	add		buf, buf, len
-	ldr		q0, [buf, #-16]
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-
-	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
-	adr_l		x4, .Lbyteshift_table + 16
-	sub		x4, x4, len
-	ld1		{v2.16b}, [x4]
-	tbl		v1.16b, {v7.16b}, v2.16b
-
-	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
-	movi		v3.16b, #0x80
-	eor		v2.16b, v2.16b, v3.16b
-	tbl		v3.16b, {v7.16b}, v2.16b
-
-	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
-	sshr		v2.16b, v2.16b, #7
-
-	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
-	// then '16-len' bytes from v1 (high-order bytes).
-	bsl		v2.16b, v1.16b, v0.16b
-
-	// Fold the first chunk into the second chunk, storing the result in v7.
-	__pmull_\p	v0, v3, fold_consts
-	__pmull_\p	v7, v3, fold_consts, 2
-	eor		v7.16b, v7.16b, v0.16b
-	eor		v7.16b, v7.16b, v2.16b
-
-.Lreduce_final_16_bytes_\@:
-	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
-
-	movi		v2.16b, #0		// init zero register
-
-	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	__pmull_pre_\p	fold_consts
-
-	// Fold the high 64 bits into the low 64 bits, while also multiplying by
-	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
-	// whose low 48 bits are 0.
-	ext		v0.16b, v2.16b, v7.16b, #8
-	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
-	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
-
-	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
-	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
-	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
-	mov		v0.s[3], v2.s[0]	// zero high 32 bits
-	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
-	eor		v0.16b, v0.16b, v1.16b	// + low bits
-
-	// Load G(x) and floor(x^48 / G(x)).
-	ld1		{fold_consts.2d}, [fold_consts_ptr]
-	__pmull_pre_\p	fold_consts
-
-	// Use Barrett reduction to compute the final CRC value.
-	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
-	ushr		v1.2d, v1.2d, #32	// /= x^32
-	__pmull_\p	v1, v1, fold_consts	// *= G(x)
-	ushr		v0.2d, v0.2d, #48
-	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
-	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
-	umov		w0, v0.h[0]
-	.ifc		\p, p8
-	frame_pop
-	.endif
-	ret
-
-.Lless_than_256_bytes_\@:
-	// Checksumming a buffer of length 16...255 bytes
-
-	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
-
-	// Load the first 16 data bytes.
-	ldr		q7, [buf], #0x10
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
-
-	// XOR the first 16 data *bits* with the initial CRC value.
-	movi		v0.16b, #0
-	mov		v0.h[7], init_crc
-	eor		v7.16b, v7.16b, v0.16b
-
-	// Load the fold-across-16-bytes constants.
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	__pmull_pre_\p	fold_consts
-
-	cmp		len, #16
-	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
-	subs		len, len, #32
-	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
-	add		len, len, #16
-	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
-	.endm
-
-//
-// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p8)
-	frame_push	1
-	crc_t10dif_pmull p8
-SYM_FUNC_END(crc_t10dif_pmull_p8)
-
-	.align		5
-//
-// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p64)
-	crc_t10dif_pmull	p64
-SYM_FUNC_END(crc_t10dif_pmull_p64)
-
-	.section	".rodata", "a"
-	.align		4
-
-// Fold constants precomputed from the polynomial 0x18bb7
-// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
-	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
-	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
-// .Lfold_across_64_bytes_consts:
-	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
-	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
-// .Lfold_across_32_bytes_consts:
-	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
-	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
-.Lfold_across_16_bytes_consts:
-	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
-	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
-// .Lfinal_fold_consts:
-	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
-	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
-// .Lbarrett_reduction_consts:
-	.quad		0x0000000000018bb7	// G(x)
-	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
-
-// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
-// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
-// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
-	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
-	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
-	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
-	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
deleted file mode 100644
index 09eb1456aed4..000000000000
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ /dev/null
@@ -1,143 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc-t10dif.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
-
-asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
-asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-
-static int crct10dif_init(struct shash_desc *desc)
-{
-	u16 *crc = shash_desc_ctx(desc);
-
-	*crc = 0;
-	return 0;
-}
-
-static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
-			    unsigned int length)
-{
-	u16 *crc = shash_desc_ctx(desc);
-
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
-		do {
-			unsigned int chunk = length;
-
-			if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
-				chunk = SZ_4K;
-
-			kernel_neon_begin();
-			*crc = crc_t10dif_pmull_p8(*crc, data, chunk);
-			kernel_neon_end();
-			data += chunk;
-			length -= chunk;
-		} while (length);
-	} else {
-		*crc = crc_t10dif_generic(*crc, data, length);
-	}
-
-	return 0;
-}
-
-static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
-			    unsigned int length)
-{
-	u16 *crc = shash_desc_ctx(desc);
-
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
-		do {
-			unsigned int chunk = length;
-
-			if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
-				chunk = SZ_4K;
-
-			kernel_neon_begin();
-			*crc = crc_t10dif_pmull_p64(*crc, data, chunk);
-			kernel_neon_end();
-			data += chunk;
-			length -= chunk;
-		} while (length);
-	} else {
-		*crc = crc_t10dif_generic(*crc, data, length);
-	}
-
-	return 0;
-}
-
-static int crct10dif_final(struct shash_desc *desc, u8 *out)
-{
-	u16 *crc = shash_desc_ctx(desc);
-
-	*(u16 *)out = *crc;
-	return 0;
-}
-
-static struct shash_alg crc_t10dif_alg[] = {{
-	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
-	.init			= crct10dif_init,
-	.update			= crct10dif_update_pmull_p8,
-	.final			= crct10dif_final,
-	.descsize		= CRC_T10DIF_DIGEST_SIZE,
-
-	.base.cra_name		= "crct10dif",
-	.base.cra_driver_name	= "crct10dif-arm64-neon",
-	.base.cra_priority	= 100,
-	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
-	.init			= crct10dif_init,
-	.update			= crct10dif_update_pmull_p64,
-	.final			= crct10dif_final,
-	.descsize		= CRC_T10DIF_DIGEST_SIZE,
-
-	.base.cra_name		= "crct10dif",
-	.base.cra_driver_name	= "crct10dif-arm64-ce",
-	.base.cra_priority	= 200,
-	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}};
-
-static int __init crc_t10dif_mod_init(void)
-{
-	if (cpu_have_named_feature(PMULL))
-		return crypto_register_shashes(crc_t10dif_alg,
-					       ARRAY_SIZE(crc_t10dif_alg));
-	else
-		/* only register the first array element */
-		return crypto_register_shash(crc_t10dif_alg);
-}
-
-static void __exit crc_t10dif_mod_exit(void)
-{
-	if (cpu_have_named_feature(PMULL))
-		crypto_unregister_shashes(crc_t10dif_alg,
-					  ARRAY_SIZE(crc_t10dif_alg));
-	else
-		crypto_unregister_shash(crc_t10dif_alg);
-}
-
-module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
-module_exit(crc_t10dif_mod_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("crct10dif");
-MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce");
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 97331b454ea8..da7b7ec1a664 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -7,7 +7,7 @@
 
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <crypto/aes.h>
 #include <crypto/gcm.h>
 #include <crypto/algapi.h>
diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl
index cbc980fb02e3..22c9069c0650 100644
--- a/arch/arm64/crypto/poly1305-armv8.pl
+++ b/arch/arm64/crypto/poly1305-armv8.pl
@@ -473,7 +473,8 @@ poly1305_blocks_neon:
 	subs	$len,$len,#64
 	ldp	x9,x13,[$inp,#48]
 	add	$in2,$inp,#96
-	adr	$zeros,.Lzeros
+	adrp	$zeros,.Lzeros
+	add	$zeros,$zeros,#:lo12:.Lzeros
 
 	lsl	$padbit,$padbit,#24
 	add	x15,$ctx,#48
@@ -885,10 +886,13 @@ poly1305_blocks_neon:
 	ret
 .size	poly1305_blocks_neon,.-poly1305_blocks_neon
 
+.pushsection .rodata
 .align	5
 .Lzeros:
 .long	0,0,0,0,0,0,0,0
 .asciz	"Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
+.popsection
+
 .align	2
 #if !defined(__KERNEL__) && !defined(_WIN64)
 .comm	OPENSSL_armcap_P,4,4
diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c
index 1fae18ba11ed..18883ea438f3 100644
--- a/arch/arm64/crypto/poly1305-glue.c
+++ b/arch/arm64/crypto/poly1305-glue.c
@@ -8,7 +8,7 @@
 #include <asm/hwcap.h>
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <crypto/algapi.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/poly1305.h>
@@ -226,6 +226,7 @@ static void __exit neon_poly1305_mod_exit(void)
 module_init(neon_poly1305_mod_init);
 module_exit(neon_poly1305_mod_exit);
 
+MODULE_DESCRIPTION("Poly1305 transform using NEON instructions");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("poly1305");
 MODULE_ALIAS_CRYPTO("poly1305-neon");
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index 1dd93e1fcb39..cbd14f208f83 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -7,7 +7,7 @@
 
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/sha1.h>
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 0a44d2e7ee1f..6b4866a88ded 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -7,7 +7,7 @@
 
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/sha2.h>
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
index 250e1377c481..5662c3ac49e9 100644
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@@ -12,7 +12,7 @@
 #include <asm/hwcap.h>
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/sha3.h>
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
index f3431fc62315..071f64293227 100644
--- a/arch/arm64/crypto/sha512-ce-glue.c
+++ b/arch/arm64/crypto/sha512-ce-glue.c
@@ -11,7 +11,7 @@
 
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/sha2.h>
diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c
index 54bf6ebcfffb..1a71788c4cda 100644
--- a/arch/arm64/crypto/sm3-ce-glue.c
+++ b/arch/arm64/crypto/sm3-ce-glue.c
@@ -7,7 +7,7 @@
 
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/sm3.h>
diff --git a/arch/arm64/crypto/sm3-neon-glue.c b/arch/arm64/crypto/sm3-neon-glue.c
index 7182ee683f14..8dd71ce79b69 100644
--- a/arch/arm64/crypto/sm3-neon-glue.c
+++ b/arch/arm64/crypto/sm3-neon-glue.c
@@ -7,7 +7,7 @@
 
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/sm3.h>