8 files changed, 42 insertions, 1033 deletions
diff --git a/arch/arm64/lib/.gitignore b/arch/arm64/lib/.gitignore
new file mode 100644
index 000000000000..647d7a922e68
--- /dev/null
+++ b/arch/arm64/lib/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 4d49dff721a8..633e5223d944 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -13,12 +13,6 @@ endif
 
 lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
 
-obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o
-crc32-arm64-y := crc32.o crc32-glue.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm64.o
-crc-t10dif-arm64-y := crc-t10dif-glue.o crc-t10dif-core.o
-
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
 obj-$(CONFIG_ARM64_MTE) += mte.o
diff --git a/arch/arm64/lib/crc-t10dif-core.S b/arch/arm64/lib/crc-t10dif-core.S
deleted file mode 100644
index 87dd6d46224d..000000000000
--- a/arch/arm64/lib/crc-t10dif-core.S
+++ /dev/null
@@ -1,469 +0,0 @@
-//
-// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
-//
-// Copyright (C) 2016 Linaro Ltd
-// Copyright (C) 2019-2024 Google LLC
-//
-// Authors: Ard Biesheuvel <ardb@google.com>
-//          Eric Biggers <ebiggers@google.com>
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License version 2 as
-// published by the Free Software Foundation.
-//
-
-// Derived from the x86 version:
-//
-// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
-//
-// Copyright (c) 2013, Intel Corporation
-//
-// Authors:
-//     Erdinc Ozturk <erdinc.ozturk@intel.com>
-//     Vinodh Gopal <vinodh.gopal@intel.com>
-//     James Guilford <james.guilford@intel.com>
-//     Tim Chen <tim.c.chen@linux.intel.com>
-//
-// This software is available to you under a choice of one of two
-// licenses.  You may choose to be licensed under the terms of the GNU
-// General Public License (GPL) Version 2, available from the file
-// COPYING in the main directory of this source tree, or the
-// OpenIB.org BSD license below:
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// * Neither the name of the Intel Corporation nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-//
-// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-//       Reference paper titled "Fast CRC Computation for Generic
-//	Polynomials Using PCLMULQDQ Instruction"
-//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
-//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-//
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.text
-	.arch		armv8-a+crypto
-
-	init_crc	.req	w0
-	buf		.req	x1
-	len		.req	x2
-	fold_consts_ptr	.req	x5
-
-	fold_consts	.req	v10
-
-	t3		.req	v17
-	t4		.req	v18
-	t5		.req	v19
-	t6		.req	v20
-	t7		.req	v21
-	t8		.req	v22
-
-	perm		.req	v27
-
-	.macro		pmull16x64_p64, a16, b64, c64
-	pmull2		\c64\().1q, \a16\().2d, \b64\().2d
-	pmull		\b64\().1q, \a16\().1d, \b64\().1d
-	.endm
-
-	/*
-	 * Pairwise long polynomial multiplication of two 16-bit values
-	 *
-	 *   { w0, w1 }, { y0, y1 }
-	 *
-	 * by two 64-bit values
-	 *
-	 *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
-	 *
-	 * where each vector element is a byte, ordered from least to most
-	 * significant.
-	 *
-	 * This can be implemented using 8x8 long polynomial multiplication, by
-	 * reorganizing the input so that each pairwise 8x8 multiplication
-	 * produces one of the terms from the decomposition below, and
-	 * combining the results of each rank and shifting them into place.
-	 *
-	 * Rank
-	 *  0            w0*x0 ^              |        y0*z0 ^
-	 *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
-	 *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
-	 *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
-	 *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
-	 *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
-	 *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
-	 *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
-	 *  8            w1*x7      << 64     |        y1*z7      << 64
-	 *
-	 * The inputs can be reorganized into
-	 *
-	 *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
-	 *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
-	 *
-	 * and after performing 8x8->16 bit long polynomial multiplication of
-	 * each of the halves of the first vector with those of the second one,
-	 * we obtain the following four vectors of 16-bit elements:
-	 *
-	 *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
-	 *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
-	 *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
-	 *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
-	 *
-	 * Results b and c can be XORed together, as the vector elements have
-	 * matching ranks. Then, the final XOR (*) can be pulled forward, and
-	 * applied between the halves of each of the remaining three vectors,
-	 * which are then shifted into place, and combined to produce two
-	 * 80-bit results.
-	 *
-	 * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
-	 * to the 64x64 bit one above, but XOR'ing the outputs together will
-	 * produce the expected result, and this is sufficient in the context of
-	 * this algorithm.
-	 */
-	.macro		pmull16x64_p8, a16, b64, c64
-	ext		t7.16b, \b64\().16b, \b64\().16b, #1
-	tbl		t5.16b, {\a16\().16b}, perm.16b
-	uzp1		t7.16b, \b64\().16b, t7.16b
-	bl		__pmull_p8_16x64
-	ext		\b64\().16b, t4.16b, t4.16b, #15
-	eor		\c64\().16b, t8.16b, t5.16b
-	.endm
-
-SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
-	ext		t6.16b, t5.16b, t5.16b, #8
-
-	pmull		t3.8h, t7.8b, t5.8b
-	pmull		t4.8h, t7.8b, t6.8b
-	pmull2		t5.8h, t7.16b, t5.16b
-	pmull2		t6.8h, t7.16b, t6.16b
-
-	ext		t8.16b, t3.16b, t3.16b, #8
-	eor		t4.16b, t4.16b, t6.16b
-	ext		t7.16b, t5.16b, t5.16b, #8
-	ext		t6.16b, t4.16b, t4.16b, #8
-	eor		t8.8b, t8.8b, t3.8b
-	eor		t5.8b, t5.8b, t7.8b
-	eor		t4.8b, t4.8b, t6.8b
-	ext		t5.16b, t5.16b, t5.16b, #14
-	ret
-SYM_FUNC_END(__pmull_p8_16x64)
-
-
-	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
-	// into reg1, reg2.
-	.macro		fold_32_bytes, p, reg1, reg2
-	ldp		q11, q12, [buf], #0x20
-
-	pmull16x64_\p	fold_consts, \reg1, v8
-
-CPU_LE(	rev64		v11.16b, v11.16b		)
-CPU_LE(	rev64		v12.16b, v12.16b		)
-
-	pmull16x64_\p	fold_consts, \reg2, v9
-
-CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
-CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
-
-	eor		\reg1\().16b, \reg1\().16b, v8.16b
-	eor		\reg2\().16b, \reg2\().16b, v9.16b
-	eor		\reg1\().16b, \reg1\().16b, v11.16b
-	eor		\reg2\().16b, \reg2\().16b, v12.16b
-	.endm
-
-	// Fold src_reg into dst_reg, optionally loading the next fold constants
-	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
-	pmull16x64_\p	fold_consts, \src_reg, v8
-	.ifnb		\load_next_consts
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	.endif
-	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
-	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
-	.endm
-
-	.macro		crc_t10dif_pmull, p
-
-	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
-	cmp		len, #256
-	b.lt		.Lless_than_256_bytes_\@
-
-	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
-
-	// Load the first 128 data bytes.  Byte swapping is necessary to make
-	// the bit order match the polynomial coefficient order.
-	ldp		q0, q1, [buf]
-	ldp		q2, q3, [buf, #0x20]
-	ldp		q4, q5, [buf, #0x40]
-	ldp		q6, q7, [buf, #0x60]
-	add		buf, buf, #0x80
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	rev64		v1.16b, v1.16b			)
-CPU_LE(	rev64		v2.16b, v2.16b			)
-CPU_LE(	rev64		v3.16b, v3.16b			)
-CPU_LE(	rev64		v4.16b, v4.16b			)
-CPU_LE(	rev64		v5.16b, v5.16b			)
-CPU_LE(	rev64		v6.16b, v6.16b			)
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
-CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
-CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
-CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
-CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
-CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
-
-	// XOR the first 16 data *bits* with the initial CRC value.
-	movi		v8.16b, #0
-	mov		v8.h[7], init_crc
-	eor		v0.16b, v0.16b, v8.16b
-
-	// Load the constants for folding across 128 bytes.
-	ld1		{fold_consts.2d}, [fold_consts_ptr]
-
-	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
-	// 128 to simplify the termination condition of the following loop.
-	sub		len, len, #256
-
-	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
-	// bytes v0-v7 into them, storing the result back into v0-v7.
-.Lfold_128_bytes_loop_\@:
-	fold_32_bytes	\p, v0, v1
-	fold_32_bytes	\p, v2, v3
-	fold_32_bytes	\p, v4, v5
-	fold_32_bytes	\p, v6, v7
-
-	subs		len, len, #128
-	b.ge		.Lfold_128_bytes_loop_\@
-
-	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
-
-	// Fold across 64 bytes.
-	add		fold_consts_ptr, fold_consts_ptr, #16
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	fold_16_bytes	\p, v0, v4
-	fold_16_bytes	\p, v1, v5
-	fold_16_bytes	\p, v2, v6
-	fold_16_bytes	\p, v3, v7, 1
-	// Fold across 32 bytes.
-	fold_16_bytes	\p, v4, v6
-	fold_16_bytes	\p, v5, v7, 1
-	// Fold across 16 bytes.
-	fold_16_bytes	\p, v6, v7
-
-	// Add 128 to get the correct number of data bytes remaining in 0...127
-	// (not counting v7), following the previous extra subtraction by 128.
-	// Then subtract 16 to simplify the termination condition of the
-	// following loop.
-	adds		len, len, #(128-16)
-
-	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
-	// into them, storing the result back into v7.
-	b.lt		.Lfold_16_bytes_loop_done_\@
-.Lfold_16_bytes_loop_\@:
-	pmull16x64_\p	fold_consts, v7, v8
-	eor		v7.16b, v7.16b, v8.16b
-	ldr		q0, [buf], #16
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-	eor		v7.16b, v7.16b, v0.16b
-	subs		len, len, #16
-	b.ge		.Lfold_16_bytes_loop_\@
-
-.Lfold_16_bytes_loop_done_\@:
-	// Add 16 to get the correct number of data bytes remaining in 0...15
-	// (not counting v7), following the previous extra subtraction by 16.
-	adds		len, len, #16
-	b.eq		.Lreduce_final_16_bytes_\@
-
-.Lhandle_partial_segment_\@:
-	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
-	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
-	// do this without needing a fold constant for each possible 'len',
-	// redivide the bytes into a first chunk of 'len' bytes and a second
-	// chunk of 16 bytes, then fold the first chunk into the second.
-
-	// v0 = last 16 original data bytes
-	add		buf, buf, len
-	ldr		q0, [buf, #-16]
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-
-	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
-	adr_l		x4, .Lbyteshift_table + 16
-	sub		x4, x4, len
-	ld1		{v2.16b}, [x4]
-	tbl		v1.16b, {v7.16b}, v2.16b
-
-	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
-	movi		v3.16b, #0x80
-	eor		v2.16b, v2.16b, v3.16b
-	tbl		v3.16b, {v7.16b}, v2.16b
-
-	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
-	sshr		v2.16b, v2.16b, #7
-
-	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
-	// then '16-len' bytes from v1 (high-order bytes).
-	bsl		v2.16b, v1.16b, v0.16b
-
-	// Fold the first chunk into the second chunk, storing the result in v7.
-	pmull16x64_\p	fold_consts, v3, v0
-	eor		v7.16b, v3.16b, v0.16b
-	eor		v7.16b, v7.16b, v2.16b
-	b		.Lreduce_final_16_bytes_\@
-
-.Lless_than_256_bytes_\@:
-	// Checksumming a buffer of length 16...255 bytes
-
-	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
-
-	// Load the first 16 data bytes.
-	ldr		q7, [buf], #0x10
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
-
-	// XOR the first 16 data *bits* with the initial CRC value.
-	movi		v0.16b, #0
-	mov		v0.h[7], init_crc
-	eor		v7.16b, v7.16b, v0.16b
-
-	// Load the fold-across-16-bytes constants.
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-
-	cmp		len, #16
-	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
-	subs		len, len, #32
-	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
-	add		len, len, #16
-	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
-
-.Lreduce_final_16_bytes_\@:
-	.endm
-
-//
-// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p8)
-	frame_push	1
-
-	// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
-	movi		perm.4h, #8, lsl #8
-	orr		perm.2s, #1, lsl #16
-	orr		perm.2s, #1, lsl #24
-	zip1		perm.16b, perm.16b, perm.16b
-	zip1		perm.16b, perm.16b, perm.16b
-
-	crc_t10dif_pmull p8
-
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
-	str		q7, [x3]
-
-	frame_pop
-	ret
-SYM_FUNC_END(crc_t10dif_pmull_p8)
-
-	.align		5
-//
-// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p64)
-	crc_t10dif_pmull	p64
-
-	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
-
-	movi		v2.16b, #0		// init zero register
-
-	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-
-	// Fold the high 64 bits into the low 64 bits, while also multiplying by
-	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
-	// whose low 48 bits are 0.
-	ext		v0.16b, v2.16b, v7.16b, #8
-	pmull2		v7.1q, v7.2d, fold_consts.2d	// high bits * x^48 * (x^80 mod G(x))
-	eor		v0.16b, v0.16b, v7.16b		// + low bits * x^64
-
-	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
-	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
-	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
-	mov		v0.s[3], v2.s[0]		// zero high 32 bits
-	pmull		v1.1q, v1.1d, fold_consts.1d	// high 32 bits * x^48 * (x^48 mod G(x))
-	eor		v0.16b, v0.16b, v1.16b		// + low bits
-
-	// Load G(x) and floor(x^48 / G(x)).
-	ld1		{fold_consts.2d}, [fold_consts_ptr]
-
-	// Use Barrett reduction to compute the final CRC value.
-	pmull2		v1.1q, v0.2d, fold_consts.2d	// high 32 bits * floor(x^48 / G(x))
-	ushr		v1.2d, v1.2d, #32		// /= x^32
-	pmull		v1.1q, v1.1d, fold_consts.1d	// *= G(x)
-	ushr		v0.2d, v0.2d, #48
-	eor		v0.16b, v0.16b, v1.16b		// + low 16 nonzero bits
-	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
-	umov		w0, v0.h[0]
-	ret
-SYM_FUNC_END(crc_t10dif_pmull_p64)
-
-	.section	".rodata", "a"
-	.align		4
-
-// Fold constants precomputed from the polynomial 0x18bb7
-// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
-	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
-	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
-// .Lfold_across_64_bytes_consts:
-	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
-	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
-// .Lfold_across_32_bytes_consts:
-	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
-	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
-.Lfold_across_16_bytes_consts:
-	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
-	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
-// .Lfinal_fold_consts:
-	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
-	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
-// .Lbarrett_reduction_consts:
-	.quad		0x0000000000018bb7	// G(x)
-	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
-
-// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
-// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
-// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
-	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
-	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
-	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
-	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
diff --git a/arch/arm64/lib/crc-t10dif-glue.c b/arch/arm64/lib/crc-t10dif-glue.c
deleted file mode 100644
index bacd18f23168..000000000000
--- a/arch/arm64/lib/crc-t10dif-glue.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc-t10dif.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/simd.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-static DEFINE_STATIC_KEY_FALSE(have_asimd);
-static DEFINE_STATIC_KEY_FALSE(have_pmull);
-
-#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
-
-asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
-				    u8 out[16]);
-asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-
-u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
-{
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
-		if (static_branch_likely(&have_pmull)) {
-			if (crypto_simd_usable()) {
-				kernel_neon_begin();
-				crc = crc_t10dif_pmull_p64(crc, data, length);
-				kernel_neon_end();
-				return crc;
-			}
-		} else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
-			   static_branch_likely(&have_asimd) &&
-			   crypto_simd_usable()) {
-			u8 buf[16];
-
-			kernel_neon_begin();
-			crc_t10dif_pmull_p8(crc, data, length, buf);
-			kernel_neon_end();
-
-			return crc_t10dif_generic(0, buf, sizeof(buf));
-		}
-	}
-	return crc_t10dif_generic(crc, data, length);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_arm64_init(void)
-{
-	if (cpu_have_named_feature(ASIMD)) {
-		static_branch_enable(&have_asimd);
-		if (cpu_have_named_feature(PMULL))
-			static_branch_enable(&have_pmull);
-	}
-	return 0;
-}
-arch_initcall(crc_t10dif_arm64_init);
-
-static void __exit crc_t10dif_arm64_exit(void)
-{
-}
-module_exit(crc_t10dif_arm64_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c
deleted file mode 100644
index ed3acd71178f..000000000000
--- a/arch/arm64/lib/crc32-glue.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/crc32.h>
-#include <linux/linkage.h>
-#include <linux/module.h>
-
-#include <asm/alternative.h>
-#include <asm/cpufeature.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-#include <crypto/internal/simd.h>
-
-// The minimum input length to consider the 4-way interleaved code path
-static const size_t min_len = 1024;
-
-asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
-
-asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
-	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
-		return crc32_le_base(crc, p, len);
-
-	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
-		kernel_neon_begin();
-		crc = crc32_le_arm64_4way(crc, p, len);
-		kernel_neon_end();
-
-		p += round_down(len, 64);
-		len %= 64;
-
-		if (!len)
-			return crc;
-	}
-
-	return crc32_le_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
-	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
-		return crc32c_base(crc, p, len);
-
-	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
-		kernel_neon_begin();
-		crc = crc32c_le_arm64_4way(crc, p, len);
-		kernel_neon_end();
-
-		p += round_down(len, 64);
-		len %= 64;
-
-		if (!len)
-			return crc;
-	}
-
-	return crc32c_le_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
-	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
-		return crc32_be_base(crc, p, len);
-
-	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
-		kernel_neon_begin();
-		crc = crc32_be_arm64_4way(crc, p, len);
-		kernel_neon_end();
-
-		p += round_down(len, 64);
-		len %= 64;
-
-		if (!len)
-			return crc;
-	}
-
-	return crc32_be_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-u32 crc32_optimizations(void)
-{
-	if (alternative_has_cap_likely(ARM64_HAS_CRC32))
-		return CRC32_LE_OPTIMIZATION |
-		       CRC32_BE_OPTIMIZATION |
-		       CRC32C_OPTIMIZATION;
-	return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("arm64-optimized CRC32 functions");
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
deleted file mode 100644
index 68825317460f..000000000000
--- a/arch/arm64/lib/crc32.S
+++ /dev/null
@@ -1,362 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
- *
- * Copyright (C) 2016 - 2018 Linaro Ltd.
- * Copyright (C) 2024 Google LLC
- *
- * Author: Ard Biesheuvel <ardb@kernel.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.cpu		generic+crc+crypto
-
-	.macro		bitle, reg
-	.endm
-
-	.macro		bitbe, reg
-	rbit		\reg, \reg
-	.endm
-
-	.macro		bytele, reg
-	.endm
-
-	.macro		bytebe, reg
-	rbit		\reg, \reg
-	lsr		\reg, \reg, #24
-	.endm
-
-	.macro		hwordle, reg
-CPU_BE(	rev16		\reg, \reg	)
-	.endm
-
-	.macro		hwordbe, reg
-CPU_LE(	rev		\reg, \reg	)
-	rbit		\reg, \reg
-CPU_BE(	lsr		\reg, \reg, #16	)
-	.endm
-
-	.macro		le, regs:vararg
-	.irp		r, \regs
-CPU_BE(	rev		\r, \r		)
-	.endr
-	.endm
-
-	.macro		be, regs:vararg
-	.irp		r, \regs
-CPU_LE(	rev		\r, \r		)
-	.endr
-	.irp		r, \regs
-	rbit		\r, \r
-	.endr
-	.endm
-
-	.macro		__crc32, c, order=le
-	bit\order	w0
-	cmp		x2, #16
-	b.lt		8f			// less than 16 bytes
-
-	and		x7, x2, #0x1f
-	and		x2, x2, #~0x1f
-	cbz		x7, 32f			// multiple of 32 bytes
-
-	and		x8, x7, #0xf
-	ldp		x3, x4, [x1]
-	add		x8, x8, x1
-	add		x1, x1, x7
-	ldp		x5, x6, [x8]
-	\order		x3, x4, x5, x6
-
-	tst		x7, #8
-	crc32\c\()x	w8, w0, x3
-	csel		x3, x3, x4, eq
-	csel		w0, w0, w8, eq
-	tst		x7, #4
-	lsr		x4, x3, #32
-	crc32\c\()w	w8, w0, w3
-	csel		x3, x3, x4, eq
-	csel		w0, w0, w8, eq
-	tst		x7, #2
-	lsr		w4, w3, #16
-	crc32\c\()h	w8, w0, w3
-	csel		w3, w3, w4, eq
-	csel		w0, w0, w8, eq
-	tst		x7, #1
-	crc32\c\()b	w8, w0, w3
-	csel		w0, w0, w8, eq
-	tst		x7, #16
-	crc32\c\()x	w8, w0, x5
-	crc32\c\()x	w8, w8, x6
-	csel		w0, w0, w8, eq
-	cbz		x2, 0f
-
-32:	ldp		x3, x4, [x1], #32
-	sub		x2, x2, #32
-	ldp		x5, x6, [x1, #-16]
-	\order		x3, x4, x5, x6
-	crc32\c\()x	w0, w0, x3
-	crc32\c\()x	w0, w0, x4
-	crc32\c\()x	w0, w0, x5
-	crc32\c\()x	w0, w0, x6
-	cbnz		x2, 32b
-0:	bit\order	w0
-	ret
-
-8:	tbz		x2, #3, 4f
-	ldr		x3, [x1], #8
-	\order		x3
-	crc32\c\()x	w0, w0, x3
-4:	tbz		x2, #2, 2f
-	ldr		w3, [x1], #4
-	\order		w3
-	crc32\c\()w	w0, w0, w3
-2:	tbz		x2, #1, 1f
-	ldrh		w3, [x1], #2
-	hword\order	w3
-	crc32\c\()h	w0, w0, w3
-1:	tbz		x2, #0, 0f
-	ldrb		w3, [x1]
-	byte\order	w3
-	crc32\c\()b	w0, w0, w3
-0:	bit\order	w0
-	ret
-	.endm
-
-	.align		5
-SYM_FUNC_START(crc32_le_arm64)
-	__crc32
-SYM_FUNC_END(crc32_le_arm64)
-
-	.align		5
-SYM_FUNC_START(crc32c_le_arm64)
-	__crc32		c
-SYM_FUNC_END(crc32c_le_arm64)
-
-	.align		5
-SYM_FUNC_START(crc32_be_arm64)
-	__crc32		order=be
-SYM_FUNC_END(crc32_be_arm64)
-
-	in		.req	x1
-	len		.req	x2
-
-	/*
-	 * w0: input CRC at entry, output CRC at exit
-	 * x1: pointer to input buffer
-	 * x2: length of input in bytes
-	 */
-	.macro		crc4way, insn, table, order=le
-	bit\order	w0
-	lsr		len, len, #6		// len := # of 64-byte blocks
-
-	/* Process up to 64 blocks of 64 bytes at a time */
-.La\@:	mov		x3, #64
-	cmp		len, #64
-	csel		x3, x3, len, hi		// x3 := min(len, 64)
-	sub		len, len, x3
-
-	/* Divide the input into 4 contiguous blocks */
-	add		x4, x3, x3, lsl #1	// x4 :=  3 * x3
-	add		x7, in, x3, lsl #4	// x7 := in + 16 * x3
-	add		x8, in, x3, lsl #5	// x8 := in + 32 * x3
-	add		x9, in, x4, lsl #4	// x9 := in + 16 * x4
-
-	/* Load the folding coefficients from the lookup table */
-	adr_l		x5, \table - 12		// entry 0 omitted
-	add		x5, x5, x4, lsl #2	// x5 += 12 * x3
-	ldp		s0, s1, [x5]
-	ldr		s2, [x5, #8]
-
-	/* Zero init partial CRCs for this iteration */
-	mov		w4, wzr
-	mov		w5, wzr
-	mov		w6, wzr
-	mov		x17, xzr
-
-.Lb\@:	sub		x3, x3, #1
-	\insn		w6, w6, x17
-	ldp		x10, x11, [in], #16
-	ldp		x12, x13, [x7], #16
-	ldp		x14, x15, [x8], #16
-	ldp		x16, x17, [x9], #16
-
-	\order		x10, x11, x12, x13, x14, x15, x16, x17
-
-	/* Apply the CRC transform to 4 16-byte blocks in parallel */
-	\insn		w0, w0, x10
-	\insn		w4, w4, x12
-	\insn		w5, w5, x14
-	\insn		w6, w6, x16
-	\insn		w0, w0, x11
-	\insn		w4, w4, x13
-	\insn		w5, w5, x15
-	cbnz		x3, .Lb\@
-
-	/* Combine the 4 partial results into w0 */
-	mov		v3.d[0], x0
-	mov		v4.d[0], x4
-	mov		v5.d[0], x5
-	pmull		v0.1q, v0.1d, v3.1d
-	pmull		v1.1q, v1.1d, v4.1d
-	pmull		v2.1q, v2.1d, v5.1d
-	eor		v0.8b, v0.8b, v1.8b
-	eor		v0.8b, v0.8b, v2.8b
-	mov		x5, v0.d[0]
-	eor		x5, x5, x17
-	\insn		w0, w6, x5
-
-	mov		in, x9
-	cbnz		len, .La\@
-
-	bit\order	w0
-	ret
-	.endm
-
-	.align		5
-SYM_FUNC_START(crc32c_le_arm64_4way)
-	crc4way		crc32cx, .L0
-SYM_FUNC_END(crc32c_le_arm64_4way)
-
-	.align		5
-SYM_FUNC_START(crc32_le_arm64_4way)
-	crc4way		crc32x, .L1
-SYM_FUNC_END(crc32_le_arm64_4way)
-
-	.align		5
-SYM_FUNC_START(crc32_be_arm64_4way)
-	crc4way		crc32x, .L1, be
-SYM_FUNC_END(crc32_be_arm64_4way)
-
-	.section	.rodata, "a", %progbits
-	.align		6
-.L0:	.long		0xddc0152b, 0xba4fc28e, 0x493c7d27
-	.long		0x0715ce53, 0x9e4addf8, 0xba4fc28e
-	.long		0xc96cfdc0, 0x0715ce53, 0xddc0152b
-	.long		0xab7aff2a, 0x0d3b6092, 0x9e4addf8
-	.long		0x299847d5, 0x878a92a7, 0x39d3b296
-	.long		0xb6dd949b, 0xab7aff2a, 0x0715ce53
-	.long		0xa60ce07b, 0x83348832, 0x47db8317
-	.long		0xd270f1a2, 0xb9e02b86, 0x0d3b6092
-	.long		0x65863b64, 0xb6dd949b, 0xc96cfdc0
-	.long		0xb3e32c28, 0xbac2fd7b, 0x878a92a7
-	.long		0xf285651c, 0xce7f39f4, 0xdaece73e
-	.long		0x271d9844, 0xd270f1a2, 0xab7aff2a
-	.long		0x6cb08e5c, 0x2b3cac5d, 0x2162d385
-	.long		0xcec3662e, 0x1b03397f, 0x83348832
-	.long		0x8227bb8a, 0xb3e32c28, 0x299847d5
-	.long		0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86
-	.long		0xf6076544, 0x10746f3c, 0x18b33a4e
-	.long		0x98d8d9cb, 0x271d9844, 0xb6dd949b
-	.long		0x57a3d037, 0x93a5f730, 0x78d9ccb7
-	.long		0x3771e98f, 0x6b749fb2, 0xbac2fd7b
-	.long		0xe0ac139e, 0xcec3662e, 0xa60ce07b
-	.long		0x6f345e45, 0xe6fc4e6a, 0xce7f39f4
-	.long		0xa2b73df1, 0xb0cd4768, 0x61d82e56
-	.long		0x86d8e4d2, 0xd7a4825c, 0xd270f1a2
-	.long		0xa90fd27a, 0x0167d312, 0xc619809d
-	.long		0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d
-	.long		0x4597456a, 0x98d8d9cb, 0x65863b64
-	.long		0xc9c8b782, 0x68bce87a, 0x1b03397f
-	.long		0x62ec6c6d, 0x6956fc3b, 0xebb883bd
-	.long		0x2342001e, 0x3771e98f, 0xb3e32c28
-	.long		0xe8b6368b, 0x2178513a, 0x064f7f26
-	.long		0x9ef68d35, 0x170076fa, 0xdd7e3b0c
-	.long		0x0b0bf8ca, 0x6f345e45, 0xf285651c
-	.long		0x02ee03b2, 0xff0dba97, 0x10746f3c
-	.long		0x135c83fd, 0xf872e54c, 0xc7a68855
-	.long		0x00bcf5f6, 0x86d8e4d2, 0x271d9844
-	.long		0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c
-	.long		0xded288f8, 0xb3af077a, 0x93a5f730
-	.long		0x37170390, 0xca6ef3ac, 0x6cb08e5c
-	.long		0xf48642e9, 0xdd66cbbb, 0x6b749fb2
-	.long		0xb25b29f2, 0xe9e28eb4, 0x1393e203
-	.long		0x45cddf4e, 0xc9c8b782, 0xcec3662e
-	.long		0xdfd94fb2, 0x93e106a4, 0x96c515bb
-	.long		0x021ac5ef, 0xd813b325, 0xe6fc4e6a
-	.long		0x8e1450f7, 0x2342001e, 0x8227bb8a
-	.long		0xe0cdcf86, 0x6d9a4957, 0xb0cd4768
-	.long		0x613eee91, 0xd2c3ed1a, 0x39c7ff35
-	.long		0xbedc6ba1, 0x9ef68d35, 0xd7a4825c
-	.long		0x0cd1526a, 0xf2271e60, 0x0ab3844b
-	.long		0xd6c3a807, 0x2664fd8b, 0x0167d312
-	.long		0x1d31175f, 0x02ee03b2, 0xf6076544
-	.long		0x4be7fd90, 0x363bd6b3, 0x26f6a60a
-	.long		0x6eeed1c9, 0x5fabe670, 0xa741c1bf
-	.long		0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb
-	.long		0x2e7d11a7, 0x17f27698, 0x49c3cc9c
-	.long		0x889774e1, 0xaa7c7ad5, 0x68bce87a
-	.long		0x8a074012, 0xded288f8, 0x57a3d037
-	.long		0xbd0bb25f, 0x6d390dec, 0x6956fc3b
-	.long		0x3be3c09b, 0x6353c1cc, 0x42d98888
-	.long		0x465a4eee, 0xf48642e9, 0x3771e98f
-	.long		0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9
-	.long		0xa52f58ec, 0x9a5ede41, 0x2178513a
-	.long		0x47972100, 0x45cddf4e, 0xe0ac139e
-	.long		0x359674f7, 0xa51b6135, 0x170076fa
-
-.L1:	.long		0xaf449247, 0x81256527, 0xccaa009e
-	.long		0x57c54819, 0x1d9513d7, 0x81256527
-	.long		0x3f41287a, 0x57c54819, 0xaf449247
-	.long		0xf5e48c85, 0x910eeec1, 0x1d9513d7
-	.long		0x1f0c2cdd, 0x9026d5b1, 0xae0b5394
-	.long		0x71d54a59, 0xf5e48c85, 0x57c54819
-	.long		0x1c63267b, 0xfe807bbd, 0x0cbec0ed
-	.long		0xd31343ea, 0xe95c1271, 0x910eeec1
-	.long		0xf9d9c7ee, 0x71d54a59, 0x3f41287a
-	.long		0x9ee62949, 0xcec97417, 0x9026d5b1
-	.long		0xa55d1514, 0xf183c71b, 0xd1df2327
-	.long		0x21aa2b26, 0xd31343ea, 0xf5e48c85
-	.long		0x9d842b80, 0xeea395c4, 0x3c656ced
-	.long		0xd8110ff1, 0xcd669a40, 0xfe807bbd
-	.long		0x3f9e9356, 0x9ee62949, 0x1f0c2cdd
-	.long		0x1d6708a0, 0x0c30f51d, 0xe95c1271
-	.long		0xef82aa68, 0xdb3935ea, 0xb918a347
-	.long		0xd14bcc9b, 0x21aa2b26, 0x71d54a59
-	.long		0x99cce860, 0x356d209f, 0xff6f2fc2
-	.long		0xd8af8e46, 0xc352f6de, 0xcec97417
-	.long		0xf1996890, 0xd8110ff1, 0x1c63267b
-	.long		0x631bc508, 0xe95c7216, 0xf183c71b
-	.long		0x8511c306, 0x8e031a19, 0x9b9bdbd0
-	.long		0xdb3839f3, 0x1d6708a0, 0xd31343ea
-	.long		0x7a92fffb, 0xf7003835, 0x4470ac44
-	.long		0x6ce68f2a, 0x00eba0c8, 0xeea395c4
-	.long		0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee
-	.long		0xb46f7cff, 0x9a1b53c8, 0xcd669a40
-	.long		0x60290934, 0x81b6f443, 0x6d40f445
-	.long		0x8e976a7d, 0xd8af8e46, 0x9ee62949
-	.long		0xdcf5088a, 0x9dbdc100, 0x145575d5
-	.long		0x1753ab84, 0xbbf2f6d6, 0x0c30f51d
-	.long		0x255b139e, 0x631bc508, 0xa55d1514
-	.long		0xd784eaa8, 0xce26786c, 0xdb3935ea
-	.long		0x6d2c864a, 0x8068c345, 0x2586d334
-	.long		0x02072e24, 0xdb3839f3, 0x21aa2b26
-	.long		0x06689b0a, 0x5efd72f5, 0xe0575528
-	.long		0x1e52f5ea, 0x4117915b, 0x356d209f
-	.long		0x1d3d1db6, 0x6ce68f2a, 0x9d842b80
-	.long		0x3796455c, 0xb8e0e4a8, 0xc352f6de
-	.long		0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c
-	.long		0x28ae0976, 0xb46f7cff, 0xd8110ff1
-	.long		0x9764bc8d, 0xd7e7a22c, 0x712510f0
-	.long		0x13a13e18, 0x3e9a43cd, 0xe95c7216
-	.long		0xb8ee242e, 0x8e976a7d, 0x3f9e9356
-	.long		0x0c540e7b, 0x753c81ff, 0x8e031a19
-	.long		0x9924c781, 0xb9220208, 0x3edcde65
-	.long		0x3954de39, 0x1753ab84, 0x1d6708a0
-	.long		0xf32238b5, 0xbec81497, 0x9e70b943
-	.long		0xbbd2cd2c, 0x0925d861, 0xf7003835
-	.long		0xcc401304, 0xd784eaa8, 0xef82aa68
-	.long		0x4987e684, 0x6044fbb0, 0x00eba0c8
-	.long		0x3aa11427, 0x18fe3b4a, 0x87441142
-	.long		0x297aad60, 0x02072e24, 0xd14bcc9b
-	.long		0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a
-	.long		0x632d78c5, 0x3fc33de4, 0x9a1b53c8
-	.long		0x25b8822a, 0x1e52f5ea, 0x99cce860
-	.long		0xd4fc84bc, 0x1af62fb8, 0x81b6f443
-	.long		0x5690aa32, 0xa91fdefb, 0x688a110e
-	.long		0x1357a093, 0x3796455c, 0xd8af8e46
-	.long		0x798fdd33, 0xaaa18a37, 0x357b9517
-	.long		0xc2815395, 0x54d42691, 0x9dbdc100
-	.long		0x21cfc0f7, 0x28ae0976, 0xf1996890
-	.long		0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index 9bef696e2230..4e298baddc2e 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -5,6 +5,7 @@
  *
  * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com>
  */
+#include <linux/bitfield.h>
 #include <linux/bitops.h>
 #include <linux/bug.h>
 #include <linux/printk.h>
@@ -1500,43 +1501,41 @@ u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
 	return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm);
 }
 
-u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
+static u32 __get_barrier_crm_val(enum aarch64_insn_mb_type type)
 {
-	u32 opt;
-	u32 insn;
-
 	switch (type) {
 	case AARCH64_INSN_MB_SY:
-		opt = 0xf;
-		break;
+		return 0xf;
 	case AARCH64_INSN_MB_ST:
-		opt = 0xe;
-		break;
+		return 0xe;
 	case AARCH64_INSN_MB_LD:
-		opt = 0xd;
-		break;
+		return 0xd;
 	case AARCH64_INSN_MB_ISH:
-		opt = 0xb;
-		break;
+		return 0xb;
 	case AARCH64_INSN_MB_ISHST:
-		opt = 0xa;
-		break;
+		return 0xa;
 	case AARCH64_INSN_MB_ISHLD:
-		opt = 0x9;
-		break;
+		return 0x9;
 	case AARCH64_INSN_MB_NSH:
-		opt = 0x7;
-		break;
+		return 0x7;
 	case AARCH64_INSN_MB_NSHST:
-		opt = 0x6;
-		break;
+		return 0x6;
 	case AARCH64_INSN_MB_NSHLD:
-		opt = 0x5;
-		break;
+		return 0x5;
 	default:
-		pr_err("%s: unknown dmb type %d\n", __func__, type);
+		pr_err("%s: unknown barrier type %d\n", __func__, type);
 		return AARCH64_BREAK_FAULT;
 	}
+}
+
+u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
+{
+	u32 opt;
+	u32 insn;
+
+	opt = __get_barrier_crm_val(type);
+	if (opt == AARCH64_BREAK_FAULT)
+		return AARCH64_BREAK_FAULT;
 
 	insn = aarch64_insn_get_dmb_value();
 	insn &= ~GENMASK(11, 8);
@@ -1545,6 +1544,21 @@ u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
 	return insn;
 }
 
+u32 aarch64_insn_gen_dsb(enum aarch64_insn_mb_type type)
+{
+	u32 opt, insn;
+
+	opt = __get_barrier_crm_val(type);
+	if (opt == AARCH64_BREAK_FAULT)
+		return AARCH64_BREAK_FAULT;
+
+	insn = aarch64_insn_get_dsb_base_value();
+	insn &= ~GENMASK(11, 8);
+	insn |= (opt << 8);
+
+	return insn;
+}
+
 u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result,
 			 enum aarch64_insn_system_register sysreg)
 {
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index f9a53b7f9842..8fffebfa17b2 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -319,7 +319,7 @@ static void xor_arm64_eor3_5(unsigned long bytes,
 
 static int __init xor_neon_init(void)
 {
-	if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
+	if (cpu_have_named_feature(SHA3)) {
 		xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
 		xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
 		xor_block_inner_neon.do_5 = xor_arm64_eor3_5;