summaryrefslogtreecommitdiff
path: root/arch/arm64/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/lib')
-rw-r--r--arch/arm64/lib/.gitignore4
-rw-r--r--arch/arm64/lib/Makefile6
-rw-r--r--arch/arm64/lib/crc-t10dif-core.S469
-rw-r--r--arch/arm64/lib/crc-t10dif-glue.c73
-rw-r--r--arch/arm64/lib/crc32-glue.c99
-rw-r--r--arch/arm64/lib/crc32.S362
-rw-r--r--arch/arm64/lib/insn.c60
-rw-r--r--arch/arm64/lib/xor-neon.c2
8 files changed, 42 insertions, 1033 deletions
diff --git a/arch/arm64/lib/.gitignore b/arch/arm64/lib/.gitignore
new file mode 100644
index 000000000000..647d7a922e68
--- /dev/null
+++ b/arch/arm64/lib/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 4d49dff721a8..633e5223d944 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -13,12 +13,6 @@ endif
lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
-obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o
-crc32-arm64-y := crc32.o crc32-glue.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm64.o
-crc-t10dif-arm64-y := crc-t10dif-glue.o crc-t10dif-core.o
-
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
obj-$(CONFIG_ARM64_MTE) += mte.o
diff --git a/arch/arm64/lib/crc-t10dif-core.S b/arch/arm64/lib/crc-t10dif-core.S
deleted file mode 100644
index 87dd6d46224d..000000000000
--- a/arch/arm64/lib/crc-t10dif-core.S
+++ /dev/null
@@ -1,469 +0,0 @@
-//
-// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
-//
-// Copyright (C) 2016 Linaro Ltd
-// Copyright (C) 2019-2024 Google LLC
-//
-// Authors: Ard Biesheuvel <ardb@google.com>
-// Eric Biggers <ebiggers@google.com>
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License version 2 as
-// published by the Free Software Foundation.
-//
-
-// Derived from the x86 version:
-//
-// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
-//
-// Copyright (c) 2013, Intel Corporation
-//
-// Authors:
-// Erdinc Ozturk <erdinc.ozturk@intel.com>
-// Vinodh Gopal <vinodh.gopal@intel.com>
-// James Guilford <james.guilford@intel.com>
-// Tim Chen <tim.c.chen@linux.intel.com>
-//
-// This software is available to you under a choice of one of two
-// licenses. You may choose to be licensed under the terms of the GNU
-// General Public License (GPL) Version 2, available from the file
-// COPYING in the main directory of this source tree, or the
-// OpenIB.org BSD license below:
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the
-// distribution.
-//
-// * Neither the name of the Intel Corporation nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-//
-// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Reference paper titled "Fast CRC Computation for Generic
-// Polynomials Using PCLMULQDQ Instruction"
-// URL: http://www.intel.com/content/dam/www/public/us/en/documents
-// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-//
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .arch armv8-a+crypto
-
- init_crc .req w0
- buf .req x1
- len .req x2
- fold_consts_ptr .req x5
-
- fold_consts .req v10
-
- t3 .req v17
- t4 .req v18
- t5 .req v19
- t6 .req v20
- t7 .req v21
- t8 .req v22
-
- perm .req v27
-
- .macro pmull16x64_p64, a16, b64, c64
- pmull2 \c64\().1q, \a16\().2d, \b64\().2d
- pmull \b64\().1q, \a16\().1d, \b64\().1d
- .endm
-
- /*
- * Pairwise long polynomial multiplication of two 16-bit values
- *
- * { w0, w1 }, { y0, y1 }
- *
- * by two 64-bit values
- *
- * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
- *
- * where each vector element is a byte, ordered from least to most
- * significant.
- *
- * This can be implemented using 8x8 long polynomial multiplication, by
- * reorganizing the input so that each pairwise 8x8 multiplication
- * produces one of the terms from the decomposition below, and
- * combining the results of each rank and shifting them into place.
- *
- * Rank
- * 0 w0*x0 ^ | y0*z0 ^
- * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
- * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
- * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
- * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
- * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
- * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
- * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
- * 8 w1*x7 << 64 | y1*z7 << 64
- *
- * The inputs can be reorganized into
- *
- * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
- * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
- *
- * and after performing 8x8->16 bit long polynomial multiplication of
- * each of the halves of the first vector with those of the second one,
- * we obtain the following four vectors of 16-bit elements:
- *
- * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
- * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
- * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
- * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
- *
- * Results b and c can be XORed together, as the vector elements have
- * matching ranks. Then, the final XOR (*) can be pulled forward, and
- * applied between the halves of each of the remaining three vectors,
- * which are then shifted into place, and combined to produce two
- * 80-bit results.
- *
- * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
- * to the 64x64 bit one above, but XOR'ing the outputs together will
- * produce the expected result, and this is sufficient in the context of
- * this algorithm.
- */
- .macro pmull16x64_p8, a16, b64, c64
- ext t7.16b, \b64\().16b, \b64\().16b, #1
- tbl t5.16b, {\a16\().16b}, perm.16b
- uzp1 t7.16b, \b64\().16b, t7.16b
- bl __pmull_p8_16x64
- ext \b64\().16b, t4.16b, t4.16b, #15
- eor \c64\().16b, t8.16b, t5.16b
- .endm
-
-SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
- ext t6.16b, t5.16b, t5.16b, #8
-
- pmull t3.8h, t7.8b, t5.8b
- pmull t4.8h, t7.8b, t6.8b
- pmull2 t5.8h, t7.16b, t5.16b
- pmull2 t6.8h, t7.16b, t6.16b
-
- ext t8.16b, t3.16b, t3.16b, #8
- eor t4.16b, t4.16b, t6.16b
- ext t7.16b, t5.16b, t5.16b, #8
- ext t6.16b, t4.16b, t4.16b, #8
- eor t8.8b, t8.8b, t3.8b
- eor t5.8b, t5.8b, t7.8b
- eor t4.8b, t4.8b, t6.8b
- ext t5.16b, t5.16b, t5.16b, #14
- ret
-SYM_FUNC_END(__pmull_p8_16x64)
-
-
- // Fold reg1, reg2 into the next 32 data bytes, storing the result back
- // into reg1, reg2.
- .macro fold_32_bytes, p, reg1, reg2
- ldp q11, q12, [buf], #0x20
-
- pmull16x64_\p fold_consts, \reg1, v8
-
-CPU_LE( rev64 v11.16b, v11.16b )
-CPU_LE( rev64 v12.16b, v12.16b )
-
- pmull16x64_\p fold_consts, \reg2, v9
-
-CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
-CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
-
- eor \reg1\().16b, \reg1\().16b, v8.16b
- eor \reg2\().16b, \reg2\().16b, v9.16b
- eor \reg1\().16b, \reg1\().16b, v11.16b
- eor \reg2\().16b, \reg2\().16b, v12.16b
- .endm
-
- // Fold src_reg into dst_reg, optionally loading the next fold constants
- .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
- pmull16x64_\p fold_consts, \src_reg, v8
- .ifnb \load_next_consts
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- .endif
- eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
- eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
- .endm
-
- .macro crc_t10dif_pmull, p
-
- // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
- cmp len, #256
- b.lt .Lless_than_256_bytes_\@
-
- adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts
-
- // Load the first 128 data bytes. Byte swapping is necessary to make
- // the bit order match the polynomial coefficient order.
- ldp q0, q1, [buf]
- ldp q2, q3, [buf, #0x20]
- ldp q4, q5, [buf, #0x40]
- ldp q6, q7, [buf, #0x60]
- add buf, buf, #0x80
-CPU_LE( rev64 v0.16b, v0.16b )
-CPU_LE( rev64 v1.16b, v1.16b )
-CPU_LE( rev64 v2.16b, v2.16b )
-CPU_LE( rev64 v3.16b, v3.16b )
-CPU_LE( rev64 v4.16b, v4.16b )
-CPU_LE( rev64 v5.16b, v5.16b )
-CPU_LE( rev64 v6.16b, v6.16b )
-CPU_LE( rev64 v7.16b, v7.16b )
-CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
-CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
-CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
-CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
-CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
-CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
-CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
-
- // XOR the first 16 data *bits* with the initial CRC value.
- movi v8.16b, #0
- mov v8.h[7], init_crc
- eor v0.16b, v0.16b, v8.16b
-
- // Load the constants for folding across 128 bytes.
- ld1 {fold_consts.2d}, [fold_consts_ptr]
-
- // Subtract 128 for the 128 data bytes just consumed. Subtract another
- // 128 to simplify the termination condition of the following loop.
- sub len, len, #256
-
- // While >= 128 data bytes remain (not counting v0-v7), fold the 128
- // bytes v0-v7 into them, storing the result back into v0-v7.
-.Lfold_128_bytes_loop_\@:
- fold_32_bytes \p, v0, v1
- fold_32_bytes \p, v2, v3
- fold_32_bytes \p, v4, v5
- fold_32_bytes \p, v6, v7
-
- subs len, len, #128
- b.ge .Lfold_128_bytes_loop_\@
-
- // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
-
- // Fold across 64 bytes.
- add fold_consts_ptr, fold_consts_ptr, #16
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- fold_16_bytes \p, v0, v4
- fold_16_bytes \p, v1, v5
- fold_16_bytes \p, v2, v6
- fold_16_bytes \p, v3, v7, 1
- // Fold across 32 bytes.
- fold_16_bytes \p, v4, v6
- fold_16_bytes \p, v5, v7, 1
- // Fold across 16 bytes.
- fold_16_bytes \p, v6, v7
-
- // Add 128 to get the correct number of data bytes remaining in 0...127
- // (not counting v7), following the previous extra subtraction by 128.
- // Then subtract 16 to simplify the termination condition of the
- // following loop.
- adds len, len, #(128-16)
-
- // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
- // into them, storing the result back into v7.
- b.lt .Lfold_16_bytes_loop_done_\@
-.Lfold_16_bytes_loop_\@:
- pmull16x64_\p fold_consts, v7, v8
- eor v7.16b, v7.16b, v8.16b
- ldr q0, [buf], #16
-CPU_LE( rev64 v0.16b, v0.16b )
-CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
- eor v7.16b, v7.16b, v0.16b
- subs len, len, #16
- b.ge .Lfold_16_bytes_loop_\@
-
-.Lfold_16_bytes_loop_done_\@:
- // Add 16 to get the correct number of data bytes remaining in 0...15
- // (not counting v7), following the previous extra subtraction by 16.
- adds len, len, #16
- b.eq .Lreduce_final_16_bytes_\@
-
-.Lhandle_partial_segment_\@:
- // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
- // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To
- // do this without needing a fold constant for each possible 'len',
- // redivide the bytes into a first chunk of 'len' bytes and a second
- // chunk of 16 bytes, then fold the first chunk into the second.
-
- // v0 = last 16 original data bytes
- add buf, buf, len
- ldr q0, [buf, #-16]
-CPU_LE( rev64 v0.16b, v0.16b )
-CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
-
- // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
- adr_l x4, .Lbyteshift_table + 16
- sub x4, x4, len
- ld1 {v2.16b}, [x4]
- tbl v1.16b, {v7.16b}, v2.16b
-
- // v3 = first chunk: v7 right-shifted by '16-len' bytes.
- movi v3.16b, #0x80
- eor v2.16b, v2.16b, v3.16b
- tbl v3.16b, {v7.16b}, v2.16b
-
- // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
- sshr v2.16b, v2.16b, #7
-
- // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
- // then '16-len' bytes from v1 (high-order bytes).
- bsl v2.16b, v1.16b, v0.16b
-
- // Fold the first chunk into the second chunk, storing the result in v7.
- pmull16x64_\p fold_consts, v3, v0
- eor v7.16b, v3.16b, v0.16b
- eor v7.16b, v7.16b, v2.16b
- b .Lreduce_final_16_bytes_\@
-
-.Lless_than_256_bytes_\@:
- // Checksumming a buffer of length 16...255 bytes
-
- adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts
-
- // Load the first 16 data bytes.
- ldr q7, [buf], #0x10
-CPU_LE( rev64 v7.16b, v7.16b )
-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
-
- // XOR the first 16 data *bits* with the initial CRC value.
- movi v0.16b, #0
- mov v0.h[7], init_crc
- eor v7.16b, v7.16b, v0.16b
-
- // Load the fold-across-16-bytes constants.
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
-
- cmp len, #16
- b.eq .Lreduce_final_16_bytes_\@ // len == 16
- subs len, len, #32
- b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
- add len, len, #16
- b .Lhandle_partial_segment_\@ // 17 <= len <= 31
-
-.Lreduce_final_16_bytes_\@:
- .endm
-
-//
-// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p8)
- frame_push 1
-
- // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
- movi perm.4h, #8, lsl #8
- orr perm.2s, #1, lsl #16
- orr perm.2s, #1, lsl #24
- zip1 perm.16b, perm.16b, perm.16b
- zip1 perm.16b, perm.16b, perm.16b
-
- crc_t10dif_pmull p8
-
-CPU_LE( rev64 v7.16b, v7.16b )
-CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
- str q7, [x3]
-
- frame_pop
- ret
-SYM_FUNC_END(crc_t10dif_pmull_p8)
-
- .align 5
-//
-// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p64)
- crc_t10dif_pmull p64
-
- // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
-
- movi v2.16b, #0 // init zero register
-
- // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
-
- // Fold the high 64 bits into the low 64 bits, while also multiplying by
- // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
- // whose low 48 bits are 0.
- ext v0.16b, v2.16b, v7.16b, #8
- pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x))
- eor v0.16b, v0.16b, v7.16b // + low bits * x^64
-
- // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
- // value congruent to x^64 * M(x) and whose low 48 bits are 0.
- ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
- mov v0.s[3], v2.s[0] // zero high 32 bits
- pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x))
- eor v0.16b, v0.16b, v1.16b // + low bits
-
- // Load G(x) and floor(x^48 / G(x)).
- ld1 {fold_consts.2d}, [fold_consts_ptr]
-
- // Use Barrett reduction to compute the final CRC value.
- pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x))
- ushr v1.2d, v1.2d, #32 // /= x^32
- pmull v1.1q, v1.1d, fold_consts.1d // *= G(x)
- ushr v0.2d, v0.2d, #48
- eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
- // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
- umov w0, v0.h[0]
- ret
-SYM_FUNC_END(crc_t10dif_pmull_p64)
-
- .section ".rodata", "a"
- .align 4
-
-// Fold constants precomputed from the polynomial 0x18bb7
-// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
- .quad 0x0000000000006123 // x^(8*128) mod G(x)
- .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
-// .Lfold_across_64_bytes_consts:
- .quad 0x0000000000001069 // x^(4*128) mod G(x)
- .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
-// .Lfold_across_32_bytes_consts:
- .quad 0x000000000000857d // x^(2*128) mod G(x)
- .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
-.Lfold_across_16_bytes_consts:
- .quad 0x000000000000a010 // x^(1*128) mod G(x)
- .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
-// .Lfinal_fold_consts:
- .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
- .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
-// .Lbarrett_reduction_consts:
- .quad 0x0000000000018bb7 // G(x)
- .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
-
-// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
-// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
-// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
- .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
- .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
- .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
- .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
diff --git a/arch/arm64/lib/crc-t10dif-glue.c b/arch/arm64/lib/crc-t10dif-glue.c
deleted file mode 100644
index bacd18f23168..000000000000
--- a/arch/arm64/lib/crc-t10dif-glue.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
- *
- * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/cpufeature.h>
-#include <linux/crc-t10dif.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/simd.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-static DEFINE_STATIC_KEY_FALSE(have_asimd);
-static DEFINE_STATIC_KEY_FALSE(have_pmull);
-
-#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-
-asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
- u8 out[16]);
-asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-
-u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
-{
- if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
- if (static_branch_likely(&have_pmull)) {
- if (crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc_t10dif_pmull_p64(crc, data, length);
- kernel_neon_end();
- return crc;
- }
- } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
- static_branch_likely(&have_asimd) &&
- crypto_simd_usable()) {
- u8 buf[16];
-
- kernel_neon_begin();
- crc_t10dif_pmull_p8(crc, data, length, buf);
- kernel_neon_end();
-
- return crc_t10dif_generic(0, buf, sizeof(buf));
- }
- }
- return crc_t10dif_generic(crc, data, length);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_arm64_init(void)
-{
- if (cpu_have_named_feature(ASIMD)) {
- static_branch_enable(&have_asimd);
- if (cpu_have_named_feature(PMULL))
- static_branch_enable(&have_pmull);
- }
- return 0;
-}
-arch_initcall(crc_t10dif_arm64_init);
-
-static void __exit crc_t10dif_arm64_exit(void)
-{
-}
-module_exit(crc_t10dif_arm64_exit);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c
deleted file mode 100644
index ed3acd71178f..000000000000
--- a/arch/arm64/lib/crc32-glue.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/crc32.h>
-#include <linux/linkage.h>
-#include <linux/module.h>
-
-#include <asm/alternative.h>
-#include <asm/cpufeature.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-#include <crypto/internal/simd.h>
-
-// The minimum input length to consider the 4-way interleaved code path
-static const size_t min_len = 1024;
-
-asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
-
-asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
- return crc32_le_base(crc, p, len);
-
- if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc32_le_arm64_4way(crc, p, len);
- kernel_neon_end();
-
- p += round_down(len, 64);
- len %= 64;
-
- if (!len)
- return crc;
- }
-
- return crc32_le_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
- return crc32c_base(crc, p, len);
-
- if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc32c_le_arm64_4way(crc, p, len);
- kernel_neon_end();
-
- p += round_down(len, 64);
- len %= 64;
-
- if (!len)
- return crc;
- }
-
- return crc32c_le_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
- return crc32_be_base(crc, p, len);
-
- if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
- kernel_neon_begin();
- crc = crc32_be_arm64_4way(crc, p, len);
- kernel_neon_end();
-
- p += round_down(len, 64);
- len %= 64;
-
- if (!len)
- return crc;
- }
-
- return crc32_be_arm64(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-u32 crc32_optimizations(void)
-{
- if (alternative_has_cap_likely(ARM64_HAS_CRC32))
- return CRC32_LE_OPTIMIZATION |
- CRC32_BE_OPTIMIZATION |
- CRC32C_OPTIMIZATION;
- return 0;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("arm64-optimized CRC32 functions");
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
deleted file mode 100644
index 68825317460f..000000000000
--- a/arch/arm64/lib/crc32.S
+++ /dev/null
@@ -1,362 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
- *
- * Copyright (C) 2016 - 2018 Linaro Ltd.
- * Copyright (C) 2024 Google LLC
- *
- * Author: Ard Biesheuvel <ardb@kernel.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .cpu generic+crc+crypto
-
- .macro bitle, reg
- .endm
-
- .macro bitbe, reg
- rbit \reg, \reg
- .endm
-
- .macro bytele, reg
- .endm
-
- .macro bytebe, reg
- rbit \reg, \reg
- lsr \reg, \reg, #24
- .endm
-
- .macro hwordle, reg
-CPU_BE( rev16 \reg, \reg )
- .endm
-
- .macro hwordbe, reg
-CPU_LE( rev \reg, \reg )
- rbit \reg, \reg
-CPU_BE( lsr \reg, \reg, #16 )
- .endm
-
- .macro le, regs:vararg
- .irp r, \regs
-CPU_BE( rev \r, \r )
- .endr
- .endm
-
- .macro be, regs:vararg
- .irp r, \regs
-CPU_LE( rev \r, \r )
- .endr
- .irp r, \regs
- rbit \r, \r
- .endr
- .endm
-
- .macro __crc32, c, order=le
- bit\order w0
- cmp x2, #16
- b.lt 8f // less than 16 bytes
-
- and x7, x2, #0x1f
- and x2, x2, #~0x1f
- cbz x7, 32f // multiple of 32 bytes
-
- and x8, x7, #0xf
- ldp x3, x4, [x1]
- add x8, x8, x1
- add x1, x1, x7
- ldp x5, x6, [x8]
- \order x3, x4, x5, x6
-
- tst x7, #8
- crc32\c\()x w8, w0, x3
- csel x3, x3, x4, eq
- csel w0, w0, w8, eq
- tst x7, #4
- lsr x4, x3, #32
- crc32\c\()w w8, w0, w3
- csel x3, x3, x4, eq
- csel w0, w0, w8, eq
- tst x7, #2
- lsr w4, w3, #16
- crc32\c\()h w8, w0, w3
- csel w3, w3, w4, eq
- csel w0, w0, w8, eq
- tst x7, #1
- crc32\c\()b w8, w0, w3
- csel w0, w0, w8, eq
- tst x7, #16
- crc32\c\()x w8, w0, x5
- crc32\c\()x w8, w8, x6
- csel w0, w0, w8, eq
- cbz x2, 0f
-
-32: ldp x3, x4, [x1], #32
- sub x2, x2, #32
- ldp x5, x6, [x1, #-16]
- \order x3, x4, x5, x6
- crc32\c\()x w0, w0, x3
- crc32\c\()x w0, w0, x4
- crc32\c\()x w0, w0, x5
- crc32\c\()x w0, w0, x6
- cbnz x2, 32b
-0: bit\order w0
- ret
-
-8: tbz x2, #3, 4f
- ldr x3, [x1], #8
- \order x3
- crc32\c\()x w0, w0, x3
-4: tbz x2, #2, 2f
- ldr w3, [x1], #4
- \order w3
- crc32\c\()w w0, w0, w3
-2: tbz x2, #1, 1f
- ldrh w3, [x1], #2
- hword\order w3
- crc32\c\()h w0, w0, w3
-1: tbz x2, #0, 0f
- ldrb w3, [x1]
- byte\order w3
- crc32\c\()b w0, w0, w3
-0: bit\order w0
- ret
- .endm
-
- .align 5
-SYM_FUNC_START(crc32_le_arm64)
- __crc32
-SYM_FUNC_END(crc32_le_arm64)
-
- .align 5
-SYM_FUNC_START(crc32c_le_arm64)
- __crc32 c
-SYM_FUNC_END(crc32c_le_arm64)
-
- .align 5
-SYM_FUNC_START(crc32_be_arm64)
- __crc32 order=be
-SYM_FUNC_END(crc32_be_arm64)
-
- in .req x1
- len .req x2
-
- /*
- * w0: input CRC at entry, output CRC at exit
- * x1: pointer to input buffer
- * x2: length of input in bytes
- */
- .macro crc4way, insn, table, order=le
- bit\order w0
- lsr len, len, #6 // len := # of 64-byte blocks
-
- /* Process up to 64 blocks of 64 bytes at a time */
-.La\@: mov x3, #64
- cmp len, #64
- csel x3, x3, len, hi // x3 := min(len, 64)
- sub len, len, x3
-
- /* Divide the input into 4 contiguous blocks */
- add x4, x3, x3, lsl #1 // x4 := 3 * x3
- add x7, in, x3, lsl #4 // x7 := in + 16 * x3
- add x8, in, x3, lsl #5 // x8 := in + 32 * x3
- add x9, in, x4, lsl #4 // x9 := in + 16 * x4
-
- /* Load the folding coefficients from the lookup table */
- adr_l x5, \table - 12 // entry 0 omitted
- add x5, x5, x4, lsl #2 // x5 += 12 * x3
- ldp s0, s1, [x5]
- ldr s2, [x5, #8]
-
- /* Zero init partial CRCs for this iteration */
- mov w4, wzr
- mov w5, wzr
- mov w6, wzr
- mov x17, xzr
-
-.Lb\@: sub x3, x3, #1
- \insn w6, w6, x17
- ldp x10, x11, [in], #16
- ldp x12, x13, [x7], #16
- ldp x14, x15, [x8], #16
- ldp x16, x17, [x9], #16
-
- \order x10, x11, x12, x13, x14, x15, x16, x17
-
- /* Apply the CRC transform to 4 16-byte blocks in parallel */
- \insn w0, w0, x10
- \insn w4, w4, x12
- \insn w5, w5, x14
- \insn w6, w6, x16
- \insn w0, w0, x11
- \insn w4, w4, x13
- \insn w5, w5, x15
- cbnz x3, .Lb\@
-
- /* Combine the 4 partial results into w0 */
- mov v3.d[0], x0
- mov v4.d[0], x4
- mov v5.d[0], x5
- pmull v0.1q, v0.1d, v3.1d
- pmull v1.1q, v1.1d, v4.1d
- pmull v2.1q, v2.1d, v5.1d
- eor v0.8b, v0.8b, v1.8b
- eor v0.8b, v0.8b, v2.8b
- mov x5, v0.d[0]
- eor x5, x5, x17
- \insn w0, w6, x5
-
- mov in, x9
- cbnz len, .La\@
-
- bit\order w0
- ret
- .endm
-
- .align 5
-SYM_FUNC_START(crc32c_le_arm64_4way)
- crc4way crc32cx, .L0
-SYM_FUNC_END(crc32c_le_arm64_4way)
-
- .align 5
-SYM_FUNC_START(crc32_le_arm64_4way)
- crc4way crc32x, .L1
-SYM_FUNC_END(crc32_le_arm64_4way)
-
- .align 5
-SYM_FUNC_START(crc32_be_arm64_4way)
- crc4way crc32x, .L1, be
-SYM_FUNC_END(crc32_be_arm64_4way)
-
- .section .rodata, "a", %progbits
- .align 6
-.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27
- .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e
- .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b
- .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8
- .long 0x299847d5, 0x878a92a7, 0x39d3b296
- .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53
- .long 0xa60ce07b, 0x83348832, 0x47db8317
- .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092
- .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0
- .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7
- .long 0xf285651c, 0xce7f39f4, 0xdaece73e
- .long 0x271d9844, 0xd270f1a2, 0xab7aff2a
- .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385
- .long 0xcec3662e, 0x1b03397f, 0x83348832
- .long 0x8227bb8a, 0xb3e32c28, 0x299847d5
- .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86
- .long 0xf6076544, 0x10746f3c, 0x18b33a4e
- .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b
- .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7
- .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b
- .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b
- .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4
- .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56
- .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2
- .long 0xa90fd27a, 0x0167d312, 0xc619809d
- .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d
- .long 0x4597456a, 0x98d8d9cb, 0x65863b64
- .long 0xc9c8b782, 0x68bce87a, 0x1b03397f
- .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd
- .long 0x2342001e, 0x3771e98f, 0xb3e32c28
- .long 0xe8b6368b, 0x2178513a, 0x064f7f26
- .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c
- .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c
- .long 0x02ee03b2, 0xff0dba97, 0x10746f3c
- .long 0x135c83fd, 0xf872e54c, 0xc7a68855
- .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844
- .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c
- .long 0xded288f8, 0xb3af077a, 0x93a5f730
- .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c
- .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2
- .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203
- .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e
- .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb
- .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a
- .long 0x8e1450f7, 0x2342001e, 0x8227bb8a
- .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768
- .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35
- .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c
- .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b
- .long 0xd6c3a807, 0x2664fd8b, 0x0167d312
- .long 0x1d31175f, 0x02ee03b2, 0xf6076544
- .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a
- .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf
- .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb
- .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c
- .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a
- .long 0x8a074012, 0xded288f8, 0x57a3d037
- .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b
- .long 0x3be3c09b, 0x6353c1cc, 0x42d98888
- .long 0x465a4eee, 0xf48642e9, 0x3771e98f
- .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9
- .long 0xa52f58ec, 0x9a5ede41, 0x2178513a
- .long 0x47972100, 0x45cddf4e, 0xe0ac139e
- .long 0x359674f7, 0xa51b6135, 0x170076fa
-
-.L1: .long 0xaf449247, 0x81256527, 0xccaa009e
- .long 0x57c54819, 0x1d9513d7, 0x81256527
- .long 0x3f41287a, 0x57c54819, 0xaf449247
- .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7
- .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394
- .long 0x71d54a59, 0xf5e48c85, 0x57c54819
- .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed
- .long 0xd31343ea, 0xe95c1271, 0x910eeec1
- .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a
- .long 0x9ee62949, 0xcec97417, 0x9026d5b1
- .long 0xa55d1514, 0xf183c71b, 0xd1df2327
- .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85
- .long 0x9d842b80, 0xeea395c4, 0x3c656ced
- .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd
- .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd
- .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271
- .long 0xef82aa68, 0xdb3935ea, 0xb918a347
- .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59
- .long 0x99cce860, 0x356d209f, 0xff6f2fc2
- .long 0xd8af8e46, 0xc352f6de, 0xcec97417
- .long 0xf1996890, 0xd8110ff1, 0x1c63267b
- .long 0x631bc508, 0xe95c7216, 0xf183c71b
- .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0
- .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea
- .long 0x7a92fffb, 0xf7003835, 0x4470ac44
- .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4
- .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee
- .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40
- .long 0x60290934, 0x81b6f443, 0x6d40f445
- .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949
- .long 0xdcf5088a, 0x9dbdc100, 0x145575d5
- .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d
- .long 0x255b139e, 0x631bc508, 0xa55d1514
- .long 0xd784eaa8, 0xce26786c, 0xdb3935ea
- .long 0x6d2c864a, 0x8068c345, 0x2586d334
- .long 0x02072e24, 0xdb3839f3, 0x21aa2b26
- .long 0x06689b0a, 0x5efd72f5, 0xe0575528
- .long 0x1e52f5ea, 0x4117915b, 0x356d209f
- .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80
- .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de
- .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c
- .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1
- .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0
- .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216
- .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356
- .long 0x0c540e7b, 0x753c81ff, 0x8e031a19
- .long 0x9924c781, 0xb9220208, 0x3edcde65
- .long 0x3954de39, 0x1753ab84, 0x1d6708a0
- .long 0xf32238b5, 0xbec81497, 0x9e70b943
- .long 0xbbd2cd2c, 0x0925d861, 0xf7003835
- .long 0xcc401304, 0xd784eaa8, 0xef82aa68
- .long 0x4987e684, 0x6044fbb0, 0x00eba0c8
- .long 0x3aa11427, 0x18fe3b4a, 0x87441142
- .long 0x297aad60, 0x02072e24, 0xd14bcc9b
- .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a
- .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8
- .long 0x25b8822a, 0x1e52f5ea, 0x99cce860
- .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443
- .long 0x5690aa32, 0xa91fdefb, 0x688a110e
- .long 0x1357a093, 0x3796455c, 0xd8af8e46
- .long 0x798fdd33, 0xaaa18a37, 0x357b9517
- .long 0xc2815395, 0x54d42691, 0x9dbdc100
- .long 0x21cfc0f7, 0x28ae0976, 0xf1996890
- .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index 9bef696e2230..4e298baddc2e 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -5,6 +5,7 @@
*
* Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com>
*/
+#include <linux/bitfield.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/printk.h>
@@ -1500,43 +1501,41 @@ u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm);
}
-u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
+static u32 __get_barrier_crm_val(enum aarch64_insn_mb_type type)
{
- u32 opt;
- u32 insn;
-
switch (type) {
case AARCH64_INSN_MB_SY:
- opt = 0xf;
- break;
+ return 0xf;
case AARCH64_INSN_MB_ST:
- opt = 0xe;
- break;
+ return 0xe;
case AARCH64_INSN_MB_LD:
- opt = 0xd;
- break;
+ return 0xd;
case AARCH64_INSN_MB_ISH:
- opt = 0xb;
- break;
+ return 0xb;
case AARCH64_INSN_MB_ISHST:
- opt = 0xa;
- break;
+ return 0xa;
case AARCH64_INSN_MB_ISHLD:
- opt = 0x9;
- break;
+ return 0x9;
case AARCH64_INSN_MB_NSH:
- opt = 0x7;
- break;
+ return 0x7;
case AARCH64_INSN_MB_NSHST:
- opt = 0x6;
- break;
+ return 0x6;
case AARCH64_INSN_MB_NSHLD:
- opt = 0x5;
- break;
+ return 0x5;
default:
- pr_err("%s: unknown dmb type %d\n", __func__, type);
+ pr_err("%s: unknown barrier type %d\n", __func__, type);
return AARCH64_BREAK_FAULT;
}
+}
+
+u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
+{
+ u32 opt;
+ u32 insn;
+
+ opt = __get_barrier_crm_val(type);
+ if (opt == AARCH64_BREAK_FAULT)
+ return AARCH64_BREAK_FAULT;
insn = aarch64_insn_get_dmb_value();
insn &= ~GENMASK(11, 8);
@@ -1545,6 +1544,21 @@ u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
return insn;
}
+u32 aarch64_insn_gen_dsb(enum aarch64_insn_mb_type type)
+{
+ u32 opt, insn;
+
+ opt = __get_barrier_crm_val(type);
+ if (opt == AARCH64_BREAK_FAULT)
+ return AARCH64_BREAK_FAULT;
+
+ insn = aarch64_insn_get_dsb_base_value();
+ insn &= ~GENMASK(11, 8);
+ insn |= (opt << 8);
+
+ return insn;
+}
+
u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result,
enum aarch64_insn_system_register sysreg)
{
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index f9a53b7f9842..8fffebfa17b2 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -319,7 +319,7 @@ static void xor_arm64_eor3_5(unsigned long bytes,
static int __init xor_neon_init(void)
{
- if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
+ if (cpu_have_named_feature(SHA3)) {
xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
xor_block_inner_neon.do_5 = xor_arm64_eor3_5;