summaryrefslogtreecommitdiff
path: root/lib/crc
diff options
context:
space:
mode:
Diffstat (limited to 'lib/crc')
-rw-r--r--lib/crc/.gitignore5
-rw-r--r--lib/crc/Kconfig119
-rw-r--r--lib/crc/Makefile63
-rw-r--r--lib/crc/arm/crc-t10dif-core.S468
-rw-r--r--lib/crc/arm/crc-t10dif.h55
-rw-r--r--lib/crc/arm/crc32-core.S306
-rw-r--r--lib/crc/arm/crc32.h101
-rw-r--r--lib/crc/arm64/crc-t10dif-core.S469
-rw-r--r--lib/crc/arm64/crc-t10dif.h57
-rw-r--r--lib/crc/arm64/crc32-core.S362
-rw-r--r--lib/crc/arm64/crc32.h88
-rw-r--r--lib/crc/crc-ccitt.c66
-rw-r--r--lib/crc/crc-itu-t.c68
-rw-r--r--lib/crc/crc-t10dif-main.c89
-rw-r--r--lib/crc/crc16.c65
-rw-r--r--lib/crc/crc32-main.c105
-rw-r--r--lib/crc/crc4.c45
-rw-r--r--lib/crc/crc64-main.c93
-rw-r--r--lib/crc/crc7.c73
-rw-r--r--lib/crc/crc8.c87
-rw-r--r--lib/crc/gen_crc32table.c89
-rw-r--r--lib/crc/gen_crc64table.c88
-rw-r--r--lib/crc/loongarch/crc32.h115
-rw-r--r--lib/crc/mips/crc32.h162
-rw-r--r--lib/crc/powerpc/crc-t10dif.h69
-rw-r--r--lib/crc/powerpc/crc-vpmsum-template.S746
-rw-r--r--lib/crc/powerpc/crc32.h69
-rw-r--r--lib/crc/powerpc/crc32c-vpmsum_asm.S842
-rw-r--r--lib/crc/powerpc/crct10dif-vpmsum_asm.S845
-rw-r--r--lib/crc/riscv/crc-clmul-consts.h122
-rw-r--r--lib/crc/riscv/crc-clmul-template.h265
-rw-r--r--lib/crc/riscv/crc-clmul.h23
-rw-r--r--lib/crc/riscv/crc-t10dif.h18
-rw-r--r--lib/crc/riscv/crc16_msb.c18
-rw-r--r--lib/crc/riscv/crc32.h44
-rw-r--r--lib/crc/riscv/crc32_lsb.c18
-rw-r--r--lib/crc/riscv/crc32_msb.c18
-rw-r--r--lib/crc/riscv/crc64.h27
-rw-r--r--lib/crc/riscv/crc64_lsb.c18
-rw-r--r--lib/crc/riscv/crc64_msb.c18
-rw-r--r--lib/crc/s390/crc32-vx.h12
-rw-r--r--lib/crc/s390/crc32.h67
-rw-r--r--lib/crc/s390/crc32be-vx.c174
-rw-r--r--lib/crc/s390/crc32le-vx.c240
-rw-r--r--lib/crc/sparc/crc32.h67
-rw-r--r--lib/crc/sparc/crc32c_asm.S20
-rw-r--r--lib/crc/tests/Makefile2
-rw-r--r--lib/crc/tests/crc_kunit.c452
-rw-r--r--lib/crc/x86/crc-pclmul-consts.h240
-rw-r--r--lib/crc/x86/crc-pclmul-template.S575
-rw-r--r--lib/crc/x86/crc-pclmul-template.h72
-rw-r--r--lib/crc/x86/crc-t10dif.h35
-rw-r--r--lib/crc/x86/crc16-msb-pclmul.S6
-rw-r--r--lib/crc/x86/crc32-pclmul.S6
-rw-r--r--lib/crc/x86/crc32.h137
-rw-r--r--lib/crc/x86/crc32c-3way.S360
-rw-r--r--lib/crc/x86/crc64-pclmul.S7
-rw-r--r--lib/crc/x86/crc64.h48
58 files changed, 8918 insertions, 0 deletions
diff --git a/lib/crc/.gitignore b/lib/crc/.gitignore
new file mode 100644
index 000000000000..a9e48103c9fb
--- /dev/null
+++ b/lib/crc/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/crc32table.h
+/crc64table.h
+/gen_crc32table
+/gen_crc64table
diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
new file mode 100644
index 000000000000..70e7a6016de3
--- /dev/null
+++ b/lib/crc/Kconfig
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# Kconfig for the kernel's cyclic redundancy check (CRC) library code
+
+config CRC4
+ tristate
+ help
+ The CRC4 library functions. Select this if your module uses any of
+ the functions from <linux/crc4.h>.
+
+config CRC7
+ tristate
+ help
+ The CRC7 library functions. Select this if your module uses any of
+ the functions from <linux/crc7.h>.
+
+config CRC8
+ tristate
+ help
+ The CRC8 library functions. Select this if your module uses any of
+ the functions from <linux/crc8.h>.
+
+config CRC16
+ tristate
+ help
+ The CRC16 library functions. Select this if your module uses any of
+ the functions from <linux/crc16.h>.
+
+config CRC_CCITT
+ tristate
+ help
+ The CRC-CCITT library functions. Select this if your module uses any
+ of the functions from <linux/crc-ccitt.h>.
+
+config CRC_ITU_T
+ tristate
+ help
+ The CRC-ITU-T library functions. Select this if your module uses
+ any of the functions from <linux/crc-itu-t.h>.
+
+config CRC_T10DIF
+ tristate
+ help
+ The CRC-T10DIF library functions. Select this if your module uses
+ any of the functions from <linux/crc-t10dif.h>.
+
+config CRC_T10DIF_ARCH
+ bool
+ depends on CRC_T10DIF && CRC_OPTIMIZATIONS
+ default y if ARM && KERNEL_MODE_NEON
+ default y if ARM64 && KERNEL_MODE_NEON
+ default y if PPC64 && ALTIVEC
+ default y if RISCV && RISCV_ISA_ZBC
+ default y if X86
+
+config CRC32
+ tristate
+ select BITREVERSE
+ help
+ The CRC32 library functions. Select this if your module uses any of
+ the functions from <linux/crc32.h> or <linux/crc32c.h>.
+
+config CRC32_ARCH
+ bool
+ depends on CRC32 && CRC_OPTIMIZATIONS
+ default y if ARM && KERNEL_MODE_NEON
+ default y if ARM64
+ default y if LOONGARCH
+ default y if MIPS && CPU_MIPSR6
+ default y if PPC64 && ALTIVEC
+ default y if RISCV && RISCV_ISA_ZBC
+ default y if S390
+ default y if SPARC64
+ default y if X86
+
+config CRC64
+ tristate
+ help
+ The CRC64 library functions. Select this if your module uses any of
+ the functions from <linux/crc64.h>.
+
+config CRC64_ARCH
+ bool
+ depends on CRC64 && CRC_OPTIMIZATIONS
+ default y if RISCV && RISCV_ISA_ZBC && 64BIT
+ default y if X86_64
+
+config CRC_OPTIMIZATIONS
+ bool "Enable optimized CRC implementations" if EXPERT
+ depends on !UML
+ default y
+ help
+ Disabling this option reduces code size slightly by disabling the
+ architecture-optimized implementations of any CRC variants that are
+ enabled. CRC checksumming performance may get much slower.
+
+ Keep this enabled unless you're really trying to minimize the size of
+ the kernel.
+
+config CRC_KUNIT_TEST
+ tristate "KUnit tests for CRC functions" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ select CRC7
+ select CRC16
+ select CRC_T10DIF
+ select CRC32
+ select CRC64
+ help
+ Unit tests for the CRC library functions.
+
+ This is intended to help people writing architecture-specific
+ optimized versions. If unsure, say N.
+
+config CRC_BENCHMARK
+ bool "Benchmark for the CRC functions"
+ depends on CRC_KUNIT_TEST
+ help
+ Include benchmarks in the KUnit test suite for the CRC functions.
diff --git a/lib/crc/Makefile b/lib/crc/Makefile
new file mode 100644
index 000000000000..7543ad295ab6
--- /dev/null
+++ b/lib/crc/Makefile
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# Makefile for the kernel's cyclic redundancy check (CRC) library code
+
+obj-$(CONFIG_CRC4) += crc4.o
+obj-$(CONFIG_CRC7) += crc7.o
+obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC16) += crc16.o
+obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o
+obj-$(CONFIG_CRC_ITU_T) += crc-itu-t.o
+
+obj-$(CONFIG_CRC_T10DIF) += crc-t10dif.o
+crc-t10dif-y := crc-t10dif-main.o
+ifeq ($(CONFIG_CRC_T10DIF_ARCH),y)
+CFLAGS_crc-t10dif-main.o += -I$(src)/$(SRCARCH)
+crc-t10dif-$(CONFIG_ARM) += arm/crc-t10dif-core.o
+crc-t10dif-$(CONFIG_ARM64) += arm64/crc-t10dif-core.o
+crc-t10dif-$(CONFIG_PPC) += powerpc/crct10dif-vpmsum_asm.o
+crc-t10dif-$(CONFIG_RISCV) += riscv/crc16_msb.o
+crc-t10dif-$(CONFIG_X86) += x86/crc16-msb-pclmul.o
+endif
+
+obj-$(CONFIG_CRC32) += crc32.o
+crc32-y := crc32-main.o
+ifeq ($(CONFIG_CRC32_ARCH),y)
+CFLAGS_crc32-main.o += -I$(src)/$(SRCARCH)
+crc32-$(CONFIG_ARM) += arm/crc32-core.o
+crc32-$(CONFIG_ARM64) += arm64/crc32-core.o
+crc32-$(CONFIG_PPC) += powerpc/crc32c-vpmsum_asm.o
+crc32-$(CONFIG_RISCV) += riscv/crc32_lsb.o riscv/crc32_msb.o
+crc32-$(CONFIG_S390) += s390/crc32le-vx.o s390/crc32be-vx.o
+crc32-$(CONFIG_SPARC) += sparc/crc32c_asm.o
+crc32-$(CONFIG_X86) += x86/crc32-pclmul.o
+crc32-$(CONFIG_X86_64) += x86/crc32c-3way.o
+endif
+
+obj-$(CONFIG_CRC64) += crc64.o
+crc64-y := crc64-main.o
+ifeq ($(CONFIG_CRC64_ARCH),y)
+CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
+crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
+crc64-$(CONFIG_X86) += x86/crc64-pclmul.o
+endif
+
+obj-y += tests/
+
+hostprogs := gen_crc32table gen_crc64table
+clean-files := crc32table.h crc64table.h
+
+$(obj)/crc32-main.o: $(obj)/crc32table.h
+$(obj)/crc64-main.o: $(obj)/crc64table.h
+
+quiet_cmd_crc32 = GEN $@
+ cmd_crc32 = $< > $@
+
+quiet_cmd_crc64 = GEN $@
+ cmd_crc64 = $< > $@
+
+$(obj)/crc32table.h: $(obj)/gen_crc32table
+ $(call cmd,crc32)
+
+$(obj)/crc64table.h: $(obj)/gen_crc64table
+ $(call cmd,crc64)
diff --git a/lib/crc/arm/crc-t10dif-core.S b/lib/crc/arm/crc-t10dif-core.S
new file mode 100644
index 000000000000..2bbf2df9c1e2
--- /dev/null
+++ b/lib/crc/arm/crc-t10dif-core.S
@@ -0,0 +1,468 @@
+//
+// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+// Derived from the x86 version:
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+// Erdinc Ozturk <erdinc.ozturk@intel.com>
+// Vinodh Gopal <vinodh.gopal@intel.com>
+// James Guilford <james.guilford@intel.com>
+// Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses. You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the
+// distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Reference paper titled "Fast CRC Computation for Generic
+// Polynomials Using PCLMULQDQ Instruction"
+// URL: http://www.intel.com/content/dam/www/public/us/en/documents
+// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define CPU_LE(code...)
+#else
+#define CPU_LE(code...) code
+#endif
+
+ .text
+ .arch armv8-a
+ .fpu crypto-neon-fp-armv8
+
+ init_crc .req r0
+ buf .req r1
+ len .req r2
+
+ fold_consts_ptr .req ip
+
+ q0l .req d0
+ q0h .req d1
+ q1l .req d2
+ q1h .req d3
+ q2l .req d4
+ q2h .req d5
+ q3l .req d6
+ q3h .req d7
+ q4l .req d8
+ q4h .req d9
+ q5l .req d10
+ q5h .req d11
+ q6l .req d12
+ q6h .req d13
+ q7l .req d14
+ q7h .req d15
+ q8l .req d16
+ q8h .req d17
+ q9l .req d18
+ q9h .req d19
+ q10l .req d20
+ q10h .req d21
+ q11l .req d22
+ q11h .req d23
+ q12l .req d24
+ q12h .req d25
+
+ FOLD_CONSTS .req q10
+ FOLD_CONST_L .req q10l
+ FOLD_CONST_H .req q10h
+
+ /*
+ * Pairwise long polynomial multiplication of two 16-bit values
+ *
+ * { w0, w1 }, { y0, y1 }
+ *
+ * by two 64-bit values
+ *
+ * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+ *
+ * where each vector element is a byte, ordered from least to most
+ * significant. The resulting 80-bit vectors are XOR'ed together.
+ *
+ * This can be implemented using 8x8 long polynomial multiplication, by
+ * reorganizing the input so that each pairwise 8x8 multiplication
+ * produces one of the terms from the decomposition below, and
+ * combining the results of each rank and shifting them into place.
+ *
+ * Rank
+ * 0 w0*x0 ^ | y0*z0 ^
+ * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
+ * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
+ * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
+ * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
+ * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
+ * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
+ * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
+ * 8 w1*x7 << 64 | y1*z7 << 64
+ *
+ * The inputs can be reorganized into
+ *
+ * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+ * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+ *
+ * and after performing 8x8->16 bit long polynomial multiplication of
+ * each of the halves of the first vector with those of the second one,
+ * we obtain the following four vectors of 16-bit elements:
+ *
+ * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+ * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+ * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+ * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+ *
+ * Results b and c can be XORed together, as the vector elements have
+ * matching ranks. Then, the final XOR can be pulled forward, and
+ * applied between the halves of each of the remaining three vectors,
+ * which are then shifted into place, and XORed together to produce the
+ * final 80-bit result.
+ */
+ .macro pmull16x64_p8, v16, v64
+ vext.8 q11, \v64, \v64, #1
+ vld1.64 {q12}, [r4, :128]
+ vuzp.8 q11, \v64
+ vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
+ vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
+ bl __pmull16x64_p8
+ veor \v64, q12, q14
+ .endm
+
+__pmull16x64_p8:
+ vmull.p8 q13, d23, d24
+ vmull.p8 q14, d23, d25
+ vmull.p8 q15, d22, d24
+ vmull.p8 q12, d22, d25
+
+ veor q14, q14, q15
+ veor d24, d24, d25
+ veor d26, d26, d27
+ veor d28, d28, d29
+ vmov.i32 d25, #0
+ vmov.i32 d29, #0
+ vext.8 q12, q12, q12, #14
+ vext.8 q14, q14, q14, #15
+ veor d24, d24, d26
+ bx lr
+ENDPROC(__pmull16x64_p8)
+
+ .macro pmull16x64_p64, v16, v64
+ vmull.p64 q11, \v64\()l, \v16\()_L
+ vmull.p64 \v64, \v64\()h, \v16\()_H
+ veor \v64, \v64, q11
+ .endm
+
+ // Fold reg1, reg2 into the next 32 data bytes, storing the result back
+ // into reg1, reg2.
+ .macro fold_32_bytes, reg1, reg2, p
+ vld1.64 {q8-q9}, [buf]!
+
+ pmull16x64_\p FOLD_CONST, \reg1
+ pmull16x64_\p FOLD_CONST, \reg2
+
+CPU_LE( vrev64.8 q8, q8 )
+CPU_LE( vrev64.8 q9, q9 )
+ vswp q8l, q8h
+ vswp q9l, q9h
+
+ veor.8 \reg1, \reg1, q8
+ veor.8 \reg2, \reg2, q9
+ .endm
+
+ // Fold src_reg into dst_reg, optionally loading the next fold constants
+ .macro fold_16_bytes, src_reg, dst_reg, p, load_next_consts
+ pmull16x64_\p FOLD_CONST, \src_reg
+ .ifnb \load_next_consts
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+ .endif
+ veor.8 \dst_reg, \dst_reg, \src_reg
+ .endm
+
+ .macro crct10dif, p
+ // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+ cmp len, #256
+ blt .Lless_than_256_bytes\@
+
+ mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts
+
+ // Load the first 128 data bytes. Byte swapping is necessary to make
+ // the bit order match the polynomial coefficient order.
+ vld1.64 {q0-q1}, [buf]!
+ vld1.64 {q2-q3}, [buf]!
+ vld1.64 {q4-q5}, [buf]!
+ vld1.64 {q6-q7}, [buf]!
+CPU_LE( vrev64.8 q0, q0 )
+CPU_LE( vrev64.8 q1, q1 )
+CPU_LE( vrev64.8 q2, q2 )
+CPU_LE( vrev64.8 q3, q3 )
+CPU_LE( vrev64.8 q4, q4 )
+CPU_LE( vrev64.8 q5, q5 )
+CPU_LE( vrev64.8 q6, q6 )
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q0l, q0h
+ vswp q1l, q1h
+ vswp q2l, q2h
+ vswp q3l, q3h
+ vswp q4l, q4h
+ vswp q5l, q5h
+ vswp q6l, q6h
+ vswp q7l, q7h
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ vmov.i8 q8h, #0
+ vmov.u16 q8h[3], init_crc
+ veor q0h, q0h, q8h
+
+ // Load the constants for folding across 128 bytes.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+ // Subtract 128 for the 128 data bytes just consumed. Subtract another
+ // 128 to simplify the termination condition of the following loop.
+ sub len, len, #256
+
+ // While >= 128 data bytes remain (not counting q0-q7), fold the 128
+ // bytes q0-q7 into them, storing the result back into q0-q7.
+.Lfold_128_bytes_loop\@:
+ fold_32_bytes q0, q1, \p
+ fold_32_bytes q2, q3, \p
+ fold_32_bytes q4, q5, \p
+ fold_32_bytes q6, q7, \p
+ subs len, len, #128
+ bge .Lfold_128_bytes_loop\@
+
+ // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
+
+ // Fold across 64 bytes.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+ fold_16_bytes q0, q4, \p
+ fold_16_bytes q1, q5, \p
+ fold_16_bytes q2, q6, \p
+ fold_16_bytes q3, q7, \p, 1
+ // Fold across 32 bytes.
+ fold_16_bytes q4, q6, \p
+ fold_16_bytes q5, q7, \p, 1
+ // Fold across 16 bytes.
+ fold_16_bytes q6, q7, \p
+
+ // Add 128 to get the correct number of data bytes remaining in 0...127
+ // (not counting q7), following the previous extra subtraction by 128.
+ // Then subtract 16 to simplify the termination condition of the
+ // following loop.
+ adds len, len, #(128-16)
+
+ // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
+ // into them, storing the result back into q7.
+ blt .Lfold_16_bytes_loop_done\@
+.Lfold_16_bytes_loop\@:
+ pmull16x64_\p FOLD_CONST, q7
+ vld1.64 {q0}, [buf]!
+CPU_LE( vrev64.8 q0, q0 )
+ vswp q0l, q0h
+ veor.8 q7, q7, q0
+ subs len, len, #16
+ bge .Lfold_16_bytes_loop\@
+
+.Lfold_16_bytes_loop_done\@:
+ // Add 16 to get the correct number of data bytes remaining in 0...15
+ // (not counting q7), following the previous extra subtraction by 16.
+ adds len, len, #16
+ beq .Lreduce_final_16_bytes\@
+
+.Lhandle_partial_segment\@:
+ // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+ // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
+ // do this without needing a fold constant for each possible 'len',
+ // redivide the bytes into a first chunk of 'len' bytes and a second
+ // chunk of 16 bytes, then fold the first chunk into the second.
+
+ // q0 = last 16 original data bytes
+ add buf, buf, len
+ sub buf, buf, #16
+ vld1.64 {q0}, [buf]
+CPU_LE( vrev64.8 q0, q0 )
+ vswp q0l, q0h
+
+ // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
+ mov_l r1, .Lbyteshift_table + 16
+ sub r1, r1, len
+ vld1.8 {q2}, [r1]
+ vtbl.8 q1l, {q7l-q7h}, q2l
+ vtbl.8 q1h, {q7l-q7h}, q2h
+
+ // q3 = first chunk: q7 right-shifted by '16-len' bytes.
+ vmov.i8 q3, #0x80
+ veor.8 q2, q2, q3
+ vtbl.8 q3l, {q7l-q7h}, q2l
+ vtbl.8 q3h, {q7l-q7h}, q2h
+
+ // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+ vshr.s8 q2, q2, #7
+
+ // q2 = second chunk: 'len' bytes from q0 (low-order bytes),
+ // then '16-len' bytes from q1 (high-order bytes).
+ vbsl.8 q2, q1, q0
+
+ // Fold the first chunk into the second chunk, storing the result in q7.
+ pmull16x64_\p FOLD_CONST, q3
+ veor.8 q7, q3, q2
+ b .Lreduce_final_16_bytes\@
+
+.Lless_than_256_bytes\@:
+ // Checksumming a buffer of length 16...255 bytes
+
+ mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts
+
+ // Load the first 16 data bytes.
+ vld1.64 {q7}, [buf]!
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q7l, q7h
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ vmov.i8 q0h, #0
+ vmov.u16 q0h[3], init_crc
+ veor.8 q7h, q7h, q0h
+
+ // Load the fold-across-16-bytes constants.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+ cmp len, #16
+ beq .Lreduce_final_16_bytes\@ // len == 16
+ subs len, len, #32
+ addlt len, len, #16
+ blt .Lhandle_partial_segment\@ // 17 <= len <= 31
+ b .Lfold_16_bytes_loop\@ // 32 <= len <= 255
+
+.Lreduce_final_16_bytes\@:
+ .endm
+
+//
+// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+ENTRY(crc_t10dif_pmull64)
+ crct10dif p64
+
+ // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
+
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ // whose low 48 bits are 0.
+ vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x))
+ veor.8 q0h, q0h, q7l // + low bits * x^64
+
+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+ vmov.i8 q1, #0
+ vmov s4, s3 // extract high 32 bits
+ vmov s3, s5 // zero high 32 bits
+ vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x))
+ veor.8 q0, q0, q1 // + low bits
+
+ // Load G(x) and floor(x^48 / G(x)).
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]
+
+ // Use Barrett reduction to compute the final CRC value.
+ vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x))
+ vshr.u64 q1l, q1l, #32 // /= x^32
+ vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x)
+ vshr.u64 q0l, q0l, #48
+ veor.8 q0l, q0l, q1l // + low 16 nonzero bits
+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0.
+
+ vmov.u16 r0, q0l[0]
+ bx lr
+ENDPROC(crc_t10dif_pmull64)
+
+ENTRY(crc_t10dif_pmull8)
+ push {r4, lr}
+ mov_l r4, .L16x64perm
+
+ crct10dif p8
+
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q7l, q7h
+ vst1.64 {q7}, [r3, :128]
+ pop {r4, pc}
+ENDPROC(crc_t10dif_pmull8)
+
+ .section ".rodata", "a"
+ .align 4
+
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+ .quad 0x0000000000006123 // x^(8*128) mod G(x)
+ .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
+// .Lfold_across_64_bytes_consts:
+ .quad 0x0000000000001069 // x^(4*128) mod G(x)
+ .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
+// .Lfold_across_32_bytes_consts:
+ .quad 0x000000000000857d // x^(2*128) mod G(x)
+ .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+ .quad 0x000000000000a010 // x^(1*128) mod G(x)
+ .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
+// .Lfinal_fold_consts:
+ .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
+ .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+ .quad 0x0000000000018bb7 // G(x)
+ .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
+ .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+ .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
+
+.L16x64perm:
+ .quad 0x808080800000000, 0x909090901010101
diff --git a/lib/crc/arm/crc-t10dif.h b/lib/crc/arm/crc-t10dif.h
new file mode 100644
index 000000000000..2edf7e9681d0
--- /dev/null
+++ b/lib/crc/arm/crc-t10dif.h
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <crypto/internal/simd.h>
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
+
+asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
+ u8 out[16]);
+
+static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
+{
+ if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
+ if (static_branch_likely(&have_pmull)) {
+ if (crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc = crc_t10dif_pmull64(crc, data, length);
+ kernel_neon_end();
+ return crc;
+ }
+ } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
+ static_branch_likely(&have_neon) &&
+ crypto_simd_usable()) {
+ u8 buf[16] __aligned(16);
+
+ kernel_neon_begin();
+ crc_t10dif_pmull8(crc, data, length, buf);
+ kernel_neon_end();
+
+ return crc_t10dif_generic(0, buf, sizeof(buf));
+ }
+ }
+ return crc_t10dif_generic(crc, data, length);
+}
+
+#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
+static inline void crc_t10dif_mod_init_arch(void)
+{
+ if (elf_hwcap & HWCAP_NEON) {
+ static_branch_enable(&have_neon);
+ if (elf_hwcap2 & HWCAP2_PMULL)
+ static_branch_enable(&have_pmull);
+ }
+}
diff --git a/lib/crc/arm/crc32-core.S b/lib/crc/arm/crc32-core.S
new file mode 100644
index 000000000000..6f674f30c70b
--- /dev/null
+++ b/lib/crc/arm/crc32-core.S
@@ -0,0 +1,306 @@
+/*
+ * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * https://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ * Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .align 6
+ .arch armv8-a
+ .arch_extension crc
+ .fpu crypto-neon-fp-armv8
+
+.Lcrc32_constants:
+ /*
+ * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
+ * #define CONSTANT_R1 0x154442bd4LL
+ *
+ * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
+ * #define CONSTANT_R2 0x1c6e41596LL
+ */
+ .quad 0x0000000154442bd4
+ .quad 0x00000001c6e41596
+
+ /*
+ * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
+ * #define CONSTANT_R3 0x1751997d0LL
+ *
+ * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
+ * #define CONSTANT_R4 0x0ccaa009eLL
+ */
+ .quad 0x00000001751997d0
+ .quad 0x00000000ccaa009e
+
+ /*
+ * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
+ * #define CONSTANT_R5 0x163cd6124LL
+ */
+ .quad 0x0000000163cd6124
+ .quad 0x00000000FFFFFFFF
+
+ /*
+ * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+ *
+ * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
+ * = 0x1F7011641LL
+ * #define CONSTANT_RU 0x1F7011641LL
+ */
+ .quad 0x00000001DB710641
+ .quad 0x00000001F7011641
+
+.Lcrc32c_constants:
+ .quad 0x00000000740eef02
+ .quad 0x000000009e4addf8
+ .quad 0x00000000f20c0dfe
+ .quad 0x000000014cd00bd6
+ .quad 0x00000000dd45aab8
+ .quad 0x00000000FFFFFFFF
+ .quad 0x0000000105ec76f0
+ .quad 0x00000000dea713f1
+
+ dCONSTANTl .req d0
+ dCONSTANTh .req d1
+ qCONSTANT .req q0
+
+ BUF .req r0
+ LEN .req r1
+ CRC .req r2
+
+ qzr .req q9
+
+ /**
+ * Calculate crc32
+ * BUF - buffer
+ * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
+ * CRC - initial crc32
+ * return %eax crc32
+ * uint crc32_pmull_le(unsigned char const *buffer,
+ * size_t len, uint crc32)
+ */
+SYM_FUNC_START(crc32_pmull_le)
+ adr r3, .Lcrc32_constants
+ b 0f
+SYM_FUNC_END(crc32_pmull_le)
+
+SYM_FUNC_START(crc32c_pmull_le)
+ adr r3, .Lcrc32c_constants
+
+0: bic LEN, LEN, #15
+ vld1.8 {q1-q2}, [BUF, :128]!
+ vld1.8 {q3-q4}, [BUF, :128]!
+ vmov.i8 qzr, #0
+ vmov.i8 qCONSTANT, #0
+ vmov.32 dCONSTANTl[0], CRC
+ veor.8 d2, d2, dCONSTANTl
+ sub LEN, LEN, #0x40
+ cmp LEN, #0x40
+ blt less_64
+
+ vld1.64 {qCONSTANT}, [r3]
+
+loop_64: /* 64 bytes Full cache line folding */
+ sub LEN, LEN, #0x40
+
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q6, d5, dCONSTANTh
+ vmull.p64 q7, d7, dCONSTANTh
+ vmull.p64 q8, d9, dCONSTANTh
+
+ vmull.p64 q1, d2, dCONSTANTl
+ vmull.p64 q2, d4, dCONSTANTl
+ vmull.p64 q3, d6, dCONSTANTl
+ vmull.p64 q4, d8, dCONSTANTl
+
+ veor.8 q1, q1, q5
+ vld1.8 {q5}, [BUF, :128]!
+ veor.8 q2, q2, q6
+ vld1.8 {q6}, [BUF, :128]!
+ veor.8 q3, q3, q7
+ vld1.8 {q7}, [BUF, :128]!
+ veor.8 q4, q4, q8
+ vld1.8 {q8}, [BUF, :128]!
+
+ veor.8 q1, q1, q5
+ veor.8 q2, q2, q6
+ veor.8 q3, q3, q7
+ veor.8 q4, q4, q8
+
+ cmp LEN, #0x40
+ bge loop_64
+
+less_64: /* Folding cache line into 128bit */
+ vldr dCONSTANTl, [r3, #16]
+ vldr dCONSTANTh, [r3, #24]
+
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q5
+ veor.8 q1, q1, q2
+
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q5
+ veor.8 q1, q1, q3
+
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q5
+ veor.8 q1, q1, q4
+
+ teq LEN, #0
+ beq fold_64
+
+loop_16: /* Folding rest buffer into 128bit */
+ subs LEN, LEN, #0x10
+
+ vld1.8 {q2}, [BUF, :128]!
+ vmull.p64 q5, d3, dCONSTANTh
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q5
+ veor.8 q1, q1, q2
+
+ bne loop_16
+
+fold_64:
+ /* perform the last 64 bit fold, also adds 32 zeroes
+ * to the input stream */
+ vmull.p64 q2, d2, dCONSTANTh
+ vext.8 q1, q1, qzr, #8
+ veor.8 q1, q1, q2
+
+ /* final 32-bit fold */
+ vldr dCONSTANTl, [r3, #32]
+ vldr d6, [r3, #40]
+ vmov.i8 d7, #0
+
+ vext.8 q2, q1, qzr, #4
+ vand.8 d2, d2, d6
+ vmull.p64 q1, d2, dCONSTANTl
+ veor.8 q1, q1, q2
+
+ /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+ vldr dCONSTANTl, [r3, #48]
+ vldr dCONSTANTh, [r3, #56]
+
+ vand.8 q2, q1, q3
+ vext.8 q2, qzr, q2, #8
+ vmull.p64 q2, d5, dCONSTANTh
+ vand.8 q2, q2, q3
+ vmull.p64 q2, d4, dCONSTANTl
+ veor.8 q1, q1, q2
+ vmov r0, s5
+
+ bx lr
+SYM_FUNC_END(crc32c_pmull_le)
+
+ .macro __crc32, c
+ subs ip, r2, #8
+ bmi .Ltail\c
+
+ tst r1, #3
+ bne .Lunaligned\c
+
+ teq ip, #0
+.Laligned8\c:
+ ldrd r2, r3, [r1], #8
+ARM_BE8(rev r2, r2 )
+ARM_BE8(rev r3, r3 )
+ crc32\c\()w r0, r0, r2
+ crc32\c\()w r0, r0, r3
+ bxeq lr
+ subs ip, ip, #8
+ bpl .Laligned8\c
+
+.Ltail\c:
+ tst ip, #4
+ beq 2f
+ ldr r3, [r1], #4
+ARM_BE8(rev r3, r3 )
+ crc32\c\()w r0, r0, r3
+
+2: tst ip, #2
+ beq 1f
+ ldrh r3, [r1], #2
+ARM_BE8(rev16 r3, r3 )
+ crc32\c\()h r0, r0, r3
+
+1: tst ip, #1
+ bxeq lr
+ ldrb r3, [r1]
+ crc32\c\()b r0, r0, r3
+ bx lr
+
+.Lunaligned\c:
+ tst r1, #1
+ beq 2f
+ ldrb r3, [r1], #1
+ subs r2, r2, #1
+ crc32\c\()b r0, r0, r3
+
+ tst r1, #2
+ beq 0f
+2: ldrh r3, [r1], #2
+ subs r2, r2, #2
+ARM_BE8(rev16 r3, r3 )
+ crc32\c\()h r0, r0, r3
+
+0: subs ip, r2, #8
+ bpl .Laligned8\c
+ b .Ltail\c
+ .endm
+
+ .align 5
+SYM_FUNC_START(crc32_armv8_le)
+ __crc32
+SYM_FUNC_END(crc32_armv8_le)
+
+ .align 5
+SYM_FUNC_START(crc32c_armv8_le)
+ __crc32 c
+SYM_FUNC_END(crc32c_armv8_le)
diff --git a/lib/crc/arm/crc32.h b/lib/crc/arm/crc32.h
new file mode 100644
index 000000000000..018007e162a2
--- /dev/null
+++ b/lib/crc/arm/crc32.h
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <crypto/internal/simd.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+#define PMULL_MIN_LEN 64 /* min size of buffer for pmull functions */
+
+asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc);
+asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len);
+
+asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc);
+asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len);
+
+static inline u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len)
+{
+ if (static_branch_likely(&have_crc32))
+ return crc32_armv8_le(crc, p, len);
+ return crc32_le_base(crc, p, len);
+}
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (len >= PMULL_MIN_LEN + 15 &&
+ static_branch_likely(&have_pmull) && crypto_simd_usable()) {
+ size_t n = -(uintptr_t)p & 15;
+
+ /* align p to 16-byte boundary */
+ if (n) {
+ crc = crc32_le_scalar(crc, p, n);
+ p += n;
+ len -= n;
+ }
+ n = round_down(len, 16);
+ kernel_neon_begin();
+ crc = crc32_pmull_le(p, n, crc);
+ kernel_neon_end();
+ p += n;
+ len -= n;
+ }
+ return crc32_le_scalar(crc, p, len);
+}
+
+static inline u32 crc32c_scalar(u32 crc, const u8 *p, size_t len)
+{
+ if (static_branch_likely(&have_crc32))
+ return crc32c_armv8_le(crc, p, len);
+ return crc32c_base(crc, p, len);
+}
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (len >= PMULL_MIN_LEN + 15 &&
+ static_branch_likely(&have_pmull) && crypto_simd_usable()) {
+ size_t n = -(uintptr_t)p & 15;
+
+ /* align p to 16-byte boundary */
+ if (n) {
+ crc = crc32c_scalar(crc, p, n);
+ p += n;
+ len -= n;
+ }
+ n = round_down(len, 16);
+ kernel_neon_begin();
+ crc = crc32c_pmull_le(p, n, crc);
+ kernel_neon_end();
+ p += n;
+ len -= n;
+ }
+ return crc32c_scalar(crc, p, len);
+}
+
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static inline void crc32_mod_init_arch(void)
+{
+ if (elf_hwcap2 & HWCAP2_CRC32)
+ static_branch_enable(&have_crc32);
+ if (elf_hwcap2 & HWCAP2_PMULL)
+ static_branch_enable(&have_pmull);
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (elf_hwcap2 & (HWCAP2_CRC32 | HWCAP2_PMULL))
+ return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/arm64/crc-t10dif-core.S b/lib/crc/arm64/crc-t10dif-core.S
new file mode 100644
index 000000000000..87dd6d46224d
--- /dev/null
+++ b/lib/crc/arm64/crc-t10dif-core.S
@@ -0,0 +1,469 @@
+//
+// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd
+// Copyright (C) 2019-2024 Google LLC
+//
+// Authors: Ard Biesheuvel <ardb@google.com>
+// Eric Biggers <ebiggers@google.com>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+// Derived from the x86 version:
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+// Erdinc Ozturk <erdinc.ozturk@intel.com>
+// Vinodh Gopal <vinodh.gopal@intel.com>
+// James Guilford <james.guilford@intel.com>
+// Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses. You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the
+// distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Reference paper titled "Fast CRC Computation for Generic
+// Polynomials Using PCLMULQDQ Instruction"
+// URL: http://www.intel.com/content/dam/www/public/us/en/documents
+// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .arch armv8-a+crypto
+
+ init_crc .req w0
+ buf .req x1
+ len .req x2
+ fold_consts_ptr .req x5
+
+ fold_consts .req v10
+
+ t3 .req v17
+ t4 .req v18
+ t5 .req v19
+ t6 .req v20
+ t7 .req v21
+ t8 .req v22
+
+ perm .req v27
+
+ .macro pmull16x64_p64, a16, b64, c64
+ pmull2 \c64\().1q, \a16\().2d, \b64\().2d
+ pmull \b64\().1q, \a16\().1d, \b64\().1d
+ .endm
+
+ /*
+ * Pairwise long polynomial multiplication of two 16-bit values
+ *
+ * { w0, w1 }, { y0, y1 }
+ *
+ * by two 64-bit values
+ *
+ * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+ *
+ * where each vector element is a byte, ordered from least to most
+ * significant.
+ *
+ * This can be implemented using 8x8 long polynomial multiplication, by
+ * reorganizing the input so that each pairwise 8x8 multiplication
+ * produces one of the terms from the decomposition below, and
+ * combining the results of each rank and shifting them into place.
+ *
+ * Rank
+ * 0 w0*x0 ^ | y0*z0 ^
+ * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
+ * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
+ * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
+ * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
+ * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
+ * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
+ * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
+ * 8 w1*x7 << 64 | y1*z7 << 64
+ *
+ * The inputs can be reorganized into
+ *
+ * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+ * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+ *
+ * and after performing 8x8->16 bit long polynomial multiplication of
+ * each of the halves of the first vector with those of the second one,
+ * we obtain the following four vectors of 16-bit elements:
+ *
+ * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+ * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+ * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+ * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+ *
+ * Results b and c can be XORed together, as the vector elements have
+ * matching ranks. Then, the final XOR (*) can be pulled forward, and
+ * applied between the halves of each of the remaining three vectors,
+ * which are then shifted into place, and combined to produce two
+ * 80-bit results.
+ *
+ * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
+ * to the 64x64 bit one above, but XOR'ing the outputs together will
+ * produce the expected result, and this is sufficient in the context of
+ * this algorithm.
+ */
+ .macro pmull16x64_p8, a16, b64, c64
+ ext t7.16b, \b64\().16b, \b64\().16b, #1
+ tbl t5.16b, {\a16\().16b}, perm.16b
+ uzp1 t7.16b, \b64\().16b, t7.16b
+ bl __pmull_p8_16x64
+ ext \b64\().16b, t4.16b, t4.16b, #15
+ eor \c64\().16b, t8.16b, t5.16b
+ .endm
+
+SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
+ ext t6.16b, t5.16b, t5.16b, #8
+
+ pmull t3.8h, t7.8b, t5.8b
+ pmull t4.8h, t7.8b, t6.8b
+ pmull2 t5.8h, t7.16b, t5.16b
+ pmull2 t6.8h, t7.16b, t6.16b
+
+ ext t8.16b, t3.16b, t3.16b, #8
+ eor t4.16b, t4.16b, t6.16b
+ ext t7.16b, t5.16b, t5.16b, #8
+ ext t6.16b, t4.16b, t4.16b, #8
+ eor t8.8b, t8.8b, t3.8b
+ eor t5.8b, t5.8b, t7.8b
+ eor t4.8b, t4.8b, t6.8b
+ ext t5.16b, t5.16b, t5.16b, #14
+ ret
+SYM_FUNC_END(__pmull_p8_16x64)
+
+
+ // Fold reg1, reg2 into the next 32 data bytes, storing the result back
+ // into reg1, reg2.
+ .macro fold_32_bytes, p, reg1, reg2
+ ldp q11, q12, [buf], #0x20
+
+ pmull16x64_\p fold_consts, \reg1, v8
+
+CPU_LE( rev64 v11.16b, v11.16b )
+CPU_LE( rev64 v12.16b, v12.16b )
+
+ pmull16x64_\p fold_consts, \reg2, v9
+
+CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
+CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
+
+ eor \reg1\().16b, \reg1\().16b, v8.16b
+ eor \reg2\().16b, \reg2\().16b, v9.16b
+ eor \reg1\().16b, \reg1\().16b, v11.16b
+ eor \reg2\().16b, \reg2\().16b, v12.16b
+ .endm
+
+ // Fold src_reg into dst_reg, optionally loading the next fold constants
+ .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
+ pmull16x64_\p fold_consts, \src_reg, v8
+ .ifnb \load_next_consts
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+ .endif
+ eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
+ eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
+ .endm
+
+ .macro crc_t10dif_pmull, p
+
+ // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+ cmp len, #256
+ b.lt .Lless_than_256_bytes_\@
+
+ adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts
+
+ // Load the first 128 data bytes. Byte swapping is necessary to make
+ // the bit order match the polynomial coefficient order.
+ ldp q0, q1, [buf]
+ ldp q2, q3, [buf, #0x20]
+ ldp q4, q5, [buf, #0x40]
+ ldp q6, q7, [buf, #0x60]
+ add buf, buf, #0x80
+CPU_LE( rev64 v0.16b, v0.16b )
+CPU_LE( rev64 v1.16b, v1.16b )
+CPU_LE( rev64 v2.16b, v2.16b )
+CPU_LE( rev64 v3.16b, v3.16b )
+CPU_LE( rev64 v4.16b, v4.16b )
+CPU_LE( rev64 v5.16b, v5.16b )
+CPU_LE( rev64 v6.16b, v6.16b )
+CPU_LE( rev64 v7.16b, v7.16b )
+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
+CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
+CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
+CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
+CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
+CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
+CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
+CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ movi v8.16b, #0
+ mov v8.h[7], init_crc
+ eor v0.16b, v0.16b, v8.16b
+
+ // Load the constants for folding across 128 bytes.
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
+
+ // Subtract 128 for the 128 data bytes just consumed. Subtract another
+ // 128 to simplify the termination condition of the following loop.
+ sub len, len, #256
+
+ // While >= 128 data bytes remain (not counting v0-v7), fold the 128
+ // bytes v0-v7 into them, storing the result back into v0-v7.
+.Lfold_128_bytes_loop_\@:
+ fold_32_bytes \p, v0, v1
+ fold_32_bytes \p, v2, v3
+ fold_32_bytes \p, v4, v5
+ fold_32_bytes \p, v6, v7
+
+ subs len, len, #128
+ b.ge .Lfold_128_bytes_loop_\@
+
+ // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
+
+ // Fold across 64 bytes.
+ add fold_consts_ptr, fold_consts_ptr, #16
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+ fold_16_bytes \p, v0, v4
+ fold_16_bytes \p, v1, v5
+ fold_16_bytes \p, v2, v6
+ fold_16_bytes \p, v3, v7, 1
+ // Fold across 32 bytes.
+ fold_16_bytes \p, v4, v6
+ fold_16_bytes \p, v5, v7, 1
+ // Fold across 16 bytes.
+ fold_16_bytes \p, v6, v7
+
+ // Add 128 to get the correct number of data bytes remaining in 0...127
+ // (not counting v7), following the previous extra subtraction by 128.
+ // Then subtract 16 to simplify the termination condition of the
+ // following loop.
+ adds len, len, #(128-16)
+
+ // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
+ // into them, storing the result back into v7.
+ b.lt .Lfold_16_bytes_loop_done_\@
+.Lfold_16_bytes_loop_\@:
+ pmull16x64_\p fold_consts, v7, v8
+ eor v7.16b, v7.16b, v8.16b
+ ldr q0, [buf], #16
+CPU_LE( rev64 v0.16b, v0.16b )
+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
+ eor v7.16b, v7.16b, v0.16b
+ subs len, len, #16
+ b.ge .Lfold_16_bytes_loop_\@
+
+.Lfold_16_bytes_loop_done_\@:
+ // Add 16 to get the correct number of data bytes remaining in 0...15
+ // (not counting v7), following the previous extra subtraction by 16.
+ adds len, len, #16
+ b.eq .Lreduce_final_16_bytes_\@
+
+.Lhandle_partial_segment_\@:
+ // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+ // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To
+ // do this without needing a fold constant for each possible 'len',
+ // redivide the bytes into a first chunk of 'len' bytes and a second
+ // chunk of 16 bytes, then fold the first chunk into the second.
+
+ // v0 = last 16 original data bytes
+ add buf, buf, len
+ ldr q0, [buf, #-16]
+CPU_LE( rev64 v0.16b, v0.16b )
+CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
+
+ // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
+ adr_l x4, .Lbyteshift_table + 16
+ sub x4, x4, len
+ ld1 {v2.16b}, [x4]
+ tbl v1.16b, {v7.16b}, v2.16b
+
+ // v3 = first chunk: v7 right-shifted by '16-len' bytes.
+ movi v3.16b, #0x80
+ eor v2.16b, v2.16b, v3.16b
+ tbl v3.16b, {v7.16b}, v2.16b
+
+ // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+ sshr v2.16b, v2.16b, #7
+
+ // v2 = second chunk: 'len' bytes from v0 (low-order bytes),
+ // then '16-len' bytes from v1 (high-order bytes).
+ bsl v2.16b, v1.16b, v0.16b
+
+ // Fold the first chunk into the second chunk, storing the result in v7.
+ pmull16x64_\p fold_consts, v3, v0
+ eor v7.16b, v3.16b, v0.16b
+ eor v7.16b, v7.16b, v2.16b
+ b .Lreduce_final_16_bytes_\@
+
+.Lless_than_256_bytes_\@:
+ // Checksumming a buffer of length 16...255 bytes
+
+ adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts
+
+ // Load the first 16 data bytes.
+ ldr q7, [buf], #0x10
+CPU_LE( rev64 v7.16b, v7.16b )
+CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ movi v0.16b, #0
+ mov v0.h[7], init_crc
+ eor v7.16b, v7.16b, v0.16b
+
+ // Load the fold-across-16-bytes constants.
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+
+ cmp len, #16
+ b.eq .Lreduce_final_16_bytes_\@ // len == 16
+ subs len, len, #32
+ b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
+ add len, len, #16
+ b .Lhandle_partial_segment_\@ // 17 <= len <= 31
+
+.Lreduce_final_16_bytes_\@:
+ .endm
+
+//
+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p8)
+ frame_push 1
+
+ // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+ movi perm.4h, #8, lsl #8
+ orr perm.2s, #1, lsl #16
+ orr perm.2s, #1, lsl #24
+ zip1 perm.16b, perm.16b, perm.16b
+ zip1 perm.16b, perm.16b, perm.16b
+
+ crc_t10dif_pmull p8
+
+CPU_LE( rev64 v7.16b, v7.16b )
+CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
+ str q7, [x3]
+
+ frame_pop
+ ret
+SYM_FUNC_END(crc_t10dif_pmull_p8)
+
+ .align 5
+//
+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p64)
+ crc_t10dif_pmull p64
+
+ // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+ movi v2.16b, #0 // init zero register
+
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+
+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ // whose low 48 bits are 0.
+ ext v0.16b, v2.16b, v7.16b, #8
+ pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x))
+ eor v0.16b, v0.16b, v7.16b // + low bits * x^64
+
+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+ ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
+ mov v0.s[3], v2.s[0] // zero high 32 bits
+ pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x))
+ eor v0.16b, v0.16b, v1.16b // + low bits
+
+ // Load G(x) and floor(x^48 / G(x)).
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
+
+ // Use Barrett reduction to compute the final CRC value.
+ pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x))
+ ushr v1.2d, v1.2d, #32 // /= x^32
+ pmull v1.1q, v1.1d, fold_consts.1d // *= G(x)
+ ushr v0.2d, v0.2d, #48
+ eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+ umov w0, v0.h[0]
+ ret
+SYM_FUNC_END(crc_t10dif_pmull_p64)
+
+ .section ".rodata", "a"
+ .align 4
+
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+ .quad 0x0000000000006123 // x^(8*128) mod G(x)
+ .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
+// .Lfold_across_64_bytes_consts:
+ .quad 0x0000000000001069 // x^(4*128) mod G(x)
+ .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
+// .Lfold_across_32_bytes_consts:
+ .quad 0x000000000000857d // x^(2*128) mod G(x)
+ .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+ .quad 0x000000000000a010 // x^(1*128) mod G(x)
+ .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
+// .Lfinal_fold_consts:
+ .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
+ .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+ .quad 0x0000000000018bb7 // G(x)
+ .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
+ .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+ .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
diff --git a/lib/crc/arm64/crc-t10dif.h b/lib/crc/arm64/crc-t10dif.h
new file mode 100644
index 000000000000..c4521a7f1ee9
--- /dev/null
+++ b/lib/crc/arm64/crc-t10dif.h
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <crypto/internal/simd.h>
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
+
+asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
+ u8 out[16]);
+asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+
+static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
+{
+ if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
+ if (static_branch_likely(&have_pmull)) {
+ if (crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc = crc_t10dif_pmull_p64(crc, data, length);
+ kernel_neon_end();
+ return crc;
+ }
+ } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
+ static_branch_likely(&have_asimd) &&
+ crypto_simd_usable()) {
+ u8 buf[16];
+
+ kernel_neon_begin();
+ crc_t10dif_pmull_p8(crc, data, length, buf);
+ kernel_neon_end();
+
+ return crc_t10dif_generic(0, buf, sizeof(buf));
+ }
+ }
+ return crc_t10dif_generic(crc, data, length);
+}
+
+#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
+static inline void crc_t10dif_mod_init_arch(void)
+{
+ if (cpu_have_named_feature(ASIMD)) {
+ static_branch_enable(&have_asimd);
+ if (cpu_have_named_feature(PMULL))
+ static_branch_enable(&have_pmull);
+ }
+}
diff --git a/lib/crc/arm64/crc32-core.S b/lib/crc/arm64/crc32-core.S
new file mode 100644
index 000000000000..68825317460f
--- /dev/null
+++ b/lib/crc/arm64/crc32-core.S
@@ -0,0 +1,362 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
+ *
+ * Copyright (C) 2016 - 2018 Linaro Ltd.
+ * Copyright (C) 2024 Google LLC
+ *
+ * Author: Ard Biesheuvel <ardb@kernel.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .cpu generic+crc+crypto
+
+ .macro bitle, reg
+ .endm
+
+ .macro bitbe, reg
+ rbit \reg, \reg
+ .endm
+
+ .macro bytele, reg
+ .endm
+
+ .macro bytebe, reg
+ rbit \reg, \reg
+ lsr \reg, \reg, #24
+ .endm
+
+ .macro hwordle, reg
+CPU_BE( rev16 \reg, \reg )
+ .endm
+
+ .macro hwordbe, reg
+CPU_LE( rev \reg, \reg )
+ rbit \reg, \reg
+CPU_BE( lsr \reg, \reg, #16 )
+ .endm
+
+ .macro le, regs:vararg
+ .irp r, \regs
+CPU_BE( rev \r, \r )
+ .endr
+ .endm
+
+ .macro be, regs:vararg
+ .irp r, \regs
+CPU_LE( rev \r, \r )
+ .endr
+ .irp r, \regs
+ rbit \r, \r
+ .endr
+ .endm
+
+ .macro __crc32, c, order=le
+ bit\order w0
+ cmp x2, #16
+ b.lt 8f // less than 16 bytes
+
+ and x7, x2, #0x1f
+ and x2, x2, #~0x1f
+ cbz x7, 32f // multiple of 32 bytes
+
+ and x8, x7, #0xf
+ ldp x3, x4, [x1]
+ add x8, x8, x1
+ add x1, x1, x7
+ ldp x5, x6, [x8]
+ \order x3, x4, x5, x6
+
+ tst x7, #8
+ crc32\c\()x w8, w0, x3
+ csel x3, x3, x4, eq
+ csel w0, w0, w8, eq
+ tst x7, #4
+ lsr x4, x3, #32
+ crc32\c\()w w8, w0, w3
+ csel x3, x3, x4, eq
+ csel w0, w0, w8, eq
+ tst x7, #2
+ lsr w4, w3, #16
+ crc32\c\()h w8, w0, w3
+ csel w3, w3, w4, eq
+ csel w0, w0, w8, eq
+ tst x7, #1
+ crc32\c\()b w8, w0, w3
+ csel w0, w0, w8, eq
+ tst x7, #16
+ crc32\c\()x w8, w0, x5
+ crc32\c\()x w8, w8, x6
+ csel w0, w0, w8, eq
+ cbz x2, 0f
+
+32: ldp x3, x4, [x1], #32
+ sub x2, x2, #32
+ ldp x5, x6, [x1, #-16]
+ \order x3, x4, x5, x6
+ crc32\c\()x w0, w0, x3
+ crc32\c\()x w0, w0, x4
+ crc32\c\()x w0, w0, x5
+ crc32\c\()x w0, w0, x6
+ cbnz x2, 32b
+0: bit\order w0
+ ret
+
+8: tbz x2, #3, 4f
+ ldr x3, [x1], #8
+ \order x3
+ crc32\c\()x w0, w0, x3
+4: tbz x2, #2, 2f
+ ldr w3, [x1], #4
+ \order w3
+ crc32\c\()w w0, w0, w3
+2: tbz x2, #1, 1f
+ ldrh w3, [x1], #2
+ hword\order w3
+ crc32\c\()h w0, w0, w3
+1: tbz x2, #0, 0f
+ ldrb w3, [x1]
+ byte\order w3
+ crc32\c\()b w0, w0, w3
+0: bit\order w0
+ ret
+ .endm
+
+ .align 5
+SYM_FUNC_START(crc32_le_arm64)
+ __crc32
+SYM_FUNC_END(crc32_le_arm64)
+
+ .align 5
+SYM_FUNC_START(crc32c_le_arm64)
+ __crc32 c
+SYM_FUNC_END(crc32c_le_arm64)
+
+ .align 5
+SYM_FUNC_START(crc32_be_arm64)
+ __crc32 order=be
+SYM_FUNC_END(crc32_be_arm64)
+
+ in .req x1
+ len .req x2
+
+ /*
+ * w0: input CRC at entry, output CRC at exit
+ * x1: pointer to input buffer
+ * x2: length of input in bytes
+ */
+ .macro crc4way, insn, table, order=le
+ bit\order w0
+ lsr len, len, #6 // len := # of 64-byte blocks
+
+ /* Process up to 64 blocks of 64 bytes at a time */
+.La\@: mov x3, #64
+ cmp len, #64
+ csel x3, x3, len, hi // x3 := min(len, 64)
+ sub len, len, x3
+
+ /* Divide the input into 4 contiguous blocks */
+ add x4, x3, x3, lsl #1 // x4 := 3 * x3
+ add x7, in, x3, lsl #4 // x7 := in + 16 * x3
+ add x8, in, x3, lsl #5 // x8 := in + 32 * x3
+ add x9, in, x4, lsl #4 // x9 := in + 16 * x4
+
+ /* Load the folding coefficients from the lookup table */
+ adr_l x5, \table - 12 // entry 0 omitted
+ add x5, x5, x4, lsl #2 // x5 += 12 * x3
+ ldp s0, s1, [x5]
+ ldr s2, [x5, #8]
+
+ /* Zero init partial CRCs for this iteration */
+ mov w4, wzr
+ mov w5, wzr
+ mov w6, wzr
+ mov x17, xzr
+
+.Lb\@: sub x3, x3, #1
+ \insn w6, w6, x17
+ ldp x10, x11, [in], #16
+ ldp x12, x13, [x7], #16
+ ldp x14, x15, [x8], #16
+ ldp x16, x17, [x9], #16
+
+ \order x10, x11, x12, x13, x14, x15, x16, x17
+
+ /* Apply the CRC transform to 4 16-byte blocks in parallel */
+ \insn w0, w0, x10
+ \insn w4, w4, x12
+ \insn w5, w5, x14
+ \insn w6, w6, x16
+ \insn w0, w0, x11
+ \insn w4, w4, x13
+ \insn w5, w5, x15
+ cbnz x3, .Lb\@
+
+ /* Combine the 4 partial results into w0 */
+ mov v3.d[0], x0
+ mov v4.d[0], x4
+ mov v5.d[0], x5
+ pmull v0.1q, v0.1d, v3.1d
+ pmull v1.1q, v1.1d, v4.1d
+ pmull v2.1q, v2.1d, v5.1d
+ eor v0.8b, v0.8b, v1.8b
+ eor v0.8b, v0.8b, v2.8b
+ mov x5, v0.d[0]
+ eor x5, x5, x17
+ \insn w0, w6, x5
+
+ mov in, x9
+ cbnz len, .La\@
+
+ bit\order w0
+ ret
+ .endm
+
+ .align 5
+SYM_FUNC_START(crc32c_le_arm64_4way)
+ crc4way crc32cx, .L0
+SYM_FUNC_END(crc32c_le_arm64_4way)
+
+ .align 5
+SYM_FUNC_START(crc32_le_arm64_4way)
+ crc4way crc32x, .L1
+SYM_FUNC_END(crc32_le_arm64_4way)
+
+ .align 5
+SYM_FUNC_START(crc32_be_arm64_4way)
+ crc4way crc32x, .L1, be
+SYM_FUNC_END(crc32_be_arm64_4way)
+
+ .section .rodata, "a", %progbits
+ .align 6
+.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27
+ .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e
+ .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b
+ .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8
+ .long 0x299847d5, 0x878a92a7, 0x39d3b296
+ .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53
+ .long 0xa60ce07b, 0x83348832, 0x47db8317
+ .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092
+ .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0
+ .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7
+ .long 0xf285651c, 0xce7f39f4, 0xdaece73e
+ .long 0x271d9844, 0xd270f1a2, 0xab7aff2a
+ .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385
+ .long 0xcec3662e, 0x1b03397f, 0x83348832
+ .long 0x8227bb8a, 0xb3e32c28, 0x299847d5
+ .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86
+ .long 0xf6076544, 0x10746f3c, 0x18b33a4e
+ .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b
+ .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7
+ .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b
+ .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b
+ .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4
+ .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56
+ .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2
+ .long 0xa90fd27a, 0x0167d312, 0xc619809d
+ .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d
+ .long 0x4597456a, 0x98d8d9cb, 0x65863b64
+ .long 0xc9c8b782, 0x68bce87a, 0x1b03397f
+ .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd
+ .long 0x2342001e, 0x3771e98f, 0xb3e32c28
+ .long 0xe8b6368b, 0x2178513a, 0x064f7f26
+ .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c
+ .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c
+ .long 0x02ee03b2, 0xff0dba97, 0x10746f3c
+ .long 0x135c83fd, 0xf872e54c, 0xc7a68855
+ .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844
+ .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c
+ .long 0xded288f8, 0xb3af077a, 0x93a5f730
+ .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c
+ .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2
+ .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203
+ .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e
+ .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb
+ .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a
+ .long 0x8e1450f7, 0x2342001e, 0x8227bb8a
+ .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768
+ .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35
+ .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c
+ .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b
+ .long 0xd6c3a807, 0x2664fd8b, 0x0167d312
+ .long 0x1d31175f, 0x02ee03b2, 0xf6076544
+ .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a
+ .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf
+ .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb
+ .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c
+ .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a
+ .long 0x8a074012, 0xded288f8, 0x57a3d037
+ .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b
+ .long 0x3be3c09b, 0x6353c1cc, 0x42d98888
+ .long 0x465a4eee, 0xf48642e9, 0x3771e98f
+ .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9
+ .long 0xa52f58ec, 0x9a5ede41, 0x2178513a
+ .long 0x47972100, 0x45cddf4e, 0xe0ac139e
+ .long 0x359674f7, 0xa51b6135, 0x170076fa
+
+.L1: .long 0xaf449247, 0x81256527, 0xccaa009e
+ .long 0x57c54819, 0x1d9513d7, 0x81256527
+ .long 0x3f41287a, 0x57c54819, 0xaf449247
+ .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7
+ .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394
+ .long 0x71d54a59, 0xf5e48c85, 0x57c54819
+ .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed
+ .long 0xd31343ea, 0xe95c1271, 0x910eeec1
+ .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a
+ .long 0x9ee62949, 0xcec97417, 0x9026d5b1
+ .long 0xa55d1514, 0xf183c71b, 0xd1df2327
+ .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85
+ .long 0x9d842b80, 0xeea395c4, 0x3c656ced
+ .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd
+ .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd
+ .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271
+ .long 0xef82aa68, 0xdb3935ea, 0xb918a347
+ .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59
+ .long 0x99cce860, 0x356d209f, 0xff6f2fc2
+ .long 0xd8af8e46, 0xc352f6de, 0xcec97417
+ .long 0xf1996890, 0xd8110ff1, 0x1c63267b
+ .long 0x631bc508, 0xe95c7216, 0xf183c71b
+ .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0
+ .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea
+ .long 0x7a92fffb, 0xf7003835, 0x4470ac44
+ .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4
+ .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee
+ .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40
+ .long 0x60290934, 0x81b6f443, 0x6d40f445
+ .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949
+ .long 0xdcf5088a, 0x9dbdc100, 0x145575d5
+ .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d
+ .long 0x255b139e, 0x631bc508, 0xa55d1514
+ .long 0xd784eaa8, 0xce26786c, 0xdb3935ea
+ .long 0x6d2c864a, 0x8068c345, 0x2586d334
+ .long 0x02072e24, 0xdb3839f3, 0x21aa2b26
+ .long 0x06689b0a, 0x5efd72f5, 0xe0575528
+ .long 0x1e52f5ea, 0x4117915b, 0x356d209f
+ .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80
+ .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de
+ .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c
+ .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1
+ .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0
+ .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216
+ .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356
+ .long 0x0c540e7b, 0x753c81ff, 0x8e031a19
+ .long 0x9924c781, 0xb9220208, 0x3edcde65
+ .long 0x3954de39, 0x1753ab84, 0x1d6708a0
+ .long 0xf32238b5, 0xbec81497, 0x9e70b943
+ .long 0xbbd2cd2c, 0x0925d861, 0xf7003835
+ .long 0xcc401304, 0xd784eaa8, 0xef82aa68
+ .long 0x4987e684, 0x6044fbb0, 0x00eba0c8
+ .long 0x3aa11427, 0x18fe3b4a, 0x87441142
+ .long 0x297aad60, 0x02072e24, 0xd14bcc9b
+ .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a
+ .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8
+ .long 0x25b8822a, 0x1e52f5ea, 0x99cce860
+ .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443
+ .long 0x5690aa32, 0xa91fdefb, 0x688a110e
+ .long 0x1357a093, 0x3796455c, 0xd8af8e46
+ .long 0x798fdd33, 0xaaa18a37, 0x357b9517
+ .long 0xc2815395, 0x54d42691, 0x9dbdc100
+ .long 0x21cfc0f7, 0x28ae0976, 0xf1996890
+ .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6
diff --git a/lib/crc/arm64/crc32.h b/lib/crc/arm64/crc32.h
new file mode 100644
index 000000000000..6e5dec45f05d
--- /dev/null
+++ b/lib/crc/arm64/crc32.h
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <asm/alternative.h>
+#include <asm/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#include <crypto/internal/simd.h>
+
+// The minimum input length to consider the 4-way interleaved code path
+static const size_t min_len = 1024;
+
+asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
+
+asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+ return crc32_le_base(crc, p, len);
+
+ if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc = crc32_le_arm64_4way(crc, p, len);
+ kernel_neon_end();
+
+ p += round_down(len, 64);
+ len %= 64;
+
+ if (!len)
+ return crc;
+ }
+
+ return crc32_le_arm64(crc, p, len);
+}
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+ return crc32c_base(crc, p, len);
+
+ if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc = crc32c_le_arm64_4way(crc, p, len);
+ kernel_neon_end();
+
+ p += round_down(len, 64);
+ len %= 64;
+
+ if (!len)
+ return crc;
+ }
+
+ return crc32c_le_arm64(crc, p, len);
+}
+
+static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+ return crc32_be_base(crc, p, len);
+
+ if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc = crc32_be_arm64_4way(crc, p, len);
+ kernel_neon_end();
+
+ p += round_down(len, 64);
+ len %= 64;
+
+ if (!len)
+ return crc;
+ }
+
+ return crc32_be_arm64(crc, p, len);
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (alternative_has_cap_likely(ARM64_HAS_CRC32))
+ return CRC32_LE_OPTIMIZATION |
+ CRC32_BE_OPTIMIZATION |
+ CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/crc-ccitt.c b/lib/crc/crc-ccitt.c
new file mode 100644
index 000000000000..f8692c3de101
--- /dev/null
+++ b/lib/crc/crc-ccitt.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/crc-ccitt.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+/*
+ * This mysterious table is just the CRC of each possible byte. It can be
+ * computed using the standard bit-at-a-time methods. The polynomial can
+ * be seen in entry 128, 0x8408. This corresponds to x^0 + x^5 + x^12.
+ * Add the implicit x^16, and you have the standard CRC-CCITT.
+ */
+u16 const crc_ccitt_table[256] = {
+ 0x0000, 0x1189, 0x2312, 0x329b, 0x4624, 0x57ad, 0x6536, 0x74bf,
+ 0x8c48, 0x9dc1, 0xaf5a, 0xbed3, 0xca6c, 0xdbe5, 0xe97e, 0xf8f7,
+ 0x1081, 0x0108, 0x3393, 0x221a, 0x56a5, 0x472c, 0x75b7, 0x643e,
+ 0x9cc9, 0x8d40, 0xbfdb, 0xae52, 0xdaed, 0xcb64, 0xf9ff, 0xe876,
+ 0x2102, 0x308b, 0x0210, 0x1399, 0x6726, 0x76af, 0x4434, 0x55bd,
+ 0xad4a, 0xbcc3, 0x8e58, 0x9fd1, 0xeb6e, 0xfae7, 0xc87c, 0xd9f5,
+ 0x3183, 0x200a, 0x1291, 0x0318, 0x77a7, 0x662e, 0x54b5, 0x453c,
+ 0xbdcb, 0xac42, 0x9ed9, 0x8f50, 0xfbef, 0xea66, 0xd8fd, 0xc974,
+ 0x4204, 0x538d, 0x6116, 0x709f, 0x0420, 0x15a9, 0x2732, 0x36bb,
+ 0xce4c, 0xdfc5, 0xed5e, 0xfcd7, 0x8868, 0x99e1, 0xab7a, 0xbaf3,
+ 0x5285, 0x430c, 0x7197, 0x601e, 0x14a1, 0x0528, 0x37b3, 0x263a,
+ 0xdecd, 0xcf44, 0xfddf, 0xec56, 0x98e9, 0x8960, 0xbbfb, 0xaa72,
+ 0x6306, 0x728f, 0x4014, 0x519d, 0x2522, 0x34ab, 0x0630, 0x17b9,
+ 0xef4e, 0xfec7, 0xcc5c, 0xddd5, 0xa96a, 0xb8e3, 0x8a78, 0x9bf1,
+ 0x7387, 0x620e, 0x5095, 0x411c, 0x35a3, 0x242a, 0x16b1, 0x0738,
+ 0xffcf, 0xee46, 0xdcdd, 0xcd54, 0xb9eb, 0xa862, 0x9af9, 0x8b70,
+ 0x8408, 0x9581, 0xa71a, 0xb693, 0xc22c, 0xd3a5, 0xe13e, 0xf0b7,
+ 0x0840, 0x19c9, 0x2b52, 0x3adb, 0x4e64, 0x5fed, 0x6d76, 0x7cff,
+ 0x9489, 0x8500, 0xb79b, 0xa612, 0xd2ad, 0xc324, 0xf1bf, 0xe036,
+ 0x18c1, 0x0948, 0x3bd3, 0x2a5a, 0x5ee5, 0x4f6c, 0x7df7, 0x6c7e,
+ 0xa50a, 0xb483, 0x8618, 0x9791, 0xe32e, 0xf2a7, 0xc03c, 0xd1b5,
+ 0x2942, 0x38cb, 0x0a50, 0x1bd9, 0x6f66, 0x7eef, 0x4c74, 0x5dfd,
+ 0xb58b, 0xa402, 0x9699, 0x8710, 0xf3af, 0xe226, 0xd0bd, 0xc134,
+ 0x39c3, 0x284a, 0x1ad1, 0x0b58, 0x7fe7, 0x6e6e, 0x5cf5, 0x4d7c,
+ 0xc60c, 0xd785, 0xe51e, 0xf497, 0x8028, 0x91a1, 0xa33a, 0xb2b3,
+ 0x4a44, 0x5bcd, 0x6956, 0x78df, 0x0c60, 0x1de9, 0x2f72, 0x3efb,
+ 0xd68d, 0xc704, 0xf59f, 0xe416, 0x90a9, 0x8120, 0xb3bb, 0xa232,
+ 0x5ac5, 0x4b4c, 0x79d7, 0x685e, 0x1ce1, 0x0d68, 0x3ff3, 0x2e7a,
+ 0xe70e, 0xf687, 0xc41c, 0xd595, 0xa12a, 0xb0a3, 0x8238, 0x93b1,
+ 0x6b46, 0x7acf, 0x4854, 0x59dd, 0x2d62, 0x3ceb, 0x0e70, 0x1ff9,
+ 0xf78f, 0xe606, 0xd49d, 0xc514, 0xb1ab, 0xa022, 0x92b9, 0x8330,
+ 0x7bc7, 0x6a4e, 0x58d5, 0x495c, 0x3de3, 0x2c6a, 0x1ef1, 0x0f78
+};
+EXPORT_SYMBOL(crc_ccitt_table);
+
+/**
+ * crc_ccitt - recompute the CRC (CRC-CCITT variant) for the data
+ * buffer
+ * @crc: previous CRC value
+ * @buffer: data pointer
+ * @len: number of bytes in the buffer
+ */
+u16 crc_ccitt(u16 crc, u8 const *buffer, size_t len)
+{
+ while (len--)
+ crc = crc_ccitt_byte(crc, *buffer++);
+ return crc;
+}
+EXPORT_SYMBOL(crc_ccitt);
+
+MODULE_DESCRIPTION("CRC-CCITT calculations");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc-itu-t.c b/lib/crc/crc-itu-t.c
new file mode 100644
index 000000000000..6e413a290f54
--- /dev/null
+++ b/lib/crc/crc-itu-t.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crc-itu-t.c
+ */
+
+#include <linux/crc-itu-t.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+/* CRC table for the CRC ITU-T V.41 0x1021 (x^16 + x^12 + x^5 + 1) */
+const u16 crc_itu_t_table[256] = {
+ 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
+ 0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef,
+ 0x1231, 0x0210, 0x3273, 0x2252, 0x52b5, 0x4294, 0x72f7, 0x62d6,
+ 0x9339, 0x8318, 0xb37b, 0xa35a, 0xd3bd, 0xc39c, 0xf3ff, 0xe3de,
+ 0x2462, 0x3443, 0x0420, 0x1401, 0x64e6, 0x74c7, 0x44a4, 0x5485,
+ 0xa56a, 0xb54b, 0x8528, 0x9509, 0xe5ee, 0xf5cf, 0xc5ac, 0xd58d,
+ 0x3653, 0x2672, 0x1611, 0x0630, 0x76d7, 0x66f6, 0x5695, 0x46b4,
+ 0xb75b, 0xa77a, 0x9719, 0x8738, 0xf7df, 0xe7fe, 0xd79d, 0xc7bc,
+ 0x48c4, 0x58e5, 0x6886, 0x78a7, 0x0840, 0x1861, 0x2802, 0x3823,
+ 0xc9cc, 0xd9ed, 0xe98e, 0xf9af, 0x8948, 0x9969, 0xa90a, 0xb92b,
+ 0x5af5, 0x4ad4, 0x7ab7, 0x6a96, 0x1a71, 0x0a50, 0x3a33, 0x2a12,
+ 0xdbfd, 0xcbdc, 0xfbbf, 0xeb9e, 0x9b79, 0x8b58, 0xbb3b, 0xab1a,
+ 0x6ca6, 0x7c87, 0x4ce4, 0x5cc5, 0x2c22, 0x3c03, 0x0c60, 0x1c41,
+ 0xedae, 0xfd8f, 0xcdec, 0xddcd, 0xad2a, 0xbd0b, 0x8d68, 0x9d49,
+ 0x7e97, 0x6eb6, 0x5ed5, 0x4ef4, 0x3e13, 0x2e32, 0x1e51, 0x0e70,
+ 0xff9f, 0xefbe, 0xdfdd, 0xcffc, 0xbf1b, 0xaf3a, 0x9f59, 0x8f78,
+ 0x9188, 0x81a9, 0xb1ca, 0xa1eb, 0xd10c, 0xc12d, 0xf14e, 0xe16f,
+ 0x1080, 0x00a1, 0x30c2, 0x20e3, 0x5004, 0x4025, 0x7046, 0x6067,
+ 0x83b9, 0x9398, 0xa3fb, 0xb3da, 0xc33d, 0xd31c, 0xe37f, 0xf35e,
+ 0x02b1, 0x1290, 0x22f3, 0x32d2, 0x4235, 0x5214, 0x6277, 0x7256,
+ 0xb5ea, 0xa5cb, 0x95a8, 0x8589, 0xf56e, 0xe54f, 0xd52c, 0xc50d,
+ 0x34e2, 0x24c3, 0x14a0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
+ 0xa7db, 0xb7fa, 0x8799, 0x97b8, 0xe75f, 0xf77e, 0xc71d, 0xd73c,
+ 0x26d3, 0x36f2, 0x0691, 0x16b0, 0x6657, 0x7676, 0x4615, 0x5634,
+ 0xd94c, 0xc96d, 0xf90e, 0xe92f, 0x99c8, 0x89e9, 0xb98a, 0xa9ab,
+ 0x5844, 0x4865, 0x7806, 0x6827, 0x18c0, 0x08e1, 0x3882, 0x28a3,
+ 0xcb7d, 0xdb5c, 0xeb3f, 0xfb1e, 0x8bf9, 0x9bd8, 0xabbb, 0xbb9a,
+ 0x4a75, 0x5a54, 0x6a37, 0x7a16, 0x0af1, 0x1ad0, 0x2ab3, 0x3a92,
+ 0xfd2e, 0xed0f, 0xdd6c, 0xcd4d, 0xbdaa, 0xad8b, 0x9de8, 0x8dc9,
+ 0x7c26, 0x6c07, 0x5c64, 0x4c45, 0x3ca2, 0x2c83, 0x1ce0, 0x0cc1,
+ 0xef1f, 0xff3e, 0xcf5d, 0xdf7c, 0xaf9b, 0xbfba, 0x8fd9, 0x9ff8,
+ 0x6e17, 0x7e36, 0x4e55, 0x5e74, 0x2e93, 0x3eb2, 0x0ed1, 0x1ef0
+};
+
+EXPORT_SYMBOL(crc_itu_t_table);
+
+/**
+ * crc_itu_t - Compute the CRC-ITU-T for the data buffer
+ *
+ * @crc: previous CRC value
+ * @buffer: data pointer
+ * @len: number of bytes in the buffer
+ *
+ * Returns the updated CRC value
+ */
+u16 crc_itu_t(u16 crc, const u8 *buffer, size_t len)
+{
+ while (len--)
+ crc = crc_itu_t_byte(crc, *buffer++);
+ return crc;
+}
+EXPORT_SYMBOL(crc_itu_t);
+
+MODULE_DESCRIPTION("CRC ITU-T V.41 calculations");
+MODULE_LICENSE("GPL");
+
diff --git a/lib/crc/crc-t10dif-main.c b/lib/crc/crc-t10dif-main.c
new file mode 100644
index 000000000000..08dde238e89f
--- /dev/null
+++ b/lib/crc/crc-t10dif-main.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * T10 Data Integrity Field CRC16 calculation
+ *
+ * Copyright (c) 2007 Oracle Corporation. All rights reserved.
+ * Written by Martin K. Petersen <martin.petersen@oracle.com>
+ */
+
+#include <linux/crc-t10dif.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+/*
+ * Table generated using the following polynomial:
+ * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
+ * gt: 0x8bb7
+ */
+static const u16 t10_dif_crc_table[256] = {
+ 0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
+ 0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
+ 0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
+ 0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
+ 0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
+ 0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
+ 0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
+ 0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
+ 0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
+ 0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
+ 0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
+ 0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
+ 0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
+ 0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
+ 0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
+ 0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
+ 0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
+ 0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
+ 0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
+ 0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
+ 0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
+ 0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
+ 0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
+ 0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
+ 0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
+ 0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
+ 0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
+ 0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
+ 0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
+ 0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
+ 0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
+ 0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
+};
+
+static inline u16 __maybe_unused
+crc_t10dif_generic(u16 crc, const u8 *p, size_t len)
+{
+ while (len--)
+ crc = (crc << 8) ^ t10_dif_crc_table[(crc >> 8) ^ *p++];
+ return crc;
+}
+
+#ifdef CONFIG_CRC_T10DIF_ARCH
+#include "crc-t10dif.h" /* $(SRCARCH)/crc-t10dif.h */
+#else
+#define crc_t10dif_arch crc_t10dif_generic
+#endif
+
+u16 crc_t10dif_update(u16 crc, const u8 *p, size_t len)
+{
+ return crc_t10dif_arch(crc, p, len);
+}
+EXPORT_SYMBOL(crc_t10dif_update);
+
+#ifdef crc_t10dif_mod_init_arch
+static int __init crc_t10dif_mod_init(void)
+{
+ crc_t10dif_mod_init_arch();
+ return 0;
+}
+subsys_initcall(crc_t10dif_mod_init);
+
+static void __exit crc_t10dif_mod_exit(void)
+{
+}
+module_exit(crc_t10dif_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("CRC-T10DIF library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc16.c b/lib/crc/crc16.c
new file mode 100644
index 000000000000..931660a8cbaa
--- /dev/null
+++ b/lib/crc/crc16.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crc16.c
+ */
+
+#include <linux/crc16.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+/** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */
+static const u16 crc16_table[256] = {
+ 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
+ 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
+ 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
+ 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
+ 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
+ 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
+ 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
+ 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
+ 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
+ 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
+ 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
+ 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
+ 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
+ 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
+ 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
+ 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
+ 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
+ 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
+ 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
+ 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
+ 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
+ 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
+ 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
+ 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
+ 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
+ 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
+ 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
+ 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
+ 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
+ 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
+ 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
+ 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
+};
+
+/**
+ * crc16 - compute the CRC-16 for the data buffer
+ * @crc: previous CRC value
+ * @p: data pointer
+ * @len: number of bytes in the buffer
+ *
+ * Returns the updated CRC value.
+ */
+u16 crc16(u16 crc, const u8 *p, size_t len)
+{
+ while (len--)
+ crc = (crc >> 8) ^ crc16_table[(crc & 0xff) ^ *p++];
+ return crc;
+}
+EXPORT_SYMBOL(crc16);
+
+MODULE_DESCRIPTION("CRC16 calculations");
+MODULE_LICENSE("GPL");
+
diff --git a/lib/crc/crc32-main.c b/lib/crc/crc32-main.c
new file mode 100644
index 000000000000..fbb90c9006e5
--- /dev/null
+++ b/lib/crc/crc32-main.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
+ * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks!
+ * Code was from the public domain, copyright abandoned. Code was
+ * subsequently included in the kernel, thus was re-licensed under the
+ * GNU GPL v2.
+ *
+ * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Same crc32 function was used in 5 other places in the kernel.
+ * I made one version, and deleted the others.
+ * There are various incantations of crc32(). Some use a seed of 0 or ~0.
+ * Some xor at the end with ~0. The generic crc32() function takes
+ * seed as an argument, and doesn't xor at the end. Then individual
+ * users can do whatever they need.
+ * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
+ * fs/jffs2 uses seed 0, doesn't xor with ~0.
+ * fs/partitions/efi.c uses seed ~0, xor's with ~0.
+ */
+
+/* see: Documentation/staging/crc32.rst for a description of algorithms */
+
+#include <linux/crc32.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "crc32table.h"
+
+static inline u32 __maybe_unused
+crc32_le_base(u32 crc, const u8 *p, size_t len)
+{
+ while (len--)
+ crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++];
+ return crc;
+}
+
+static inline u32 __maybe_unused
+crc32_be_base(u32 crc, const u8 *p, size_t len)
+{
+ while (len--)
+ crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++];
+ return crc;
+}
+
+static inline u32 __maybe_unused
+crc32c_base(u32 crc, const u8 *p, size_t len)
+{
+ while (len--)
+ crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++];
+ return crc;
+}
+
+#ifdef CONFIG_CRC32_ARCH
+#include "crc32.h" /* $(SRCARCH)/crc32.h */
+
+u32 crc32_optimizations(void)
+{
+ return crc32_optimizations_arch();
+}
+EXPORT_SYMBOL(crc32_optimizations);
+#else
+#define crc32_le_arch crc32_le_base
+#define crc32_be_arch crc32_be_base
+#define crc32c_arch crc32c_base
+#endif
+
+u32 crc32_le(u32 crc, const void *p, size_t len)
+{
+ return crc32_le_arch(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_le);
+
+u32 crc32_be(u32 crc, const void *p, size_t len)
+{
+ return crc32_be_arch(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_be);
+
+u32 crc32c(u32 crc, const void *p, size_t len)
+{
+ return crc32c_arch(crc, p, len);
+}
+EXPORT_SYMBOL(crc32c);
+
+#ifdef crc32_mod_init_arch
+static int __init crc32_mod_init(void)
+{
+ crc32_mod_init_arch();
+ return 0;
+}
+subsys_initcall(crc32_mod_init);
+
+static void __exit crc32_mod_exit(void)
+{
+}
+module_exit(crc32_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("CRC32 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc4.c b/lib/crc/crc4.c
new file mode 100644
index 000000000000..8e83fbe60bdc
--- /dev/null
+++ b/lib/crc/crc4.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crc4.c - simple crc-4 calculations.
+ */
+
+#include <linux/crc4.h>
+#include <linux/export.h>
+#include <linux/module.h>
+
+static const uint8_t crc4_tab[] = {
+ 0x0, 0x7, 0xe, 0x9, 0xb, 0xc, 0x5, 0x2,
+ 0x1, 0x6, 0xf, 0x8, 0xa, 0xd, 0x4, 0x3,
+};
+
+/**
+ * crc4 - calculate the 4-bit crc of a value.
+ * @c: starting crc4
+ * @x: value to checksum
+ * @bits: number of bits in @x to checksum
+ *
+ * Returns the crc4 value of @x, using polynomial 0b10111.
+ *
+ * The @x value is treated as left-aligned, and bits above @bits are ignored
+ * in the crc calculations.
+ */
+uint8_t crc4(uint8_t c, uint64_t x, int bits)
+{
+ int i;
+
+ /* mask off anything above the top bit */
+ x &= (1ull << bits) - 1;
+
+ /* Align to 4-bits */
+ bits = (bits + 3) & ~0x3;
+
+ /* Calculate crc4 over four-bit nibbles, starting at the MSbit */
+ for (i = bits - 4; i >= 0; i -= 4)
+ c = crc4_tab[c ^ ((x >> i) & 0xf)];
+
+ return c;
+}
+EXPORT_SYMBOL_GPL(crc4);
+
+MODULE_DESCRIPTION("CRC4 calculations");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc64-main.c b/lib/crc/crc64-main.c
new file mode 100644
index 000000000000..1337036010fe
--- /dev/null
+++ b/lib/crc/crc64-main.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Normal 64-bit CRC calculation.
+ *
+ * This is a basic crc64 implementation following ECMA-182 specification,
+ * which can be found from,
+ * https://www.ecma-international.org/publications/standards/Ecma-182.htm
+ *
+ * Dr. Ross N. Williams has a great document to introduce the idea of CRC
+ * algorithm, here the CRC64 code is also inspired by the table-driven
+ * algorithm and detail example from this paper. This paper can be found
+ * from,
+ * http://www.ross.net/crc/download/crc_v3.txt
+ *
+ * crc64table[256] is the lookup table of a table-driven 64-bit CRC
+ * calculation, which is generated by gen_crc64table.c in kernel build
+ * time. The polynomial of crc64 arithmetic is from ECMA-182 specification
+ * as well, which is defined as,
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+ *
+ * crc64nvmetable[256] uses the CRC64 polynomial from the NVME NVM Command Set
+ * Specification and uses least-significant-bit first bit order:
+ *
+ * x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + x^47 +
+ * x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + x^26 + x^23 +
+ * x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + x^4 + x^3 + 1
+ *
+ * Copyright 2018 SUSE Linux.
+ * Author: Coly Li <colyli@suse.de>
+ */
+
+#include <linux/crc64.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "crc64table.h"
+
+static inline u64 __maybe_unused
+crc64_be_generic(u64 crc, const u8 *p, size_t len)
+{
+ while (len--)
+ crc = (crc << 8) ^ crc64table[(crc >> 56) ^ *p++];
+ return crc;
+}
+
+static inline u64 __maybe_unused
+crc64_nvme_generic(u64 crc, const u8 *p, size_t len)
+{
+ while (len--)
+ crc = (crc >> 8) ^ crc64nvmetable[(crc & 0xff) ^ *p++];
+ return crc;
+}
+
+#ifdef CONFIG_CRC64_ARCH
+#include "crc64.h" /* $(SRCARCH)/crc64.h */
+#else
+#define crc64_be_arch crc64_be_generic
+#define crc64_nvme_arch crc64_nvme_generic
+#endif
+
+u64 crc64_be(u64 crc, const void *p, size_t len)
+{
+ return crc64_be_arch(crc, p, len);
+}
+EXPORT_SYMBOL_GPL(crc64_be);
+
+u64 crc64_nvme(u64 crc, const void *p, size_t len)
+{
+ return ~crc64_nvme_arch(~crc, p, len);
+}
+EXPORT_SYMBOL_GPL(crc64_nvme);
+
+#ifdef crc64_mod_init_arch
+static int __init crc64_mod_init(void)
+{
+ crc64_mod_init_arch();
+ return 0;
+}
+subsys_initcall(crc64_mod_init);
+
+static void __exit crc64_mod_exit(void)
+{
+}
+module_exit(crc64_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("CRC64 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc7.c b/lib/crc/crc7.c
new file mode 100644
index 000000000000..46b95d7ac6ce
--- /dev/null
+++ b/lib/crc/crc7.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crc7.c
+ */
+
+#include <linux/crc7.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+/*
+ * Table for CRC-7 (polynomial x^7 + x^3 + 1).
+ * This is a big-endian CRC (msbit is highest power of x),
+ * aligned so the msbit of the byte is the x^6 coefficient
+ * and the lsbit is not used.
+ */
+static const u8 crc7_be_syndrome_table[256] = {
+ 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,
+ 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee,
+ 0x32, 0x20, 0x16, 0x04, 0x7a, 0x68, 0x5e, 0x4c,
+ 0xa2, 0xb0, 0x86, 0x94, 0xea, 0xf8, 0xce, 0xdc,
+ 0x64, 0x76, 0x40, 0x52, 0x2c, 0x3e, 0x08, 0x1a,
+ 0xf4, 0xe6, 0xd0, 0xc2, 0xbc, 0xae, 0x98, 0x8a,
+ 0x56, 0x44, 0x72, 0x60, 0x1e, 0x0c, 0x3a, 0x28,
+ 0xc6, 0xd4, 0xe2, 0xf0, 0x8e, 0x9c, 0xaa, 0xb8,
+ 0xc8, 0xda, 0xec, 0xfe, 0x80, 0x92, 0xa4, 0xb6,
+ 0x58, 0x4a, 0x7c, 0x6e, 0x10, 0x02, 0x34, 0x26,
+ 0xfa, 0xe8, 0xde, 0xcc, 0xb2, 0xa0, 0x96, 0x84,
+ 0x6a, 0x78, 0x4e, 0x5c, 0x22, 0x30, 0x06, 0x14,
+ 0xac, 0xbe, 0x88, 0x9a, 0xe4, 0xf6, 0xc0, 0xd2,
+ 0x3c, 0x2e, 0x18, 0x0a, 0x74, 0x66, 0x50, 0x42,
+ 0x9e, 0x8c, 0xba, 0xa8, 0xd6, 0xc4, 0xf2, 0xe0,
+ 0x0e, 0x1c, 0x2a, 0x38, 0x46, 0x54, 0x62, 0x70,
+ 0x82, 0x90, 0xa6, 0xb4, 0xca, 0xd8, 0xee, 0xfc,
+ 0x12, 0x00, 0x36, 0x24, 0x5a, 0x48, 0x7e, 0x6c,
+ 0xb0, 0xa2, 0x94, 0x86, 0xf8, 0xea, 0xdc, 0xce,
+ 0x20, 0x32, 0x04, 0x16, 0x68, 0x7a, 0x4c, 0x5e,
+ 0xe6, 0xf4, 0xc2, 0xd0, 0xae, 0xbc, 0x8a, 0x98,
+ 0x76, 0x64, 0x52, 0x40, 0x3e, 0x2c, 0x1a, 0x08,
+ 0xd4, 0xc6, 0xf0, 0xe2, 0x9c, 0x8e, 0xb8, 0xaa,
+ 0x44, 0x56, 0x60, 0x72, 0x0c, 0x1e, 0x28, 0x3a,
+ 0x4a, 0x58, 0x6e, 0x7c, 0x02, 0x10, 0x26, 0x34,
+ 0xda, 0xc8, 0xfe, 0xec, 0x92, 0x80, 0xb6, 0xa4,
+ 0x78, 0x6a, 0x5c, 0x4e, 0x30, 0x22, 0x14, 0x06,
+ 0xe8, 0xfa, 0xcc, 0xde, 0xa0, 0xb2, 0x84, 0x96,
+ 0x2e, 0x3c, 0x0a, 0x18, 0x66, 0x74, 0x42, 0x50,
+ 0xbe, 0xac, 0x9a, 0x88, 0xf6, 0xe4, 0xd2, 0xc0,
+ 0x1c, 0x0e, 0x38, 0x2a, 0x54, 0x46, 0x70, 0x62,
+ 0x8c, 0x9e, 0xa8, 0xba, 0xc4, 0xd6, 0xe0, 0xf2
+};
+
+/**
+ * crc7_be - update the CRC7 for the data buffer
+ * @crc: previous CRC7 value
+ * @buffer: data pointer
+ * @len: number of bytes in the buffer
+ * Context: any
+ *
+ * Returns the updated CRC7 value.
+ * The CRC7 is left-aligned in the byte (the lsbit is always 0), as that
+ * makes the computation easier, and all callers want it in that form.
+ *
+ */
+u8 crc7_be(u8 crc, const u8 *buffer, size_t len)
+{
+ while (len--)
+ crc = crc7_be_syndrome_table[crc ^ *buffer++];
+ return crc;
+}
+EXPORT_SYMBOL(crc7_be);
+
+MODULE_DESCRIPTION("CRC7 calculations");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc8.c b/lib/crc/crc8.c
new file mode 100644
index 000000000000..329c52158c45
--- /dev/null
+++ b/lib/crc/crc8.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2011 Broadcom Corporation
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/crc8.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+
+/**
+ * crc8_populate_msb - fill crc table for given polynomial in reverse bit order.
+ *
+ * @table: table to be filled.
+ * @polynomial: polynomial for which table is to be filled.
+ */
+void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial)
+{
+ int i, j;
+ const u8 msbit = 0x80;
+ u8 t = msbit;
+
+ table[0] = 0;
+
+ for (i = 1; i < CRC8_TABLE_SIZE; i *= 2) {
+ t = (t << 1) ^ (t & msbit ? polynomial : 0);
+ for (j = 0; j < i; j++)
+ table[i+j] = table[j] ^ t;
+ }
+}
+EXPORT_SYMBOL(crc8_populate_msb);
+
+/**
+ * crc8_populate_lsb - fill crc table for given polynomial in regular bit order.
+ *
+ * @table: table to be filled.
+ * @polynomial: polynomial for which table is to be filled.
+ */
+void crc8_populate_lsb(u8 table[CRC8_TABLE_SIZE], u8 polynomial)
+{
+ int i, j;
+ u8 t = 1;
+
+ table[0] = 0;
+
+ for (i = (CRC8_TABLE_SIZE >> 1); i; i >>= 1) {
+ t = (t >> 1) ^ (t & 1 ? polynomial : 0);
+ for (j = 0; j < CRC8_TABLE_SIZE; j += 2*i)
+ table[i+j] = table[j] ^ t;
+ }
+}
+EXPORT_SYMBOL(crc8_populate_lsb);
+
+/**
+ * crc8 - calculate a crc8 over the given input data.
+ *
+ * @table: crc table used for calculation.
+ * @pdata: pointer to data buffer.
+ * @nbytes: number of bytes in data buffer.
+ * @crc: previous returned crc8 value.
+ */
+u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc)
+{
+ /* loop over the buffer data */
+ while (nbytes-- > 0)
+ crc = table[(crc ^ *pdata++) & 0xff];
+
+ return crc;
+}
+EXPORT_SYMBOL(crc8);
+
+MODULE_DESCRIPTION("CRC8 (by Williams, Ross N.) function");
+MODULE_AUTHOR("Broadcom Corporation");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/lib/crc/gen_crc32table.c b/lib/crc/gen_crc32table.c
new file mode 100644
index 000000000000..9a7f31658e35
--- /dev/null
+++ b/lib/crc/gen_crc32table.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include "../../include/linux/crc32poly.h"
+#include "../../include/generated/autoconf.h"
+#include <inttypes.h>
+
+static uint32_t crc32table_le[256];
+static uint32_t crc32table_be[256];
+static uint32_t crc32ctable_le[256];
+
+/**
+ * crc32init_le() - allocate and initialize LE table data
+ *
+ * crc is the crc of the byte i; other entries are filled in based on the
+ * fact that crctable[i^j] = crctable[i] ^ crctable[j].
+ *
+ */
+static void crc32init_le_generic(const uint32_t polynomial, uint32_t tab[256])
+{
+ unsigned i, j;
+ uint32_t crc = 1;
+
+ tab[0] = 0;
+
+ for (i = 128; i; i >>= 1) {
+ crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0);
+ for (j = 0; j < 256; j += 2 * i)
+ tab[i + j] = crc ^ tab[j];
+ }
+}
+
+static void crc32init_le(void)
+{
+ crc32init_le_generic(CRC32_POLY_LE, crc32table_le);
+}
+
+static void crc32cinit_le(void)
+{
+ crc32init_le_generic(CRC32C_POLY_LE, crc32ctable_le);
+}
+
+/**
+ * crc32init_be() - allocate and initialize BE table data
+ */
+static void crc32init_be(void)
+{
+ unsigned i, j;
+ uint32_t crc = 0x80000000;
+
+ crc32table_be[0] = 0;
+
+ for (i = 1; i < 256; i <<= 1) {
+ crc = (crc << 1) ^ ((crc & 0x80000000) ? CRC32_POLY_BE : 0);
+ for (j = 0; j < i; j++)
+ crc32table_be[i + j] = crc ^ crc32table_be[j];
+ }
+}
+
+static void output_table(const uint32_t table[256])
+{
+ int i;
+
+ for (i = 0; i < 256; i += 4) {
+ printf("\t0x%08x, 0x%08x, 0x%08x, 0x%08x,\n",
+ table[i], table[i + 1], table[i + 2], table[i + 3]);
+ }
+}
+
+int main(int argc, char** argv)
+{
+ printf("/* this file is generated - do not edit */\n\n");
+
+ crc32init_le();
+ printf("static const u32 ____cacheline_aligned crc32table_le[256] = {\n");
+ output_table(crc32table_le);
+ printf("};\n");
+
+ crc32init_be();
+ printf("static const u32 ____cacheline_aligned crc32table_be[256] = {\n");
+ output_table(crc32table_be);
+ printf("};\n");
+
+ crc32cinit_le();
+ printf("static const u32 ____cacheline_aligned crc32ctable_le[256] = {\n");
+ output_table(crc32ctable_le);
+ printf("};\n");
+
+ return 0;
+}
diff --git a/lib/crc/gen_crc64table.c b/lib/crc/gen_crc64table.c
new file mode 100644
index 000000000000..f2be9f62bab7
--- /dev/null
+++ b/lib/crc/gen_crc64table.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This host program runs at kernel build time and generates the lookup tables
+ * used by the generic CRC64 code.
+ *
+ * Copyright 2018 SUSE Linux.
+ * Author: Coly Li <colyli@suse.de>
+ */
+#include <inttypes.h>
+#include <stdio.h>
+
+#define CRC64_ECMA182_POLY 0x42F0E1EBA9EA3693ULL
+#define CRC64_NVME_POLY 0x9A6C9329AC4BC9B5ULL
+
+static uint64_t crc64_table[256] = {0};
+static uint64_t crc64_nvme_table[256] = {0};
+
+static void generate_reflected_crc64_table(uint64_t table[256], uint64_t poly)
+{
+ uint64_t i, j, c, crc;
+
+ for (i = 0; i < 256; i++) {
+ crc = 0ULL;
+ c = i;
+
+ for (j = 0; j < 8; j++) {
+ if ((crc ^ (c >> j)) & 1)
+ crc = (crc >> 1) ^ poly;
+ else
+ crc >>= 1;
+ }
+ table[i] = crc;
+ }
+}
+
+static void generate_crc64_table(uint64_t table[256], uint64_t poly)
+{
+ uint64_t i, j, c, crc;
+
+ for (i = 0; i < 256; i++) {
+ crc = 0;
+ c = i << 56;
+
+ for (j = 0; j < 8; j++) {
+ if ((crc ^ c) & 0x8000000000000000ULL)
+ crc = (crc << 1) ^ poly;
+ else
+ crc <<= 1;
+ c <<= 1;
+ }
+
+ table[i] = crc;
+ }
+}
+
+static void output_table(uint64_t table[256])
+{
+ int i;
+
+ for (i = 0; i < 256; i++) {
+ printf("\t0x%016" PRIx64 "ULL", table[i]);
+ if (i & 0x1)
+ printf(",\n");
+ else
+ printf(", ");
+ }
+ printf("};\n");
+}
+
+static void print_crc64_tables(void)
+{
+ printf("/* this file is generated - do not edit */\n\n");
+ printf("#include <linux/types.h>\n");
+ printf("#include <linux/cache.h>\n\n");
+ printf("static const u64 ____cacheline_aligned crc64table[256] = {\n");
+ output_table(crc64_table);
+
+ printf("\nstatic const u64 ____cacheline_aligned crc64nvmetable[256] = {\n");
+ output_table(crc64_nvme_table);
+}
+
+int main(int argc, char *argv[])
+{
+ generate_crc64_table(crc64_table, CRC64_ECMA182_POLY);
+ generate_reflected_crc64_table(crc64_nvme_table, CRC64_NVME_POLY);
+ print_crc64_tables();
+ return 0;
+}
diff --git a/lib/crc/loongarch/crc32.h b/lib/crc/loongarch/crc32.h
new file mode 100644
index 000000000000..6de5c96594af
--- /dev/null
+++ b/lib/crc/loongarch/crc32.h
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CRC32 and CRC32C using LoongArch crc* instructions
+ *
+ * Module based on mips/crypto/crc32-mips.c
+ *
+ * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org>
+ * Copyright (C) 2018 MIPS Tech, LLC
+ * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
+ */
+
+#include <asm/cpu-features.h>
+#include <linux/unaligned.h>
+
+#define _CRC32(crc, value, size, type) \
+do { \
+ __asm__ __volatile__( \
+ #type ".w." #size ".w" " %0, %1, %0\n\t"\
+ : "+r" (crc) \
+ : "r" (value) \
+ : "memory"); \
+} while (0)
+
+#define CRC32(crc, value, size) _CRC32(crc, value, size, crc)
+#define CRC32C(crc, value, size) _CRC32(crc, value, size, crcc)
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!static_branch_likely(&have_crc32))
+ return crc32_le_base(crc, p, len);
+
+ while (len >= sizeof(u64)) {
+ u64 value = get_unaligned_le64(p);
+
+ CRC32(crc, value, d);
+ p += sizeof(u64);
+ len -= sizeof(u64);
+ }
+
+ if (len & sizeof(u32)) {
+ u32 value = get_unaligned_le32(p);
+
+ CRC32(crc, value, w);
+ p += sizeof(u32);
+ }
+
+ if (len & sizeof(u16)) {
+ u16 value = get_unaligned_le16(p);
+
+ CRC32(crc, value, h);
+ p += sizeof(u16);
+ }
+
+ if (len & sizeof(u8)) {
+ u8 value = *p++;
+
+ CRC32(crc, value, b);
+ }
+
+ return crc;
+}
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!static_branch_likely(&have_crc32))
+ return crc32c_base(crc, p, len);
+
+ while (len >= sizeof(u64)) {
+ u64 value = get_unaligned_le64(p);
+
+ CRC32C(crc, value, d);
+ p += sizeof(u64);
+ len -= sizeof(u64);
+ }
+
+ if (len & sizeof(u32)) {
+ u32 value = get_unaligned_le32(p);
+
+ CRC32C(crc, value, w);
+ p += sizeof(u32);
+ }
+
+ if (len & sizeof(u16)) {
+ u16 value = get_unaligned_le16(p);
+
+ CRC32C(crc, value, h);
+ p += sizeof(u16);
+ }
+
+ if (len & sizeof(u8)) {
+ u8 value = *p++;
+
+ CRC32C(crc, value, b);
+ }
+
+ return crc;
+}
+
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static inline void crc32_mod_init_arch(void)
+{
+ if (cpu_has_crc32)
+ static_branch_enable(&have_crc32);
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (static_key_enabled(&have_crc32))
+ return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/mips/crc32.h b/lib/crc/mips/crc32.h
new file mode 100644
index 000000000000..11cb272c63a6
--- /dev/null
+++ b/lib/crc/mips/crc32.h
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * crc32-mips.c - CRC32 and CRC32C using optional MIPSr6 instructions
+ *
+ * Module based on arm64/crypto/crc32-arm.c
+ *
+ * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org>
+ * Copyright (C) 2018 MIPS Tech, LLC
+ */
+
+#include <linux/cpufeature.h>
+#include <asm/mipsregs.h>
+#include <linux/unaligned.h>
+
+#ifndef TOOLCHAIN_SUPPORTS_CRC
+#define _ASM_SET_CRC(OP, SZ, TYPE) \
+_ASM_MACRO_3R(OP, rt, rs, rt2, \
+ ".ifnc \\rt, \\rt2\n\t" \
+ ".error \"invalid operands \\\"" #OP " \\rt,\\rs,\\rt2\\\"\"\n\t" \
+ ".endif\n\t" \
+ _ASM_INSN_IF_MIPS(0x7c00000f | (__rt << 16) | (__rs << 21) | \
+ ((SZ) << 6) | ((TYPE) << 8)) \
+ _ASM_INSN32_IF_MM(0x00000030 | (__rs << 16) | (__rt << 21) | \
+ ((SZ) << 14) | ((TYPE) << 3)))
+#define _ASM_UNSET_CRC(op, SZ, TYPE) ".purgem " #op "\n\t"
+#else /* !TOOLCHAIN_SUPPORTS_CRC */
+#define _ASM_SET_CRC(op, SZ, TYPE) ".set\tcrc\n\t"
+#define _ASM_UNSET_CRC(op, SZ, TYPE)
+#endif
+
+#define __CRC32(crc, value, op, SZ, TYPE) \
+do { \
+ __asm__ __volatile__( \
+ ".set push\n\t" \
+ _ASM_SET_CRC(op, SZ, TYPE) \
+ #op " %0, %1, %0\n\t" \
+ _ASM_UNSET_CRC(op, SZ, TYPE) \
+ ".set pop" \
+ : "+r" (crc) \
+ : "r" (value)); \
+} while (0)
+
+#define _CRC32_crc32b(crc, value) __CRC32(crc, value, crc32b, 0, 0)
+#define _CRC32_crc32h(crc, value) __CRC32(crc, value, crc32h, 1, 0)
+#define _CRC32_crc32w(crc, value) __CRC32(crc, value, crc32w, 2, 0)
+#define _CRC32_crc32d(crc, value) __CRC32(crc, value, crc32d, 3, 0)
+#define _CRC32_crc32cb(crc, value) __CRC32(crc, value, crc32cb, 0, 1)
+#define _CRC32_crc32ch(crc, value) __CRC32(crc, value, crc32ch, 1, 1)
+#define _CRC32_crc32cw(crc, value) __CRC32(crc, value, crc32cw, 2, 1)
+#define _CRC32_crc32cd(crc, value) __CRC32(crc, value, crc32cd, 3, 1)
+
+#define _CRC32(crc, value, size, op) \
+ _CRC32_##op##size(crc, value)
+
+#define CRC32(crc, value, size) \
+ _CRC32(crc, value, size, crc32)
+
+#define CRC32C(crc, value, size) \
+ _CRC32(crc, value, size, crc32c)
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!static_branch_likely(&have_crc32))
+ return crc32_le_base(crc, p, len);
+
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) {
+ u64 value = get_unaligned_le64(p);
+
+ CRC32(crc, value, d);
+ }
+
+ if (len & sizeof(u32)) {
+ u32 value = get_unaligned_le32(p);
+
+ CRC32(crc, value, w);
+ p += sizeof(u32);
+ }
+ } else {
+ for (; len >= sizeof(u32); len -= sizeof(u32)) {
+ u32 value = get_unaligned_le32(p);
+
+ CRC32(crc, value, w);
+ p += sizeof(u32);
+ }
+ }
+
+ if (len & sizeof(u16)) {
+ u16 value = get_unaligned_le16(p);
+
+ CRC32(crc, value, h);
+ p += sizeof(u16);
+ }
+
+ if (len & sizeof(u8)) {
+ u8 value = *p++;
+
+ CRC32(crc, value, b);
+ }
+
+ return crc;
+}
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (!static_branch_likely(&have_crc32))
+ return crc32c_base(crc, p, len);
+
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) {
+ u64 value = get_unaligned_le64(p);
+
+ CRC32C(crc, value, d);
+ }
+
+ if (len & sizeof(u32)) {
+ u32 value = get_unaligned_le32(p);
+
+ CRC32C(crc, value, w);
+ p += sizeof(u32);
+ }
+ } else {
+ for (; len >= sizeof(u32); len -= sizeof(u32)) {
+ u32 value = get_unaligned_le32(p);
+
+ CRC32C(crc, value, w);
+ p += sizeof(u32);
+ }
+ }
+
+ if (len & sizeof(u16)) {
+ u16 value = get_unaligned_le16(p);
+
+ CRC32C(crc, value, h);
+ p += sizeof(u16);
+ }
+
+ if (len & sizeof(u8)) {
+ u8 value = *p++;
+
+ CRC32C(crc, value, b);
+ }
+ return crc;
+}
+
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static inline void crc32_mod_init_arch(void)
+{
+ if (cpu_have_feature(cpu_feature(MIPS_CRC32)))
+ static_branch_enable(&have_crc32);
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (static_key_enabled(&have_crc32))
+ return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/powerpc/crc-t10dif.h b/lib/crc/powerpc/crc-t10dif.h
new file mode 100644
index 000000000000..59e16804a6ea
--- /dev/null
+++ b/lib/crc/powerpc/crc-t10dif.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Calculate a CRC T10-DIF with vpmsum acceleration
+ *
+ * Copyright 2017, Daniel Axtens, IBM Corporation.
+ * [based on crc32c-vpmsum_glue.c]
+ */
+
+#include <asm/switch_to.h>
+#include <crypto/internal/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/preempt.h>
+#include <linux/uaccess.h>
+
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN-1)
+
+#define VECTOR_BREAKPOINT 64
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
+
+u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len);
+
+static inline u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len)
+{
+ unsigned int prealign;
+ unsigned int tail;
+ u32 crc = crci;
+
+ if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) ||
+ !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable())
+ return crc_t10dif_generic(crc, p, len);
+
+ if ((unsigned long)p & VMX_ALIGN_MASK) {
+ prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+ crc = crc_t10dif_generic(crc, p, prealign);
+ len -= prealign;
+ p += prealign;
+ }
+
+ if (len & ~VMX_ALIGN_MASK) {
+ crc <<= 16;
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_altivec();
+ crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+ disable_kernel_altivec();
+ pagefault_enable();
+ preempt_enable();
+ crc >>= 16;
+ }
+
+ tail = len & VMX_ALIGN_MASK;
+ if (tail) {
+ p += len & ~VMX_ALIGN_MASK;
+ crc = crc_t10dif_generic(crc, p, tail);
+ }
+
+ return crc & 0xffff;
+}
+
+#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
+static inline void crc_t10dif_mod_init_arch(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO))
+ static_branch_enable(&have_vec_crypto);
+}
diff --git a/lib/crc/powerpc/crc-vpmsum-template.S b/lib/crc/powerpc/crc-vpmsum-template.S
new file mode 100644
index 000000000000..b0f87f595b26
--- /dev/null
+++ b/lib/crc/powerpc/crc-vpmsum-template.S
@@ -0,0 +1,746 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Core of the accelerated CRC algorithm.
+ * In your file, define the constants and CRC_FUNCTION_NAME
+ * Then include this file.
+ *
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * https://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+*/
+
+#include <asm/ppc_asm.h>
+#include <asm/ppc-opcode.h>
+
+#define MAX_SIZE 32768
+
+ .text
+
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16 r25
+#define off32 r26
+#define off48 r27
+#define off64 r28
+#define off80 r29
+#define off96 r30
+#define off112 r31
+
+#define const1 v24
+#define const2 v25
+
+#define byteswap v26
+#define mask_32bit v27
+#define mask_64bit v28
+#define zeroes v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(CRC_FUNCTION_NAME)
+ std r31,-8(r1)
+ std r30,-16(r1)
+ std r29,-24(r1)
+ std r28,-32(r1)
+ std r27,-40(r1)
+ std r26,-48(r1)
+ std r25,-56(r1)
+
+ li off16,16
+ li off32,32
+ li off48,48
+ li off64,64
+ li off80,80
+ li off96,96
+ li off112,112
+ li r0,0
+
+ /* Enough room for saving 10 non volatile VMX registers */
+ subi r6,r1,56+10*16
+ subi r7,r1,56+2*16
+
+ stvx v20,0,r6
+ stvx v21,off16,r6
+ stvx v22,off32,r6
+ stvx v23,off48,r6
+ stvx v24,off64,r6
+ stvx v25,off80,r6
+ stvx v26,off96,r6
+ stvx v27,off112,r6
+ stvx v28,0,r7
+ stvx v29,off16,r7
+
+ mr r10,r3
+
+ vxor zeroes,zeroes,zeroes
+ vspltisw v0,-1
+
+ vsldoi mask_32bit,zeroes,v0,4
+ vsldoi mask_64bit,zeroes,v0,8
+
+ /* Get the initial value into v8 */
+ vxor v8,v8,v8
+ MTVRD(v8, R3)
+#ifdef REFLECT
+ vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
+#else
+ vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
+#endif
+
+#ifdef BYTESWAP_DATA
+ LOAD_REG_ADDR(r3, .byteswap_constant)
+ lvx byteswap,0,r3
+ addi r3,r3,16
+#endif
+
+ cmpdi r5,256
+ blt .Lshort
+
+ rldicr r6,r5,0,56
+
+ /* Checksum in blocks of MAX_SIZE */
+1: lis r7,MAX_SIZE@h
+ ori r7,r7,MAX_SIZE@l
+ mr r9,r7
+ cmpd r6,r7
+ bgt 2f
+ mr r7,r6
+2: subf r6,r7,r6
+
+ /* our main loop does 128 bytes at a time */
+ srdi r7,r7,7
+
+ /*
+ * Work out the offset into the constants table to start at. Each
+ * constant is 16 bytes, and it is used against 128 bytes of input
+ * data - 128 / 16 = 8
+ */
+ sldi r8,r7,4
+ srdi r9,r9,3
+ subf r8,r8,r9
+
+ /* We reduce our final 128 bytes in a separate step */
+ addi r7,r7,-1
+ mtctr r7
+
+ LOAD_REG_ADDR(r3, .constants)
+
+ /* Find the start of our constants */
+ add r3,r3,r8
+
+ /* zero v0-v7 which will contain our checksums */
+ vxor v0,v0,v0
+ vxor v1,v1,v1
+ vxor v2,v2,v2
+ vxor v3,v3,v3
+ vxor v4,v4,v4
+ vxor v5,v5,v5
+ vxor v6,v6,v6
+ vxor v7,v7,v7
+
+ lvx const1,0,r3
+
+ /*
+ * If we are looping back to consume more data we use the values
+ * already in v16-v23.
+ */
+ cmpdi r0,1
+ beq 2f
+
+ /* First warm up pass */
+ lvx v16,0,r4
+ lvx v17,off16,r4
+ VPERM(v16,v16,v16,byteswap)
+ VPERM(v17,v17,v17,byteswap)
+ lvx v18,off32,r4
+ lvx v19,off48,r4
+ VPERM(v18,v18,v18,byteswap)
+ VPERM(v19,v19,v19,byteswap)
+ lvx v20,off64,r4
+ lvx v21,off80,r4
+ VPERM(v20,v20,v20,byteswap)
+ VPERM(v21,v21,v21,byteswap)
+ lvx v22,off96,r4
+ lvx v23,off112,r4
+ VPERM(v22,v22,v22,byteswap)
+ VPERM(v23,v23,v23,byteswap)
+ addi r4,r4,8*16
+
+ /* xor in initial value */
+ vxor v16,v16,v8
+
+2: bdz .Lfirst_warm_up_done
+
+ addi r3,r3,16
+ lvx const2,0,r3
+
+ /* Second warm up pass */
+ VPMSUMD(v8,v16,const1)
+ lvx v16,0,r4
+ VPERM(v16,v16,v16,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v9,v17,const1)
+ lvx v17,off16,r4
+ VPERM(v17,v17,v17,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v10,v18,const1)
+ lvx v18,off32,r4
+ VPERM(v18,v18,v18,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v11,v19,const1)
+ lvx v19,off48,r4
+ VPERM(v19,v19,v19,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v12,v20,const1)
+ lvx v20,off64,r4
+ VPERM(v20,v20,v20,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v13,v21,const1)
+ lvx v21,off80,r4
+ VPERM(v21,v21,v21,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v14,v22,const1)
+ lvx v22,off96,r4
+ VPERM(v22,v22,v22,byteswap)
+ ori r2,r2,0
+
+ VPMSUMD(v15,v23,const1)
+ lvx v23,off112,r4
+ VPERM(v23,v23,v23,byteswap)
+
+ addi r4,r4,8*16
+
+ bdz .Lfirst_cool_down
+
+ /*
+ * main loop. We modulo schedule it such that it takes three iterations
+ * to complete - first iteration load, second iteration vpmsum, third
+ * iteration xor.
+ */
+ .balign 16
+4: lvx const1,0,r3
+ addi r3,r3,16
+ ori r2,r2,0
+
+ vxor v0,v0,v8
+ VPMSUMD(v8,v16,const2)
+ lvx v16,0,r4
+ VPERM(v16,v16,v16,byteswap)
+ ori r2,r2,0
+
+ vxor v1,v1,v9
+ VPMSUMD(v9,v17,const2)
+ lvx v17,off16,r4
+ VPERM(v17,v17,v17,byteswap)
+ ori r2,r2,0
+
+ vxor v2,v2,v10
+ VPMSUMD(v10,v18,const2)
+ lvx v18,off32,r4
+ VPERM(v18,v18,v18,byteswap)
+ ori r2,r2,0
+
+ vxor v3,v3,v11
+ VPMSUMD(v11,v19,const2)
+ lvx v19,off48,r4
+ VPERM(v19,v19,v19,byteswap)
+ lvx const2,0,r3
+ ori r2,r2,0
+
+ vxor v4,v4,v12
+ VPMSUMD(v12,v20,const1)
+ lvx v20,off64,r4
+ VPERM(v20,v20,v20,byteswap)
+ ori r2,r2,0
+
+ vxor v5,v5,v13
+ VPMSUMD(v13,v21,const1)
+ lvx v21,off80,r4
+ VPERM(v21,v21,v21,byteswap)
+ ori r2,r2,0
+
+ vxor v6,v6,v14
+ VPMSUMD(v14,v22,const1)
+ lvx v22,off96,r4
+ VPERM(v22,v22,v22,byteswap)
+ ori r2,r2,0
+
+ vxor v7,v7,v15
+ VPMSUMD(v15,v23,const1)
+ lvx v23,off112,r4
+ VPERM(v23,v23,v23,byteswap)
+
+ addi r4,r4,8*16
+
+ bdnz 4b
+
+.Lfirst_cool_down:
+ /* First cool down pass */
+ lvx const1,0,r3
+ addi r3,r3,16
+
+ vxor v0,v0,v8
+ VPMSUMD(v8,v16,const1)
+ ori r2,r2,0
+
+ vxor v1,v1,v9
+ VPMSUMD(v9,v17,const1)
+ ori r2,r2,0
+
+ vxor v2,v2,v10
+ VPMSUMD(v10,v18,const1)
+ ori r2,r2,0
+
+ vxor v3,v3,v11
+ VPMSUMD(v11,v19,const1)
+ ori r2,r2,0
+
+ vxor v4,v4,v12
+ VPMSUMD(v12,v20,const1)
+ ori r2,r2,0
+
+ vxor v5,v5,v13
+ VPMSUMD(v13,v21,const1)
+ ori r2,r2,0
+
+ vxor v6,v6,v14
+ VPMSUMD(v14,v22,const1)
+ ori r2,r2,0
+
+ vxor v7,v7,v15
+ VPMSUMD(v15,v23,const1)
+ ori r2,r2,0
+
+.Lsecond_cool_down:
+ /* Second cool down pass */
+ vxor v0,v0,v8
+ vxor v1,v1,v9
+ vxor v2,v2,v10
+ vxor v3,v3,v11
+ vxor v4,v4,v12
+ vxor v5,v5,v13
+ vxor v6,v6,v14
+ vxor v7,v7,v15
+
+#ifdef REFLECT
+ /*
+ * vpmsumd produces a 96 bit result in the least significant bits
+ * of the register. Since we are bit reflected we have to shift it
+ * left 32 bits so it occupies the least significant bits in the
+ * bit reflected domain.
+ */
+ vsldoi v0,v0,zeroes,4
+ vsldoi v1,v1,zeroes,4
+ vsldoi v2,v2,zeroes,4
+ vsldoi v3,v3,zeroes,4
+ vsldoi v4,v4,zeroes,4
+ vsldoi v5,v5,zeroes,4
+ vsldoi v6,v6,zeroes,4
+ vsldoi v7,v7,zeroes,4
+#endif
+
+ /* xor with last 1024 bits */
+ lvx v8,0,r4
+ lvx v9,off16,r4
+ VPERM(v8,v8,v8,byteswap)
+ VPERM(v9,v9,v9,byteswap)
+ lvx v10,off32,r4
+ lvx v11,off48,r4
+ VPERM(v10,v10,v10,byteswap)
+ VPERM(v11,v11,v11,byteswap)
+ lvx v12,off64,r4
+ lvx v13,off80,r4
+ VPERM(v12,v12,v12,byteswap)
+ VPERM(v13,v13,v13,byteswap)
+ lvx v14,off96,r4
+ lvx v15,off112,r4
+ VPERM(v14,v14,v14,byteswap)
+ VPERM(v15,v15,v15,byteswap)
+
+ addi r4,r4,8*16
+
+ vxor v16,v0,v8
+ vxor v17,v1,v9
+ vxor v18,v2,v10
+ vxor v19,v3,v11
+ vxor v20,v4,v12
+ vxor v21,v5,v13
+ vxor v22,v6,v14
+ vxor v23,v7,v15
+
+ li r0,1
+ cmpdi r6,0
+ addi r6,r6,128
+ bne 1b
+
+ /* Work out how many bytes we have left */
+ andi. r5,r5,127
+
+ /* Calculate where in the constant table we need to start */
+ subfic r6,r5,128
+ add r3,r3,r6
+
+ /* How many 16 byte chunks are in the tail */
+ srdi r7,r5,4
+ mtctr r7
+
+ /*
+ * Reduce the previously calculated 1024 bits to 64 bits, shifting
+ * 32 bits to include the trailing 32 bits of zeros
+ */
+ lvx v0,0,r3
+ lvx v1,off16,r3
+ lvx v2,off32,r3
+ lvx v3,off48,r3
+ lvx v4,off64,r3
+ lvx v5,off80,r3
+ lvx v6,off96,r3
+ lvx v7,off112,r3
+ addi r3,r3,8*16
+
+ VPMSUMW(v0,v16,v0)
+ VPMSUMW(v1,v17,v1)
+ VPMSUMW(v2,v18,v2)
+ VPMSUMW(v3,v19,v3)
+ VPMSUMW(v4,v20,v4)
+ VPMSUMW(v5,v21,v5)
+ VPMSUMW(v6,v22,v6)
+ VPMSUMW(v7,v23,v7)
+
+ /* Now reduce the tail (0 - 112 bytes) */
+ cmpdi r7,0
+ beq 1f
+
+ lvx v16,0,r4
+ lvx v17,0,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off16,r4
+ lvx v17,off16,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off32,r4
+ lvx v17,off32,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off48,r4
+ lvx v17,off48,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off64,r4
+ lvx v17,off64,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off80,r4
+ lvx v17,off80,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+ bdz 1f
+
+ lvx v16,off96,r4
+ lvx v17,off96,r3
+ VPERM(v16,v16,v16,byteswap)
+ VPMSUMW(v16,v16,v17)
+ vxor v0,v0,v16
+
+ /* Now xor all the parallel chunks together */
+1: vxor v0,v0,v1
+ vxor v2,v2,v3
+ vxor v4,v4,v5
+ vxor v6,v6,v7
+
+ vxor v0,v0,v2
+ vxor v4,v4,v6
+
+ vxor v0,v0,v4
+
+.Lbarrett_reduction:
+ /* Barrett constants */
+ LOAD_REG_ADDR(r3, .barrett_constants)
+
+ lvx const1,0,r3
+ lvx const2,off16,r3
+
+ vsldoi v1,v0,v0,8
+ vxor v0,v0,v1 /* xor two 64 bit results together */
+
+#ifdef REFLECT
+ /* shift left one bit */
+ vspltisb v1,1
+ vsl v0,v0,v1
+#endif
+
+ vand v0,v0,mask_64bit
+#ifndef REFLECT
+ /*
+ * Now for the Barrett reduction algorithm. The idea is to calculate q,
+ * the multiple of our polynomial that we need to subtract. By
+ * doing the computation 2x bits higher (ie 64 bits) and shifting the
+ * result back down 2x bits, we round down to the nearest multiple.
+ */
+ VPMSUMD(v1,v0,const1) /* ma */
+ vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
+ VPMSUMD(v1,v1,const2) /* qn */
+ vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Get the result into r3. We need to shift it left 8 bytes:
+ * V0 [ 0 1 2 X ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
+#else
+ /*
+ * The reflected version of Barrett reduction. Instead of bit
+ * reflecting our data (which is expensive to do), we bit reflect our
+ * constants and our algorithm, which means the intermediate data in
+ * our vector registers goes from 0-63 instead of 63-0. We can reflect
+ * the algorithm because we don't carry in mod 2 arithmetic.
+ */
+ vand v1,v0,mask_32bit /* bottom 32 bits of a */
+ VPMSUMD(v1,v1,const1) /* ma */
+ vand v1,v1,mask_32bit /* bottom 32bits of ma */
+ VPMSUMD(v1,v1,const2) /* qn */
+ vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Since we are bit reflected, the result (ie the low 32 bits) is in
+ * the high 32 bits. We just need to shift it left 4 bytes
+ * V0 [ 0 1 X 3 ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
+#endif
+
+ /* Get it into r3 */
+ MFVRD(R3, v0)
+
+.Lout:
+ subi r6,r1,56+10*16
+ subi r7,r1,56+2*16
+
+ lvx v20,0,r6
+ lvx v21,off16,r6
+ lvx v22,off32,r6
+ lvx v23,off48,r6
+ lvx v24,off64,r6
+ lvx v25,off80,r6
+ lvx v26,off96,r6
+ lvx v27,off112,r6
+ lvx v28,0,r7
+ lvx v29,off16,r7
+
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+ ld r26,-48(r1)
+ ld r25,-56(r1)
+
+ blr
+
+.Lfirst_warm_up_done:
+ lvx const1,0,r3
+ addi r3,r3,16
+
+ VPMSUMD(v8,v16,const1)
+ VPMSUMD(v9,v17,const1)
+ VPMSUMD(v10,v18,const1)
+ VPMSUMD(v11,v19,const1)
+ VPMSUMD(v12,v20,const1)
+ VPMSUMD(v13,v21,const1)
+ VPMSUMD(v14,v22,const1)
+ VPMSUMD(v15,v23,const1)
+
+ b .Lsecond_cool_down
+
+.Lshort:
+ cmpdi r5,0
+ beq .Lzero
+
+ LOAD_REG_ADDR(r3, .short_constants)
+
+ /* Calculate where in the constant table we need to start */
+ subfic r6,r5,256
+ add r3,r3,r6
+
+ /* How many 16 byte chunks? */
+ srdi r7,r5,4
+ mtctr r7
+
+ vxor v19,v19,v19
+ vxor v20,v20,v20
+
+ lvx v0,0,r4
+ lvx v16,0,r3
+ VPERM(v0,v0,v16,byteswap)
+ vxor v0,v0,v8 /* xor in initial value */
+ VPMSUMW(v0,v0,v16)
+ bdz .Lv0
+
+ lvx v1,off16,r4
+ lvx v17,off16,r3
+ VPERM(v1,v1,v17,byteswap)
+ VPMSUMW(v1,v1,v17)
+ bdz .Lv1
+
+ lvx v2,off32,r4
+ lvx v16,off32,r3
+ VPERM(v2,v2,v16,byteswap)
+ VPMSUMW(v2,v2,v16)
+ bdz .Lv2
+
+ lvx v3,off48,r4
+ lvx v17,off48,r3
+ VPERM(v3,v3,v17,byteswap)
+ VPMSUMW(v3,v3,v17)
+ bdz .Lv3
+
+ lvx v4,off64,r4
+ lvx v16,off64,r3
+ VPERM(v4,v4,v16,byteswap)
+ VPMSUMW(v4,v4,v16)
+ bdz .Lv4
+
+ lvx v5,off80,r4
+ lvx v17,off80,r3
+ VPERM(v5,v5,v17,byteswap)
+ VPMSUMW(v5,v5,v17)
+ bdz .Lv5
+
+ lvx v6,off96,r4
+ lvx v16,off96,r3
+ VPERM(v6,v6,v16,byteswap)
+ VPMSUMW(v6,v6,v16)
+ bdz .Lv6
+
+ lvx v7,off112,r4
+ lvx v17,off112,r3
+ VPERM(v7,v7,v17,byteswap)
+ VPMSUMW(v7,v7,v17)
+ bdz .Lv7
+
+ addi r3,r3,128
+ addi r4,r4,128
+
+ lvx v8,0,r4
+ lvx v16,0,r3
+ VPERM(v8,v8,v16,byteswap)
+ VPMSUMW(v8,v8,v16)
+ bdz .Lv8
+
+ lvx v9,off16,r4
+ lvx v17,off16,r3
+ VPERM(v9,v9,v17,byteswap)
+ VPMSUMW(v9,v9,v17)
+ bdz .Lv9
+
+ lvx v10,off32,r4
+ lvx v16,off32,r3
+ VPERM(v10,v10,v16,byteswap)
+ VPMSUMW(v10,v10,v16)
+ bdz .Lv10
+
+ lvx v11,off48,r4
+ lvx v17,off48,r3
+ VPERM(v11,v11,v17,byteswap)
+ VPMSUMW(v11,v11,v17)
+ bdz .Lv11
+
+ lvx v12,off64,r4
+ lvx v16,off64,r3
+ VPERM(v12,v12,v16,byteswap)
+ VPMSUMW(v12,v12,v16)
+ bdz .Lv12
+
+ lvx v13,off80,r4
+ lvx v17,off80,r3
+ VPERM(v13,v13,v17,byteswap)
+ VPMSUMW(v13,v13,v17)
+ bdz .Lv13
+
+ lvx v14,off96,r4
+ lvx v16,off96,r3
+ VPERM(v14,v14,v16,byteswap)
+ VPMSUMW(v14,v14,v16)
+ bdz .Lv14
+
+ lvx v15,off112,r4
+ lvx v17,off112,r3
+ VPERM(v15,v15,v17,byteswap)
+ VPMSUMW(v15,v15,v17)
+
+.Lv15: vxor v19,v19,v15
+.Lv14: vxor v20,v20,v14
+.Lv13: vxor v19,v19,v13
+.Lv12: vxor v20,v20,v12
+.Lv11: vxor v19,v19,v11
+.Lv10: vxor v20,v20,v10
+.Lv9: vxor v19,v19,v9
+.Lv8: vxor v20,v20,v8
+.Lv7: vxor v19,v19,v7
+.Lv6: vxor v20,v20,v6
+.Lv5: vxor v19,v19,v5
+.Lv4: vxor v20,v20,v4
+.Lv3: vxor v19,v19,v3
+.Lv2: vxor v20,v20,v2
+.Lv1: vxor v19,v19,v1
+.Lv0: vxor v20,v20,v0
+
+ vxor v0,v19,v20
+
+ b .Lbarrett_reduction
+
+.Lzero:
+ mr r3,r10
+ b .Lout
+
+FUNC_END(CRC_FUNCTION_NAME)
diff --git a/lib/crc/powerpc/crc32.h b/lib/crc/powerpc/crc32.h
new file mode 100644
index 000000000000..811cc2e6ed24
--- /dev/null
+++ b/lib/crc/powerpc/crc32.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <asm/switch_to.h>
+#include <crypto/internal/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/preempt.h>
+#include <linux/uaccess.h>
+
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN-1)
+
+#define VECTOR_BREAKPOINT 512
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto);
+
+#define crc32_le_arch crc32_le_base /* not implemented on this arch */
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+u32 __crc32c_vpmsum(u32 crc, const u8 *p, size_t len);
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ unsigned int prealign;
+ unsigned int tail;
+
+ if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) ||
+ !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable())
+ return crc32c_base(crc, p, len);
+
+ if ((unsigned long)p & VMX_ALIGN_MASK) {
+ prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
+ crc = crc32c_base(crc, p, prealign);
+ len -= prealign;
+ p += prealign;
+ }
+
+ if (len & ~VMX_ALIGN_MASK) {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_altivec();
+ crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+ disable_kernel_altivec();
+ pagefault_enable();
+ preempt_enable();
+ }
+
+ tail = len & VMX_ALIGN_MASK;
+ if (tail) {
+ p += len & ~VMX_ALIGN_MASK;
+ crc = crc32c_base(crc, p, tail);
+ }
+
+ return crc;
+}
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static inline void crc32_mod_init_arch(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO))
+ static_branch_enable(&have_vec_crypto);
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (static_key_enabled(&have_vec_crypto))
+ return CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/powerpc/crc32c-vpmsum_asm.S b/lib/crc/powerpc/crc32c-vpmsum_asm.S
new file mode 100644
index 000000000000..1b35c55cce0a
--- /dev/null
+++ b/lib/crc/powerpc/crc32c-vpmsum_asm.S
@@ -0,0 +1,842 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Calculate a crc32c with vpmsum acceleration
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ */
+ .section .rodata
+.balign 16
+
+.byteswap_constant:
+ /* byte reverse permute constant */
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
+.constants:
+
+ /* Reduce 262144 kbits to 1024 bits */
+ /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+ .octa 0x00000000b6ca9e20000000009c37c408
+
+ /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+ .octa 0x00000000350249a800000001b51df26c
+
+ /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+ .octa 0x00000001862dac54000000000724b9d0
+
+ /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+ .octa 0x00000001d87fb48c00000001c00532fe
+
+ /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+ .octa 0x00000001f39b699e00000000f05a9362
+
+ /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+ .octa 0x0000000101da11b400000001e1007970
+
+ /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+ .octa 0x00000001cab571e000000000a57366ee
+
+ /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+ .octa 0x00000000c7020cfe0000000192011284
+
+ /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+ .octa 0x00000000cdaed1ae0000000162716d9a
+
+ /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+ .octa 0x00000001e804effc00000000cd97ecde
+
+ /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+ .octa 0x0000000077c3ea3a0000000058812bc0
+
+ /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+ .octa 0x0000000068df31b40000000088b8c12e
+
+ /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+ .octa 0x00000000b059b6c200000001230b234c
+
+ /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+ .octa 0x0000000145fb8ed800000001120b416e
+
+ /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+ .octa 0x00000000cbc0916800000001974aecb0
+
+ /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+ .octa 0x000000005ceeedc2000000008ee3f226
+
+ /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+ .octa 0x0000000047d74e8600000001089aba9a
+
+ /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+ .octa 0x00000001407e9e220000000065113872
+
+ /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+ .octa 0x00000001da967bda000000005c07ec10
+
+ /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+ .octa 0x000000006c8983680000000187590924
+
+ /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+ .octa 0x00000000f2d14c9800000000e35da7c6
+
+ /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+ .octa 0x00000001993c6ad4000000000415855a
+
+ /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+ .octa 0x000000014683d1ac0000000073617758
+
+ /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+ .octa 0x00000001a7c93e6c0000000176021d28
+
+ /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+ .octa 0x000000010211e90a00000001c358fd0a
+
+ /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+ .octa 0x000000001119403e00000001ff7a2c18
+
+ /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+ .octa 0x000000001c3261aa00000000f2d9f7e4
+
+ /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+ .octa 0x000000014e37a634000000016cf1f9c8
+
+ /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+ .octa 0x0000000073786c0c000000010af9279a
+
+ /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+ .octa 0x000000011dc037f80000000004f101e8
+
+ /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+ .octa 0x0000000031433dfc0000000070bcf184
+
+ /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+ .octa 0x000000009cde8348000000000a8de642
+
+ /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+ .octa 0x0000000038d3c2a60000000062ea130c
+
+ /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+ .octa 0x000000011b25f26000000001eb31cbb2
+
+ /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+ .octa 0x000000001629e6f00000000170783448
+
+ /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+ .octa 0x0000000160838b4c00000001a684b4c6
+
+ /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+ .octa 0x000000007a44011c00000000253ca5b4
+
+ /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+ .octa 0x00000000226f417a0000000057b4b1e2
+
+ /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+ .octa 0x0000000045eb2eb400000000b6bd084c
+
+ /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+ .octa 0x000000014459d70c0000000123c2d592
+
+ /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+ .octa 0x00000001d406ed8200000000159dafce
+
+ /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+ .octa 0x0000000160c8e1a80000000127e1a64e
+
+ /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+ .octa 0x0000000027ba80980000000056860754
+
+ /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+ .octa 0x000000006d92d01800000001e661aae8
+
+ /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+ .octa 0x000000012ed7e3f200000000f82c6166
+
+ /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+ .octa 0x000000002dc8778800000000c4f9c7ae
+
+ /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+ .octa 0x0000000018240bb80000000074203d20
+
+ /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+ .octa 0x000000001ad381580000000198173052
+
+ /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+ .octa 0x00000001396b78f200000001ce8aba54
+
+ /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+ .octa 0x000000011a68133400000001850d5d94
+
+ /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+ .octa 0x000000012104732e00000001d609239c
+
+ /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+ .octa 0x00000000a140d90c000000001595f048
+
+ /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+ .octa 0x00000001b7215eda0000000042ccee08
+
+ /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+ .octa 0x00000001aaf1df3c000000010a389d74
+
+ /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+ .octa 0x0000000029d15b8a000000012a840da6
+
+ /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+ .octa 0x00000000f1a96922000000001d181c0c
+
+ /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+ .octa 0x00000001ac80d03c0000000068b7d1f6
+
+ /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+ .octa 0x000000000f11d56a000000005b0f14fc
+
+ /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+ .octa 0x00000001f1c022a20000000179e9e730
+
+ /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+ .octa 0x0000000173d00ae200000001ce1368d6
+
+ /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+ .octa 0x00000001d4ffe4ac0000000112c3a84c
+
+ /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+ .octa 0x000000016edc5ae400000000de940fee
+
+ /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+ .octa 0x00000001f1a0214000000000fe896b7e
+
+ /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+ .octa 0x00000000ca0b28a000000001f797431c
+
+ /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+ .octa 0x00000001928e30a20000000053e989ba
+
+ /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+ .octa 0x0000000097b1b002000000003920cd16
+
+ /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+ .octa 0x00000000b15bf90600000001e6f579b8
+
+ /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+ .octa 0x00000000411c5d52000000007493cb0a
+
+ /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+ .octa 0x00000001c36f330000000001bdd376d8
+
+ /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+ .octa 0x00000001119227e0000000016badfee6
+
+ /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+ .octa 0x00000000114d47020000000071de5c58
+
+ /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+ .octa 0x00000000458b5b9800000000453f317c
+
+ /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+ .octa 0x000000012e31fb8e0000000121675cce
+
+ /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+ .octa 0x000000005cf619d800000001f409ee92
+
+ /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+ .octa 0x0000000063f4d8b200000000f36b9c88
+
+ /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+ .octa 0x000000004138dc8a0000000036b398f4
+
+ /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+ .octa 0x00000001d29ee8e000000001748f9adc
+
+ /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+ .octa 0x000000006a08ace800000001be94ec00
+
+ /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+ .octa 0x0000000127d4201000000000b74370d6
+
+ /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+ .octa 0x0000000019d76b6200000001174d0b98
+
+ /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+ .octa 0x00000001b1471f6e00000000befc06a4
+
+ /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+ .octa 0x00000001f64c19cc00000001ae125288
+
+ /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+ .octa 0x00000000003c0ea00000000095c19b34
+
+ /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+ .octa 0x000000014d73abf600000001a78496f2
+
+ /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+ .octa 0x00000001620eb84400000001ac5390a0
+
+ /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+ .octa 0x0000000147655048000000002a80ed6e
+
+ /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+ .octa 0x0000000067b5077e00000001fa9b0128
+
+ /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+ .octa 0x0000000010ffe20600000001ea94929e
+
+ /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+ .octa 0x000000000fee8f1e0000000125f4305c
+
+ /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+ .octa 0x00000001da26fbae00000001471e2002
+
+ /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+ .octa 0x00000001b3a8bd880000000132d2253a
+
+ /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+ .octa 0x00000000e8f3898e00000000f26b3592
+
+ /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+ .octa 0x00000000b0d0d28c00000000bc8b67b0
+
+ /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+ .octa 0x0000000030f2a798000000013a826ef2
+
+ /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+ .octa 0x000000000fba10020000000081482c84
+
+ /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+ .octa 0x00000000bdb9bd7200000000e77307c2
+
+ /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+ .octa 0x0000000075d3bf5a00000000d4a07ec8
+
+ /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+ .octa 0x00000000ef1f98a00000000017102100
+
+ /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+ .octa 0x00000000689c760200000000db406486
+
+ /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+ .octa 0x000000016d5fa5fe0000000192db7f88
+
+ /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+ .octa 0x00000001d0d2b9ca000000018bf67b1e
+
+ /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+ .octa 0x0000000041e7b470000000007c09163e
+
+ /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+ .octa 0x00000001cbb6495e000000000adac060
+
+ /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+ .octa 0x000000010052a0b000000000bd8316ae
+
+ /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+ .octa 0x00000001d8effb5c000000019f09ab54
+
+ /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+ .octa 0x00000001d969853c0000000125155542
+
+ /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+ .octa 0x00000000523ccce2000000018fdb5882
+
+ /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+ .octa 0x000000001e2436bc00000000e794b3f4
+
+ /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+ .octa 0x00000000ddd1c3a2000000016f9bb022
+
+ /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+ .octa 0x0000000019fcfe3800000000290c9978
+
+ /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+ .octa 0x00000001ce95db640000000083c0f350
+
+ /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+ .octa 0x00000000af5828060000000173ea6628
+
+ /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+ .octa 0x00000001006388f600000001c8b4e00a
+
+ /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+ .octa 0x0000000179eca00a00000000de95d6aa
+
+ /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+ .octa 0x0000000122410a6a000000010b7f7248
+
+ /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+ .octa 0x000000004288e87c00000001326e3a06
+
+ /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+ .octa 0x000000016c5490da00000000bb62c2e6
+
+ /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+ .octa 0x00000000d1c71f6e0000000156a4b2c2
+
+ /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+ .octa 0x00000001b4ce08a6000000011dfe763a
+
+ /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+ .octa 0x00000001466ba60c000000007bcca8e2
+
+ /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+ .octa 0x00000001f6c488a40000000186118faa
+
+ /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+ .octa 0x000000013bfb06820000000111a65a88
+
+ /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+ .octa 0x00000000690e9e54000000003565e1c4
+
+ /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+ .octa 0x00000000281346b6000000012ed02a82
+
+ /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+ .octa 0x000000015646402400000000c486ecfc
+
+ /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+ .octa 0x000000016063a8dc0000000001b951b2
+
+ /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+ .octa 0x0000000116a663620000000048143916
+
+ /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+ .octa 0x000000017e8aa4d200000001dc2ae124
+
+ /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+ .octa 0x00000001728eb10c00000001416c58d6
+
+ /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+ .octa 0x00000001b08fd7fa00000000a479744a
+
+ /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+ .octa 0x00000001092a16e80000000096ca3a26
+
+ /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+ .octa 0x00000000a505637c00000000ff223d4e
+
+ /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+ .octa 0x00000000d94869b2000000010e84da42
+
+ /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+ .octa 0x00000001c8b203ae00000001b61ba3d0
+
+ /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+ .octa 0x000000005704aea000000000680f2de8
+
+ /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+ .octa 0x000000012e295fa2000000008772a9a8
+
+ /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+ .octa 0x000000011d0908bc0000000155f295bc
+
+ /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+ .octa 0x0000000193ed97ea00000000595f9282
+
+ /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+ .octa 0x000000013a0f1c520000000164b1c25a
+
+ /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+ .octa 0x000000010c2c40c000000000fbd67c50
+
+ /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+ .octa 0x00000000ff6fac3e0000000096076268
+
+ /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+ .octa 0x000000017b3609c000000001d288e4cc
+
+ /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+ .octa 0x0000000088c8c92200000001eaac1bdc
+
+ /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+ .octa 0x00000001751baae600000001f1ea39e2
+
+ /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+ .octa 0x000000010795297200000001eb6506fc
+
+ /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+ .octa 0x0000000162b00abe000000010f806ffe
+
+ /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+ .octa 0x000000000d7b404c000000010408481e
+
+ /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+ .octa 0x00000000763b13d40000000188260534
+
+ /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+ .octa 0x00000000f6dc22d80000000058fc73e0
+
+ /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+ .octa 0x000000007daae06000000000391c59b8
+
+ /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+ .octa 0x000000013359ab7c000000018b638400
+
+ /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+ .octa 0x000000008add438a000000011738f5c4
+
+ /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+ .octa 0x00000001edbefdea000000008cf7c6da
+
+ /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+ .octa 0x000000004104e0f800000001ef97fb16
+
+ /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+ .octa 0x00000000b48a82220000000102130e20
+
+ /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+ .octa 0x00000001bcb4684400000000db968898
+
+ /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+ .octa 0x000000013293ce0a00000000b5047b5e
+
+ /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+ .octa 0x00000001710d0844000000010b90fdb2
+
+ /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+ .octa 0x0000000117907f6e000000004834a32e
+
+ /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+ .octa 0x0000000087ddf93e0000000059c8f2b0
+
+ /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+ .octa 0x000000005970e9b00000000122cec508
+
+ /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+ .octa 0x0000000185b2b7d0000000000a330cda
+
+ /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+ .octa 0x00000001dcee0efc000000014a47148c
+
+ /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+ .octa 0x0000000030da27220000000042c61cb8
+
+ /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+ .octa 0x000000012f925a180000000012fe6960
+
+ /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+ .octa 0x00000000dd2e357c00000000dbda2c20
+
+ /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+ .octa 0x00000000071c80de000000011122410c
+
+ /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+ .octa 0x000000011513140a00000000977b2070
+
+ /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+ .octa 0x00000001df876e8e000000014050438e
+
+ /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+ .octa 0x000000015f81d6ce0000000147c840e8
+
+ /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+ .octa 0x000000019dd94dbe00000001cc7c88ce
+
+ /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+ .octa 0x00000001373d206e00000001476b35a4
+
+ /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+ .octa 0x00000000668ccade000000013d52d508
+
+ /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+ .octa 0x00000001b192d268000000008e4be32e
+
+ /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+ .octa 0x00000000e30f3a7800000000024120fe
+
+ /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+ .octa 0x000000010ef1f7bc00000000ddecddb4
+
+ /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+ .octa 0x00000001f5ac738000000000d4d403bc
+
+ /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+ .octa 0x000000011822ea7000000001734b89aa
+
+ /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+ .octa 0x00000000c3a33848000000010e7a58d6
+
+ /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+ .octa 0x00000001bd151c2400000001f9f04e9c
+
+ /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+ .octa 0x0000000056002d7600000000b692225e
+
+ /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+ .octa 0x000000014657c4f4000000019b8d3f3e
+
+ /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+ .octa 0x0000000113742d7c00000001a874f11e
+
+ /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+ .octa 0x000000019c5920ba000000010d5a4254
+
+ /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+ .octa 0x000000005216d2d600000000bbb2f5d6
+
+ /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+ .octa 0x0000000136f5ad8a0000000179cc0e36
+
+ /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+ .octa 0x000000018b07beb600000001dca1da4a
+
+ /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+ .octa 0x00000000db1e93b000000000feb1a192
+
+ /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+ .octa 0x000000000b96fa3a00000000d1eeedd6
+
+ /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+ .octa 0x00000001d9968af0000000008fad9bb4
+
+ /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+ .octa 0x000000000e4a77a200000001884938e4
+
+ /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+ .octa 0x00000000508c2ac800000001bc2e9bc0
+
+ /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+ .octa 0x0000000021572a8000000001f9658a68
+
+ /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+ .octa 0x00000001b859daf2000000001b9224fc
+
+ /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+ .octa 0x000000016f7884740000000055b2fb84
+
+ /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+ .octa 0x00000001b438810e000000018b090348
+
+ /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+ .octa 0x0000000095ddc6f2000000011ccbd5ea
+
+ /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+ .octa 0x00000001d977c20c0000000007ae47f8
+
+ /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+ .octa 0x00000000ebedb99a0000000172acbec0
+
+ /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+ .octa 0x00000001df9e9e9200000001c6e3ff20
+
+ /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+ .octa 0x00000001a4a3f95200000000e1b38744
+
+ /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+ .octa 0x00000000e2f5122000000000791585b2
+
+ /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+ .octa 0x000000004aa01f3e00000000ac53b894
+
+ /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+ .octa 0x00000000b3e90a5800000001ed5f2cf4
+
+ /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+ .octa 0x000000000c9ca2aa00000001df48b2e0
+
+ /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+ .octa 0x000000015168231600000000049c1c62
+
+ /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+ .octa 0x0000000036fce78c000000017c460c12
+
+ /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+ .octa 0x000000009037dc10000000015be4da7e
+
+ /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+ .octa 0x00000000d3298582000000010f38f668
+
+ /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+ .octa 0x00000001b42e8ad60000000039f40a00
+
+ /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+ .octa 0x00000000142a983800000000bd4c10c4
+
+ /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+ .octa 0x0000000109c7f1900000000042db1d98
+
+ /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+ .octa 0x0000000056ff931000000001c905bae6
+
+ /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+ .octa 0x00000001594513aa00000000069d40ea
+
+ /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+ .octa 0x00000001e3b5b1e8000000008e4fbad0
+
+ /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+ .octa 0x000000011dd5fc080000000047bedd46
+
+ /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+ .octa 0x00000001675f0cc20000000026396bf8
+
+ /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+ .octa 0x00000000d1c8dd4400000000379beb92
+
+ /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+ .octa 0x0000000115ebd3d8000000000abae54a
+
+ /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+ .octa 0x00000001ecbd0dac0000000007e6a128
+
+ /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+ .octa 0x00000000cdf67af2000000000ade29d2
+
+ /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+ .octa 0x000000004c01ff4c00000000f974c45c
+
+ /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+ .octa 0x00000000f2d8657e00000000e77ac60a
+
+ /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+ .octa 0x000000006bae74c40000000145895816
+
+ /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+ .octa 0x0000000152af8aa00000000038e362be
+
+ /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+ .octa 0x0000000004663802000000007f991a64
+
+ /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+ .octa 0x00000001ab2f5afc00000000fa366d3a
+
+ /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+ .octa 0x0000000074a4ebd400000001a2bb34f0
+
+ /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+ .octa 0x00000001d7ab3a4c0000000028a9981e
+
+ /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+ .octa 0x00000001a8da60c600000001dbc672be
+
+ /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+ .octa 0x000000013cf6382000000000b04d77f6
+
+ /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+ .octa 0x00000000bec12e1e0000000124400d96
+
+ /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+ .octa 0x00000001c6368010000000014ca4b414
+
+ /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+ .octa 0x00000001e6e78758000000012fe2c938
+
+ /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+ .octa 0x000000008d7f2b3c00000001faed01e6
+
+ /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+ .octa 0x000000016b4a156e000000007e80ecfe
+
+ /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+ .octa 0x00000001c63cfeb60000000098daee94
+
+ /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+ .octa 0x000000015f902670000000010a04edea
+
+ /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+ .octa 0x00000001cd5de11e00000001c00b4524
+
+ /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+ .octa 0x000000001acaec540000000170296550
+
+ /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+ .octa 0x000000002bd0ca780000000181afaa48
+
+ /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+ .octa 0x0000000032d63d5c0000000185a31ffa
+
+ /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+ .octa 0x000000001c6d4e4c000000002469f608
+
+ /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+ .octa 0x0000000106a60b92000000006980102a
+
+ /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+ .octa 0x00000000d3855e120000000111ea9ca8
+
+ /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+ .octa 0x00000000e312563600000001bd1d29ce
+
+ /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+ .octa 0x000000009e8f7ea400000001b34b9580
+
+ /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+ .octa 0x00000001c82e562c000000003076054e
+
+ /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+ .octa 0x00000000ca9f09ce000000012a608ea4
+
+ /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+ .octa 0x00000000c63764e600000000784d05fe
+
+ /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+ .octa 0x0000000168d2e49e000000016ef0d82a
+
+ /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+ .octa 0x00000000e986c1480000000075bda454
+
+ /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+ .octa 0x00000000cfb65894000000003dc0a1c4
+
+ /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+ .octa 0x0000000111cadee400000000e9a5d8be
+
+ /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+ .octa 0x0000000171fb63ce00000001609bc4b4
+
+.short_constants:
+
+ /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+ /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
+ .octa 0x7fec2963e5bf80485cf015c388e56f72
+
+ /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
+ .octa 0x38e888d4844752a9963a18920246e2e6
+
+ /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
+ .octa 0x42316c00730206ad419a441956993a31
+
+ /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
+ .octa 0x543d5c543e65ddf9924752ba2b830011
+
+ /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
+ .octa 0x78e87aaf56767c9255bd7f9518e4a304
+
+ /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
+ .octa 0x8f68fcec1903da7f6d76739fe0553f1e
+
+ /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
+ .octa 0x3f4840246791d588c133722b1fe0b5c3
+
+ /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
+ .octa 0x34c96751b04de25a64b67ee0e55ef1f3
+
+ /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
+ .octa 0x156c8e180b4a395b069db049b8fdb1e7
+
+ /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
+ .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
+
+ /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
+ .octa 0x041d37768cd75659817cdc5119b29a35
+
+ /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
+ .octa 0x3a0777818cfaa9651ce9d94b36c41f1c
+
+ /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
+ .octa 0x0e148e8252377a554f256efcb82be955
+
+ /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
+ .octa 0x9c25531d19e65ddeec1631edb2dea967
+
+ /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
+ .octa 0x790606ff9957c0a65d27e147510ac59a
+
+ /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
+ .octa 0x82f63b786ea2d55ca66805eb18b8ea18
+
+
+.barrett_constants:
+ /* 33 bit reflected Barrett constant m - (4^32)/n */
+ .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */
+ /* 33 bit reflected Barrett constant n */
+ .octa 0x00000000000000000000000105ec76f1
+
+#define CRC_FUNCTION_NAME __crc32c_vpmsum
+#define REFLECT
+#include "crc-vpmsum-template.S"
diff --git a/lib/crc/powerpc/crct10dif-vpmsum_asm.S b/lib/crc/powerpc/crct10dif-vpmsum_asm.S
new file mode 100644
index 000000000000..47a6266d89a8
--- /dev/null
+++ b/lib/crc/powerpc/crct10dif-vpmsum_asm.S
@@ -0,0 +1,845 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Calculate a CRC T10DIF with vpmsum acceleration
+ *
+ * Constants generated by crc32-vpmsum, available at
+ * https://github.com/antonblanchard/crc32-vpmsum
+ *
+ * crc32-vpmsum is
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ */
+ .section .rodata
+.balign 16
+
+.byteswap_constant:
+ /* byte reverse permute constant */
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
+.constants:
+
+ /* Reduce 262144 kbits to 1024 bits */
+ /* x^261184 mod p(x), x^261120 mod p(x) */
+ .octa 0x0000000056d300000000000052550000
+
+ /* x^260160 mod p(x), x^260096 mod p(x) */
+ .octa 0x00000000ee67000000000000a1e40000
+
+ /* x^259136 mod p(x), x^259072 mod p(x) */
+ .octa 0x0000000060830000000000004ad10000
+
+ /* x^258112 mod p(x), x^258048 mod p(x) */
+ .octa 0x000000008cfe0000000000009ab40000
+
+ /* x^257088 mod p(x), x^257024 mod p(x) */
+ .octa 0x000000003e93000000000000fdb50000
+
+ /* x^256064 mod p(x), x^256000 mod p(x) */
+ .octa 0x000000003c2000000000000045480000
+
+ /* x^255040 mod p(x), x^254976 mod p(x) */
+ .octa 0x00000000b1fc0000000000008d690000
+
+ /* x^254016 mod p(x), x^253952 mod p(x) */
+ .octa 0x00000000f82b00000000000024ad0000
+
+ /* x^252992 mod p(x), x^252928 mod p(x) */
+ .octa 0x0000000044420000000000009f1a0000
+
+ /* x^251968 mod p(x), x^251904 mod p(x) */
+ .octa 0x00000000e88c00000000000066ec0000
+
+ /* x^250944 mod p(x), x^250880 mod p(x) */
+ .octa 0x00000000385c000000000000c87d0000
+
+ /* x^249920 mod p(x), x^249856 mod p(x) */
+ .octa 0x000000003227000000000000c8ff0000
+
+ /* x^248896 mod p(x), x^248832 mod p(x) */
+ .octa 0x00000000a9a900000000000033440000
+
+ /* x^247872 mod p(x), x^247808 mod p(x) */
+ .octa 0x00000000abaa00000000000066eb0000
+
+ /* x^246848 mod p(x), x^246784 mod p(x) */
+ .octa 0x000000001ac3000000000000c4ef0000
+
+ /* x^245824 mod p(x), x^245760 mod p(x) */
+ .octa 0x0000000063f000000000000056f30000
+
+ /* x^244800 mod p(x), x^244736 mod p(x) */
+ .octa 0x0000000032cc00000000000002050000
+
+ /* x^243776 mod p(x), x^243712 mod p(x) */
+ .octa 0x00000000f8b5000000000000568e0000
+
+ /* x^242752 mod p(x), x^242688 mod p(x) */
+ .octa 0x000000008db100000000000064290000
+
+ /* x^241728 mod p(x), x^241664 mod p(x) */
+ .octa 0x0000000059ca0000000000006b660000
+
+ /* x^240704 mod p(x), x^240640 mod p(x) */
+ .octa 0x000000005f5c00000000000018f80000
+
+ /* x^239680 mod p(x), x^239616 mod p(x) */
+ .octa 0x0000000061af000000000000b6090000
+
+ /* x^238656 mod p(x), x^238592 mod p(x) */
+ .octa 0x00000000e29e000000000000099a0000
+
+ /* x^237632 mod p(x), x^237568 mod p(x) */
+ .octa 0x000000000975000000000000a8360000
+
+ /* x^236608 mod p(x), x^236544 mod p(x) */
+ .octa 0x0000000043900000000000004f570000
+
+ /* x^235584 mod p(x), x^235520 mod p(x) */
+ .octa 0x00000000f9cd000000000000134c0000
+
+ /* x^234560 mod p(x), x^234496 mod p(x) */
+ .octa 0x000000007c29000000000000ec380000
+
+ /* x^233536 mod p(x), x^233472 mod p(x) */
+ .octa 0x000000004c6a000000000000b0d10000
+
+ /* x^232512 mod p(x), x^232448 mod p(x) */
+ .octa 0x00000000e7290000000000007d3e0000
+
+ /* x^231488 mod p(x), x^231424 mod p(x) */
+ .octa 0x00000000f1ab000000000000f0b20000
+
+ /* x^230464 mod p(x), x^230400 mod p(x) */
+ .octa 0x0000000039db0000000000009c270000
+
+ /* x^229440 mod p(x), x^229376 mod p(x) */
+ .octa 0x000000005e2800000000000092890000
+
+ /* x^228416 mod p(x), x^228352 mod p(x) */
+ .octa 0x00000000d44e000000000000d5ee0000
+
+ /* x^227392 mod p(x), x^227328 mod p(x) */
+ .octa 0x00000000cd0a00000000000041f50000
+
+ /* x^226368 mod p(x), x^226304 mod p(x) */
+ .octa 0x00000000c5b400000000000010520000
+
+ /* x^225344 mod p(x), x^225280 mod p(x) */
+ .octa 0x00000000fd2100000000000042170000
+
+ /* x^224320 mod p(x), x^224256 mod p(x) */
+ .octa 0x000000002f2500000000000095c20000
+
+ /* x^223296 mod p(x), x^223232 mod p(x) */
+ .octa 0x000000001b0100000000000001ce0000
+
+ /* x^222272 mod p(x), x^222208 mod p(x) */
+ .octa 0x000000000d430000000000002aca0000
+
+ /* x^221248 mod p(x), x^221184 mod p(x) */
+ .octa 0x0000000030a6000000000000385e0000
+
+ /* x^220224 mod p(x), x^220160 mod p(x) */
+ .octa 0x00000000e37b0000000000006f7a0000
+
+ /* x^219200 mod p(x), x^219136 mod p(x) */
+ .octa 0x00000000873600000000000024320000
+
+ /* x^218176 mod p(x), x^218112 mod p(x) */
+ .octa 0x00000000e9fb000000000000bd9c0000
+
+ /* x^217152 mod p(x), x^217088 mod p(x) */
+ .octa 0x000000003b9500000000000054bc0000
+
+ /* x^216128 mod p(x), x^216064 mod p(x) */
+ .octa 0x00000000133e000000000000a4660000
+
+ /* x^215104 mod p(x), x^215040 mod p(x) */
+ .octa 0x00000000784500000000000079930000
+
+ /* x^214080 mod p(x), x^214016 mod p(x) */
+ .octa 0x00000000b9800000000000001bb80000
+
+ /* x^213056 mod p(x), x^212992 mod p(x) */
+ .octa 0x00000000687600000000000024400000
+
+ /* x^212032 mod p(x), x^211968 mod p(x) */
+ .octa 0x00000000aff300000000000029e10000
+
+ /* x^211008 mod p(x), x^210944 mod p(x) */
+ .octa 0x0000000024b50000000000005ded0000
+
+ /* x^209984 mod p(x), x^209920 mod p(x) */
+ .octa 0x0000000017e8000000000000b12e0000
+
+ /* x^208960 mod p(x), x^208896 mod p(x) */
+ .octa 0x00000000128400000000000026d20000
+
+ /* x^207936 mod p(x), x^207872 mod p(x) */
+ .octa 0x000000002115000000000000a32a0000
+
+ /* x^206912 mod p(x), x^206848 mod p(x) */
+ .octa 0x000000009595000000000000a1210000
+
+ /* x^205888 mod p(x), x^205824 mod p(x) */
+ .octa 0x00000000281e000000000000ee8b0000
+
+ /* x^204864 mod p(x), x^204800 mod p(x) */
+ .octa 0x0000000006010000000000003d0d0000
+
+ /* x^203840 mod p(x), x^203776 mod p(x) */
+ .octa 0x00000000e2b600000000000034e90000
+
+ /* x^202816 mod p(x), x^202752 mod p(x) */
+ .octa 0x000000001bd40000000000004cdb0000
+
+ /* x^201792 mod p(x), x^201728 mod p(x) */
+ .octa 0x00000000df2800000000000030e90000
+
+ /* x^200768 mod p(x), x^200704 mod p(x) */
+ .octa 0x0000000049c200000000000042590000
+
+ /* x^199744 mod p(x), x^199680 mod p(x) */
+ .octa 0x000000009b97000000000000df950000
+
+ /* x^198720 mod p(x), x^198656 mod p(x) */
+ .octa 0x000000006184000000000000da7b0000
+
+ /* x^197696 mod p(x), x^197632 mod p(x) */
+ .octa 0x00000000461700000000000012510000
+
+ /* x^196672 mod p(x), x^196608 mod p(x) */
+ .octa 0x000000009b40000000000000f37e0000
+
+ /* x^195648 mod p(x), x^195584 mod p(x) */
+ .octa 0x00000000eeb2000000000000ecf10000
+
+ /* x^194624 mod p(x), x^194560 mod p(x) */
+ .octa 0x00000000b2e800000000000050f20000
+
+ /* x^193600 mod p(x), x^193536 mod p(x) */
+ .octa 0x00000000f59a000000000000e0b30000
+
+ /* x^192576 mod p(x), x^192512 mod p(x) */
+ .octa 0x00000000467f0000000000004d5a0000
+
+ /* x^191552 mod p(x), x^191488 mod p(x) */
+ .octa 0x00000000da92000000000000bb010000
+
+ /* x^190528 mod p(x), x^190464 mod p(x) */
+ .octa 0x000000001e1000000000000022a40000
+
+ /* x^189504 mod p(x), x^189440 mod p(x) */
+ .octa 0x0000000058fe000000000000836f0000
+
+ /* x^188480 mod p(x), x^188416 mod p(x) */
+ .octa 0x00000000b9ce000000000000d78d0000
+
+ /* x^187456 mod p(x), x^187392 mod p(x) */
+ .octa 0x0000000022210000000000004f8d0000
+
+ /* x^186432 mod p(x), x^186368 mod p(x) */
+ .octa 0x00000000744600000000000033760000
+
+ /* x^185408 mod p(x), x^185344 mod p(x) */
+ .octa 0x000000001c2e000000000000a1e50000
+
+ /* x^184384 mod p(x), x^184320 mod p(x) */
+ .octa 0x00000000dcc8000000000000a1a40000
+
+ /* x^183360 mod p(x), x^183296 mod p(x) */
+ .octa 0x00000000910f00000000000019a20000
+
+ /* x^182336 mod p(x), x^182272 mod p(x) */
+ .octa 0x0000000055d5000000000000f6ae0000
+
+ /* x^181312 mod p(x), x^181248 mod p(x) */
+ .octa 0x00000000c8ba000000000000a7ac0000
+
+ /* x^180288 mod p(x), x^180224 mod p(x) */
+ .octa 0x0000000031f8000000000000eea20000
+
+ /* x^179264 mod p(x), x^179200 mod p(x) */
+ .octa 0x000000001966000000000000c4d90000
+
+ /* x^178240 mod p(x), x^178176 mod p(x) */
+ .octa 0x00000000b9810000000000002b470000
+
+ /* x^177216 mod p(x), x^177152 mod p(x) */
+ .octa 0x000000008303000000000000f7cf0000
+
+ /* x^176192 mod p(x), x^176128 mod p(x) */
+ .octa 0x000000002ce500000000000035b30000
+
+ /* x^175168 mod p(x), x^175104 mod p(x) */
+ .octa 0x000000002fae0000000000000c7c0000
+
+ /* x^174144 mod p(x), x^174080 mod p(x) */
+ .octa 0x00000000f50c0000000000009edf0000
+
+ /* x^173120 mod p(x), x^173056 mod p(x) */
+ .octa 0x00000000714f00000000000004cd0000
+
+ /* x^172096 mod p(x), x^172032 mod p(x) */
+ .octa 0x00000000c161000000000000541b0000
+
+ /* x^171072 mod p(x), x^171008 mod p(x) */
+ .octa 0x0000000021c8000000000000e2700000
+
+ /* x^170048 mod p(x), x^169984 mod p(x) */
+ .octa 0x00000000b93d00000000000009a60000
+
+ /* x^169024 mod p(x), x^168960 mod p(x) */
+ .octa 0x00000000fbcf000000000000761c0000
+
+ /* x^168000 mod p(x), x^167936 mod p(x) */
+ .octa 0x0000000026350000000000009db30000
+
+ /* x^166976 mod p(x), x^166912 mod p(x) */
+ .octa 0x00000000b64f0000000000003e9f0000
+
+ /* x^165952 mod p(x), x^165888 mod p(x) */
+ .octa 0x00000000bd0e00000000000078590000
+
+ /* x^164928 mod p(x), x^164864 mod p(x) */
+ .octa 0x00000000d9360000000000008bc80000
+
+ /* x^163904 mod p(x), x^163840 mod p(x) */
+ .octa 0x000000002f140000000000008c9f0000
+
+ /* x^162880 mod p(x), x^162816 mod p(x) */
+ .octa 0x000000006a270000000000006af70000
+
+ /* x^161856 mod p(x), x^161792 mod p(x) */
+ .octa 0x000000006685000000000000e5210000
+
+ /* x^160832 mod p(x), x^160768 mod p(x) */
+ .octa 0x0000000062da00000000000008290000
+
+ /* x^159808 mod p(x), x^159744 mod p(x) */
+ .octa 0x00000000bb4b000000000000e4d00000
+
+ /* x^158784 mod p(x), x^158720 mod p(x) */
+ .octa 0x00000000d2490000000000004ae10000
+
+ /* x^157760 mod p(x), x^157696 mod p(x) */
+ .octa 0x00000000c85b00000000000000e70000
+
+ /* x^156736 mod p(x), x^156672 mod p(x) */
+ .octa 0x00000000c37a00000000000015650000
+
+ /* x^155712 mod p(x), x^155648 mod p(x) */
+ .octa 0x0000000018530000000000001c2f0000
+
+ /* x^154688 mod p(x), x^154624 mod p(x) */
+ .octa 0x00000000b46600000000000037bd0000
+
+ /* x^153664 mod p(x), x^153600 mod p(x) */
+ .octa 0x00000000439b00000000000012190000
+
+ /* x^152640 mod p(x), x^152576 mod p(x) */
+ .octa 0x00000000b1260000000000005ece0000
+
+ /* x^151616 mod p(x), x^151552 mod p(x) */
+ .octa 0x00000000d8110000000000002a5e0000
+
+ /* x^150592 mod p(x), x^150528 mod p(x) */
+ .octa 0x00000000099f00000000000052330000
+
+ /* x^149568 mod p(x), x^149504 mod p(x) */
+ .octa 0x00000000f9f9000000000000f9120000
+
+ /* x^148544 mod p(x), x^148480 mod p(x) */
+ .octa 0x000000005cc00000000000000ddc0000
+
+ /* x^147520 mod p(x), x^147456 mod p(x) */
+ .octa 0x00000000343b00000000000012200000
+
+ /* x^146496 mod p(x), x^146432 mod p(x) */
+ .octa 0x000000009222000000000000d12b0000
+
+ /* x^145472 mod p(x), x^145408 mod p(x) */
+ .octa 0x00000000d781000000000000eb2d0000
+
+ /* x^144448 mod p(x), x^144384 mod p(x) */
+ .octa 0x000000000bf400000000000058970000
+
+ /* x^143424 mod p(x), x^143360 mod p(x) */
+ .octa 0x00000000094200000000000013690000
+
+ /* x^142400 mod p(x), x^142336 mod p(x) */
+ .octa 0x00000000d55100000000000051950000
+
+ /* x^141376 mod p(x), x^141312 mod p(x) */
+ .octa 0x000000008f11000000000000954b0000
+
+ /* x^140352 mod p(x), x^140288 mod p(x) */
+ .octa 0x00000000140f000000000000b29e0000
+
+ /* x^139328 mod p(x), x^139264 mod p(x) */
+ .octa 0x00000000c6db000000000000db5d0000
+
+ /* x^138304 mod p(x), x^138240 mod p(x) */
+ .octa 0x00000000715b000000000000dfaf0000
+
+ /* x^137280 mod p(x), x^137216 mod p(x) */
+ .octa 0x000000000dea000000000000e3b60000
+
+ /* x^136256 mod p(x), x^136192 mod p(x) */
+ .octa 0x000000006f94000000000000ddaf0000
+
+ /* x^135232 mod p(x), x^135168 mod p(x) */
+ .octa 0x0000000024e1000000000000e4f70000
+
+ /* x^134208 mod p(x), x^134144 mod p(x) */
+ .octa 0x000000008810000000000000aa110000
+
+ /* x^133184 mod p(x), x^133120 mod p(x) */
+ .octa 0x0000000030c2000000000000a8e60000
+
+ /* x^132160 mod p(x), x^132096 mod p(x) */
+ .octa 0x00000000e6d0000000000000ccf30000
+
+ /* x^131136 mod p(x), x^131072 mod p(x) */
+ .octa 0x000000004da000000000000079bf0000
+
+ /* x^130112 mod p(x), x^130048 mod p(x) */
+ .octa 0x000000007759000000000000b3a30000
+
+ /* x^129088 mod p(x), x^129024 mod p(x) */
+ .octa 0x00000000597400000000000028790000
+
+ /* x^128064 mod p(x), x^128000 mod p(x) */
+ .octa 0x000000007acd000000000000b5820000
+
+ /* x^127040 mod p(x), x^126976 mod p(x) */
+ .octa 0x00000000e6e400000000000026ad0000
+
+ /* x^126016 mod p(x), x^125952 mod p(x) */
+ .octa 0x000000006d49000000000000985b0000
+
+ /* x^124992 mod p(x), x^124928 mod p(x) */
+ .octa 0x000000000f0800000000000011520000
+
+ /* x^123968 mod p(x), x^123904 mod p(x) */
+ .octa 0x000000002c7f000000000000846c0000
+
+ /* x^122944 mod p(x), x^122880 mod p(x) */
+ .octa 0x000000005ce7000000000000ae1d0000
+
+ /* x^121920 mod p(x), x^121856 mod p(x) */
+ .octa 0x00000000d4cb000000000000e21d0000
+
+ /* x^120896 mod p(x), x^120832 mod p(x) */
+ .octa 0x000000003a2300000000000019bb0000
+
+ /* x^119872 mod p(x), x^119808 mod p(x) */
+ .octa 0x000000000e1700000000000095290000
+
+ /* x^118848 mod p(x), x^118784 mod p(x) */
+ .octa 0x000000006e6400000000000050d20000
+
+ /* x^117824 mod p(x), x^117760 mod p(x) */
+ .octa 0x000000008d5c0000000000000cd10000
+
+ /* x^116800 mod p(x), x^116736 mod p(x) */
+ .octa 0x00000000ef310000000000007b570000
+
+ /* x^115776 mod p(x), x^115712 mod p(x) */
+ .octa 0x00000000645d00000000000053d60000
+
+ /* x^114752 mod p(x), x^114688 mod p(x) */
+ .octa 0x0000000018fc00000000000077510000
+
+ /* x^113728 mod p(x), x^113664 mod p(x) */
+ .octa 0x000000000cb3000000000000a7b70000
+
+ /* x^112704 mod p(x), x^112640 mod p(x) */
+ .octa 0x00000000991b000000000000d0780000
+
+ /* x^111680 mod p(x), x^111616 mod p(x) */
+ .octa 0x00000000845a000000000000be3c0000
+
+ /* x^110656 mod p(x), x^110592 mod p(x) */
+ .octa 0x00000000d3a9000000000000df020000
+
+ /* x^109632 mod p(x), x^109568 mod p(x) */
+ .octa 0x0000000017d7000000000000063e0000
+
+ /* x^108608 mod p(x), x^108544 mod p(x) */
+ .octa 0x000000007a860000000000008ab40000
+
+ /* x^107584 mod p(x), x^107520 mod p(x) */
+ .octa 0x00000000fd7c000000000000c7bd0000
+
+ /* x^106560 mod p(x), x^106496 mod p(x) */
+ .octa 0x00000000a56b000000000000efd60000
+
+ /* x^105536 mod p(x), x^105472 mod p(x) */
+ .octa 0x0000000010e400000000000071380000
+
+ /* x^104512 mod p(x), x^104448 mod p(x) */
+ .octa 0x00000000994500000000000004d30000
+
+ /* x^103488 mod p(x), x^103424 mod p(x) */
+ .octa 0x00000000b83c0000000000003b0e0000
+
+ /* x^102464 mod p(x), x^102400 mod p(x) */
+ .octa 0x00000000d6c10000000000008b020000
+
+ /* x^101440 mod p(x), x^101376 mod p(x) */
+ .octa 0x000000009efc000000000000da940000
+
+ /* x^100416 mod p(x), x^100352 mod p(x) */
+ .octa 0x000000005e87000000000000f9f70000
+
+ /* x^99392 mod p(x), x^99328 mod p(x) */
+ .octa 0x000000006c9b00000000000045e40000
+
+ /* x^98368 mod p(x), x^98304 mod p(x) */
+ .octa 0x00000000178a00000000000083940000
+
+ /* x^97344 mod p(x), x^97280 mod p(x) */
+ .octa 0x00000000f0c8000000000000f0a00000
+
+ /* x^96320 mod p(x), x^96256 mod p(x) */
+ .octa 0x00000000f699000000000000b74b0000
+
+ /* x^95296 mod p(x), x^95232 mod p(x) */
+ .octa 0x00000000316d000000000000c1cf0000
+
+ /* x^94272 mod p(x), x^94208 mod p(x) */
+ .octa 0x00000000987e00000000000072680000
+
+ /* x^93248 mod p(x), x^93184 mod p(x) */
+ .octa 0x00000000acff000000000000e0ab0000
+
+ /* x^92224 mod p(x), x^92160 mod p(x) */
+ .octa 0x00000000a1f6000000000000c5a80000
+
+ /* x^91200 mod p(x), x^91136 mod p(x) */
+ .octa 0x0000000061bd000000000000cf690000
+
+ /* x^90176 mod p(x), x^90112 mod p(x) */
+ .octa 0x00000000c9f2000000000000cbcc0000
+
+ /* x^89152 mod p(x), x^89088 mod p(x) */
+ .octa 0x000000005a33000000000000de050000
+
+ /* x^88128 mod p(x), x^88064 mod p(x) */
+ .octa 0x00000000e416000000000000ccd70000
+
+ /* x^87104 mod p(x), x^87040 mod p(x) */
+ .octa 0x0000000058930000000000002f670000
+
+ /* x^86080 mod p(x), x^86016 mod p(x) */
+ .octa 0x00000000a9d3000000000000152f0000
+
+ /* x^85056 mod p(x), x^84992 mod p(x) */
+ .octa 0x00000000c114000000000000ecc20000
+
+ /* x^84032 mod p(x), x^83968 mod p(x) */
+ .octa 0x00000000b9270000000000007c890000
+
+ /* x^83008 mod p(x), x^82944 mod p(x) */
+ .octa 0x000000002e6000000000000006ee0000
+
+ /* x^81984 mod p(x), x^81920 mod p(x) */
+ .octa 0x00000000dfc600000000000009100000
+
+ /* x^80960 mod p(x), x^80896 mod p(x) */
+ .octa 0x000000004911000000000000ad4e0000
+
+ /* x^79936 mod p(x), x^79872 mod p(x) */
+ .octa 0x00000000ae1b000000000000b04d0000
+
+ /* x^78912 mod p(x), x^78848 mod p(x) */
+ .octa 0x0000000005fa000000000000e9900000
+
+ /* x^77888 mod p(x), x^77824 mod p(x) */
+ .octa 0x0000000004a1000000000000cc6f0000
+
+ /* x^76864 mod p(x), x^76800 mod p(x) */
+ .octa 0x00000000af73000000000000ed110000
+
+ /* x^75840 mod p(x), x^75776 mod p(x) */
+ .octa 0x0000000082530000000000008f7e0000
+
+ /* x^74816 mod p(x), x^74752 mod p(x) */
+ .octa 0x00000000cfdc000000000000594f0000
+
+ /* x^73792 mod p(x), x^73728 mod p(x) */
+ .octa 0x00000000a6b6000000000000a8750000
+
+ /* x^72768 mod p(x), x^72704 mod p(x) */
+ .octa 0x00000000fd76000000000000aa0c0000
+
+ /* x^71744 mod p(x), x^71680 mod p(x) */
+ .octa 0x0000000006f500000000000071db0000
+
+ /* x^70720 mod p(x), x^70656 mod p(x) */
+ .octa 0x0000000037ca000000000000ab0c0000
+
+ /* x^69696 mod p(x), x^69632 mod p(x) */
+ .octa 0x00000000d7ab000000000000b7a00000
+
+ /* x^68672 mod p(x), x^68608 mod p(x) */
+ .octa 0x00000000440800000000000090d30000
+
+ /* x^67648 mod p(x), x^67584 mod p(x) */
+ .octa 0x00000000186100000000000054730000
+
+ /* x^66624 mod p(x), x^66560 mod p(x) */
+ .octa 0x000000007368000000000000a3a20000
+
+ /* x^65600 mod p(x), x^65536 mod p(x) */
+ .octa 0x0000000026d0000000000000f9040000
+
+ /* x^64576 mod p(x), x^64512 mod p(x) */
+ .octa 0x00000000fe770000000000009c0a0000
+
+ /* x^63552 mod p(x), x^63488 mod p(x) */
+ .octa 0x000000002cba000000000000d1e70000
+
+ /* x^62528 mod p(x), x^62464 mod p(x) */
+ .octa 0x00000000f8bd0000000000005ac10000
+
+ /* x^61504 mod p(x), x^61440 mod p(x) */
+ .octa 0x000000007372000000000000d68d0000
+
+ /* x^60480 mod p(x), x^60416 mod p(x) */
+ .octa 0x00000000f37f00000000000089f60000
+
+ /* x^59456 mod p(x), x^59392 mod p(x) */
+ .octa 0x00000000078400000000000008a90000
+
+ /* x^58432 mod p(x), x^58368 mod p(x) */
+ .octa 0x00000000d3e400000000000042360000
+
+ /* x^57408 mod p(x), x^57344 mod p(x) */
+ .octa 0x00000000eba800000000000092d50000
+
+ /* x^56384 mod p(x), x^56320 mod p(x) */
+ .octa 0x00000000afbe000000000000b4d50000
+
+ /* x^55360 mod p(x), x^55296 mod p(x) */
+ .octa 0x00000000d8ca000000000000c9060000
+
+ /* x^54336 mod p(x), x^54272 mod p(x) */
+ .octa 0x00000000c2d00000000000008f4f0000
+
+ /* x^53312 mod p(x), x^53248 mod p(x) */
+ .octa 0x00000000373200000000000028690000
+
+ /* x^52288 mod p(x), x^52224 mod p(x) */
+ .octa 0x0000000046ae000000000000c3b30000
+
+ /* x^51264 mod p(x), x^51200 mod p(x) */
+ .octa 0x00000000b243000000000000f8700000
+
+ /* x^50240 mod p(x), x^50176 mod p(x) */
+ .octa 0x00000000f7f500000000000029eb0000
+
+ /* x^49216 mod p(x), x^49152 mod p(x) */
+ .octa 0x000000000c7e000000000000fe730000
+
+ /* x^48192 mod p(x), x^48128 mod p(x) */
+ .octa 0x00000000c38200000000000096000000
+
+ /* x^47168 mod p(x), x^47104 mod p(x) */
+ .octa 0x000000008956000000000000683c0000
+
+ /* x^46144 mod p(x), x^46080 mod p(x) */
+ .octa 0x00000000422d0000000000005f1e0000
+
+ /* x^45120 mod p(x), x^45056 mod p(x) */
+ .octa 0x00000000ac0f0000000000006f810000
+
+ /* x^44096 mod p(x), x^44032 mod p(x) */
+ .octa 0x00000000ce30000000000000031f0000
+
+ /* x^43072 mod p(x), x^43008 mod p(x) */
+ .octa 0x000000003d43000000000000455a0000
+
+ /* x^42048 mod p(x), x^41984 mod p(x) */
+ .octa 0x000000007ebe000000000000a6050000
+
+ /* x^41024 mod p(x), x^40960 mod p(x) */
+ .octa 0x00000000976e00000000000077eb0000
+
+ /* x^40000 mod p(x), x^39936 mod p(x) */
+ .octa 0x000000000872000000000000389c0000
+
+ /* x^38976 mod p(x), x^38912 mod p(x) */
+ .octa 0x000000008979000000000000c7b20000
+
+ /* x^37952 mod p(x), x^37888 mod p(x) */
+ .octa 0x000000005c1e0000000000001d870000
+
+ /* x^36928 mod p(x), x^36864 mod p(x) */
+ .octa 0x00000000aebb00000000000045810000
+
+ /* x^35904 mod p(x), x^35840 mod p(x) */
+ .octa 0x000000004f7e0000000000006d4a0000
+
+ /* x^34880 mod p(x), x^34816 mod p(x) */
+ .octa 0x00000000ea98000000000000b9200000
+
+ /* x^33856 mod p(x), x^33792 mod p(x) */
+ .octa 0x00000000f39600000000000022f20000
+
+ /* x^32832 mod p(x), x^32768 mod p(x) */
+ .octa 0x000000000bc500000000000041ca0000
+
+ /* x^31808 mod p(x), x^31744 mod p(x) */
+ .octa 0x00000000786400000000000078500000
+
+ /* x^30784 mod p(x), x^30720 mod p(x) */
+ .octa 0x00000000be970000000000009e7e0000
+
+ /* x^29760 mod p(x), x^29696 mod p(x) */
+ .octa 0x00000000dd6d000000000000a53c0000
+
+ /* x^28736 mod p(x), x^28672 mod p(x) */
+ .octa 0x000000004c3f00000000000039340000
+
+ /* x^27712 mod p(x), x^27648 mod p(x) */
+ .octa 0x0000000093a4000000000000b58e0000
+
+ /* x^26688 mod p(x), x^26624 mod p(x) */
+ .octa 0x0000000050fb00000000000062d40000
+
+ /* x^25664 mod p(x), x^25600 mod p(x) */
+ .octa 0x00000000f505000000000000a26f0000
+
+ /* x^24640 mod p(x), x^24576 mod p(x) */
+ .octa 0x0000000064f900000000000065e60000
+
+ /* x^23616 mod p(x), x^23552 mod p(x) */
+ .octa 0x00000000e8c2000000000000aad90000
+
+ /* x^22592 mod p(x), x^22528 mod p(x) */
+ .octa 0x00000000720b000000000000a3b00000
+
+ /* x^21568 mod p(x), x^21504 mod p(x) */
+ .octa 0x00000000e992000000000000d2680000
+
+ /* x^20544 mod p(x), x^20480 mod p(x) */
+ .octa 0x000000009132000000000000cf4c0000
+
+ /* x^19520 mod p(x), x^19456 mod p(x) */
+ .octa 0x00000000608a00000000000076610000
+
+ /* x^18496 mod p(x), x^18432 mod p(x) */
+ .octa 0x000000009948000000000000fb9f0000
+
+ /* x^17472 mod p(x), x^17408 mod p(x) */
+ .octa 0x00000000173000000000000003770000
+
+ /* x^16448 mod p(x), x^16384 mod p(x) */
+ .octa 0x000000006fe300000000000004880000
+
+ /* x^15424 mod p(x), x^15360 mod p(x) */
+ .octa 0x00000000e15300000000000056a70000
+
+ /* x^14400 mod p(x), x^14336 mod p(x) */
+ .octa 0x0000000092d60000000000009dfd0000
+
+ /* x^13376 mod p(x), x^13312 mod p(x) */
+ .octa 0x0000000002fd00000000000074c80000
+
+ /* x^12352 mod p(x), x^12288 mod p(x) */
+ .octa 0x00000000c78b000000000000a3ec0000
+
+ /* x^11328 mod p(x), x^11264 mod p(x) */
+ .octa 0x000000009262000000000000b3530000
+
+ /* x^10304 mod p(x), x^10240 mod p(x) */
+ .octa 0x0000000084f200000000000047bf0000
+
+ /* x^9280 mod p(x), x^9216 mod p(x) */
+ .octa 0x0000000067ee000000000000e97c0000
+
+ /* x^8256 mod p(x), x^8192 mod p(x) */
+ .octa 0x00000000535b00000000000091e10000
+
+ /* x^7232 mod p(x), x^7168 mod p(x) */
+ .octa 0x000000007ebb00000000000055060000
+
+ /* x^6208 mod p(x), x^6144 mod p(x) */
+ .octa 0x00000000c6a1000000000000fd360000
+
+ /* x^5184 mod p(x), x^5120 mod p(x) */
+ .octa 0x000000001be500000000000055860000
+
+ /* x^4160 mod p(x), x^4096 mod p(x) */
+ .octa 0x00000000ae0e0000000000005bd00000
+
+ /* x^3136 mod p(x), x^3072 mod p(x) */
+ .octa 0x0000000022040000000000008db20000
+
+ /* x^2112 mod p(x), x^2048 mod p(x) */
+ .octa 0x00000000c9eb000000000000efe20000
+
+ /* x^1088 mod p(x), x^1024 mod p(x) */
+ .octa 0x0000000039b400000000000051d10000
+
+.short_constants:
+
+ /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+ /* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */
+ .octa 0xefe20000dccf00009440000033590000
+
+ /* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */
+ .octa 0xee6300002f3f000062180000e0ed0000
+
+ /* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */
+ .octa 0xcf5f000017ef0000ccbe000023d30000
+
+ /* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */
+ .octa 0x6d0c0000a30e00000920000042630000
+
+ /* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */
+ .octa 0x21d30000932b0000a7a00000efcc0000
+
+ /* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */
+ .octa 0x10be00000b310000666f00000d1c0000
+
+ /* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */
+ .octa 0x1f240000ce9e0000caad0000589e0000
+
+ /* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */
+ .octa 0x29610000d02b000039b400007cf50000
+
+ /* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */
+ .octa 0x51d100009d9d00003c0e0000bfd60000
+
+ /* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */
+ .octa 0xda390000ceae000013830000713c0000
+
+ /* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */
+ .octa 0xb67800001e16000085c0000080a60000
+
+ /* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */
+ .octa 0x0db40000f7f90000371d0000e6580000
+
+ /* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */
+ .octa 0x87e70000044c0000aadb0000a4970000
+
+ /* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */
+ .octa 0x1f990000ad180000d8b30000e7b50000
+
+ /* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */
+ .octa 0xbe6c00006ee300004c1a000006df0000
+
+ /* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */
+ .octa 0xfb0b00002d560000136800008bb70000
+
+
+.barrett_constants:
+ /* Barrett constant m - (4^32)/n */
+ .octa 0x000000000000000000000001f65a57f8 /* x^64 div p(x) */
+ /* Barrett constant n */
+ .octa 0x0000000000000000000000018bb70000
+
+#define CRC_FUNCTION_NAME __crct10dif_vpmsum
+#include "crc-vpmsum-template.S"
diff --git a/lib/crc/riscv/crc-clmul-consts.h b/lib/crc/riscv/crc-clmul-consts.h
new file mode 100644
index 000000000000..8d73449235ef
--- /dev/null
+++ b/lib/crc/riscv/crc-clmul-consts.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * CRC constants generated by:
+ *
+ * ./scripts/gen-crc-consts.py riscv_clmul crc16_msb_0x8bb7,crc32_msb_0x04c11db7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
+ *
+ * Do not edit manually.
+ */
+
+struct crc_clmul_consts {
+ unsigned long fold_across_2_longs_const_hi;
+ unsigned long fold_across_2_longs_const_lo;
+ unsigned long barrett_reduction_const_1;
+ unsigned long barrett_reduction_const_2;
+};
+
+/*
+ * Constants generated for most-significant-bit-first CRC-16 using
+ * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct crc_clmul_consts crc16_msb_0x8bb7_consts __maybe_unused = {
+#ifdef CONFIG_64BIT
+ .fold_across_2_longs_const_hi = 0x0000000000001faa, /* x^192 mod G */
+ .fold_across_2_longs_const_lo = 0x000000000000a010, /* x^128 mod G */
+ .barrett_reduction_const_1 = 0xfb2d2bfc0e99d245, /* floor(x^79 / G) */
+ .barrett_reduction_const_2 = 0x0000000000008bb7, /* G - x^16 */
+#else
+ .fold_across_2_longs_const_hi = 0x00005890, /* x^96 mod G */
+ .fold_across_2_longs_const_lo = 0x0000f249, /* x^64 mod G */
+ .barrett_reduction_const_1 = 0xfb2d2bfc, /* floor(x^47 / G) */
+ .barrett_reduction_const_2 = 0x00008bb7, /* G - x^16 */
+#endif
+};
+
+/*
+ * Constants generated for most-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
+ * x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct crc_clmul_consts crc32_msb_0x04c11db7_consts __maybe_unused = {
+#ifdef CONFIG_64BIT
+ .fold_across_2_longs_const_hi = 0x00000000c5b9cd4c, /* x^192 mod G */
+ .fold_across_2_longs_const_lo = 0x00000000e8a45605, /* x^128 mod G */
+ .barrett_reduction_const_1 = 0x826880efa40da72d, /* floor(x^95 / G) */
+ .barrett_reduction_const_2 = 0x0000000004c11db7, /* G - x^32 */
+#else
+ .fold_across_2_longs_const_hi = 0xf200aa66, /* x^96 mod G */
+ .fold_across_2_longs_const_lo = 0x490d678d, /* x^64 mod G */
+ .barrett_reduction_const_1 = 0x826880ef, /* floor(x^63 / G) */
+ .barrett_reduction_const_2 = 0x04c11db7, /* G - x^32 */
+#endif
+};
+
+/*
+ * Constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
+ * x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct crc_clmul_consts crc32_lsb_0xedb88320_consts __maybe_unused = {
+#ifdef CONFIG_64BIT
+ .fold_across_2_longs_const_hi = 0x65673b4600000000, /* x^191 mod G */
+ .fold_across_2_longs_const_lo = 0x9ba54c6f00000000, /* x^127 mod G */
+ .barrett_reduction_const_1 = 0xb4e5b025f7011641, /* floor(x^95 / G) */
+ .barrett_reduction_const_2 = 0x00000000edb88320, /* (G - x^32) * x^32 */
+#else
+ .fold_across_2_longs_const_hi = 0xccaa009e, /* x^95 mod G */
+ .fold_across_2_longs_const_lo = 0xb8bc6765, /* x^63 mod G */
+ .barrett_reduction_const_1 = 0xf7011641, /* floor(x^63 / G) */
+ .barrett_reduction_const_2 = 0xedb88320, /* (G - x^32) * x^0 */
+#endif
+};
+
+/*
+ * Constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 +
+ * x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0
+ */
+static const struct crc_clmul_consts crc32_lsb_0x82f63b78_consts __maybe_unused = {
+#ifdef CONFIG_64BIT
+ .fold_across_2_longs_const_hi = 0x3743f7bd00000000, /* x^191 mod G */
+ .fold_across_2_longs_const_lo = 0x3171d43000000000, /* x^127 mod G */
+ .barrett_reduction_const_1 = 0x4869ec38dea713f1, /* floor(x^95 / G) */
+ .barrett_reduction_const_2 = 0x0000000082f63b78, /* (G - x^32) * x^32 */
+#else
+ .fold_across_2_longs_const_hi = 0x493c7d27, /* x^95 mod G */
+ .fold_across_2_longs_const_lo = 0xdd45aab8, /* x^63 mod G */
+ .barrett_reduction_const_1 = 0xdea713f1, /* floor(x^63 / G) */
+ .barrett_reduction_const_2 = 0x82f63b78, /* (G - x^32) * x^0 */
+#endif
+};
+
+/*
+ * Constants generated for most-significant-bit-first CRC-64 using
+ * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x^1 + x^0
+ */
+#ifdef CONFIG_64BIT
+static const struct crc_clmul_consts crc64_msb_0x42f0e1eba9ea3693_consts __maybe_unused = {
+ .fold_across_2_longs_const_hi = 0x4eb938a7d257740e, /* x^192 mod G */
+ .fold_across_2_longs_const_lo = 0x05f5c3c7eb52fab6, /* x^128 mod G */
+ .barrett_reduction_const_1 = 0xabc694e836627c39, /* floor(x^127 / G) */
+ .barrett_reduction_const_2 = 0x42f0e1eba9ea3693, /* G - x^64 */
+};
+#endif
+
+/*
+ * Constants generated for least-significant-bit-first CRC-64 using
+ * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
+ * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
+ * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
+ * x^4 + x^3 + x^0
+ */
+#ifdef CONFIG_64BIT
+static const struct crc_clmul_consts crc64_lsb_0x9a6c9329ac4bc9b5_consts __maybe_unused = {
+ .fold_across_2_longs_const_hi = 0xeadc41fd2ba3d420, /* x^191 mod G */
+ .fold_across_2_longs_const_lo = 0x21e9761e252621ac, /* x^127 mod G */
+ .barrett_reduction_const_1 = 0x27ecfa329aef9f77, /* floor(x^127 / G) */
+ .barrett_reduction_const_2 = 0x9a6c9329ac4bc9b5, /* (G - x^64) * x^0 */
+};
+#endif
diff --git a/lib/crc/riscv/crc-clmul-template.h b/lib/crc/riscv/crc-clmul-template.h
new file mode 100644
index 000000000000..77187e7f1762
--- /dev/null
+++ b/lib/crc/riscv/crc-clmul-template.h
@@ -0,0 +1,265 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright 2025 Google LLC */
+
+/*
+ * This file is a "template" that generates a CRC function optimized using the
+ * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this
+ * file must define the following parameters to specify the type of CRC:
+ *
+ * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC
+ * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural
+ * mapping between bits and polynomial coefficients
+ * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected
+ * mapping between bits and polynomial coefficients
+ */
+
+#include <asm/byteorder.h>
+#include <linux/minmax.h>
+
+#define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */
+
+static inline unsigned long clmul(unsigned long a, unsigned long b)
+{
+ unsigned long res;
+
+ asm(".option push\n"
+ ".option arch,+zbc\n"
+ "clmul %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (res) : "r" (a), "r" (b));
+ return res;
+}
+
+static inline unsigned long clmulh(unsigned long a, unsigned long b)
+{
+ unsigned long res;
+
+ asm(".option push\n"
+ ".option arch,+zbc\n"
+ "clmulh %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (res) : "r" (a), "r" (b));
+ return res;
+}
+
+static inline unsigned long clmulr(unsigned long a, unsigned long b)
+{
+ unsigned long res;
+
+ asm(".option push\n"
+ ".option arch,+zbc\n"
+ "clmulr %0, %1, %2\n"
+ ".option pop\n"
+ : "=r" (res) : "r" (a), "r" (b));
+ return res;
+}
+
+/*
+ * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a
+ * polynomial whose bit order matches the CRC's bit order.
+ */
+#ifdef CONFIG_64BIT
+# if LSB_CRC
+# define crc_load_long(x) le64_to_cpup(x)
+# else
+# define crc_load_long(x) be64_to_cpup(x)
+# endif
+#else
+# if LSB_CRC
+# define crc_load_long(x) le32_to_cpup(x)
+# else
+# define crc_load_long(x) be32_to_cpup(x)
+# endif
+#endif
+
+/* XOR @crc into the end of @msgpoly that represents the high-order terms. */
+static inline unsigned long
+crc_clmul_prep(crc_t crc, unsigned long msgpoly)
+{
+#if LSB_CRC
+ return msgpoly ^ crc;
+#else
+ return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));
+#endif
+}
+
+/*
+ * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it
+ * modulo the generator polynomial G. This gives the CRC of @msgpoly.
+ */
+static inline crc_t
+crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)
+{
+ unsigned long tmp;
+
+ /*
+ * First step of Barrett reduction with integrated multiplication by
+ * x^n: calculate floor((msgpoly * x^n) / G). This is the value by
+ * which G needs to be multiplied to cancel out the x^n and higher terms
+ * of msgpoly * x^n. Do it using the following formula:
+ *
+ * msb-first:
+ * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))
+ * lsb-first:
+ * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)
+ *
+ * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),
+ * which fits a long exactly. Using any lower power of x there would
+ * not carry enough precision through the calculation, while using any
+ * higher power of x would require extra instructions to handle a wider
+ * multiplication. In the msb-first case, using this power of x results
+ * in needing a floored division by x^(BITS_PER_LONG-1), which matches
+ * what clmulr produces. In the lsb-first case, a factor of x gets
+ * implicitly introduced by each carryless multiplication (shown as
+ * '* x' above), and the floored division instead needs to be by
+ * x^BITS_PER_LONG which matches what clmul produces.
+ */
+#if LSB_CRC
+ tmp = clmul(msgpoly, consts->barrett_reduction_const_1);
+#else
+ tmp = clmulr(msgpoly, consts->barrett_reduction_const_1);
+#endif
+
+ /*
+ * Second step of Barrett reduction:
+ *
+ * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))
+ *
+ * This reduces (msgpoly * x^n) modulo G by adding the appropriate
+ * multiple of G to it. The result uses only the x^0..x^(n-1) terms.
+ * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those
+ * terms in the first place, it is more efficient to do the equivalent:
+ *
+ * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n
+ *
+ * In the lsb-first case further modify it to the following which avoids
+ * a shift, as the crc ends up in the physically low n bits from clmulr:
+ *
+ * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x
+ * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n
+ *
+ * barrett_reduction_const_2 contains the constant multiplier (G - x^n)
+ * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The
+ * cast of the result to crc_t is essential, as it applies the mod x^n!
+ */
+#if LSB_CRC
+ return clmulr(tmp, consts->barrett_reduction_const_2);
+#else
+ return clmul(tmp, consts->barrett_reduction_const_2);
+#endif
+}
+
+/* Update @crc with the data from @msgpoly. */
+static inline crc_t
+crc_clmul_update_long(crc_t crc, unsigned long msgpoly,
+ const struct crc_clmul_consts *consts)
+{
+ return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts);
+}
+
+/* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */
+static inline crc_t
+crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ unsigned long msgpoly;
+ size_t i;
+
+#if LSB_CRC
+ msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8);
+ for (i = 1; i < len; i++)
+ msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8));
+#else
+ msgpoly = p[0];
+ for (i = 1; i < len; i++)
+ msgpoly = (msgpoly << 8) ^ p[i];
+#endif
+
+ if (len >= sizeof(crc_t)) {
+ #if LSB_CRC
+ msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
+ #else
+ msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS);
+ #endif
+ return crc_clmul_long(msgpoly, consts);
+ }
+#if LSB_CRC
+ msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
+ return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len));
+#else
+ msgpoly ^= crc >> (CRC_BITS - 8*len);
+ return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len));
+#endif
+}
+
+static inline crc_t
+crc_clmul(crc_t crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ size_t align;
+
+ /* This implementation assumes that the CRC fits in an unsigned long. */
+ BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));
+
+ /* If the buffer is not long-aligned, align it. */
+ align = (unsigned long)p % sizeof(unsigned long);
+ if (align && len) {
+ align = min(sizeof(unsigned long) - align, len);
+ crc = crc_clmul_update_partial(crc, p, align, consts);
+ p += align;
+ len -= align;
+ }
+
+ if (len >= 4 * sizeof(unsigned long)) {
+ unsigned long m0, m1;
+
+ m0 = crc_clmul_prep(crc, crc_load_long(p));
+ m1 = crc_load_long(p + sizeof(unsigned long));
+ p += 2 * sizeof(unsigned long);
+ len -= 2 * sizeof(unsigned long);
+ /*
+ * Main loop. Each iteration starts with a message polynomial
+ * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two
+ * more longs of data to form x^(3*BITS_PER_LONG)*m0 +
+ * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then
+ * "folds" that back into a congruent (modulo G) value that uses
+ * just m0 and m1 again. This is done by multiplying m0 by the
+ * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by
+ * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then
+ * adding the results to m2 and m3 as appropriate. Each such
+ * multiplication produces a result twice the length of a long,
+ * which in RISC-V is two instructions clmul and clmulh.
+ *
+ * This could be changed to fold across more than 2 longs at a
+ * time if there is a CPU that can take advantage of it.
+ */
+ do {
+ unsigned long p0, p1, p2, p3;
+
+ p0 = clmulh(m0, consts->fold_across_2_longs_const_hi);
+ p1 = clmul(m0, consts->fold_across_2_longs_const_hi);
+ p2 = clmulh(m1, consts->fold_across_2_longs_const_lo);
+ p3 = clmul(m1, consts->fold_across_2_longs_const_lo);
+ m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);
+ m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^
+ crc_load_long(p + sizeof(unsigned long));
+
+ p += 2 * sizeof(unsigned long);
+ len -= 2 * sizeof(unsigned long);
+ } while (len >= 2 * sizeof(unsigned long));
+
+ crc = crc_clmul_long(m0, consts);
+ crc = crc_clmul_update_long(crc, m1, consts);
+ }
+
+ while (len >= sizeof(unsigned long)) {
+ crc = crc_clmul_update_long(crc, crc_load_long(p), consts);
+ p += sizeof(unsigned long);
+ len -= sizeof(unsigned long);
+ }
+
+ if (len)
+ crc = crc_clmul_update_partial(crc, p, len, consts);
+
+ return crc;
+}
diff --git a/lib/crc/riscv/crc-clmul.h b/lib/crc/riscv/crc-clmul.h
new file mode 100644
index 000000000000..dd1736245815
--- /dev/null
+++ b/lib/crc/riscv/crc-clmul.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright 2025 Google LLC */
+
+#ifndef _RISCV_CRC_CLMUL_H
+#define _RISCV_CRC_CLMUL_H
+
+#include <linux/types.h>
+#include "crc-clmul-consts.h"
+
+u16 crc16_msb_clmul(u16 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts);
+u32 crc32_msb_clmul(u32 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts);
+u32 crc32_lsb_clmul(u32 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts);
+#ifdef CONFIG_64BIT
+u64 crc64_msb_clmul(u64 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts);
+u64 crc64_lsb_clmul(u64 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts);
+#endif
+
+#endif /* _RISCV_CRC_CLMUL_H */
diff --git a/lib/crc/riscv/crc-t10dif.h b/lib/crc/riscv/crc-t10dif.h
new file mode 100644
index 000000000000..cd6136cbfda1
--- /dev/null
+++ b/lib/crc/riscv/crc-t10dif.h
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized CRC-T10DIF function
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/hwcap.h>
+#include <asm/alternative-macros.h>
+
+#include "crc-clmul.h"
+
+static inline u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return crc16_msb_clmul(crc, p, len, &crc16_msb_0x8bb7_consts);
+ return crc_t10dif_generic(crc, p, len);
+}
diff --git a/lib/crc/riscv/crc16_msb.c b/lib/crc/riscv/crc16_msb.c
new file mode 100644
index 000000000000..554d295e95f5
--- /dev/null
+++ b/lib/crc/riscv/crc16_msb.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized most-significant-bit-first CRC16
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-clmul.h"
+
+typedef u16 crc_t;
+#define LSB_CRC 0
+#include "crc-clmul-template.h"
+
+u16 crc16_msb_clmul(u16 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ return crc_clmul(crc, p, len, consts);
+}
diff --git a/lib/crc/riscv/crc32.h b/lib/crc/riscv/crc32.h
new file mode 100644
index 000000000000..3ec6eee98afa
--- /dev/null
+++ b/lib/crc/riscv/crc32.h
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized CRC32 functions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/hwcap.h>
+#include <asm/alternative-macros.h>
+
+#include "crc-clmul.h"
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return crc32_lsb_clmul(crc, p, len,
+ &crc32_lsb_0xedb88320_consts);
+ return crc32_le_base(crc, p, len);
+}
+
+static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return crc32_msb_clmul(crc, p, len,
+ &crc32_msb_0x04c11db7_consts);
+ return crc32_be_base(crc, p, len);
+}
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return crc32_lsb_clmul(crc, p, len,
+ &crc32_lsb_0x82f63b78_consts);
+ return crc32c_base(crc, p, len);
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return CRC32_LE_OPTIMIZATION |
+ CRC32_BE_OPTIMIZATION |
+ CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/riscv/crc32_lsb.c b/lib/crc/riscv/crc32_lsb.c
new file mode 100644
index 000000000000..72fd67e7470c
--- /dev/null
+++ b/lib/crc/riscv/crc32_lsb.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized least-significant-bit-first CRC32
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-clmul.h"
+
+typedef u32 crc_t;
+#define LSB_CRC 1
+#include "crc-clmul-template.h"
+
+u32 crc32_lsb_clmul(u32 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ return crc_clmul(crc, p, len, consts);
+}
diff --git a/lib/crc/riscv/crc32_msb.c b/lib/crc/riscv/crc32_msb.c
new file mode 100644
index 000000000000..fdbeaccc369f
--- /dev/null
+++ b/lib/crc/riscv/crc32_msb.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized most-significant-bit-first CRC32
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-clmul.h"
+
+typedef u32 crc_t;
+#define LSB_CRC 0
+#include "crc-clmul-template.h"
+
+u32 crc32_msb_clmul(u32 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ return crc_clmul(crc, p, len, consts);
+}
diff --git a/lib/crc/riscv/crc64.h b/lib/crc/riscv/crc64.h
new file mode 100644
index 000000000000..a1b7873fde57
--- /dev/null
+++ b/lib/crc/riscv/crc64.h
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized CRC64 functions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/hwcap.h>
+#include <asm/alternative-macros.h>
+
+#include "crc-clmul.h"
+
+static inline u64 crc64_be_arch(u64 crc, const u8 *p, size_t len)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return crc64_msb_clmul(crc, p, len,
+ &crc64_msb_0x42f0e1eba9ea3693_consts);
+ return crc64_be_generic(crc, p, len);
+}
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+ if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC))
+ return crc64_lsb_clmul(crc, p, len,
+ &crc64_lsb_0x9a6c9329ac4bc9b5_consts);
+ return crc64_nvme_generic(crc, p, len);
+}
diff --git a/lib/crc/riscv/crc64_lsb.c b/lib/crc/riscv/crc64_lsb.c
new file mode 100644
index 000000000000..c5371bb85d90
--- /dev/null
+++ b/lib/crc/riscv/crc64_lsb.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized least-significant-bit-first CRC64
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-clmul.h"
+
+typedef u64 crc_t;
+#define LSB_CRC 1
+#include "crc-clmul-template.h"
+
+u64 crc64_lsb_clmul(u64 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ return crc_clmul(crc, p, len, consts);
+}
diff --git a/lib/crc/riscv/crc64_msb.c b/lib/crc/riscv/crc64_msb.c
new file mode 100644
index 000000000000..1925d1dbe225
--- /dev/null
+++ b/lib/crc/riscv/crc64_msb.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RISC-V optimized most-significant-bit-first CRC64
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-clmul.h"
+
+typedef u64 crc_t;
+#define LSB_CRC 0
+#include "crc-clmul-template.h"
+
+u64 crc64_msb_clmul(u64 crc, const void *p, size_t len,
+ const struct crc_clmul_consts *consts)
+{
+ return crc_clmul(crc, p, len, consts);
+}
diff --git a/lib/crc/s390/crc32-vx.h b/lib/crc/s390/crc32-vx.h
new file mode 100644
index 000000000000..652c96e1a822
--- /dev/null
+++ b/lib/crc/s390/crc32-vx.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _CRC32_VX_S390_H
+#define _CRC32_VX_S390_H
+
+#include <linux/types.h>
+
+u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+
+#endif /* _CRC32_VX_S390_H */
diff --git a/lib/crc/s390/crc32.h b/lib/crc/s390/crc32.h
new file mode 100644
index 000000000000..59c8983d428b
--- /dev/null
+++ b/lib/crc/s390/crc32.h
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CRC-32 implemented with the z/Architecture Vector Extension Facility.
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#include <linux/cpufeature.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+/*
+ * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension
+ *
+ * Creates a function to perform a particular CRC-32 computation. Depending
+ * on the message buffer, the hardware-accelerated or software implementation
+ * is used. Note that the message buffer is aligned to improve fetch
+ * operations of VECTOR LOAD MULTIPLE instructions.
+ */
+#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \
+ static inline u32 ___fname(u32 crc, const u8 *data, size_t datalen) \
+ { \
+ unsigned long prealign, aligned, remaining; \
+ DECLARE_KERNEL_FPU_ONSTACK16(vxstate); \
+ \
+ if (datalen < VX_MIN_LEN + VX_ALIGN_MASK || !cpu_has_vx()) \
+ return ___crc32_sw(crc, data, datalen); \
+ \
+ if ((unsigned long)data & VX_ALIGN_MASK) { \
+ prealign = VX_ALIGNMENT - \
+ ((unsigned long)data & VX_ALIGN_MASK); \
+ datalen -= prealign; \
+ crc = ___crc32_sw(crc, data, prealign); \
+ data = (void *)((unsigned long)data + prealign); \
+ } \
+ \
+ aligned = datalen & ~VX_ALIGN_MASK; \
+ remaining = datalen & VX_ALIGN_MASK; \
+ \
+ kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \
+ crc = ___crc32_vx(crc, data, aligned); \
+ kernel_fpu_end(&vxstate, KERNEL_VXR_LOW); \
+ \
+ if (remaining) \
+ crc = ___crc32_sw(crc, data + aligned, remaining); \
+ \
+ return crc; \
+ }
+
+DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base)
+DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base)
+DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base)
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (cpu_has_vx()) {
+ return CRC32_LE_OPTIMIZATION |
+ CRC32_BE_OPTIMIZATION |
+ CRC32C_OPTIMIZATION;
+ }
+ return 0;
+}
diff --git a/lib/crc/s390/crc32be-vx.c b/lib/crc/s390/crc32be-vx.c
new file mode 100644
index 000000000000..fed7c9c70d05
--- /dev/null
+++ b/lib/crc/s390/crc32be-vx.c
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm processes the most-significant
+ * bit first (BE).
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"
+
+/* Vector register range containing CRC-32 constants */
+#define CONST_R1R2 9
+#define CONST_R3R4 10
+#define CONST_R5 11
+#define CONST_R6 12
+#define CONST_RU_POLY 13
+#define CONST_CRC_POLY 14
+
+/*
+ * The CRC-32 constant block contains reduction constants to fold and
+ * process particular chunks of the input data stream in parallel.
+ *
+ * For the CRC-32 variants, the constants are precomputed according to
+ * these definitions:
+ *
+ * R1 = x4*128+64 mod P(x)
+ * R2 = x4*128 mod P(x)
+ * R3 = x128+64 mod P(x)
+ * R4 = x128 mod P(x)
+ * R5 = x96 mod P(x)
+ * R6 = x64 mod P(x)
+ *
+ * Barret reduction constant, u, is defined as floor(x**64 / P(x)).
+ *
+ * where P(x) is the polynomial in the normal domain and the P'(x) is the
+ * polynomial in the reversed (bitreflected) domain.
+ *
+ * Note that the constant definitions below are extended in order to compute
+ * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
+ * The rightmost doubleword can be 0 to prevent contribution to the result or
+ * can be multiplied by 1 to perform an XOR without the need for a separate
+ * VECTOR EXCLUSIVE OR instruction.
+ *
+ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+ *
+ * P(x) = 0x04C11DB7
+ * P'(x) = 0xEDB88320
+ */
+
+static unsigned long constants_CRC_32_BE[] = {
+ 0x08833794c, 0x0e6228b11, /* R1, R2 */
+ 0x0c5b9cd4c, 0x0e8a45605, /* R3, R4 */
+ 0x0f200aa66, 1UL << 32, /* R5, x32 */
+ 0x0490d678d, 1, /* R6, 1 */
+ 0x104d101df, 0, /* u */
+ 0x104C11DB7, 0, /* P(x) */
+};
+
+/**
+ * crc32_be_vgfm_16 - Compute CRC-32 (BE variant) with vector registers
+ * @crc: Initial CRC value, typically ~0.
+ * @buf: Input buffer pointer, performance might be improved if the
+ * buffer is on a doubleword boundary.
+ * @size: Size of the buffer, must be 64 bytes or greater.
+ *
+ * Register usage:
+ * V0: Initial CRC value and intermediate constants and results.
+ * V1..V4: Data for CRC computation.
+ * V5..V8: Next data chunks that are fetched from the input buffer.
+ * V9..V14: CRC-32 constants.
+ */
+u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
+ /* Load CRC-32 constants */
+ fpu_vlm(CONST_R1R2, CONST_CRC_POLY, &constants_CRC_32_BE);
+ fpu_vzero(0);
+
+ /* Load the initial CRC value into the leftmost word of V0. */
+ fpu_vlvgf(0, crc, 0);
+
+ /* Load a 64-byte data chunk and XOR with CRC */
+ fpu_vlm(1, 4, buf);
+ fpu_vx(1, 0, 1);
+ buf += 64;
+ size -= 64;
+
+ while (size >= 64) {
+ /* Load the next 64-byte data chunk into V5 to V8 */
+ fpu_vlm(5, 8, buf);
+
+ /*
+ * Perform a GF(2) multiplication of the doublewords in V1 with
+ * the reduction constants in V0. The intermediate result is
+ * then folded (accumulated) with the next data chunk in V5 and
+ * stored in V1. Repeat this step for the register contents
+ * in V2, V3, and V4 respectively.
+ */
+ fpu_vgfmag(1, CONST_R1R2, 1, 5);
+ fpu_vgfmag(2, CONST_R1R2, 2, 6);
+ fpu_vgfmag(3, CONST_R1R2, 3, 7);
+ fpu_vgfmag(4, CONST_R1R2, 4, 8);
+ buf += 64;
+ size -= 64;
+ }
+
+ /* Fold V1 to V4 into a single 128-bit value in V1 */
+ fpu_vgfmag(1, CONST_R3R4, 1, 2);
+ fpu_vgfmag(1, CONST_R3R4, 1, 3);
+ fpu_vgfmag(1, CONST_R3R4, 1, 4);
+
+ while (size >= 16) {
+ fpu_vl(2, buf);
+ fpu_vgfmag(1, CONST_R3R4, 1, 2);
+ buf += 16;
+ size -= 16;
+ }
+
+ /*
+ * The R5 constant is used to fold a 128-bit value into an 96-bit value
+ * that is XORed with the next 96-bit input data chunk. To use a single
+ * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to
+ * form an intermediate 96-bit value (with appended zeros) which is then
+ * XORed with the intermediate reduction result.
+ */
+ fpu_vgfmg(1, CONST_R5, 1);
+
+ /*
+ * Further reduce the remaining 96-bit value to a 64-bit value using a
+ * single VGFMG, the rightmost doubleword is multiplied with 0x1. The
+ * intermediate result is then XORed with the product of the leftmost
+ * doubleword with R6. The result is a 64-bit value and is subject to
+ * the Barret reduction.
+ */
+ fpu_vgfmg(1, CONST_R6, 1);
+
+ /*
+ * The input values to the Barret reduction are the degree-63 polynomial
+ * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+ * constant u. The Barret reduction result is the CRC value of R(x) mod
+ * P(x).
+ *
+ * The Barret reduction algorithm is defined as:
+ *
+ * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+ * 3. C(x) = R(x) XOR T2(x) mod x^32
+ *
+ * Note: To compensate the division by x^32, use the vector unpack
+ * instruction to move the leftmost word into the leftmost doubleword
+ * of the vector register. The rightmost doubleword is multiplied
+ * with zero to not contribute to the intermediate results.
+ */
+
+ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+ fpu_vupllf(2, 1);
+ fpu_vgfmg(2, CONST_RU_POLY, 2);
+
+ /*
+ * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
+ * V2 and XOR the intermediate result, T2(x), with the value in V1.
+ * The final result is in the rightmost word of V2.
+ */
+ fpu_vupllf(2, 2);
+ fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
+ return fpu_vlgvf(2, 3);
+}
diff --git a/lib/crc/s390/crc32le-vx.c b/lib/crc/s390/crc32le-vx.c
new file mode 100644
index 000000000000..2f629f394df7
--- /dev/null
+++ b/lib/crc/s390/crc32le-vx.c
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet
+ * and Castagnoli.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"
+
+/* Vector register range containing CRC-32 constants */
+#define CONST_PERM_LE2BE 9
+#define CONST_R2R1 10
+#define CONST_R4R3 11
+#define CONST_R5 12
+#define CONST_RU_POLY 13
+#define CONST_CRC_POLY 14
+
+/*
+ * The CRC-32 constant block contains reduction constants to fold and
+ * process particular chunks of the input data stream in parallel.
+ *
+ * For the CRC-32 variants, the constants are precomputed according to
+ * these definitions:
+ *
+ * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+ * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+ * R3 = [(x128+32 mod P'(x) << 32)]' << 1
+ * R4 = [(x128-32 mod P'(x) << 32)]' << 1
+ * R5 = [(x64 mod P'(x) << 32)]' << 1
+ * R6 = [(x32 mod P'(x) << 32)]' << 1
+ *
+ * The bitreflected Barret reduction constant, u', is defined as
+ * the bit reversal of floor(x**64 / P(x)).
+ *
+ * where P(x) is the polynomial in the normal domain and the P'(x) is the
+ * polynomial in the reversed (bitreflected) domain.
+ *
+ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+ *
+ * P(x) = 0x04C11DB7
+ * P'(x) = 0xEDB88320
+ *
+ * CRC-32C (Castagnoli) polynomials:
+ *
+ * P(x) = 0x1EDC6F41
+ * P'(x) = 0x82F63B78
+ */
+
+static unsigned long constants_CRC_32_LE[] = {
+ 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */
+ 0x1c6e41596, 0x154442bd4, /* R2, R1 */
+ 0x0ccaa009e, 0x1751997d0, /* R4, R3 */
+ 0x0, 0x163cd6124, /* R5 */
+ 0x0, 0x1f7011641, /* u' */
+ 0x0, 0x1db710641 /* P'(x) << 1 */
+};
+
+static unsigned long constants_CRC_32C_LE[] = {
+ 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */
+ 0x09e4addf8, 0x740eef02, /* R2, R1 */
+ 0x14cd00bd6, 0xf20c0dfe, /* R4, R3 */
+ 0x0, 0x0dd45aab8, /* R5 */
+ 0x0, 0x0dea713f1, /* u' */
+ 0x0, 0x105ec76f0 /* P'(x) << 1 */
+};
+
+/**
+ * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers
+ * @crc: Initial CRC value, typically ~0.
+ * @buf: Input buffer pointer, performance might be improved if the
+ * buffer is on a doubleword boundary.
+ * @size: Size of the buffer, must be 64 bytes or greater.
+ * @constants: CRC-32 constant pool base pointer.
+ *
+ * Register usage:
+ * V0: Initial CRC value and intermediate constants and results.
+ * V1..V4: Data for CRC computation.
+ * V5..V8: Next data chunks that are fetched from the input buffer.
+ * V9: Constant for BE->LE conversion and shift operations
+ * V10..V14: CRC-32 constants.
+ */
+static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants)
+{
+ /* Load CRC-32 constants */
+ fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants);
+
+ /*
+ * Load the initial CRC value.
+ *
+ * The CRC value is loaded into the rightmost word of the
+ * vector register and is later XORed with the LSB portion
+ * of the loaded input data.
+ */
+ fpu_vzero(0); /* Clear V0 */
+ fpu_vlvgf(0, crc, 3); /* Load CRC into rightmost word */
+
+ /* Load a 64-byte data chunk and XOR with CRC */
+ fpu_vlm(1, 4, buf);
+ fpu_vperm(1, 1, 1, CONST_PERM_LE2BE);
+ fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
+ fpu_vperm(3, 3, 3, CONST_PERM_LE2BE);
+ fpu_vperm(4, 4, 4, CONST_PERM_LE2BE);
+
+ fpu_vx(1, 0, 1); /* V1 ^= CRC */
+ buf += 64;
+ size -= 64;
+
+ while (size >= 64) {
+ fpu_vlm(5, 8, buf);
+ fpu_vperm(5, 5, 5, CONST_PERM_LE2BE);
+ fpu_vperm(6, 6, 6, CONST_PERM_LE2BE);
+ fpu_vperm(7, 7, 7, CONST_PERM_LE2BE);
+ fpu_vperm(8, 8, 8, CONST_PERM_LE2BE);
+ /*
+ * Perform a GF(2) multiplication of the doublewords in V1 with
+ * the R1 and R2 reduction constants in V0. The intermediate
+ * result is then folded (accumulated) with the next data chunk
+ * in V5 and stored in V1. Repeat this step for the register
+ * contents in V2, V3, and V4 respectively.
+ */
+ fpu_vgfmag(1, CONST_R2R1, 1, 5);
+ fpu_vgfmag(2, CONST_R2R1, 2, 6);
+ fpu_vgfmag(3, CONST_R2R1, 3, 7);
+ fpu_vgfmag(4, CONST_R2R1, 4, 8);
+ buf += 64;
+ size -= 64;
+ }
+
+ /*
+ * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
+ * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+ * value remains.
+ */
+ fpu_vgfmag(1, CONST_R4R3, 1, 2);
+ fpu_vgfmag(1, CONST_R4R3, 1, 3);
+ fpu_vgfmag(1, CONST_R4R3, 1, 4);
+
+ while (size >= 16) {
+ fpu_vl(2, buf);
+ fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
+ fpu_vgfmag(1, CONST_R4R3, 1, 2);
+ buf += 16;
+ size -= 16;
+ }
+
+ /*
+ * Set up a vector register for byte shifts. The shift value must
+ * be loaded in bits 1-4 in byte element 7 of a vector register.
+ * Shift by 8 bytes: 0x40
+ * Shift by 4 bytes: 0x20
+ */
+ fpu_vleib(9, 0x40, 7);
+
+ /*
+ * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+ * to move R4 into the rightmost doubleword and set the leftmost
+ * doubleword to 0x1.
+ */
+ fpu_vsrlb(0, CONST_R4R3, 9);
+ fpu_vleig(0, 1, 0);
+
+ /*
+ * Compute GF(2) product of V1 and V0. The rightmost doubleword
+ * of V1 is multiplied with R4. The leftmost doubleword of V1 is
+ * multiplied by 0x1 and is then XORed with rightmost product.
+ * Implicitly, the intermediate leftmost product becomes padded
+ */
+ fpu_vgfmg(1, 0, 1);
+
+ /*
+ * Now do the final 32-bit fold by multiplying the rightmost word
+ * in V1 with R5 and XOR the result with the remaining bits in V1.
+ *
+ * To achieve this by a single VGFMAG, right shift V1 by a word
+ * and store the result in V2 which is then accumulated. Use the
+ * vector unpack instruction to load the rightmost half of the
+ * doubleword into the rightmost doubleword element of V1; the other
+ * half is loaded in the leftmost doubleword.
+ * The vector register with CONST_R5 contains the R5 constant in the
+ * rightmost doubleword and the leftmost doubleword is zero to ignore
+ * the leftmost product of V1.
+ */
+ fpu_vleib(9, 0x20, 7); /* Shift by words */
+ fpu_vsrlb(2, 1, 9); /* Store remaining bits in V2 */
+ fpu_vupllf(1, 1); /* Split rightmost doubleword */
+ fpu_vgfmag(1, CONST_R5, 1, 2); /* V1 = (V1 * R5) XOR V2 */
+
+ /*
+ * Apply a Barret reduction to compute the final 32-bit CRC value.
+ *
+ * The input values to the Barret reduction are the degree-63 polynomial
+ * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+ * constant u. The Barret reduction result is the CRC value of R(x) mod
+ * P(x).
+ *
+ * The Barret reduction algorithm is defined as:
+ *
+ * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+ * 3. C(x) = R(x) XOR T2(x) mod x^32
+ *
+ * Note: The leftmost doubleword of vector register containing
+ * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+ * is zero and does not contribute to the final result.
+ */
+
+ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+ fpu_vupllf(2, 1);
+ fpu_vgfmg(2, CONST_RU_POLY, 2);
+
+ /*
+ * Compute the GF(2) product of the CRC polynomial with T1(x) in
+ * V2 and XOR the intermediate result, T2(x), with the value in V1.
+ * The final result is stored in word element 2 of V2.
+ */
+ fpu_vupllf(2, 2);
+ fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
+
+ return fpu_vlgvf(2, 2);
+}
+
+u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
+ return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]);
+}
+
+u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
+ return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]);
+}
diff --git a/lib/crc/sparc/crc32.h b/lib/crc/sparc/crc32.h
new file mode 100644
index 000000000000..60f2765ac015
--- /dev/null
+++ b/lib/crc/sparc/crc32.h
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* CRC32c (Castagnoli), sparc64 crc32c opcode accelerated
+ *
+ * This is based largely upon arch/x86/crypto/crc32c-intel.c
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Authors: Austin Zhang <austin_zhang@linux.intel.com>
+ * Kent Liu <kent.liu@intel.com>
+ */
+
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32c_opcode);
+
+#define crc32_le_arch crc32_le_base /* not implemented on this arch */
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+void crc32c_sparc64(u32 *crcp, const u64 *data, size_t len);
+
+static inline u32 crc32c_arch(u32 crc, const u8 *data, size_t len)
+{
+ size_t n = -(uintptr_t)data & 7;
+
+ if (!static_branch_likely(&have_crc32c_opcode))
+ return crc32c_base(crc, data, len);
+
+ if (n) {
+ /* Data isn't 8-byte aligned. Align it. */
+ n = min(n, len);
+ crc = crc32c_base(crc, data, n);
+ data += n;
+ len -= n;
+ }
+ n = len & ~7U;
+ if (n) {
+ crc32c_sparc64(&crc, (const u64 *)data, n);
+ data += n;
+ len -= n;
+ }
+ if (len)
+ crc = crc32c_base(crc, data, len);
+ return crc;
+}
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static inline void crc32_mod_init_arch(void)
+{
+ unsigned long cfr;
+
+ if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+ return;
+
+ __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+ if (!(cfr & CFR_CRC32C))
+ return;
+
+ static_branch_enable(&have_crc32c_opcode);
+ pr_info("Using sparc64 crc32c opcode optimized CRC32C implementation\n");
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ if (static_key_enabled(&have_crc32c_opcode))
+ return CRC32C_OPTIMIZATION;
+ return 0;
+}
diff --git a/lib/crc/sparc/crc32c_asm.S b/lib/crc/sparc/crc32c_asm.S
new file mode 100644
index 000000000000..4db873850f44
--- /dev/null
+++ b/lib/crc/sparc/crc32c_asm.S
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/opcodes.h>
+#include <asm/visasm.h>
+#include <asm/asi.h>
+
+ENTRY(crc32c_sparc64)
+ /* %o0=crc32p, %o1=data_ptr, %o2=len */
+ VISEntryHalf
+ lda [%o0] ASI_PL, %f1
+1: ldd [%o1], %f2
+ CRC32C(0,2,0)
+ subcc %o2, 8, %o2
+ bne,pt %icc, 1b
+ add %o1, 0x8, %o1
+ sta %f1, [%o0] ASI_PL
+ VISExitHalf
+2: retl
+ nop
+ENDPROC(crc32c_sparc64)
diff --git a/lib/crc/tests/Makefile b/lib/crc/tests/Makefile
new file mode 100644
index 000000000000..65f63c318ef5
--- /dev/null
+++ b/lib/crc/tests/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o
diff --git a/lib/crc/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c
new file mode 100644
index 000000000000..f08d985d8860
--- /dev/null
+++ b/lib/crc/tests/crc_kunit.c
@@ -0,0 +1,452 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Unit tests and benchmarks for the CRC library functions
+ *
+ * Copyright 2024 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+#include <kunit/test.h>
+#include <linux/crc7.h>
+#include <linux/crc16.h>
+#include <linux/crc-t10dif.h>
+#include <linux/crc32.h>
+#include <linux/crc32c.h>
+#include <linux/crc64.h>
+#include <linux/prandom.h>
+#include <linux/vmalloc.h>
+
+#define CRC_KUNIT_SEED 42
+#define CRC_KUNIT_MAX_LEN 16384
+#define CRC_KUNIT_NUM_TEST_ITERS 1000
+
+static struct rnd_state rng;
+static u8 *test_buffer;
+static size_t test_buflen;
+
+/**
+ * struct crc_variant - describes a CRC variant
+ * @bits: Number of bits in the CRC, 1 <= @bits <= 64.
+ * @le: true if it's a "little endian" CRC (reversed mapping between bits and
+ * polynomial coefficients in each byte), false if it's a "big endian" CRC
+ * (natural mapping between bits and polynomial coefficients in each byte)
+ * @poly: The generator polynomial with the highest-order term omitted.
+ * Bit-reversed if @le is true.
+ * @func: The function to compute a CRC. The type signature uses u64 so that it
+ * can fit any CRC up to CRC-64. The CRC is passed in, and is expected
+ * to be returned in, the least significant bits of the u64. The
+ * function is expected to *not* invert the CRC at the beginning and end.
+ */
+struct crc_variant {
+ int bits;
+ bool le;
+ u64 poly;
+ u64 (*func)(u64 crc, const u8 *p, size_t len);
+};
+
+static u32 rand32(void)
+{
+ return prandom_u32_state(&rng);
+}
+
+static u64 rand64(void)
+{
+ u32 n = rand32();
+
+ return ((u64)n << 32) | rand32();
+}
+
+static u64 crc_mask(const struct crc_variant *v)
+{
+ return (u64)-1 >> (64 - v->bits);
+}
+
+/* Reference implementation of any CRC variant */
+static u64 crc_ref(const struct crc_variant *v,
+ u64 crc, const u8 *p, size_t len)
+{
+ size_t i, j;
+
+ for (i = 0; i < len; i++) {
+ for (j = 0; j < 8; j++) {
+ if (v->le) {
+ crc ^= (p[i] >> j) & 1;
+ crc = (crc >> 1) ^ ((crc & 1) ? v->poly : 0);
+ } else {
+ crc ^= (u64)((p[i] >> (7 - j)) & 1) <<
+ (v->bits - 1);
+ if (crc & (1ULL << (v->bits - 1)))
+ crc = ((crc << 1) ^ v->poly) &
+ crc_mask(v);
+ else
+ crc <<= 1;
+ }
+ }
+ }
+ return crc;
+}
+
+static int crc_suite_init(struct kunit_suite *suite)
+{
+ /*
+ * Allocate the test buffer using vmalloc() with a page-aligned length
+ * so that it is immediately followed by a guard page. This allows
+ * buffer overreads to be detected, even in assembly code.
+ */
+ test_buflen = round_up(CRC_KUNIT_MAX_LEN, PAGE_SIZE);
+ test_buffer = vmalloc(test_buflen);
+ if (!test_buffer)
+ return -ENOMEM;
+
+ prandom_seed_state(&rng, CRC_KUNIT_SEED);
+ prandom_bytes_state(&rng, test_buffer, test_buflen);
+ return 0;
+}
+
+static void crc_suite_exit(struct kunit_suite *suite)
+{
+ vfree(test_buffer);
+ test_buffer = NULL;
+}
+
+/* Generate a random initial CRC. */
+static u64 generate_random_initial_crc(const struct crc_variant *v)
+{
+ switch (rand32() % 4) {
+ case 0:
+ return 0;
+ case 1:
+ return crc_mask(v); /* All 1 bits */
+ default:
+ return rand64() & crc_mask(v);
+ }
+}
+
+/* Generate a random length, preferring small lengths. */
+static size_t generate_random_length(size_t max_length)
+{
+ size_t len;
+
+ switch (rand32() % 3) {
+ case 0:
+ len = rand32() % 128;
+ break;
+ case 1:
+ len = rand32() % 3072;
+ break;
+ default:
+ len = rand32();
+ break;
+ }
+ return len % (max_length + 1);
+}
+
+/* Test that v->func gives the same CRCs as a reference implementation. */
+static void crc_test(struct kunit *test, const struct crc_variant *v)
+{
+ size_t i;
+
+ for (i = 0; i < CRC_KUNIT_NUM_TEST_ITERS; i++) {
+ u64 init_crc, expected_crc, actual_crc;
+ size_t len, offset;
+ bool nosimd;
+
+ init_crc = generate_random_initial_crc(v);
+ len = generate_random_length(CRC_KUNIT_MAX_LEN);
+
+ /* Generate a random offset. */
+ if (rand32() % 2 == 0) {
+ /* Use a random alignment mod 64 */
+ offset = rand32() % 64;
+ offset = min(offset, CRC_KUNIT_MAX_LEN - len);
+ } else {
+ /* Go up to the guard page, to catch buffer overreads */
+ offset = test_buflen - len;
+ }
+
+ if (rand32() % 8 == 0)
+ /* Refresh the data occasionally. */
+ prandom_bytes_state(&rng, &test_buffer[offset], len);
+
+ nosimd = rand32() % 8 == 0;
+
+ /*
+ * Compute the CRC, and verify that it equals the CRC computed
+ * by a simple bit-at-a-time reference implementation.
+ */
+ expected_crc = crc_ref(v, init_crc, &test_buffer[offset], len);
+ if (nosimd)
+ local_irq_disable();
+ actual_crc = v->func(init_crc, &test_buffer[offset], len);
+ if (nosimd)
+ local_irq_enable();
+ KUNIT_EXPECT_EQ_MSG(test, expected_crc, actual_crc,
+ "Wrong result with len=%zu offset=%zu nosimd=%d",
+ len, offset, nosimd);
+ }
+}
+
+static __always_inline void
+crc_benchmark(struct kunit *test,
+ u64 (*crc_func)(u64 crc, const u8 *p, size_t len))
+{
+ static const size_t lens_to_test[] = {
+ 1, 16, 64, 127, 128, 200, 256, 511, 512, 1024, 3173, 4096, 16384,
+ };
+ size_t len, i, j, num_iters;
+ /*
+ * The CRC value that this function computes in a series of calls to
+ * crc_func is never actually used, so use volatile to ensure that the
+ * computations are done as intended and don't all get optimized out.
+ */
+ volatile u64 crc = 0;
+ u64 t;
+
+ if (!IS_ENABLED(CONFIG_CRC_BENCHMARK))
+ kunit_skip(test, "not enabled");
+
+ /* warm-up */
+ for (i = 0; i < 10000000; i += CRC_KUNIT_MAX_LEN)
+ crc = crc_func(crc, test_buffer, CRC_KUNIT_MAX_LEN);
+
+ for (i = 0; i < ARRAY_SIZE(lens_to_test); i++) {
+ len = lens_to_test[i];
+ KUNIT_ASSERT_LE(test, len, CRC_KUNIT_MAX_LEN);
+ num_iters = 10000000 / (len + 128);
+ preempt_disable();
+ t = ktime_get_ns();
+ for (j = 0; j < num_iters; j++)
+ crc = crc_func(crc, test_buffer, len);
+ t = ktime_get_ns() - t;
+ preempt_enable();
+ kunit_info(test, "len=%zu: %llu MB/s\n",
+ len, div64_u64((u64)len * num_iters * 1000, t));
+ }
+}
+
+/* crc7_be */
+
+static u64 crc7_be_wrapper(u64 crc, const u8 *p, size_t len)
+{
+ /*
+ * crc7_be() left-aligns the 7-bit CRC in a u8, whereas the test wants a
+ * right-aligned CRC (in a u64). Convert between the conventions.
+ */
+ return crc7_be(crc << 1, p, len) >> 1;
+}
+
+static const struct crc_variant crc_variant_crc7_be = {
+ .bits = 7,
+ .poly = 0x9,
+ .func = crc7_be_wrapper,
+};
+
+static void crc7_be_test(struct kunit *test)
+{
+ crc_test(test, &crc_variant_crc7_be);
+}
+
+static void crc7_be_benchmark(struct kunit *test)
+{
+ crc_benchmark(test, crc7_be_wrapper);
+}
+
+/* crc16 */
+
+static u64 crc16_wrapper(u64 crc, const u8 *p, size_t len)
+{
+ return crc16(crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc16 = {
+ .bits = 16,
+ .le = true,
+ .poly = 0xa001,
+ .func = crc16_wrapper,
+};
+
+static void crc16_test(struct kunit *test)
+{
+ crc_test(test, &crc_variant_crc16);
+}
+
+static void crc16_benchmark(struct kunit *test)
+{
+ crc_benchmark(test, crc16_wrapper);
+}
+
+/* crc_t10dif */
+
+static u64 crc_t10dif_wrapper(u64 crc, const u8 *p, size_t len)
+{
+ return crc_t10dif_update(crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc_t10dif = {
+ .bits = 16,
+ .le = false,
+ .poly = 0x8bb7,
+ .func = crc_t10dif_wrapper,
+};
+
+static void crc_t10dif_test(struct kunit *test)
+{
+ crc_test(test, &crc_variant_crc_t10dif);
+}
+
+static void crc_t10dif_benchmark(struct kunit *test)
+{
+ crc_benchmark(test, crc_t10dif_wrapper);
+}
+
+/* crc32_le */
+
+static u64 crc32_le_wrapper(u64 crc, const u8 *p, size_t len)
+{
+ return crc32_le(crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc32_le = {
+ .bits = 32,
+ .le = true,
+ .poly = 0xedb88320,
+ .func = crc32_le_wrapper,
+};
+
+static void crc32_le_test(struct kunit *test)
+{
+ crc_test(test, &crc_variant_crc32_le);
+}
+
+static void crc32_le_benchmark(struct kunit *test)
+{
+ crc_benchmark(test, crc32_le_wrapper);
+}
+
+/* crc32_be */
+
+static u64 crc32_be_wrapper(u64 crc, const u8 *p, size_t len)
+{
+ return crc32_be(crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc32_be = {
+ .bits = 32,
+ .le = false,
+ .poly = 0x04c11db7,
+ .func = crc32_be_wrapper,
+};
+
+static void crc32_be_test(struct kunit *test)
+{
+ crc_test(test, &crc_variant_crc32_be);
+}
+
+static void crc32_be_benchmark(struct kunit *test)
+{
+ crc_benchmark(test, crc32_be_wrapper);
+}
+
+/* crc32c */
+
+static u64 crc32c_wrapper(u64 crc, const u8 *p, size_t len)
+{
+ return crc32c(crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc32c = {
+ .bits = 32,
+ .le = true,
+ .poly = 0x82f63b78,
+ .func = crc32c_wrapper,
+};
+
+static void crc32c_test(struct kunit *test)
+{
+ crc_test(test, &crc_variant_crc32c);
+}
+
+static void crc32c_benchmark(struct kunit *test)
+{
+ crc_benchmark(test, crc32c_wrapper);
+}
+
+/* crc64_be */
+
+static u64 crc64_be_wrapper(u64 crc, const u8 *p, size_t len)
+{
+ return crc64_be(crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc64_be = {
+ .bits = 64,
+ .le = false,
+ .poly = 0x42f0e1eba9ea3693,
+ .func = crc64_be_wrapper,
+};
+
+static void crc64_be_test(struct kunit *test)
+{
+ crc_test(test, &crc_variant_crc64_be);
+}
+
+static void crc64_be_benchmark(struct kunit *test)
+{
+ crc_benchmark(test, crc64_be_wrapper);
+}
+
+/* crc64_nvme */
+
+static u64 crc64_nvme_wrapper(u64 crc, const u8 *p, size_t len)
+{
+ /* The inversions that crc64_nvme() does have to be undone here. */
+ return ~crc64_nvme(~crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc64_nvme = {
+ .bits = 64,
+ .le = true,
+ .poly = 0x9a6c9329ac4bc9b5,
+ .func = crc64_nvme_wrapper,
+};
+
+static void crc64_nvme_test(struct kunit *test)
+{
+ crc_test(test, &crc_variant_crc64_nvme);
+}
+
+static void crc64_nvme_benchmark(struct kunit *test)
+{
+ crc_benchmark(test, crc64_nvme_wrapper);
+}
+
+static struct kunit_case crc_test_cases[] = {
+ KUNIT_CASE(crc7_be_test),
+ KUNIT_CASE(crc7_be_benchmark),
+ KUNIT_CASE(crc16_test),
+ KUNIT_CASE(crc16_benchmark),
+ KUNIT_CASE(crc_t10dif_test),
+ KUNIT_CASE(crc_t10dif_benchmark),
+ KUNIT_CASE(crc32_le_test),
+ KUNIT_CASE(crc32_le_benchmark),
+ KUNIT_CASE(crc32_be_test),
+ KUNIT_CASE(crc32_be_benchmark),
+ KUNIT_CASE(crc32c_test),
+ KUNIT_CASE(crc32c_benchmark),
+ KUNIT_CASE(crc64_be_test),
+ KUNIT_CASE(crc64_be_benchmark),
+ KUNIT_CASE(crc64_nvme_test),
+ KUNIT_CASE(crc64_nvme_benchmark),
+ {},
+};
+
+static struct kunit_suite crc_test_suite = {
+ .name = "crc",
+ .test_cases = crc_test_cases,
+ .suite_init = crc_suite_init,
+ .suite_exit = crc_suite_exit,
+};
+kunit_test_suite(crc_test_suite);
+
+MODULE_DESCRIPTION("Unit tests and benchmarks for the CRC library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/x86/crc-pclmul-consts.h b/lib/crc/x86/crc-pclmul-consts.h
new file mode 100644
index 000000000000..6ae94158fca2
--- /dev/null
+++ b/lib/crc/x86/crc-pclmul-consts.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * CRC constants generated by:
+ *
+ * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
+ *
+ * Do not edit manually.
+ */
+
+/*
+ * CRC folding constants generated for most-significant-bit-first CRC-16 using
+ * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct {
+ u8 bswap_mask[16];
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = {
+ .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+ .fold_across_2048_bits_consts = {
+ 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */
+ 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */
+ 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */
+ 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */
+ },
+ .fold_across_256_bits_consts = {
+ 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */
+ 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */
+ },
+ .fold_across_128_bits_consts = {
+ 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */
+ 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */
+ 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */
+ },
+};
+
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
+ * x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct {
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = {
+ .fold_across_2048_bits_consts = {
+ 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */
+ 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */
+ 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */
+ 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */
+ },
+ .fold_across_256_bits_consts = {
+ 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */
+ 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */
+ },
+ .fold_across_128_bits_consts = {
+ 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */
+ 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */
+ 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */
+ },
+};
+
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 +
+ * x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0
+ */
+static const struct {
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc32_lsb_0x82f63b78_consts ____cacheline_aligned __maybe_unused = {
+ .fold_across_2048_bits_consts = {
+ 0x00000000dcb17aa4, /* HI64_TERMS: (x^2079 mod G) * x^32 */
+ 0x00000000b9e02b86, /* LO64_TERMS: (x^2015 mod G) * x^32 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x000000006992cea2, /* HI64_TERMS: (x^1055 mod G) * x^32 */
+ 0x000000000d3b6092, /* LO64_TERMS: (x^991 mod G) * x^32 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x00000000740eef02, /* HI64_TERMS: (x^543 mod G) * x^32 */
+ 0x000000009e4addf8, /* LO64_TERMS: (x^479 mod G) * x^32 */
+ },
+ .fold_across_256_bits_consts = {
+ 0x000000003da6d0cb, /* HI64_TERMS: (x^287 mod G) * x^32 */
+ 0x00000000ba4fc28e, /* LO64_TERMS: (x^223 mod G) * x^32 */
+ },
+ .fold_across_128_bits_consts = {
+ 0x00000000f20c0dfe, /* HI64_TERMS: (x^159 mod G) * x^32 */
+ 0x00000000493c7d27, /* LO64_TERMS: (x^95 mod G) * x^32 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x4869ec38dea713f1, /* HI64_TERMS: floor(x^95 / G) */
+ 0x0000000105ec76f0, /* LO64_TERMS: (G - x^32) * x^31 */
+ },
+};
+
+/*
+ * CRC folding constants generated for most-significant-bit-first CRC-64 using
+ * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x^1 + x^0
+ */
+static const struct {
+ u8 bswap_mask[16];
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = {
+ .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+ .fold_across_2048_bits_consts = {
+ 0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */
+ 0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */
+ 0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */
+ 0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */
+ },
+ .fold_across_256_bits_consts = {
+ 0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */
+ 0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */
+ },
+ .fold_across_128_bits_consts = {
+ 0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */
+ 0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */
+ 0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */
+ },
+};
+
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-64 using
+ * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
+ * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
+ * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
+ * x^4 + x^3 + x^0
+ */
+static const struct {
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = {
+ .fold_across_2048_bits_consts = {
+ 0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */
+ 0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */
+ 0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */
+ 0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */
+ },
+ .fold_across_256_bits_consts = {
+ 0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */
+ 0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */
+ },
+ .fold_across_128_bits_consts = {
+ 0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */
+ 0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */
+ 0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */
+ },
+};
diff --git a/lib/crc/x86/crc-pclmul-template.S b/lib/crc/x86/crc-pclmul-template.S
new file mode 100644
index 000000000000..a02f7dc8053e
--- /dev/null
+++ b/lib/crc/x86/crc-pclmul-template.S
@@ -0,0 +1,575 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+//
+// Template to generate [V]PCLMULQDQ-based CRC functions for x86
+//
+// Copyright 2025 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+
+// Offsets within the generated constants table
+.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only
+.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next
+.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next
+.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next
+.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next
+.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0
+.set OFFSETOF_SHUF_TABLE, 1*16
+.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16
+
+// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
+// corresponding non-VEX instruction plus any needed moves. The supported
+// instruction formats are:
+//
+// - Two-arg [src, dst], where the non-VEX format is the same.
+// - Three-arg [src1, src2, dst] where the non-VEX format is
+// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too.
+//
+// \insn gives the instruction without a "v" prefix and including any immediate
+// argument if needed to make the instruction follow one of the above formats.
+// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
+// it first; this is needed when \arg1 is an unaligned mem operand.
+.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
+.if AVX_LEVEL == 0
+ // VEX not allowed. Emulate it.
+ .ifnb \arg3 // Three-arg [src1, src2, dst]
+ .ifc "\arg2", "\arg3" // src2 == dst?
+ .ifnb \unaligned_mem_tmp
+ movdqu \arg1, \unaligned_mem_tmp
+ \insn \unaligned_mem_tmp, \arg3
+ .else
+ \insn \arg1, \arg3
+ .endif
+ .else // src2 != dst
+ .ifc "\arg1", "\arg3"
+ .error "Can't have src1 == dst when src2 != dst"
+ .endif
+ .ifnb \unaligned_mem_tmp
+ movdqu \arg1, \unaligned_mem_tmp
+ movdqa \arg2, \arg3
+ \insn \unaligned_mem_tmp, \arg3
+ .else
+ movdqa \arg2, \arg3
+ \insn \arg1, \arg3
+ .endif
+ .endif
+ .else // Two-arg [src, dst]
+ .ifnb \unaligned_mem_tmp
+ movdqu \arg1, \unaligned_mem_tmp
+ \insn \unaligned_mem_tmp, \arg2
+ .else
+ \insn \arg1, \arg2
+ .endif
+ .endif
+.else
+ // VEX is allowed. Emit the desired instruction directly.
+ .ifnb \arg3
+ v\insn \arg1, \arg2, \arg3
+ .else
+ v\insn \arg1, \arg2
+ .endif
+.endif
+.endm
+
+// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
+// register of length VL.
+.macro _vbroadcast src, dst
+.if VL == 16
+ _cond_vex movdqa, \src, \dst
+.elseif VL == 32
+ vbroadcasti128 \src, \dst
+.else
+ vbroadcasti32x4 \src, \dst
+.endif
+.endm
+
+// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
+// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
+.macro _load_data vl, src, bswap_mask, dst
+.if \vl < 64
+ _cond_vex movdqu, "\src", \dst
+.else
+ vmovdqu8 \src, \dst
+.endif
+.if !LSB_CRC
+ _cond_vex pshufb, \bswap_mask, \dst, \dst
+.endif
+.endm
+
+.macro _prepare_v0 vl, v0, v1, bswap_mask
+.if LSB_CRC
+ .if \vl < 64
+ _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1
+ .else
+ vpxorq (BUF), \v0, \v0
+ .endif
+.else
+ _load_data \vl, (BUF), \bswap_mask, \v1
+ .if \vl < 64
+ _cond_vex pxor, \v1, \v0, \v0
+ .else
+ vpxorq \v1, \v0, \v0
+ .endif
+.endif
+.endm
+
+// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
+// msb-first order or the physically high qword for lsb-first order
+#define LO64_TERMS 0
+
+// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
+// qword for msb-first order or the physically low qword for lsb-first order
+#define HI64_TERMS 1
+
+// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
+// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
+.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst
+ _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
+ \src1, \src2, \dst
+.endm
+
+// Fold \acc into \data and store the result back into \acc. \data can be an
+// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
+// byte-reflection is needed; otherwise it must be a vector register. \consts
+// is a vector register containing the needed fold constants, and \tmp is a
+// temporary vector register. All arguments must be the same length.
+.macro _fold_vec acc, data, consts, tmp
+ _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
+ _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc
+.if AVX_LEVEL <= 2
+ _cond_vex pxor, \data, \tmp, \tmp
+ _cond_vex pxor, \tmp, \acc, \acc
+.else
+ vpternlogq $0x96, \data, \tmp, \acc
+.endif
+.endm
+
+// Fold \acc into \data and store the result back into \acc. \data is an
+// unaligned mem operand, \consts is a vector register containing the needed
+// fold constants, \bswap_mask is a vector register containing the
+// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
+// temporary vector registers. All arguments must have length \vl.
+.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2
+.if AVX_LEVEL == 0 || !LSB_CRC
+ _load_data \vl, \data, \bswap_mask, \tmp1
+ _fold_vec \acc, \tmp1, \consts, \tmp2
+.else
+ _fold_vec \acc, \data, \consts, \tmp1
+.endif
+.endm
+
+// Load the constants for folding across 2**i vectors of length VL at a time
+// into all 128-bit lanes of the vector register CONSTS.
+.macro _load_vec_folding_consts i
+ _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
+ CONSTS
+.endm
+
+// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
+// the result back into \v0. If the remaining length mod \vl is nonzero, also
+// fold \vl data bytes from BUF. For both operations the fold distance is \vl.
+// \consts must be a register of length \vl containing the fold constants.
+.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2
+ _fold_vec \v0, \v1, \consts, \tmp1
+ test $\vl, LEN8
+ jz .Lfold_vec_final_done\@
+ _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
+ add $\vl, BUF
+.Lfold_vec_final_done\@:
+.endm
+
+// This macro generates the body of a CRC function with the following prototype:
+//
+// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
+//
+// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
+// |buf| is the data to checksum. |len| is the data length in bytes, which must
+// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts
+// field of the constants struct that was generated for the chosen CRC variant.
+//
+// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
+// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If
+// the file is compiled in i386 mode, then the maximum supported value is 32.
+//
+// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
+// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0
+// if the CRC processes the most significant bit of each byte first, i.e. maps
+// bit0 to x^0, bit1 to x^1, bit7 to x^7.
+//
+// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
+//
+// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
+// 512 for AVX512.
+//
+// If \vl == 16 && \avx_level == 0, the generated code requires:
+// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
+//
+// If \vl == 32 && \avx_level == 2, the generated code requires:
+// VPCLMULQDQ && AVX2.
+//
+// If \vl == 64 && \avx_level == 512, the generated code requires:
+// VPCLMULQDQ && AVX512BW && AVX512VL.
+//
+// Other \vl and \avx_level combinations are either not supported or not useful.
+.macro _crc_pclmul n, lsb_crc, vl, avx_level
+ .set LSB_CRC, \lsb_crc
+ .set VL, \vl
+ .set AVX_LEVEL, \avx_level
+
+ // Define aliases for the xmm, ymm, or zmm registers according to VL.
+.irp i, 0,1,2,3,4,5,6,7
+ .if VL == 16
+ .set V\i, %xmm\i
+ .set LOG2_VL, 4
+ .elseif VL == 32
+ .set V\i, %ymm\i
+ .set LOG2_VL, 5
+ .elseif VL == 64
+ .set V\i, %zmm\i
+ .set LOG2_VL, 6
+ .else
+ .error "Unsupported vector length"
+ .endif
+.endr
+ // Define aliases for the function parameters.
+ // Note: when crc_t is shorter than u32, zero-extension to 32 bits is
+ // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed
+ // when crc_t is shorter than u64.
+#ifdef __x86_64__
+.if \n <= 32
+ .set CRC, %edi
+.else
+ .set CRC, %rdi
+.endif
+ .set BUF, %rsi
+ .set LEN, %rdx
+ .set LEN32, %edx
+ .set LEN8, %dl
+ .set CONSTS_PTR, %rcx
+#else
+ // 32-bit support, assuming -mregparm=3 and not including support for
+ // CRC-64 (which would use both eax and edx to pass the crc parameter).
+ .set CRC, %eax
+ .set BUF, %edx
+ .set LEN, %ecx
+ .set LEN32, %ecx
+ .set LEN8, %cl
+ .set CONSTS_PTR, %ebx // Passed on stack
+#endif
+
+ // Define aliases for some local variables. V0-V5 are used without
+ // aliases (for accumulators, data, temporary values, etc). Staying
+ // within the first 8 vector registers keeps the code 32-bit SSE
+ // compatible and reduces the size of 64-bit SSE code slightly.
+ .set BSWAP_MASK, V6
+ .set BSWAP_MASK_YMM, %ymm6
+ .set BSWAP_MASK_XMM, %xmm6
+ .set CONSTS, V7
+ .set CONSTS_YMM, %ymm7
+ .set CONSTS_XMM, %xmm7
+
+ // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
+ // functions generated by this macro are called only by static_call.
+ ANNOTATE_NOENDBR
+
+#ifdef __i386__
+ push CONSTS_PTR
+ mov 8(%esp), CONSTS_PTR
+#endif
+
+ // Create a 128-bit vector that contains the initial CRC in the end
+ // representing the high-order polynomial coefficients, and the rest 0.
+ // If the CRC is msb-first, also load the byte-reflection table.
+.if \n <= 32
+ _cond_vex movd, CRC, %xmm0
+.else
+ _cond_vex movq, CRC, %xmm0
+.endif
+.if !LSB_CRC
+ _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
+ _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
+.endif
+
+ // Load the first vector of data and XOR the initial CRC into the
+ // appropriate end of the first 128-bit lane of data. If LEN < VL, then
+ // use a short vector and jump ahead to the final reduction. (LEN >= 16
+ // is guaranteed here but not necessarily LEN >= VL.)
+.if VL >= 32
+ cmp $VL, LEN
+ jae .Lat_least_1vec\@
+ .if VL == 64
+ cmp $32, LEN32
+ jb .Lless_than_32bytes\@
+ _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM
+ add $32, BUF
+ jmp .Lreduce_256bits_to_128bits\@
+.Lless_than_32bytes\@:
+ .endif
+ _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM
+ add $16, BUF
+ vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
+ jmp .Lcheck_for_partial_block\@
+.Lat_least_1vec\@:
+.endif
+ _prepare_v0 VL, V0, V1, BSWAP_MASK
+
+ // Handle VL <= LEN < 4*VL.
+ cmp $4*VL-1, LEN
+ ja .Lat_least_4vecs\@
+ add $VL, BUF
+ // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
+ // If VL==16 then load fold_across_128_bits_consts first, as the final
+ // reduction depends on it and it won't be loaded anywhere else.
+ cmp $2*VL-1, LEN32
+.if VL == 16
+ _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
+.endif
+ jbe .Lreduce_1vec_to_128bits\@
+ // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to
+ // the reduction from 2 vectors.
+ _load_data VL, (BUF), BSWAP_MASK, V1
+ add $VL, BUF
+ jmp .Lreduce_2vecs_to_1\@
+
+.Lat_least_4vecs\@:
+ // Load 3 more vectors of data.
+ _load_data VL, 1*VL(BUF), BSWAP_MASK, V1
+ _load_data VL, 2*VL(BUF), BSWAP_MASK, V2
+ _load_data VL, 3*VL(BUF), BSWAP_MASK, V3
+ sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32
+ add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32
+
+ // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
+ // 4 vectors of data and write the result back to V0-V3.
+ cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32
+ jbe .Lreduce_4vecs_to_2\@
+ _load_vec_folding_consts 2
+.Lfold_4vecs_loop\@:
+ _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ sub $-4*VL, BUF
+ add $-4*VL, LEN
+ cmp $4*VL-1, LEN
+ ja .Lfold_4vecs_loop\@
+
+ // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold
+ // two more vectors of data from BUF, if at least that much remains.
+.Lreduce_4vecs_to_2\@:
+ _load_vec_folding_consts 1
+ _fold_vec V0, V2, CONSTS, V4
+ _fold_vec V1, V3, CONSTS, V4
+ test $2*VL, LEN8
+ jz .Lreduce_2vecs_to_1\@
+ _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+ sub $-2*VL, BUF
+
+ // Fold V0 into V1 and write the result back to V0. Then fold one more
+ // vector of data from BUF, if at least that much remains.
+.Lreduce_2vecs_to_1\@:
+ _load_vec_folding_consts 0
+ _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
+
+.Lreduce_1vec_to_128bits\@:
+.if VL == 64
+ // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of
+ // data from BUF, if at least that much remains.
+ vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
+ vextracti64x4 $1, %zmm0, %ymm1
+ _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
+.Lreduce_256bits_to_128bits\@:
+.endif
+.if VL >= 32
+ // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of
+ // data from BUF, if at least that much remains.
+ vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
+ vextracti128 $1, %ymm0, %xmm1
+ _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
+.Lcheck_for_partial_block\@:
+.endif
+ and $15, LEN32
+ jz .Lreduce_128bits_to_crc\@
+
+ // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now
+ // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
+ // and B is the polynomial of the remaining LEN data bytes. To reduce
+ // this to 128 bits without needing fold constants for each possible
+ // LEN, rearrange this expression into C1*(x^128) + C2, where
+ // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
+ // Then fold C1 into C2, which is just another fold across 128 bits.
+
+.if !LSB_CRC || AVX_LEVEL == 0
+ // Load the last 16 data bytes. Note that originally LEN was >= 16.
+ _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
+.endif // Else will use vpblendvb mem operand later.
+.if !LSB_CRC
+ neg LEN // Needed for indexing shuf_table
+.endif
+
+ // tmp = A*x^(8*LEN) mod x^128
+ // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
+ // i.e. right-shift by LEN bytes.
+ // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
+ // i.e. left-shift by LEN bytes.
+ _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
+ _cond_vex pshufb, %xmm3, %xmm0, %xmm1
+
+ // C1 = floor(A / x^(128 - 8*LEN))
+ // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
+ // i.e. left-shift by 16-LEN bytes.
+ // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
+ // i.e. right-shift by 16-LEN bytes.
+ _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
+ %xmm0, %xmm0, unaligned_mem_tmp=%xmm4
+
+ // C2 = tmp + B. This is just a blend of tmp with the last 16 data
+ // bytes (reflected if msb-first). The blend mask is the shuffle table
+ // that was used to create tmp. 0 selects tmp, and 1 last16databytes.
+.if AVX_LEVEL == 0
+ movdqa %xmm0, %xmm4
+ movdqa %xmm3, %xmm0
+ pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand
+ movdqa %xmm4, %xmm0
+.elseif LSB_CRC
+ vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1
+.else
+ vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+.endif
+
+ // Fold C1 into C2 and store the 128-bit result in %xmm0.
+ _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4
+
+.Lreduce_128bits_to_crc\@:
+ // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit
+ // polynomial stored in %xmm0 (using either lsb-first or msb-first bit
+ // order according to LSB_CRC), and G is the CRC's generator polynomial.
+
+ // First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
+ //
+ // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
+ // x^n * (%xmm0 mod x^64)
+ //
+ // Store t0 * x^(64-n) in %xmm0. I.e., actually do:
+ //
+ // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
+ // x^64 * (%xmm0 mod x^64)
+ //
+ // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
+ // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
+ // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
+ // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
+ // (considering the extra factor of x that gets implicitly introduced by
+ // each pclmulqdq when using lsb-first order), is identical to the
+ // constant that was used earlier for folding the LO64_TERMS across 128
+ // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM.
+ _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
+.if LSB_CRC
+ _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
+.else
+ _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
+.endif
+ _cond_vex pxor, %xmm1, %xmm0, %xmm0
+ // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
+ // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
+
+ // First step of Barrett reduction: Compute floor(t0 / G). This is the
+ // polynomial by which G needs to be multiplied to cancel out the x^n
+ // and higher terms of t0, i.e. to reduce t0 mod G. First do:
+ //
+ // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
+ //
+ // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in
+ // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
+ // value that makes enough precision be carried through the calculation.
+ //
+ // The '* x' makes it so the result is floor(t1 / x^64) rather than
+ // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
+ // can be extracted much more easily in the next step. In the lsb-first
+ // case the '* x' happens implicitly. In the msb-first case it must be
+ // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
+ // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
+ // the multiplication by the x^64 term is handled using a pxor. The
+ // pxor causes the low 64 terms of t1 to be wrong, but they are unused.
+ _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
+ _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
+.if !LSB_CRC
+ _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
+.endif
+ // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
+
+ // Second step of Barrett reduction: Cancel out the x^n and higher terms
+ // of t0 by subtracting the needed multiple of G. This gives the CRC:
+ //
+ // crc := t0 - (G * floor(t0 / G))
+ //
+ // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
+ //
+ // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
+ //
+ // Furthermore, since the resulting CRC is n-bit, if mod x^n is
+ // explicitly applied to it then the x^n term of G makes no difference
+ // in the result and can be omitted. This helps keep the constant
+ // multiplier in 64 bits in most cases. This gives the following:
+ //
+ // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
+ // crc := (%xmm0 / x^(64-n)) mod x^n
+ //
+ // In the lsb-first case, each pclmulqdq implicitly introduces
+ // an extra factor of x, so in that case the constant that needs to be
+ // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
+ // For lsb-first CRCs where n=64, the extra factor of x cannot be as
+ // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to
+ // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC
+ // polynomials have nonzero x^n and x^0 terms.) It works out as: the
+ // CRC has be XORed with the physically low qword of %xmm1, representing
+ // floor(t0 / G). The most efficient way to do that is to move it to
+ // the physically high qword and use a ternlog to combine the two XORs.
+.if LSB_CRC && \n == 64
+ _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2
+ _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
+ .if AVX_LEVEL <= 2
+ _cond_vex pxor, %xmm2, %xmm0, %xmm0
+ _cond_vex pxor, %xmm1, %xmm0, %xmm0
+ .else
+ vpternlogq $0x96, %xmm2, %xmm1, %xmm0
+ .endif
+ _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64
+.else
+ _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
+ _cond_vex pxor, %xmm1, %xmm0, %xmm0
+ .if \n == 8
+ _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
+ .elseif \n == 16
+ _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
+ .elseif \n == 32
+ _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
+ .else // \n == 64 && !LSB_CRC
+ _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64
+ .endif
+.endif
+
+.if VL > 16
+ vzeroupper // Needed when ymm or zmm registers may have been used.
+.endif
+#ifdef __i386__
+ pop CONSTS_PTR
+#endif
+ RET
+.endm
+
+#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
+SYM_FUNC_START(prefix##_pclmul_sse); \
+ _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
+SYM_FUNC_END(prefix##_pclmul_sse); \
+ \
+SYM_FUNC_START(prefix##_vpclmul_avx2); \
+ _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \
+SYM_FUNC_END(prefix##_vpclmul_avx2); \
+ \
+SYM_FUNC_START(prefix##_vpclmul_avx512); \
+ _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \
+SYM_FUNC_END(prefix##_vpclmul_avx512);
diff --git a/lib/crc/x86/crc-pclmul-template.h b/lib/crc/x86/crc-pclmul-template.h
new file mode 100644
index 000000000000..35c950d7010c
--- /dev/null
+++ b/lib/crc/x86/crc-pclmul-template.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
+ * instantiated by crc-pclmul-template.S
+ *
+ * Copyright 2025 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+#ifndef _CRC_PCLMUL_TEMPLATE_H
+#define _CRC_PCLMUL_TEMPLATE_H
+
+#include <asm/cpufeatures.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <linux/static_call.h>
+#include "crc-pclmul-consts.h"
+
+#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \
+crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \
+ const void *consts_ptr); \
+crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \
+ const void *consts_ptr); \
+crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \
+ const void *consts_ptr); \
+DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
+
+static inline bool have_vpclmul(void)
+{
+ return boot_cpu_has(X86_FEATURE_VPCLMULQDQ) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL);
+}
+
+static inline bool have_avx512(void)
+{
+ return boot_cpu_has(X86_FEATURE_AVX512BW) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ !boot_cpu_has(X86_FEATURE_PREFER_YMM) &&
+ cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL);
+}
+
+/*
+ * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
+ * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
+ *
+ * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
+ * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
+ * varying by CPU and factors such as which parts of the "FPU" state userspace
+ * has touched, which could result in a larger cutoff being better. Indeed, a
+ * larger cutoff is usually better for a *single* message. However, the
+ * overhead of the FPU section gets amortized if multiple FPU sections get
+ * executed before returning to userspace, since the XSAVE and XRSTOR occur only
+ * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on
+ * the dcache than the table-based code is, a 16-byte cutoff seems to work well.
+ */
+#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \
+do { \
+ if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \
+ crypto_simd_usable()) { \
+ const void *consts_ptr; \
+ \
+ consts_ptr = (consts).fold_across_128_bits_consts; \
+ kernel_fpu_begin(); \
+ crc = static_call(prefix##_pclmul)((crc), (p), (len), \
+ consts_ptr); \
+ kernel_fpu_end(); \
+ return crc; \
+ } \
+} while (0)
+
+#endif /* _CRC_PCLMUL_TEMPLATE_H */
diff --git a/lib/crc/x86/crc-t10dif.h b/lib/crc/x86/crc-t10dif.h
new file mode 100644
index 000000000000..2a02a3026f3f
--- /dev/null
+++ b/lib/crc/x86/crc-t10dif.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CRC-T10DIF using [V]PCLMULQDQ instructions
+ *
+ * Copyright 2024 Google LLC
+ */
+
+#include "crc-pclmul-template.h"
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16);
+
+static inline u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts,
+ have_pclmulqdq);
+ return crc_t10dif_generic(crc, p, len);
+}
+
+#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
+static inline void crc_t10dif_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ static_branch_enable(&have_pclmulqdq);
+ if (have_vpclmul()) {
+ if (have_avx512())
+ static_call_update(crc16_msb_pclmul,
+ crc16_msb_vpclmul_avx512);
+ else
+ static_call_update(crc16_msb_pclmul,
+ crc16_msb_vpclmul_avx2);
+ }
+ }
+}
diff --git a/lib/crc/x86/crc16-msb-pclmul.S b/lib/crc/x86/crc16-msb-pclmul.S
new file mode 100644
index 000000000000..e9fe248093a8
--- /dev/null
+++ b/lib/crc/x86/crc16-msb-pclmul.S
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2025 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0)
diff --git a/lib/crc/x86/crc32-pclmul.S b/lib/crc/x86/crc32-pclmul.S
new file mode 100644
index 000000000000..f20f40fb0172
--- /dev/null
+++ b/lib/crc/x86/crc32-pclmul.S
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2025 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)
diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h
new file mode 100644
index 000000000000..cea2c96d08d0
--- /dev/null
+++ b/lib/crc/x86/crc32.h
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * x86-optimized CRC32 functions
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Copyright 2012 Xyratex Technology Limited
+ * Copyright 2024 Google LLC
+ */
+
+#include "crc-pclmul-template.h"
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
+ have_pclmulqdq);
+ return crc32_le_base(crc, p, len);
+}
+
+#ifdef CONFIG_X86_64
+#define CRC32_INST "crc32q %1, %q0"
+#else
+#define CRC32_INST "crc32l %1, %0"
+#endif
+
+/*
+ * Use carryless multiply version of crc32c when buffer size is >= 512 to
+ * account for FPU state save/restore overhead.
+ */
+#define CRC32C_PCLMUL_BREAKEVEN 512
+
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+ size_t num_longs;
+
+ if (!static_branch_likely(&have_crc32))
+ return crc32c_base(crc, p, len);
+
+ if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
+ static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
+ /*
+ * Long length, the vector registers are usable, and the CPU is
+ * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
+ * It is worthwhile to divide the data into multiple streams,
+ * CRC them independently, and combine them using PCLMULQDQ.
+ * crc32c_x86_3way() does this using 3 streams, which is the
+ * most that x86_64 CPUs have traditionally been capable of.
+ *
+ * However, due to improved VPCLMULQDQ performance on newer
+ * CPUs, use crc32_lsb_vpclmul_avx512() instead of
+ * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
+ * "good" implementation of AVX-512.
+ *
+ * Future work: the optimal strategy on Zen 3--5 is actually to
+ * use both crc32q and VPCLMULQDQ in parallel. Unfortunately,
+ * different numbers of streams and vector lengths are optimal
+ * on each CPU microarchitecture, making it challenging to take
+ * advantage of this. (Zen 5 even supports 7 parallel crc32q, a
+ * major upgrade.) For now, just choose between
+ * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter
+ * is needed anyway for crc32_le(), so we just reuse it here.
+ */
+ kernel_fpu_begin();
+ if (static_branch_likely(&have_vpclmul_avx512))
+ crc = crc32_lsb_vpclmul_avx512(crc, p, len,
+ crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
+ else
+ crc = crc32c_x86_3way(crc, p, len);
+ kernel_fpu_end();
+ return crc;
+ }
+
+ /*
+ * Short length, XMM registers unusable, or the CPU is 32-bit; but the
+ * CPU supports CRC32 instructions. Just issue a single stream of CRC32
+ * instructions inline. While this doesn't use the CPU's CRC32
+ * throughput very well, it avoids the need to combine streams. Stream
+ * combination would be inefficient here.
+ */
+
+ for (num_longs = len / sizeof(unsigned long);
+ num_longs != 0; num_longs--, p += sizeof(unsigned long))
+ asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
+
+ if (sizeof(unsigned long) > 4 && (len & 4)) {
+ asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
+ p += 4;
+ }
+ if (len & 2) {
+ asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
+ p += 2;
+ }
+ if (len & 1)
+ asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
+
+ return crc;
+}
+
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static inline void crc32_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_XMM4_2))
+ static_branch_enable(&have_crc32);
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ static_branch_enable(&have_pclmulqdq);
+ if (have_vpclmul()) {
+ if (have_avx512()) {
+ static_call_update(crc32_lsb_pclmul,
+ crc32_lsb_vpclmul_avx512);
+ static_branch_enable(&have_vpclmul_avx512);
+ } else {
+ static_call_update(crc32_lsb_pclmul,
+ crc32_lsb_vpclmul_avx2);
+ }
+ }
+ }
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+ u32 optimizations = 0;
+
+ if (static_key_enabled(&have_crc32))
+ optimizations |= CRC32C_OPTIMIZATION;
+ if (static_key_enabled(&have_pclmulqdq))
+ optimizations |= CRC32_LE_OPTIMIZATION;
+ return optimizations;
+}
diff --git a/lib/crc/x86/crc32c-3way.S b/lib/crc/x86/crc32c-3way.S
new file mode 100644
index 000000000000..9b8770503bbc
--- /dev/null
+++ b/lib/crc/x86/crc32c-3way.S
@@ -0,0 +1,360 @@
+/*
+ * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
+ *
+ * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
+ * downloaded from:
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ * Copyright 2024 Google LLC
+ *
+ * Authors:
+ * Wajdi Feghali <wajdi.k.feghali@intel.com>
+ * James Guilford <james.guilford@intel.com>
+ * David Cote <david.m.cote@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/linkage.h>
+
+## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+
+# Define threshold below which buffers are considered "small" and routed to
+# regular CRC code that does not interleave the CRC instructions.
+#define SMALL_SIZE 200
+
+# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
+
+.text
+SYM_FUNC_START(crc32c_x86_3way)
+#define crc0 %edi
+#define crc0_q %rdi
+#define bufp %rsi
+#define bufp_d %esi
+#define len %rdx
+#define len_dw %edx
+#define n_misaligned %ecx /* overlaps chunk_bytes! */
+#define n_misaligned_q %rcx
+#define chunk_bytes %ecx /* overlaps n_misaligned! */
+#define chunk_bytes_q %rcx
+#define crc1 %r8
+#define crc2 %r9
+
+ cmp $SMALL_SIZE, len
+ jb .Lsmall
+
+ ################################################################
+ ## 1) ALIGN:
+ ################################################################
+ mov bufp_d, n_misaligned
+ neg n_misaligned
+ and $7, n_misaligned # calculate the misalignment amount of
+ # the address
+ je .Laligned # Skip if aligned
+
+ # Process 1 <= n_misaligned <= 7 bytes individually in order to align
+ # the remaining data to an 8-byte boundary.
+.Ldo_align:
+ movq (bufp), %rax
+ add n_misaligned_q, bufp
+ sub n_misaligned_q, len
+.Lalign_loop:
+ crc32b %al, crc0 # compute crc32 of 1-byte
+ shr $8, %rax # get next byte
+ dec n_misaligned
+ jne .Lalign_loop
+.Laligned:
+
+ ################################################################
+ ## 2) PROCESS BLOCK:
+ ################################################################
+
+ cmp $128*24, len
+ jae .Lfull_block
+
+.Lpartial_block:
+ # Compute floor(len / 24) to get num qwords to process from each lane.
+ imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
+ shr $16, %eax
+ jmp .Lcrc_3lanes
+
+.Lfull_block:
+ # Processing 128 qwords from each lane.
+ mov $128, %eax
+
+ ################################################################
+ ## 3) CRC each of three lanes:
+ ################################################################
+
+.Lcrc_3lanes:
+ xor crc1,crc1
+ xor crc2,crc2
+ mov %eax, chunk_bytes
+ shl $3, chunk_bytes # num bytes to process from each lane
+ sub $5, %eax # 4 for 4x_loop, 1 for special last iter
+ jl .Lcrc_3lanes_4x_done
+
+ # Unroll the loop by a factor of 4 to reduce the overhead of the loop
+ # bookkeeping instructions, which can compete with crc32q for the ALUs.
+.Lcrc_3lanes_4x_loop:
+ crc32q (bufp), crc0_q
+ crc32q (bufp,chunk_bytes_q), crc1
+ crc32q (bufp,chunk_bytes_q,2), crc2
+ crc32q 8(bufp), crc0_q
+ crc32q 8(bufp,chunk_bytes_q), crc1
+ crc32q 8(bufp,chunk_bytes_q,2), crc2
+ crc32q 16(bufp), crc0_q
+ crc32q 16(bufp,chunk_bytes_q), crc1
+ crc32q 16(bufp,chunk_bytes_q,2), crc2
+ crc32q 24(bufp), crc0_q
+ crc32q 24(bufp,chunk_bytes_q), crc1
+ crc32q 24(bufp,chunk_bytes_q,2), crc2
+ add $32, bufp
+ sub $4, %eax
+ jge .Lcrc_3lanes_4x_loop
+
+.Lcrc_3lanes_4x_done:
+ add $4, %eax
+ jz .Lcrc_3lanes_last_qword
+
+.Lcrc_3lanes_1x_loop:
+ crc32q (bufp), crc0_q
+ crc32q (bufp,chunk_bytes_q), crc1
+ crc32q (bufp,chunk_bytes_q,2), crc2
+ add $8, bufp
+ dec %eax
+ jnz .Lcrc_3lanes_1x_loop
+
+.Lcrc_3lanes_last_qword:
+ crc32q (bufp), crc0_q
+ crc32q (bufp,chunk_bytes_q), crc1
+# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
+
+ ################################################################
+ ## 4) Combine three results:
+ ################################################################
+
+ lea (K_table-8)(%rip), %rax # first entry is for idx 1
+ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
+ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
+ sub %rax, len # len -= chunk_bytes * 3
+
+ movq crc0_q, %xmm1 # CRC for block 1
+ pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
+
+ movq crc1, %xmm2 # CRC for block 2
+ pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
+
+ pxor %xmm2,%xmm1
+ movq %xmm1, %rax
+ xor (bufp,chunk_bytes_q,2), %rax
+ mov crc2, crc0_q
+ crc32 %rax, crc0_q
+ lea 8(bufp,chunk_bytes_q,2), bufp
+
+ ################################################################
+ ## 5) If more blocks remain, goto (2):
+ ################################################################
+
+ cmp $128*24, len
+ jae .Lfull_block
+ cmp $SMALL_SIZE, len
+ jae .Lpartial_block
+
+ #######################################################################
+ ## 6) Process any remainder without interleaving:
+ #######################################################################
+.Lsmall:
+ test len_dw, len_dw
+ jz .Ldone
+ mov len_dw, %eax
+ shr $3, %eax
+ jz .Ldo_dword
+.Ldo_qwords:
+ crc32q (bufp), crc0_q
+ add $8, bufp
+ dec %eax
+ jnz .Ldo_qwords
+.Ldo_dword:
+ test $4, len_dw
+ jz .Ldo_word
+ crc32l (bufp), crc0
+ add $4, bufp
+.Ldo_word:
+ test $2, len_dw
+ jz .Ldo_byte
+ crc32w (bufp), crc0
+ add $2, bufp
+.Ldo_byte:
+ test $1, len_dw
+ jz .Ldone
+ crc32b (bufp), crc0
+.Ldone:
+ mov crc0, %eax
+ RET
+SYM_FUNC_END(crc32c_x86_3way)
+
+.section .rodata, "a", @progbits
+ ################################################################
+ ## PCLMULQDQ tables
+ ## Table is 128 entries x 2 words (8 bytes) each
+ ################################################################
+.align 8
+K_table:
+ .long 0x493c7d27, 0x00000001
+ .long 0xba4fc28e, 0x493c7d27
+ .long 0xddc0152b, 0xf20c0dfe
+ .long 0x9e4addf8, 0xba4fc28e
+ .long 0x39d3b296, 0x3da6d0cb
+ .long 0x0715ce53, 0xddc0152b
+ .long 0x47db8317, 0x1c291d04
+ .long 0x0d3b6092, 0x9e4addf8
+ .long 0xc96cfdc0, 0x740eef02
+ .long 0x878a92a7, 0x39d3b296
+ .long 0xdaece73e, 0x083a6eec
+ .long 0xab7aff2a, 0x0715ce53
+ .long 0x2162d385, 0xc49f4f67
+ .long 0x83348832, 0x47db8317
+ .long 0x299847d5, 0x2ad91c30
+ .long 0xb9e02b86, 0x0d3b6092
+ .long 0x18b33a4e, 0x6992cea2
+ .long 0xb6dd949b, 0xc96cfdc0
+ .long 0x78d9ccb7, 0x7e908048
+ .long 0xbac2fd7b, 0x878a92a7
+ .long 0xa60ce07b, 0x1b3d8f29
+ .long 0xce7f39f4, 0xdaece73e
+ .long 0x61d82e56, 0xf1d0f55e
+ .long 0xd270f1a2, 0xab7aff2a
+ .long 0xc619809d, 0xa87ab8a8
+ .long 0x2b3cac5d, 0x2162d385
+ .long 0x65863b64, 0x8462d800
+ .long 0x1b03397f, 0x83348832
+ .long 0xebb883bd, 0x71d111a8
+ .long 0xb3e32c28, 0x299847d5
+ .long 0x064f7f26, 0xffd852c6
+ .long 0xdd7e3b0c, 0xb9e02b86
+ .long 0xf285651c, 0xdcb17aa4
+ .long 0x10746f3c, 0x18b33a4e
+ .long 0xc7a68855, 0xf37c5aee
+ .long 0x271d9844, 0xb6dd949b
+ .long 0x8e766a0c, 0x6051d5a2
+ .long 0x93a5f730, 0x78d9ccb7
+ .long 0x6cb08e5c, 0x18b0d4ff
+ .long 0x6b749fb2, 0xbac2fd7b
+ .long 0x1393e203, 0x21f3d99c
+ .long 0xcec3662e, 0xa60ce07b
+ .long 0x96c515bb, 0x8f158014
+ .long 0xe6fc4e6a, 0xce7f39f4
+ .long 0x8227bb8a, 0xa00457f7
+ .long 0xb0cd4768, 0x61d82e56
+ .long 0x39c7ff35, 0x8d6d2c43
+ .long 0xd7a4825c, 0xd270f1a2
+ .long 0x0ab3844b, 0x00ac29cf
+ .long 0x0167d312, 0xc619809d
+ .long 0xf6076544, 0xe9adf796
+ .long 0x26f6a60a, 0x2b3cac5d
+ .long 0xa741c1bf, 0x96638b34
+ .long 0x98d8d9cb, 0x65863b64
+ .long 0x49c3cc9c, 0xe0e9f351
+ .long 0x68bce87a, 0x1b03397f
+ .long 0x57a3d037, 0x9af01f2d
+ .long 0x6956fc3b, 0xebb883bd
+ .long 0x42d98888, 0x2cff42cf
+ .long 0x3771e98f, 0xb3e32c28
+ .long 0xb42ae3d9, 0x88f25a3a
+ .long 0x2178513a, 0x064f7f26
+ .long 0xe0ac139e, 0x4e36f0b0
+ .long 0x170076fa, 0xdd7e3b0c
+ .long 0x444dd413, 0xbd6f81f8
+ .long 0x6f345e45, 0xf285651c
+ .long 0x41d17b64, 0x91c9bd4b
+ .long 0xff0dba97, 0x10746f3c
+ .long 0xa2b73df1, 0x885f087b
+ .long 0xf872e54c, 0xc7a68855
+ .long 0x1e41e9fc, 0x4c144932
+ .long 0x86d8e4d2, 0x271d9844
+ .long 0x651bd98b, 0x52148f02
+ .long 0x5bb8f1bc, 0x8e766a0c
+ .long 0xa90fd27a, 0xa3c6f37a
+ .long 0xb3af077a, 0x93a5f730
+ .long 0x4984d782, 0xd7c0557f
+ .long 0xca6ef3ac, 0x6cb08e5c
+ .long 0x234e0b26, 0x63ded06a
+ .long 0xdd66cbbb, 0x6b749fb2
+ .long 0x4597456a, 0x4d56973c
+ .long 0xe9e28eb4, 0x1393e203
+ .long 0x7b3ff57a, 0x9669c9df
+ .long 0xc9c8b782, 0xcec3662e
+ .long 0x3f70cc6f, 0xe417f38a
+ .long 0x93e106a4, 0x96c515bb
+ .long 0x62ec6c6d, 0x4b9e0f71
+ .long 0xd813b325, 0xe6fc4e6a
+ .long 0x0df04680, 0xd104b8fc
+ .long 0x2342001e, 0x8227bb8a
+ .long 0x0a2a8d7e, 0x5b397730
+ .long 0x6d9a4957, 0xb0cd4768
+ .long 0xe8b6368b, 0xe78eb416
+ .long 0xd2c3ed1a, 0x39c7ff35
+ .long 0x995a5724, 0x61ff0e01
+ .long 0x9ef68d35, 0xd7a4825c
+ .long 0x0c139b31, 0x8d96551c
+ .long 0xf2271e60, 0x0ab3844b
+ .long 0x0b0bf8ca, 0x0bf80dd2
+ .long 0x2664fd8b, 0x0167d312
+ .long 0xed64812d, 0x8821abed
+ .long 0x02ee03b2, 0xf6076544
+ .long 0x8604ae0f, 0x6a45d2b2
+ .long 0x363bd6b3, 0x26f6a60a
+ .long 0x135c83fd, 0xd8d26619
+ .long 0x5fabe670, 0xa741c1bf
+ .long 0x35ec3279, 0xde87806c
+ .long 0x00bcf5f6, 0x98d8d9cb
+ .long 0x8ae00689, 0x14338754
+ .long 0x17f27698, 0x49c3cc9c
+ .long 0x58ca5f00, 0x5bd2011f
+ .long 0xaa7c7ad5, 0x68bce87a
+ .long 0xb5cfca28, 0xdd07448e
+ .long 0xded288f8, 0x57a3d037
+ .long 0x59f229bc, 0xdde8f5b9
+ .long 0x6d390dec, 0x6956fc3b
+ .long 0x37170390, 0xa3e3e02c
+ .long 0x6353c1cc, 0x42d98888
+ .long 0xc4584f5c, 0xd73c7bea
+ .long 0xf48642e9, 0x3771e98f
+ .long 0x531377e2, 0x80ff0093
+ .long 0xdd35bc8d, 0xb42ae3d9
+ .long 0xb25b29f2, 0x8fe4c34d
+ .long 0x9a5ede41, 0x2178513a
+ .long 0xa563905d, 0xdf99fc11
+ .long 0x45cddf4e, 0xe0ac139e
+ .long 0xacfa3103, 0x6c23e841
+ .long 0xa51b6135, 0x170076fa
diff --git a/lib/crc/x86/crc64-pclmul.S b/lib/crc/x86/crc64-pclmul.S
new file mode 100644
index 000000000000..4173051b5197
--- /dev/null
+++ b/lib/crc/x86/crc64-pclmul.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2025 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc64_msb, /* bits= */ 64, /* lsb= */ 0)
+DEFINE_CRC_PCLMUL_FUNCS(crc64_lsb, /* bits= */ 64, /* lsb= */ 1)
diff --git a/lib/crc/x86/crc64.h b/lib/crc/x86/crc64.h
new file mode 100644
index 000000000000..fde1222c4c58
--- /dev/null
+++ b/lib/crc/x86/crc64.h
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CRC64 using [V]PCLMULQDQ instructions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-pclmul-template.h"
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64);
+DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64);
+
+static inline u64 crc64_be_arch(u64 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts,
+ have_pclmulqdq);
+ return crc64_be_generic(crc, p, len);
+}
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+ CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts,
+ have_pclmulqdq);
+ return crc64_nvme_generic(crc, p, len);
+}
+
+#define crc64_mod_init_arch crc64_mod_init_arch
+static inline void crc64_mod_init_arch(void)
+{
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ static_branch_enable(&have_pclmulqdq);
+ if (have_vpclmul()) {
+ if (have_avx512()) {
+ static_call_update(crc64_msb_pclmul,
+ crc64_msb_vpclmul_avx512);
+ static_call_update(crc64_lsb_pclmul,
+ crc64_lsb_vpclmul_avx512);
+ } else {
+ static_call_update(crc64_msb_pclmul,
+ crc64_msb_vpclmul_avx2);
+ static_call_update(crc64_lsb_pclmul,
+ crc64_lsb_vpclmul_avx2);
+ }
+ }
+ }
+}