diff options
Diffstat (limited to 'lib/crc')
58 files changed, 8918 insertions, 0 deletions
diff --git a/lib/crc/.gitignore b/lib/crc/.gitignore new file mode 100644 index 000000000000..a9e48103c9fb --- /dev/null +++ b/lib/crc/.gitignore @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +/crc32table.h +/crc64table.h +/gen_crc32table +/gen_crc64table diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig new file mode 100644 index 000000000000..70e7a6016de3 --- /dev/null +++ b/lib/crc/Kconfig @@ -0,0 +1,119 @@ +# SPDX-License-Identifier: GPL-2.0-only + +# Kconfig for the kernel's cyclic redundancy check (CRC) library code + +config CRC4 + tristate + help + The CRC4 library functions. Select this if your module uses any of + the functions from <linux/crc4.h>. + +config CRC7 + tristate + help + The CRC7 library functions. Select this if your module uses any of + the functions from <linux/crc7.h>. + +config CRC8 + tristate + help + The CRC8 library functions. Select this if your module uses any of + the functions from <linux/crc8.h>. + +config CRC16 + tristate + help + The CRC16 library functions. Select this if your module uses any of + the functions from <linux/crc16.h>. + +config CRC_CCITT + tristate + help + The CRC-CCITT library functions. Select this if your module uses any + of the functions from <linux/crc-ccitt.h>. + +config CRC_ITU_T + tristate + help + The CRC-ITU-T library functions. Select this if your module uses + any of the functions from <linux/crc-itu-t.h>. + +config CRC_T10DIF + tristate + help + The CRC-T10DIF library functions. Select this if your module uses + any of the functions from <linux/crc-t10dif.h>. + +config CRC_T10DIF_ARCH + bool + depends on CRC_T10DIF && CRC_OPTIMIZATIONS + default y if ARM && KERNEL_MODE_NEON + default y if ARM64 && KERNEL_MODE_NEON + default y if PPC64 && ALTIVEC + default y if RISCV && RISCV_ISA_ZBC + default y if X86 + +config CRC32 + tristate + select BITREVERSE + help + The CRC32 library functions. Select this if your module uses any of + the functions from <linux/crc32.h> or <linux/crc32c.h>. + +config CRC32_ARCH + bool + depends on CRC32 && CRC_OPTIMIZATIONS + default y if ARM && KERNEL_MODE_NEON + default y if ARM64 + default y if LOONGARCH + default y if MIPS && CPU_MIPSR6 + default y if PPC64 && ALTIVEC + default y if RISCV && RISCV_ISA_ZBC + default y if S390 + default y if SPARC64 + default y if X86 + +config CRC64 + tristate + help + The CRC64 library functions. Select this if your module uses any of + the functions from <linux/crc64.h>. + +config CRC64_ARCH + bool + depends on CRC64 && CRC_OPTIMIZATIONS + default y if RISCV && RISCV_ISA_ZBC && 64BIT + default y if X86_64 + +config CRC_OPTIMIZATIONS + bool "Enable optimized CRC implementations" if EXPERT + depends on !UML + default y + help + Disabling this option reduces code size slightly by disabling the + architecture-optimized implementations of any CRC variants that are + enabled. CRC checksumming performance may get much slower. + + Keep this enabled unless you're really trying to minimize the size of + the kernel. + +config CRC_KUNIT_TEST + tristate "KUnit tests for CRC functions" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + select CRC7 + select CRC16 + select CRC_T10DIF + select CRC32 + select CRC64 + help + Unit tests for the CRC library functions. + + This is intended to help people writing architecture-specific + optimized versions. If unsure, say N. + +config CRC_BENCHMARK + bool "Benchmark for the CRC functions" + depends on CRC_KUNIT_TEST + help + Include benchmarks in the KUnit test suite for the CRC functions. diff --git a/lib/crc/Makefile b/lib/crc/Makefile new file mode 100644 index 000000000000..7543ad295ab6 --- /dev/null +++ b/lib/crc/Makefile @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: GPL-2.0-only + +# Makefile for the kernel's cyclic redundancy check (CRC) library code + +obj-$(CONFIG_CRC4) += crc4.o +obj-$(CONFIG_CRC7) += crc7.o +obj-$(CONFIG_CRC8) += crc8.o +obj-$(CONFIG_CRC16) += crc16.o +obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o +obj-$(CONFIG_CRC_ITU_T) += crc-itu-t.o + +obj-$(CONFIG_CRC_T10DIF) += crc-t10dif.o +crc-t10dif-y := crc-t10dif-main.o +ifeq ($(CONFIG_CRC_T10DIF_ARCH),y) +CFLAGS_crc-t10dif-main.o += -I$(src)/$(SRCARCH) +crc-t10dif-$(CONFIG_ARM) += arm/crc-t10dif-core.o +crc-t10dif-$(CONFIG_ARM64) += arm64/crc-t10dif-core.o +crc-t10dif-$(CONFIG_PPC) += powerpc/crct10dif-vpmsum_asm.o +crc-t10dif-$(CONFIG_RISCV) += riscv/crc16_msb.o +crc-t10dif-$(CONFIG_X86) += x86/crc16-msb-pclmul.o +endif + +obj-$(CONFIG_CRC32) += crc32.o +crc32-y := crc32-main.o +ifeq ($(CONFIG_CRC32_ARCH),y) +CFLAGS_crc32-main.o += -I$(src)/$(SRCARCH) +crc32-$(CONFIG_ARM) += arm/crc32-core.o +crc32-$(CONFIG_ARM64) += arm64/crc32-core.o +crc32-$(CONFIG_PPC) += powerpc/crc32c-vpmsum_asm.o +crc32-$(CONFIG_RISCV) += riscv/crc32_lsb.o riscv/crc32_msb.o +crc32-$(CONFIG_S390) += s390/crc32le-vx.o s390/crc32be-vx.o +crc32-$(CONFIG_SPARC) += sparc/crc32c_asm.o +crc32-$(CONFIG_X86) += x86/crc32-pclmul.o +crc32-$(CONFIG_X86_64) += x86/crc32c-3way.o +endif + +obj-$(CONFIG_CRC64) += crc64.o +crc64-y := crc64-main.o +ifeq ($(CONFIG_CRC64_ARCH),y) +CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH) +crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o +crc64-$(CONFIG_X86) += x86/crc64-pclmul.o +endif + +obj-y += tests/ + +hostprogs := gen_crc32table gen_crc64table +clean-files := crc32table.h crc64table.h + +$(obj)/crc32-main.o: $(obj)/crc32table.h +$(obj)/crc64-main.o: $(obj)/crc64table.h + +quiet_cmd_crc32 = GEN $@ + cmd_crc32 = $< > $@ + +quiet_cmd_crc64 = GEN $@ + cmd_crc64 = $< > $@ + +$(obj)/crc32table.h: $(obj)/gen_crc32table + $(call cmd,crc32) + +$(obj)/crc64table.h: $(obj)/gen_crc64table + $(call cmd,crc64) diff --git a/lib/crc/arm/crc-t10dif-core.S b/lib/crc/arm/crc-t10dif-core.S new file mode 100644 index 000000000000..2bbf2df9c1e2 --- /dev/null +++ b/lib/crc/arm/crc-t10dif-core.S @@ -0,0 +1,468 @@ +// +// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions +// +// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> +// Copyright (C) 2019 Google LLC <ebiggers@google.com> +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License version 2 as +// published by the Free Software Foundation. +// + +// Derived from the x86 version: +// +// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +// +// Copyright (c) 2013, Intel Corporation +// +// Authors: +// Erdinc Ozturk <erdinc.ozturk@intel.com> +// Vinodh Gopal <vinodh.gopal@intel.com> +// James Guilford <james.guilford@intel.com> +// Tim Chen <tim.c.chen@linux.intel.com> +// +// This software is available to you under a choice of one of two +// licenses. You may choose to be licensed under the terms of the GNU +// General Public License (GPL) Version 2, available from the file +// COPYING in the main directory of this source tree, or the +// OpenIB.org BSD license below: +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the +// distribution. +// +// * Neither the name of the Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// +// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Reference paper titled "Fast CRC Computation for Generic +// Polynomials Using PCLMULQDQ Instruction" +// URL: http://www.intel.com/content/dam/www/public/us/en/documents +// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// + +#include <linux/linkage.h> +#include <asm/assembler.h> + +#ifdef CONFIG_CPU_ENDIAN_BE8 +#define CPU_LE(code...) +#else +#define CPU_LE(code...) code +#endif + + .text + .arch armv8-a + .fpu crypto-neon-fp-armv8 + + init_crc .req r0 + buf .req r1 + len .req r2 + + fold_consts_ptr .req ip + + q0l .req d0 + q0h .req d1 + q1l .req d2 + q1h .req d3 + q2l .req d4 + q2h .req d5 + q3l .req d6 + q3h .req d7 + q4l .req d8 + q4h .req d9 + q5l .req d10 + q5h .req d11 + q6l .req d12 + q6h .req d13 + q7l .req d14 + q7h .req d15 + q8l .req d16 + q8h .req d17 + q9l .req d18 + q9h .req d19 + q10l .req d20 + q10h .req d21 + q11l .req d22 + q11h .req d23 + q12l .req d24 + q12h .req d25 + + FOLD_CONSTS .req q10 + FOLD_CONST_L .req q10l + FOLD_CONST_H .req q10h + + /* + * Pairwise long polynomial multiplication of two 16-bit values + * + * { w0, w1 }, { y0, y1 } + * + * by two 64-bit values + * + * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 } + * + * where each vector element is a byte, ordered from least to most + * significant. The resulting 80-bit vectors are XOR'ed together. + * + * This can be implemented using 8x8 long polynomial multiplication, by + * reorganizing the input so that each pairwise 8x8 multiplication + * produces one of the terms from the decomposition below, and + * combining the results of each rank and shifting them into place. + * + * Rank + * 0 w0*x0 ^ | y0*z0 ^ + * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^ + * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^ + * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^ + * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^ + * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^ + * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^ + * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^ + * 8 w1*x7 << 64 | y1*z7 << 64 + * + * The inputs can be reorganized into + * + * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 } + * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 } + * + * and after performing 8x8->16 bit long polynomial multiplication of + * each of the halves of the first vector with those of the second one, + * we obtain the following four vectors of 16-bit elements: + * + * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 } + * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 } + * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 } + * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 } + * + * Results b and c can be XORed together, as the vector elements have + * matching ranks. Then, the final XOR can be pulled forward, and + * applied between the halves of each of the remaining three vectors, + * which are then shifted into place, and XORed together to produce the + * final 80-bit result. + */ + .macro pmull16x64_p8, v16, v64 + vext.8 q11, \v64, \v64, #1 + vld1.64 {q12}, [r4, :128] + vuzp.8 q11, \v64 + vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24 + vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25 + bl __pmull16x64_p8 + veor \v64, q12, q14 + .endm + +__pmull16x64_p8: + vmull.p8 q13, d23, d24 + vmull.p8 q14, d23, d25 + vmull.p8 q15, d22, d24 + vmull.p8 q12, d22, d25 + + veor q14, q14, q15 + veor d24, d24, d25 + veor d26, d26, d27 + veor d28, d28, d29 + vmov.i32 d25, #0 + vmov.i32 d29, #0 + vext.8 q12, q12, q12, #14 + vext.8 q14, q14, q14, #15 + veor d24, d24, d26 + bx lr +ENDPROC(__pmull16x64_p8) + + .macro pmull16x64_p64, v16, v64 + vmull.p64 q11, \v64\()l, \v16\()_L + vmull.p64 \v64, \v64\()h, \v16\()_H + veor \v64, \v64, q11 + .endm + + // Fold reg1, reg2 into the next 32 data bytes, storing the result back + // into reg1, reg2. + .macro fold_32_bytes, reg1, reg2, p + vld1.64 {q8-q9}, [buf]! + + pmull16x64_\p FOLD_CONST, \reg1 + pmull16x64_\p FOLD_CONST, \reg2 + +CPU_LE( vrev64.8 q8, q8 ) +CPU_LE( vrev64.8 q9, q9 ) + vswp q8l, q8h + vswp q9l, q9h + + veor.8 \reg1, \reg1, q8 + veor.8 \reg2, \reg2, q9 + .endm + + // Fold src_reg into dst_reg, optionally loading the next fold constants + .macro fold_16_bytes, src_reg, dst_reg, p, load_next_consts + pmull16x64_\p FOLD_CONST, \src_reg + .ifnb \load_next_consts + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + .endif + veor.8 \dst_reg, \dst_reg, \src_reg + .endm + + .macro crct10dif, p + // For sizes less than 256 bytes, we can't fold 128 bytes at a time. + cmp len, #256 + blt .Lless_than_256_bytes\@ + + mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts + + // Load the first 128 data bytes. Byte swapping is necessary to make + // the bit order match the polynomial coefficient order. + vld1.64 {q0-q1}, [buf]! + vld1.64 {q2-q3}, [buf]! + vld1.64 {q4-q5}, [buf]! + vld1.64 {q6-q7}, [buf]! +CPU_LE( vrev64.8 q0, q0 ) +CPU_LE( vrev64.8 q1, q1 ) +CPU_LE( vrev64.8 q2, q2 ) +CPU_LE( vrev64.8 q3, q3 ) +CPU_LE( vrev64.8 q4, q4 ) +CPU_LE( vrev64.8 q5, q5 ) +CPU_LE( vrev64.8 q6, q6 ) +CPU_LE( vrev64.8 q7, q7 ) + vswp q0l, q0h + vswp q1l, q1h + vswp q2l, q2h + vswp q3l, q3h + vswp q4l, q4h + vswp q5l, q5h + vswp q6l, q6h + vswp q7l, q7h + + // XOR the first 16 data *bits* with the initial CRC value. + vmov.i8 q8h, #0 + vmov.u16 q8h[3], init_crc + veor q0h, q0h, q8h + + // Load the constants for folding across 128 bytes. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + + // Subtract 128 for the 128 data bytes just consumed. Subtract another + // 128 to simplify the termination condition of the following loop. + sub len, len, #256 + + // While >= 128 data bytes remain (not counting q0-q7), fold the 128 + // bytes q0-q7 into them, storing the result back into q0-q7. +.Lfold_128_bytes_loop\@: + fold_32_bytes q0, q1, \p + fold_32_bytes q2, q3, \p + fold_32_bytes q4, q5, \p + fold_32_bytes q6, q7, \p + subs len, len, #128 + bge .Lfold_128_bytes_loop\@ + + // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7. + + // Fold across 64 bytes. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + fold_16_bytes q0, q4, \p + fold_16_bytes q1, q5, \p + fold_16_bytes q2, q6, \p + fold_16_bytes q3, q7, \p, 1 + // Fold across 32 bytes. + fold_16_bytes q4, q6, \p + fold_16_bytes q5, q7, \p, 1 + // Fold across 16 bytes. + fold_16_bytes q6, q7, \p + + // Add 128 to get the correct number of data bytes remaining in 0...127 + // (not counting q7), following the previous extra subtraction by 128. + // Then subtract 16 to simplify the termination condition of the + // following loop. + adds len, len, #(128-16) + + // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7 + // into them, storing the result back into q7. + blt .Lfold_16_bytes_loop_done\@ +.Lfold_16_bytes_loop\@: + pmull16x64_\p FOLD_CONST, q7 + vld1.64 {q0}, [buf]! +CPU_LE( vrev64.8 q0, q0 ) + vswp q0l, q0h + veor.8 q7, q7, q0 + subs len, len, #16 + bge .Lfold_16_bytes_loop\@ + +.Lfold_16_bytes_loop_done\@: + // Add 16 to get the correct number of data bytes remaining in 0...15 + // (not counting q7), following the previous extra subtraction by 16. + adds len, len, #16 + beq .Lreduce_final_16_bytes\@ + +.Lhandle_partial_segment\@: + // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first + // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To + // do this without needing a fold constant for each possible 'len', + // redivide the bytes into a first chunk of 'len' bytes and a second + // chunk of 16 bytes, then fold the first chunk into the second. + + // q0 = last 16 original data bytes + add buf, buf, len + sub buf, buf, #16 + vld1.64 {q0}, [buf] +CPU_LE( vrev64.8 q0, q0 ) + vswp q0l, q0h + + // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes. + mov_l r1, .Lbyteshift_table + 16 + sub r1, r1, len + vld1.8 {q2}, [r1] + vtbl.8 q1l, {q7l-q7h}, q2l + vtbl.8 q1h, {q7l-q7h}, q2h + + // q3 = first chunk: q7 right-shifted by '16-len' bytes. + vmov.i8 q3, #0x80 + veor.8 q2, q2, q3 + vtbl.8 q3l, {q7l-q7h}, q2l + vtbl.8 q3h, {q7l-q7h}, q2h + + // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. + vshr.s8 q2, q2, #7 + + // q2 = second chunk: 'len' bytes from q0 (low-order bytes), + // then '16-len' bytes from q1 (high-order bytes). + vbsl.8 q2, q1, q0 + + // Fold the first chunk into the second chunk, storing the result in q7. + pmull16x64_\p FOLD_CONST, q3 + veor.8 q7, q3, q2 + b .Lreduce_final_16_bytes\@ + +.Lless_than_256_bytes\@: + // Checksumming a buffer of length 16...255 bytes + + mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts + + // Load the first 16 data bytes. + vld1.64 {q7}, [buf]! +CPU_LE( vrev64.8 q7, q7 ) + vswp q7l, q7h + + // XOR the first 16 data *bits* with the initial CRC value. + vmov.i8 q0h, #0 + vmov.u16 q0h[3], init_crc + veor.8 q7h, q7h, q0h + + // Load the fold-across-16-bytes constants. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + + cmp len, #16 + beq .Lreduce_final_16_bytes\@ // len == 16 + subs len, len, #32 + addlt len, len, #16 + blt .Lhandle_partial_segment\@ // 17 <= len <= 31 + b .Lfold_16_bytes_loop\@ // 32 <= len <= 255 + +.Lreduce_final_16_bytes\@: + .endm + +// +// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len); +// +// Assumes len >= 16. +// +ENTRY(crc_t10dif_pmull64) + crct10dif p64 + + // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC. + + // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! + + // Fold the high 64 bits into the low 64 bits, while also multiplying by + // x^64. This produces a 128-bit value congruent to x^64 * M(x) and + // whose low 48 bits are 0. + vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x)) + veor.8 q0h, q0h, q7l // + low bits * x^64 + + // Fold the high 32 bits into the low 96 bits. This produces a 96-bit + // value congruent to x^64 * M(x) and whose low 48 bits are 0. + vmov.i8 q1, #0 + vmov s4, s3 // extract high 32 bits + vmov s3, s5 // zero high 32 bits + vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x)) + veor.8 q0, q0, q1 // + low bits + + // Load G(x) and floor(x^48 / G(x)). + vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128] + + // Use Barrett reduction to compute the final CRC value. + vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x)) + vshr.u64 q1l, q1l, #32 // /= x^32 + vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x) + vshr.u64 q0l, q0l, #48 + veor.8 q0l, q0l, q1l // + low 16 nonzero bits + // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0. + + vmov.u16 r0, q0l[0] + bx lr +ENDPROC(crc_t10dif_pmull64) + +ENTRY(crc_t10dif_pmull8) + push {r4, lr} + mov_l r4, .L16x64perm + + crct10dif p8 + +CPU_LE( vrev64.8 q7, q7 ) + vswp q7l, q7h + vst1.64 {q7}, [r3, :128] + pop {r4, pc} +ENDPROC(crc_t10dif_pmull8) + + .section ".rodata", "a" + .align 4 + +// Fold constants precomputed from the polynomial 0x18bb7 +// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 +.Lfold_across_128_bytes_consts: + .quad 0x0000000000006123 // x^(8*128) mod G(x) + .quad 0x0000000000002295 // x^(8*128+64) mod G(x) +// .Lfold_across_64_bytes_consts: + .quad 0x0000000000001069 // x^(4*128) mod G(x) + .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) +// .Lfold_across_32_bytes_consts: + .quad 0x000000000000857d // x^(2*128) mod G(x) + .quad 0x0000000000007acc // x^(2*128+64) mod G(x) +.Lfold_across_16_bytes_consts: + .quad 0x000000000000a010 // x^(1*128) mod G(x) + .quad 0x0000000000001faa // x^(1*128+64) mod G(x) +// .Lfinal_fold_consts: + .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) + .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) +// .Lbarrett_reduction_consts: + .quad 0x0000000000018bb7 // G(x) + .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) + +// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - +// len] is the index vector to shift left by 'len' bytes, and is also {0x80, +// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. +.Lbyteshift_table: + .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 + .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 + +.L16x64perm: + .quad 0x808080800000000, 0x909090901010101 diff --git a/lib/crc/arm/crc-t10dif.h b/lib/crc/arm/crc-t10dif.h new file mode 100644 index 000000000000..2edf7e9681d0 --- /dev/null +++ b/lib/crc/arm/crc-t10dif.h @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <crypto/internal/simd.h> + +#include <asm/neon.h> +#include <asm/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); + +#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U + +asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len); +asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len, + u8 out[16]); + +static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) +{ + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { + if (static_branch_likely(&have_pmull)) { + if (crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc_t10dif_pmull64(crc, data, length); + kernel_neon_end(); + return crc; + } + } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && + static_branch_likely(&have_neon) && + crypto_simd_usable()) { + u8 buf[16] __aligned(16); + + kernel_neon_begin(); + crc_t10dif_pmull8(crc, data, length, buf); + kernel_neon_end(); + + return crc_t10dif_generic(0, buf, sizeof(buf)); + } + } + return crc_t10dif_generic(crc, data, length); +} + +#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch +static inline void crc_t10dif_mod_init_arch(void) +{ + if (elf_hwcap & HWCAP_NEON) { + static_branch_enable(&have_neon); + if (elf_hwcap2 & HWCAP2_PMULL) + static_branch_enable(&have_pmull); + } +} diff --git a/lib/crc/arm/crc32-core.S b/lib/crc/arm/crc32-core.S new file mode 100644 index 000000000000..6f674f30c70b --- /dev/null +++ b/lib/crc/arm/crc32-core.S @@ -0,0 +1,306 @@ +/* + * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 + * calculation. + * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) + * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found + * at: + * https://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2B: Instruction Set Reference, N-Z + * + * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> + * Alexander Boyko <Alexander_Boyko@xyratex.com> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .align 6 + .arch armv8-a + .arch_extension crc + .fpu crypto-neon-fp-armv8 + +.Lcrc32_constants: + /* + * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 + * #define CONSTANT_R1 0x154442bd4LL + * + * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 + * #define CONSTANT_R2 0x1c6e41596LL + */ + .quad 0x0000000154442bd4 + .quad 0x00000001c6e41596 + + /* + * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 + * #define CONSTANT_R3 0x1751997d0LL + * + * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e + * #define CONSTANT_R4 0x0ccaa009eLL + */ + .quad 0x00000001751997d0 + .quad 0x00000000ccaa009e + + /* + * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 + * #define CONSTANT_R5 0x163cd6124LL + */ + .quad 0x0000000163cd6124 + .quad 0x00000000FFFFFFFF + + /* + * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL + * + * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` + * = 0x1F7011641LL + * #define CONSTANT_RU 0x1F7011641LL + */ + .quad 0x00000001DB710641 + .quad 0x00000001F7011641 + +.Lcrc32c_constants: + .quad 0x00000000740eef02 + .quad 0x000000009e4addf8 + .quad 0x00000000f20c0dfe + .quad 0x000000014cd00bd6 + .quad 0x00000000dd45aab8 + .quad 0x00000000FFFFFFFF + .quad 0x0000000105ec76f0 + .quad 0x00000000dea713f1 + + dCONSTANTl .req d0 + dCONSTANTh .req d1 + qCONSTANT .req q0 + + BUF .req r0 + LEN .req r1 + CRC .req r2 + + qzr .req q9 + + /** + * Calculate crc32 + * BUF - buffer + * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63 + * CRC - initial crc32 + * return %eax crc32 + * uint crc32_pmull_le(unsigned char const *buffer, + * size_t len, uint crc32) + */ +SYM_FUNC_START(crc32_pmull_le) + adr r3, .Lcrc32_constants + b 0f +SYM_FUNC_END(crc32_pmull_le) + +SYM_FUNC_START(crc32c_pmull_le) + adr r3, .Lcrc32c_constants + +0: bic LEN, LEN, #15 + vld1.8 {q1-q2}, [BUF, :128]! + vld1.8 {q3-q4}, [BUF, :128]! + vmov.i8 qzr, #0 + vmov.i8 qCONSTANT, #0 + vmov.32 dCONSTANTl[0], CRC + veor.8 d2, d2, dCONSTANTl + sub LEN, LEN, #0x40 + cmp LEN, #0x40 + blt less_64 + + vld1.64 {qCONSTANT}, [r3] + +loop_64: /* 64 bytes Full cache line folding */ + sub LEN, LEN, #0x40 + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q6, d5, dCONSTANTh + vmull.p64 q7, d7, dCONSTANTh + vmull.p64 q8, d9, dCONSTANTh + + vmull.p64 q1, d2, dCONSTANTl + vmull.p64 q2, d4, dCONSTANTl + vmull.p64 q3, d6, dCONSTANTl + vmull.p64 q4, d8, dCONSTANTl + + veor.8 q1, q1, q5 + vld1.8 {q5}, [BUF, :128]! + veor.8 q2, q2, q6 + vld1.8 {q6}, [BUF, :128]! + veor.8 q3, q3, q7 + vld1.8 {q7}, [BUF, :128]! + veor.8 q4, q4, q8 + vld1.8 {q8}, [BUF, :128]! + + veor.8 q1, q1, q5 + veor.8 q2, q2, q6 + veor.8 q3, q3, q7 + veor.8 q4, q4, q8 + + cmp LEN, #0x40 + bge loop_64 + +less_64: /* Folding cache line into 128bit */ + vldr dCONSTANTl, [r3, #16] + vldr dCONSTANTh, [r3, #24] + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q2 + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q3 + + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q4 + + teq LEN, #0 + beq fold_64 + +loop_16: /* Folding rest buffer into 128bit */ + subs LEN, LEN, #0x10 + + vld1.8 {q2}, [BUF, :128]! + vmull.p64 q5, d3, dCONSTANTh + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q5 + veor.8 q1, q1, q2 + + bne loop_16 + +fold_64: + /* perform the last 64 bit fold, also adds 32 zeroes + * to the input stream */ + vmull.p64 q2, d2, dCONSTANTh + vext.8 q1, q1, qzr, #8 + veor.8 q1, q1, q2 + + /* final 32-bit fold */ + vldr dCONSTANTl, [r3, #32] + vldr d6, [r3, #40] + vmov.i8 d7, #0 + + vext.8 q2, q1, qzr, #4 + vand.8 d2, d2, d6 + vmull.p64 q1, d2, dCONSTANTl + veor.8 q1, q1, q2 + + /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ + vldr dCONSTANTl, [r3, #48] + vldr dCONSTANTh, [r3, #56] + + vand.8 q2, q1, q3 + vext.8 q2, qzr, q2, #8 + vmull.p64 q2, d5, dCONSTANTh + vand.8 q2, q2, q3 + vmull.p64 q2, d4, dCONSTANTl + veor.8 q1, q1, q2 + vmov r0, s5 + + bx lr +SYM_FUNC_END(crc32c_pmull_le) + + .macro __crc32, c + subs ip, r2, #8 + bmi .Ltail\c + + tst r1, #3 + bne .Lunaligned\c + + teq ip, #0 +.Laligned8\c: + ldrd r2, r3, [r1], #8 +ARM_BE8(rev r2, r2 ) +ARM_BE8(rev r3, r3 ) + crc32\c\()w r0, r0, r2 + crc32\c\()w r0, r0, r3 + bxeq lr + subs ip, ip, #8 + bpl .Laligned8\c + +.Ltail\c: + tst ip, #4 + beq 2f + ldr r3, [r1], #4 +ARM_BE8(rev r3, r3 ) + crc32\c\()w r0, r0, r3 + +2: tst ip, #2 + beq 1f + ldrh r3, [r1], #2 +ARM_BE8(rev16 r3, r3 ) + crc32\c\()h r0, r0, r3 + +1: tst ip, #1 + bxeq lr + ldrb r3, [r1] + crc32\c\()b r0, r0, r3 + bx lr + +.Lunaligned\c: + tst r1, #1 + beq 2f + ldrb r3, [r1], #1 + subs r2, r2, #1 + crc32\c\()b r0, r0, r3 + + tst r1, #2 + beq 0f +2: ldrh r3, [r1], #2 + subs r2, r2, #2 +ARM_BE8(rev16 r3, r3 ) + crc32\c\()h r0, r0, r3 + +0: subs ip, r2, #8 + bpl .Laligned8\c + b .Ltail\c + .endm + + .align 5 +SYM_FUNC_START(crc32_armv8_le) + __crc32 +SYM_FUNC_END(crc32_armv8_le) + + .align 5 +SYM_FUNC_START(crc32c_armv8_le) + __crc32 c +SYM_FUNC_END(crc32c_armv8_le) diff --git a/lib/crc/arm/crc32.h b/lib/crc/arm/crc32.h new file mode 100644 index 000000000000..018007e162a2 --- /dev/null +++ b/lib/crc/arm/crc32.h @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/cpufeature.h> + +#include <crypto/internal/simd.h> + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); + +#define PMULL_MIN_LEN 64 /* min size of buffer for pmull functions */ + +asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc); +asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len); + +asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc); +asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len); + +static inline u32 crc32_le_scalar(u32 crc, const u8 *p, size_t len) +{ + if (static_branch_likely(&have_crc32)) + return crc32_armv8_le(crc, p, len); + return crc32_le_base(crc, p, len); +} + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (len >= PMULL_MIN_LEN + 15 && + static_branch_likely(&have_pmull) && crypto_simd_usable()) { + size_t n = -(uintptr_t)p & 15; + + /* align p to 16-byte boundary */ + if (n) { + crc = crc32_le_scalar(crc, p, n); + p += n; + len -= n; + } + n = round_down(len, 16); + kernel_neon_begin(); + crc = crc32_pmull_le(p, n, crc); + kernel_neon_end(); + p += n; + len -= n; + } + return crc32_le_scalar(crc, p, len); +} + +static inline u32 crc32c_scalar(u32 crc, const u8 *p, size_t len) +{ + if (static_branch_likely(&have_crc32)) + return crc32c_armv8_le(crc, p, len); + return crc32c_base(crc, p, len); +} + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (len >= PMULL_MIN_LEN + 15 && + static_branch_likely(&have_pmull) && crypto_simd_usable()) { + size_t n = -(uintptr_t)p & 15; + + /* align p to 16-byte boundary */ + if (n) { + crc = crc32c_scalar(crc, p, n); + p += n; + len -= n; + } + n = round_down(len, 16); + kernel_neon_begin(); + crc = crc32c_pmull_le(p, n, crc); + kernel_neon_end(); + p += n; + len -= n; + } + return crc32c_scalar(crc, p, len); +} + +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + if (elf_hwcap2 & HWCAP2_CRC32) + static_branch_enable(&have_crc32); + if (elf_hwcap2 & HWCAP2_PMULL) + static_branch_enable(&have_pmull); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (elf_hwcap2 & (HWCAP2_CRC32 | HWCAP2_PMULL)) + return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/arm64/crc-t10dif-core.S b/lib/crc/arm64/crc-t10dif-core.S new file mode 100644 index 000000000000..87dd6d46224d --- /dev/null +++ b/lib/crc/arm64/crc-t10dif-core.S @@ -0,0 +1,469 @@ +// +// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions +// +// Copyright (C) 2016 Linaro Ltd +// Copyright (C) 2019-2024 Google LLC +// +// Authors: Ard Biesheuvel <ardb@google.com> +// Eric Biggers <ebiggers@google.com> +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License version 2 as +// published by the Free Software Foundation. +// + +// Derived from the x86 version: +// +// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +// +// Copyright (c) 2013, Intel Corporation +// +// Authors: +// Erdinc Ozturk <erdinc.ozturk@intel.com> +// Vinodh Gopal <vinodh.gopal@intel.com> +// James Guilford <james.guilford@intel.com> +// Tim Chen <tim.c.chen@linux.intel.com> +// +// This software is available to you under a choice of one of two +// licenses. You may choose to be licensed under the terms of the GNU +// General Public License (GPL) Version 2, available from the file +// COPYING in the main directory of this source tree, or the +// OpenIB.org BSD license below: +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the +// distribution. +// +// * Neither the name of the Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// +// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Reference paper titled "Fast CRC Computation for Generic +// Polynomials Using PCLMULQDQ Instruction" +// URL: http://www.intel.com/content/dam/www/public/us/en/documents +// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a+crypto + + init_crc .req w0 + buf .req x1 + len .req x2 + fold_consts_ptr .req x5 + + fold_consts .req v10 + + t3 .req v17 + t4 .req v18 + t5 .req v19 + t6 .req v20 + t7 .req v21 + t8 .req v22 + + perm .req v27 + + .macro pmull16x64_p64, a16, b64, c64 + pmull2 \c64\().1q, \a16\().2d, \b64\().2d + pmull \b64\().1q, \a16\().1d, \b64\().1d + .endm + + /* + * Pairwise long polynomial multiplication of two 16-bit values + * + * { w0, w1 }, { y0, y1 } + * + * by two 64-bit values + * + * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 } + * + * where each vector element is a byte, ordered from least to most + * significant. + * + * This can be implemented using 8x8 long polynomial multiplication, by + * reorganizing the input so that each pairwise 8x8 multiplication + * produces one of the terms from the decomposition below, and + * combining the results of each rank and shifting them into place. + * + * Rank + * 0 w0*x0 ^ | y0*z0 ^ + * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^ + * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^ + * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^ + * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^ + * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^ + * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^ + * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^ + * 8 w1*x7 << 64 | y1*z7 << 64 + * + * The inputs can be reorganized into + * + * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 } + * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 } + * + * and after performing 8x8->16 bit long polynomial multiplication of + * each of the halves of the first vector with those of the second one, + * we obtain the following four vectors of 16-bit elements: + * + * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 } + * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 } + * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 } + * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 } + * + * Results b and c can be XORed together, as the vector elements have + * matching ranks. Then, the final XOR (*) can be pulled forward, and + * applied between the halves of each of the remaining three vectors, + * which are then shifted into place, and combined to produce two + * 80-bit results. + * + * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent + * to the 64x64 bit one above, but XOR'ing the outputs together will + * produce the expected result, and this is sufficient in the context of + * this algorithm. + */ + .macro pmull16x64_p8, a16, b64, c64 + ext t7.16b, \b64\().16b, \b64\().16b, #1 + tbl t5.16b, {\a16\().16b}, perm.16b + uzp1 t7.16b, \b64\().16b, t7.16b + bl __pmull_p8_16x64 + ext \b64\().16b, t4.16b, t4.16b, #15 + eor \c64\().16b, t8.16b, t5.16b + .endm + +SYM_FUNC_START_LOCAL(__pmull_p8_16x64) + ext t6.16b, t5.16b, t5.16b, #8 + + pmull t3.8h, t7.8b, t5.8b + pmull t4.8h, t7.8b, t6.8b + pmull2 t5.8h, t7.16b, t5.16b + pmull2 t6.8h, t7.16b, t6.16b + + ext t8.16b, t3.16b, t3.16b, #8 + eor t4.16b, t4.16b, t6.16b + ext t7.16b, t5.16b, t5.16b, #8 + ext t6.16b, t4.16b, t4.16b, #8 + eor t8.8b, t8.8b, t3.8b + eor t5.8b, t5.8b, t7.8b + eor t4.8b, t4.8b, t6.8b + ext t5.16b, t5.16b, t5.16b, #14 + ret +SYM_FUNC_END(__pmull_p8_16x64) + + + // Fold reg1, reg2 into the next 32 data bytes, storing the result back + // into reg1, reg2. + .macro fold_32_bytes, p, reg1, reg2 + ldp q11, q12, [buf], #0x20 + + pmull16x64_\p fold_consts, \reg1, v8 + +CPU_LE( rev64 v11.16b, v11.16b ) +CPU_LE( rev64 v12.16b, v12.16b ) + + pmull16x64_\p fold_consts, \reg2, v9 + +CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) +CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) + + eor \reg1\().16b, \reg1\().16b, v8.16b + eor \reg2\().16b, \reg2\().16b, v9.16b + eor \reg1\().16b, \reg1\().16b, v11.16b + eor \reg2\().16b, \reg2\().16b, v12.16b + .endm + + // Fold src_reg into dst_reg, optionally loading the next fold constants + .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts + pmull16x64_\p fold_consts, \src_reg, v8 + .ifnb \load_next_consts + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + .endif + eor \dst_reg\().16b, \dst_reg\().16b, v8.16b + eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b + .endm + + .macro crc_t10dif_pmull, p + + // For sizes less than 256 bytes, we can't fold 128 bytes at a time. + cmp len, #256 + b.lt .Lless_than_256_bytes_\@ + + adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts + + // Load the first 128 data bytes. Byte swapping is necessary to make + // the bit order match the polynomial coefficient order. + ldp q0, q1, [buf] + ldp q2, q3, [buf, #0x20] + ldp q4, q5, [buf, #0x40] + ldp q6, q7, [buf, #0x60] + add buf, buf, #0x80 +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( rev64 v1.16b, v1.16b ) +CPU_LE( rev64 v2.16b, v2.16b ) +CPU_LE( rev64 v3.16b, v3.16b ) +CPU_LE( rev64 v4.16b, v4.16b ) +CPU_LE( rev64 v5.16b, v5.16b ) +CPU_LE( rev64 v6.16b, v6.16b ) +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) +CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) +CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) +CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) +CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) +CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) +CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + + // XOR the first 16 data *bits* with the initial CRC value. + movi v8.16b, #0 + mov v8.h[7], init_crc + eor v0.16b, v0.16b, v8.16b + + // Load the constants for folding across 128 bytes. + ld1 {fold_consts.2d}, [fold_consts_ptr] + + // Subtract 128 for the 128 data bytes just consumed. Subtract another + // 128 to simplify the termination condition of the following loop. + sub len, len, #256 + + // While >= 128 data bytes remain (not counting v0-v7), fold the 128 + // bytes v0-v7 into them, storing the result back into v0-v7. +.Lfold_128_bytes_loop_\@: + fold_32_bytes \p, v0, v1 + fold_32_bytes \p, v2, v3 + fold_32_bytes \p, v4, v5 + fold_32_bytes \p, v6, v7 + + subs len, len, #128 + b.ge .Lfold_128_bytes_loop_\@ + + // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. + + // Fold across 64 bytes. + add fold_consts_ptr, fold_consts_ptr, #16 + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + fold_16_bytes \p, v0, v4 + fold_16_bytes \p, v1, v5 + fold_16_bytes \p, v2, v6 + fold_16_bytes \p, v3, v7, 1 + // Fold across 32 bytes. + fold_16_bytes \p, v4, v6 + fold_16_bytes \p, v5, v7, 1 + // Fold across 16 bytes. + fold_16_bytes \p, v6, v7 + + // Add 128 to get the correct number of data bytes remaining in 0...127 + // (not counting v7), following the previous extra subtraction by 128. + // Then subtract 16 to simplify the termination condition of the + // following loop. + adds len, len, #(128-16) + + // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7 + // into them, storing the result back into v7. + b.lt .Lfold_16_bytes_loop_done_\@ +.Lfold_16_bytes_loop_\@: + pmull16x64_\p fold_consts, v7, v8 + eor v7.16b, v7.16b, v8.16b + ldr q0, [buf], #16 +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) + eor v7.16b, v7.16b, v0.16b + subs len, len, #16 + b.ge .Lfold_16_bytes_loop_\@ + +.Lfold_16_bytes_loop_done_\@: + // Add 16 to get the correct number of data bytes remaining in 0...15 + // (not counting v7), following the previous extra subtraction by 16. + adds len, len, #16 + b.eq .Lreduce_final_16_bytes_\@ + +.Lhandle_partial_segment_\@: + // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first + // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To + // do this without needing a fold constant for each possible 'len', + // redivide the bytes into a first chunk of 'len' bytes and a second + // chunk of 16 bytes, then fold the first chunk into the second. + + // v0 = last 16 original data bytes + add buf, buf, len + ldr q0, [buf, #-16] +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) + + // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes. + adr_l x4, .Lbyteshift_table + 16 + sub x4, x4, len + ld1 {v2.16b}, [x4] + tbl v1.16b, {v7.16b}, v2.16b + + // v3 = first chunk: v7 right-shifted by '16-len' bytes. + movi v3.16b, #0x80 + eor v2.16b, v2.16b, v3.16b + tbl v3.16b, {v7.16b}, v2.16b + + // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. + sshr v2.16b, v2.16b, #7 + + // v2 = second chunk: 'len' bytes from v0 (low-order bytes), + // then '16-len' bytes from v1 (high-order bytes). + bsl v2.16b, v1.16b, v0.16b + + // Fold the first chunk into the second chunk, storing the result in v7. + pmull16x64_\p fold_consts, v3, v0 + eor v7.16b, v3.16b, v0.16b + eor v7.16b, v7.16b, v2.16b + b .Lreduce_final_16_bytes_\@ + +.Lless_than_256_bytes_\@: + // Checksumming a buffer of length 16...255 bytes + + adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts + + // Load the first 16 data bytes. + ldr q7, [buf], #0x10 +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + + // XOR the first 16 data *bits* with the initial CRC value. + movi v0.16b, #0 + mov v0.h[7], init_crc + eor v7.16b, v7.16b, v0.16b + + // Load the fold-across-16-bytes constants. + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + + cmp len, #16 + b.eq .Lreduce_final_16_bytes_\@ // len == 16 + subs len, len, #32 + b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255 + add len, len, #16 + b .Lhandle_partial_segment_\@ // 17 <= len <= 31 + +.Lreduce_final_16_bytes_\@: + .endm + +// +// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); +// +// Assumes len >= 16. +// +SYM_FUNC_START(crc_t10dif_pmull_p8) + frame_push 1 + + // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 } + movi perm.4h, #8, lsl #8 + orr perm.2s, #1, lsl #16 + orr perm.2s, #1, lsl #24 + zip1 perm.16b, perm.16b, perm.16b + zip1 perm.16b, perm.16b, perm.16b + + crc_t10dif_pmull p8 + +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + str q7, [x3] + + frame_pop + ret +SYM_FUNC_END(crc_t10dif_pmull_p8) + + .align 5 +// +// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); +// +// Assumes len >= 16. +// +SYM_FUNC_START(crc_t10dif_pmull_p64) + crc_t10dif_pmull p64 + + // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. + + movi v2.16b, #0 // init zero register + + // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + + // Fold the high 64 bits into the low 64 bits, while also multiplying by + // x^64. This produces a 128-bit value congruent to x^64 * M(x) and + // whose low 48 bits are 0. + ext v0.16b, v2.16b, v7.16b, #8 + pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x)) + eor v0.16b, v0.16b, v7.16b // + low bits * x^64 + + // Fold the high 32 bits into the low 96 bits. This produces a 96-bit + // value congruent to x^64 * M(x) and whose low 48 bits are 0. + ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits + mov v0.s[3], v2.s[0] // zero high 32 bits + pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x)) + eor v0.16b, v0.16b, v1.16b // + low bits + + // Load G(x) and floor(x^48 / G(x)). + ld1 {fold_consts.2d}, [fold_consts_ptr] + + // Use Barrett reduction to compute the final CRC value. + pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x)) + ushr v1.2d, v1.2d, #32 // /= x^32 + pmull v1.1q, v1.1d, fold_consts.1d // *= G(x) + ushr v0.2d, v0.2d, #48 + eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits + // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. + + umov w0, v0.h[0] + ret +SYM_FUNC_END(crc_t10dif_pmull_p64) + + .section ".rodata", "a" + .align 4 + +// Fold constants precomputed from the polynomial 0x18bb7 +// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 +.Lfold_across_128_bytes_consts: + .quad 0x0000000000006123 // x^(8*128) mod G(x) + .quad 0x0000000000002295 // x^(8*128+64) mod G(x) +// .Lfold_across_64_bytes_consts: + .quad 0x0000000000001069 // x^(4*128) mod G(x) + .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) +// .Lfold_across_32_bytes_consts: + .quad 0x000000000000857d // x^(2*128) mod G(x) + .quad 0x0000000000007acc // x^(2*128+64) mod G(x) +.Lfold_across_16_bytes_consts: + .quad 0x000000000000a010 // x^(1*128) mod G(x) + .quad 0x0000000000001faa // x^(1*128+64) mod G(x) +// .Lfinal_fold_consts: + .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) + .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) +// .Lbarrett_reduction_consts: + .quad 0x0000000000018bb7 // G(x) + .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) + +// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - +// len] is the index vector to shift left by 'len' bytes, and is also {0x80, +// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. +.Lbyteshift_table: + .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 + .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 diff --git a/lib/crc/arm64/crc-t10dif.h b/lib/crc/arm64/crc-t10dif.h new file mode 100644 index 000000000000..c4521a7f1ee9 --- /dev/null +++ b/lib/crc/arm64/crc-t10dif.h @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/cpufeature.h> + +#include <crypto/internal/simd.h> + +#include <asm/neon.h> +#include <asm/simd.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull); + +#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U + +asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len, + u8 out[16]); +asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); + +static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) +{ + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { + if (static_branch_likely(&have_pmull)) { + if (crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc_t10dif_pmull_p64(crc, data, length); + kernel_neon_end(); + return crc; + } + } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && + static_branch_likely(&have_asimd) && + crypto_simd_usable()) { + u8 buf[16]; + + kernel_neon_begin(); + crc_t10dif_pmull_p8(crc, data, length, buf); + kernel_neon_end(); + + return crc_t10dif_generic(0, buf, sizeof(buf)); + } + } + return crc_t10dif_generic(crc, data, length); +} + +#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch +static inline void crc_t10dif_mod_init_arch(void) +{ + if (cpu_have_named_feature(ASIMD)) { + static_branch_enable(&have_asimd); + if (cpu_have_named_feature(PMULL)) + static_branch_enable(&have_pmull); + } +} diff --git a/lib/crc/arm64/crc32-core.S b/lib/crc/arm64/crc32-core.S new file mode 100644 index 000000000000..68825317460f --- /dev/null +++ b/lib/crc/arm64/crc32-core.S @@ -0,0 +1,362 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions + * + * Copyright (C) 2016 - 2018 Linaro Ltd. + * Copyright (C) 2024 Google LLC + * + * Author: Ard Biesheuvel <ardb@kernel.org> + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .cpu generic+crc+crypto + + .macro bitle, reg + .endm + + .macro bitbe, reg + rbit \reg, \reg + .endm + + .macro bytele, reg + .endm + + .macro bytebe, reg + rbit \reg, \reg + lsr \reg, \reg, #24 + .endm + + .macro hwordle, reg +CPU_BE( rev16 \reg, \reg ) + .endm + + .macro hwordbe, reg +CPU_LE( rev \reg, \reg ) + rbit \reg, \reg +CPU_BE( lsr \reg, \reg, #16 ) + .endm + + .macro le, regs:vararg + .irp r, \regs +CPU_BE( rev \r, \r ) + .endr + .endm + + .macro be, regs:vararg + .irp r, \regs +CPU_LE( rev \r, \r ) + .endr + .irp r, \regs + rbit \r, \r + .endr + .endm + + .macro __crc32, c, order=le + bit\order w0 + cmp x2, #16 + b.lt 8f // less than 16 bytes + + and x7, x2, #0x1f + and x2, x2, #~0x1f + cbz x7, 32f // multiple of 32 bytes + + and x8, x7, #0xf + ldp x3, x4, [x1] + add x8, x8, x1 + add x1, x1, x7 + ldp x5, x6, [x8] + \order x3, x4, x5, x6 + + tst x7, #8 + crc32\c\()x w8, w0, x3 + csel x3, x3, x4, eq + csel w0, w0, w8, eq + tst x7, #4 + lsr x4, x3, #32 + crc32\c\()w w8, w0, w3 + csel x3, x3, x4, eq + csel w0, w0, w8, eq + tst x7, #2 + lsr w4, w3, #16 + crc32\c\()h w8, w0, w3 + csel w3, w3, w4, eq + csel w0, w0, w8, eq + tst x7, #1 + crc32\c\()b w8, w0, w3 + csel w0, w0, w8, eq + tst x7, #16 + crc32\c\()x w8, w0, x5 + crc32\c\()x w8, w8, x6 + csel w0, w0, w8, eq + cbz x2, 0f + +32: ldp x3, x4, [x1], #32 + sub x2, x2, #32 + ldp x5, x6, [x1, #-16] + \order x3, x4, x5, x6 + crc32\c\()x w0, w0, x3 + crc32\c\()x w0, w0, x4 + crc32\c\()x w0, w0, x5 + crc32\c\()x w0, w0, x6 + cbnz x2, 32b +0: bit\order w0 + ret + +8: tbz x2, #3, 4f + ldr x3, [x1], #8 + \order x3 + crc32\c\()x w0, w0, x3 +4: tbz x2, #2, 2f + ldr w3, [x1], #4 + \order w3 + crc32\c\()w w0, w0, w3 +2: tbz x2, #1, 1f + ldrh w3, [x1], #2 + hword\order w3 + crc32\c\()h w0, w0, w3 +1: tbz x2, #0, 0f + ldrb w3, [x1] + byte\order w3 + crc32\c\()b w0, w0, w3 +0: bit\order w0 + ret + .endm + + .align 5 +SYM_FUNC_START(crc32_le_arm64) + __crc32 +SYM_FUNC_END(crc32_le_arm64) + + .align 5 +SYM_FUNC_START(crc32c_le_arm64) + __crc32 c +SYM_FUNC_END(crc32c_le_arm64) + + .align 5 +SYM_FUNC_START(crc32_be_arm64) + __crc32 order=be +SYM_FUNC_END(crc32_be_arm64) + + in .req x1 + len .req x2 + + /* + * w0: input CRC at entry, output CRC at exit + * x1: pointer to input buffer + * x2: length of input in bytes + */ + .macro crc4way, insn, table, order=le + bit\order w0 + lsr len, len, #6 // len := # of 64-byte blocks + + /* Process up to 64 blocks of 64 bytes at a time */ +.La\@: mov x3, #64 + cmp len, #64 + csel x3, x3, len, hi // x3 := min(len, 64) + sub len, len, x3 + + /* Divide the input into 4 contiguous blocks */ + add x4, x3, x3, lsl #1 // x4 := 3 * x3 + add x7, in, x3, lsl #4 // x7 := in + 16 * x3 + add x8, in, x3, lsl #5 // x8 := in + 32 * x3 + add x9, in, x4, lsl #4 // x9 := in + 16 * x4 + + /* Load the folding coefficients from the lookup table */ + adr_l x5, \table - 12 // entry 0 omitted + add x5, x5, x4, lsl #2 // x5 += 12 * x3 + ldp s0, s1, [x5] + ldr s2, [x5, #8] + + /* Zero init partial CRCs for this iteration */ + mov w4, wzr + mov w5, wzr + mov w6, wzr + mov x17, xzr + +.Lb\@: sub x3, x3, #1 + \insn w6, w6, x17 + ldp x10, x11, [in], #16 + ldp x12, x13, [x7], #16 + ldp x14, x15, [x8], #16 + ldp x16, x17, [x9], #16 + + \order x10, x11, x12, x13, x14, x15, x16, x17 + + /* Apply the CRC transform to 4 16-byte blocks in parallel */ + \insn w0, w0, x10 + \insn w4, w4, x12 + \insn w5, w5, x14 + \insn w6, w6, x16 + \insn w0, w0, x11 + \insn w4, w4, x13 + \insn w5, w5, x15 + cbnz x3, .Lb\@ + + /* Combine the 4 partial results into w0 */ + mov v3.d[0], x0 + mov v4.d[0], x4 + mov v5.d[0], x5 + pmull v0.1q, v0.1d, v3.1d + pmull v1.1q, v1.1d, v4.1d + pmull v2.1q, v2.1d, v5.1d + eor v0.8b, v0.8b, v1.8b + eor v0.8b, v0.8b, v2.8b + mov x5, v0.d[0] + eor x5, x5, x17 + \insn w0, w6, x5 + + mov in, x9 + cbnz len, .La\@ + + bit\order w0 + ret + .endm + + .align 5 +SYM_FUNC_START(crc32c_le_arm64_4way) + crc4way crc32cx, .L0 +SYM_FUNC_END(crc32c_le_arm64_4way) + + .align 5 +SYM_FUNC_START(crc32_le_arm64_4way) + crc4way crc32x, .L1 +SYM_FUNC_END(crc32_le_arm64_4way) + + .align 5 +SYM_FUNC_START(crc32_be_arm64_4way) + crc4way crc32x, .L1, be +SYM_FUNC_END(crc32_be_arm64_4way) + + .section .rodata, "a", %progbits + .align 6 +.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27 + .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e + .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b + .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8 + .long 0x299847d5, 0x878a92a7, 0x39d3b296 + .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53 + .long 0xa60ce07b, 0x83348832, 0x47db8317 + .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092 + .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0 + .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7 + .long 0xf285651c, 0xce7f39f4, 0xdaece73e + .long 0x271d9844, 0xd270f1a2, 0xab7aff2a + .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385 + .long 0xcec3662e, 0x1b03397f, 0x83348832 + .long 0x8227bb8a, 0xb3e32c28, 0x299847d5 + .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86 + .long 0xf6076544, 0x10746f3c, 0x18b33a4e + .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b + .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7 + .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b + .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b + .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4 + .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56 + .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2 + .long 0xa90fd27a, 0x0167d312, 0xc619809d + .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d + .long 0x4597456a, 0x98d8d9cb, 0x65863b64 + .long 0xc9c8b782, 0x68bce87a, 0x1b03397f + .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd + .long 0x2342001e, 0x3771e98f, 0xb3e32c28 + .long 0xe8b6368b, 0x2178513a, 0x064f7f26 + .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c + .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c + .long 0x02ee03b2, 0xff0dba97, 0x10746f3c + .long 0x135c83fd, 0xf872e54c, 0xc7a68855 + .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844 + .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c + .long 0xded288f8, 0xb3af077a, 0x93a5f730 + .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c + .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2 + .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203 + .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e + .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb + .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a + .long 0x8e1450f7, 0x2342001e, 0x8227bb8a + .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768 + .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35 + .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c + .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b + .long 0xd6c3a807, 0x2664fd8b, 0x0167d312 + .long 0x1d31175f, 0x02ee03b2, 0xf6076544 + .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a + .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf + .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb + .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c + .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a + .long 0x8a074012, 0xded288f8, 0x57a3d037 + .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b + .long 0x3be3c09b, 0x6353c1cc, 0x42d98888 + .long 0x465a4eee, 0xf48642e9, 0x3771e98f + .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9 + .long 0xa52f58ec, 0x9a5ede41, 0x2178513a + .long 0x47972100, 0x45cddf4e, 0xe0ac139e + .long 0x359674f7, 0xa51b6135, 0x170076fa + +.L1: .long 0xaf449247, 0x81256527, 0xccaa009e + .long 0x57c54819, 0x1d9513d7, 0x81256527 + .long 0x3f41287a, 0x57c54819, 0xaf449247 + .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7 + .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394 + .long 0x71d54a59, 0xf5e48c85, 0x57c54819 + .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed + .long 0xd31343ea, 0xe95c1271, 0x910eeec1 + .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a + .long 0x9ee62949, 0xcec97417, 0x9026d5b1 + .long 0xa55d1514, 0xf183c71b, 0xd1df2327 + .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85 + .long 0x9d842b80, 0xeea395c4, 0x3c656ced + .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd + .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd + .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271 + .long 0xef82aa68, 0xdb3935ea, 0xb918a347 + .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59 + .long 0x99cce860, 0x356d209f, 0xff6f2fc2 + .long 0xd8af8e46, 0xc352f6de, 0xcec97417 + .long 0xf1996890, 0xd8110ff1, 0x1c63267b + .long 0x631bc508, 0xe95c7216, 0xf183c71b + .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0 + .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea + .long 0x7a92fffb, 0xf7003835, 0x4470ac44 + .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4 + .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee + .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40 + .long 0x60290934, 0x81b6f443, 0x6d40f445 + .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949 + .long 0xdcf5088a, 0x9dbdc100, 0x145575d5 + .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d + .long 0x255b139e, 0x631bc508, 0xa55d1514 + .long 0xd784eaa8, 0xce26786c, 0xdb3935ea + .long 0x6d2c864a, 0x8068c345, 0x2586d334 + .long 0x02072e24, 0xdb3839f3, 0x21aa2b26 + .long 0x06689b0a, 0x5efd72f5, 0xe0575528 + .long 0x1e52f5ea, 0x4117915b, 0x356d209f + .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80 + .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de + .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c + .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1 + .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0 + .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216 + .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356 + .long 0x0c540e7b, 0x753c81ff, 0x8e031a19 + .long 0x9924c781, 0xb9220208, 0x3edcde65 + .long 0x3954de39, 0x1753ab84, 0x1d6708a0 + .long 0xf32238b5, 0xbec81497, 0x9e70b943 + .long 0xbbd2cd2c, 0x0925d861, 0xf7003835 + .long 0xcc401304, 0xd784eaa8, 0xef82aa68 + .long 0x4987e684, 0x6044fbb0, 0x00eba0c8 + .long 0x3aa11427, 0x18fe3b4a, 0x87441142 + .long 0x297aad60, 0x02072e24, 0xd14bcc9b + .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a + .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8 + .long 0x25b8822a, 0x1e52f5ea, 0x99cce860 + .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443 + .long 0x5690aa32, 0xa91fdefb, 0x688a110e + .long 0x1357a093, 0x3796455c, 0xd8af8e46 + .long 0x798fdd33, 0xaaa18a37, 0x357b9517 + .long 0xc2815395, 0x54d42691, 0x9dbdc100 + .long 0x21cfc0f7, 0x28ae0976, 0xf1996890 + .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6 diff --git a/lib/crc/arm64/crc32.h b/lib/crc/arm64/crc32.h new file mode 100644 index 000000000000..6e5dec45f05d --- /dev/null +++ b/lib/crc/arm64/crc32.h @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <asm/alternative.h> +#include <asm/cpufeature.h> +#include <asm/neon.h> +#include <asm/simd.h> + +#include <crypto/internal/simd.h> + +// The minimum input length to consider the 4-way interleaved code path +static const size_t min_len = 1024; + +asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len); + +asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len); + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32_le_base(crc, p, len); + + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32_le_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + + return crc32_le_arm64(crc, p, len); +} + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32c_base(crc, p, len); + + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32c_le_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + + return crc32c_le_arm64(crc, p, len); +} + +static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32_be_base(crc, p, len); + + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32_be_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + + return crc32_be_arm64(crc, p, len); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (alternative_has_cap_likely(ARM64_HAS_CRC32)) + return CRC32_LE_OPTIMIZATION | + CRC32_BE_OPTIMIZATION | + CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/crc-ccitt.c b/lib/crc/crc-ccitt.c new file mode 100644 index 000000000000..f8692c3de101 --- /dev/null +++ b/lib/crc/crc-ccitt.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/crc-ccitt.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> + +/* + * This mysterious table is just the CRC of each possible byte. It can be + * computed using the standard bit-at-a-time methods. The polynomial can + * be seen in entry 128, 0x8408. This corresponds to x^0 + x^5 + x^12. + * Add the implicit x^16, and you have the standard CRC-CCITT. + */ +u16 const crc_ccitt_table[256] = { + 0x0000, 0x1189, 0x2312, 0x329b, 0x4624, 0x57ad, 0x6536, 0x74bf, + 0x8c48, 0x9dc1, 0xaf5a, 0xbed3, 0xca6c, 0xdbe5, 0xe97e, 0xf8f7, + 0x1081, 0x0108, 0x3393, 0x221a, 0x56a5, 0x472c, 0x75b7, 0x643e, + 0x9cc9, 0x8d40, 0xbfdb, 0xae52, 0xdaed, 0xcb64, 0xf9ff, 0xe876, + 0x2102, 0x308b, 0x0210, 0x1399, 0x6726, 0x76af, 0x4434, 0x55bd, + 0xad4a, 0xbcc3, 0x8e58, 0x9fd1, 0xeb6e, 0xfae7, 0xc87c, 0xd9f5, + 0x3183, 0x200a, 0x1291, 0x0318, 0x77a7, 0x662e, 0x54b5, 0x453c, + 0xbdcb, 0xac42, 0x9ed9, 0x8f50, 0xfbef, 0xea66, 0xd8fd, 0xc974, + 0x4204, 0x538d, 0x6116, 0x709f, 0x0420, 0x15a9, 0x2732, 0x36bb, + 0xce4c, 0xdfc5, 0xed5e, 0xfcd7, 0x8868, 0x99e1, 0xab7a, 0xbaf3, + 0x5285, 0x430c, 0x7197, 0x601e, 0x14a1, 0x0528, 0x37b3, 0x263a, + 0xdecd, 0xcf44, 0xfddf, 0xec56, 0x98e9, 0x8960, 0xbbfb, 0xaa72, + 0x6306, 0x728f, 0x4014, 0x519d, 0x2522, 0x34ab, 0x0630, 0x17b9, + 0xef4e, 0xfec7, 0xcc5c, 0xddd5, 0xa96a, 0xb8e3, 0x8a78, 0x9bf1, + 0x7387, 0x620e, 0x5095, 0x411c, 0x35a3, 0x242a, 0x16b1, 0x0738, + 0xffcf, 0xee46, 0xdcdd, 0xcd54, 0xb9eb, 0xa862, 0x9af9, 0x8b70, + 0x8408, 0x9581, 0xa71a, 0xb693, 0xc22c, 0xd3a5, 0xe13e, 0xf0b7, + 0x0840, 0x19c9, 0x2b52, 0x3adb, 0x4e64, 0x5fed, 0x6d76, 0x7cff, + 0x9489, 0x8500, 0xb79b, 0xa612, 0xd2ad, 0xc324, 0xf1bf, 0xe036, + 0x18c1, 0x0948, 0x3bd3, 0x2a5a, 0x5ee5, 0x4f6c, 0x7df7, 0x6c7e, + 0xa50a, 0xb483, 0x8618, 0x9791, 0xe32e, 0xf2a7, 0xc03c, 0xd1b5, + 0x2942, 0x38cb, 0x0a50, 0x1bd9, 0x6f66, 0x7eef, 0x4c74, 0x5dfd, + 0xb58b, 0xa402, 0x9699, 0x8710, 0xf3af, 0xe226, 0xd0bd, 0xc134, + 0x39c3, 0x284a, 0x1ad1, 0x0b58, 0x7fe7, 0x6e6e, 0x5cf5, 0x4d7c, + 0xc60c, 0xd785, 0xe51e, 0xf497, 0x8028, 0x91a1, 0xa33a, 0xb2b3, + 0x4a44, 0x5bcd, 0x6956, 0x78df, 0x0c60, 0x1de9, 0x2f72, 0x3efb, + 0xd68d, 0xc704, 0xf59f, 0xe416, 0x90a9, 0x8120, 0xb3bb, 0xa232, + 0x5ac5, 0x4b4c, 0x79d7, 0x685e, 0x1ce1, 0x0d68, 0x3ff3, 0x2e7a, + 0xe70e, 0xf687, 0xc41c, 0xd595, 0xa12a, 0xb0a3, 0x8238, 0x93b1, + 0x6b46, 0x7acf, 0x4854, 0x59dd, 0x2d62, 0x3ceb, 0x0e70, 0x1ff9, + 0xf78f, 0xe606, 0xd49d, 0xc514, 0xb1ab, 0xa022, 0x92b9, 0x8330, + 0x7bc7, 0x6a4e, 0x58d5, 0x495c, 0x3de3, 0x2c6a, 0x1ef1, 0x0f78 +}; +EXPORT_SYMBOL(crc_ccitt_table); + +/** + * crc_ccitt - recompute the CRC (CRC-CCITT variant) for the data + * buffer + * @crc: previous CRC value + * @buffer: data pointer + * @len: number of bytes in the buffer + */ +u16 crc_ccitt(u16 crc, u8 const *buffer, size_t len) +{ + while (len--) + crc = crc_ccitt_byte(crc, *buffer++); + return crc; +} +EXPORT_SYMBOL(crc_ccitt); + +MODULE_DESCRIPTION("CRC-CCITT calculations"); +MODULE_LICENSE("GPL"); diff --git a/lib/crc/crc-itu-t.c b/lib/crc/crc-itu-t.c new file mode 100644 index 000000000000..6e413a290f54 --- /dev/null +++ b/lib/crc/crc-itu-t.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * crc-itu-t.c + */ + +#include <linux/crc-itu-t.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> + +/* CRC table for the CRC ITU-T V.41 0x1021 (x^16 + x^12 + x^5 + 1) */ +const u16 crc_itu_t_table[256] = { + 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, + 0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef, + 0x1231, 0x0210, 0x3273, 0x2252, 0x52b5, 0x4294, 0x72f7, 0x62d6, + 0x9339, 0x8318, 0xb37b, 0xa35a, 0xd3bd, 0xc39c, 0xf3ff, 0xe3de, + 0x2462, 0x3443, 0x0420, 0x1401, 0x64e6, 0x74c7, 0x44a4, 0x5485, + 0xa56a, 0xb54b, 0x8528, 0x9509, 0xe5ee, 0xf5cf, 0xc5ac, 0xd58d, + 0x3653, 0x2672, 0x1611, 0x0630, 0x76d7, 0x66f6, 0x5695, 0x46b4, + 0xb75b, 0xa77a, 0x9719, 0x8738, 0xf7df, 0xe7fe, 0xd79d, 0xc7bc, + 0x48c4, 0x58e5, 0x6886, 0x78a7, 0x0840, 0x1861, 0x2802, 0x3823, + 0xc9cc, 0xd9ed, 0xe98e, 0xf9af, 0x8948, 0x9969, 0xa90a, 0xb92b, + 0x5af5, 0x4ad4, 0x7ab7, 0x6a96, 0x1a71, 0x0a50, 0x3a33, 0x2a12, + 0xdbfd, 0xcbdc, 0xfbbf, 0xeb9e, 0x9b79, 0x8b58, 0xbb3b, 0xab1a, + 0x6ca6, 0x7c87, 0x4ce4, 0x5cc5, 0x2c22, 0x3c03, 0x0c60, 0x1c41, + 0xedae, 0xfd8f, 0xcdec, 0xddcd, 0xad2a, 0xbd0b, 0x8d68, 0x9d49, + 0x7e97, 0x6eb6, 0x5ed5, 0x4ef4, 0x3e13, 0x2e32, 0x1e51, 0x0e70, + 0xff9f, 0xefbe, 0xdfdd, 0xcffc, 0xbf1b, 0xaf3a, 0x9f59, 0x8f78, + 0x9188, 0x81a9, 0xb1ca, 0xa1eb, 0xd10c, 0xc12d, 0xf14e, 0xe16f, + 0x1080, 0x00a1, 0x30c2, 0x20e3, 0x5004, 0x4025, 0x7046, 0x6067, + 0x83b9, 0x9398, 0xa3fb, 0xb3da, 0xc33d, 0xd31c, 0xe37f, 0xf35e, + 0x02b1, 0x1290, 0x22f3, 0x32d2, 0x4235, 0x5214, 0x6277, 0x7256, + 0xb5ea, 0xa5cb, 0x95a8, 0x8589, 0xf56e, 0xe54f, 0xd52c, 0xc50d, + 0x34e2, 0x24c3, 0x14a0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405, + 0xa7db, 0xb7fa, 0x8799, 0x97b8, 0xe75f, 0xf77e, 0xc71d, 0xd73c, + 0x26d3, 0x36f2, 0x0691, 0x16b0, 0x6657, 0x7676, 0x4615, 0x5634, + 0xd94c, 0xc96d, 0xf90e, 0xe92f, 0x99c8, 0x89e9, 0xb98a, 0xa9ab, + 0x5844, 0x4865, 0x7806, 0x6827, 0x18c0, 0x08e1, 0x3882, 0x28a3, + 0xcb7d, 0xdb5c, 0xeb3f, 0xfb1e, 0x8bf9, 0x9bd8, 0xabbb, 0xbb9a, + 0x4a75, 0x5a54, 0x6a37, 0x7a16, 0x0af1, 0x1ad0, 0x2ab3, 0x3a92, + 0xfd2e, 0xed0f, 0xdd6c, 0xcd4d, 0xbdaa, 0xad8b, 0x9de8, 0x8dc9, + 0x7c26, 0x6c07, 0x5c64, 0x4c45, 0x3ca2, 0x2c83, 0x1ce0, 0x0cc1, + 0xef1f, 0xff3e, 0xcf5d, 0xdf7c, 0xaf9b, 0xbfba, 0x8fd9, 0x9ff8, + 0x6e17, 0x7e36, 0x4e55, 0x5e74, 0x2e93, 0x3eb2, 0x0ed1, 0x1ef0 +}; + +EXPORT_SYMBOL(crc_itu_t_table); + +/** + * crc_itu_t - Compute the CRC-ITU-T for the data buffer + * + * @crc: previous CRC value + * @buffer: data pointer + * @len: number of bytes in the buffer + * + * Returns the updated CRC value + */ +u16 crc_itu_t(u16 crc, const u8 *buffer, size_t len) +{ + while (len--) + crc = crc_itu_t_byte(crc, *buffer++); + return crc; +} +EXPORT_SYMBOL(crc_itu_t); + +MODULE_DESCRIPTION("CRC ITU-T V.41 calculations"); +MODULE_LICENSE("GPL"); + diff --git a/lib/crc/crc-t10dif-main.c b/lib/crc/crc-t10dif-main.c new file mode 100644 index 000000000000..08dde238e89f --- /dev/null +++ b/lib/crc/crc-t10dif-main.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * T10 Data Integrity Field CRC16 calculation + * + * Copyright (c) 2007 Oracle Corporation. All rights reserved. + * Written by Martin K. Petersen <martin.petersen@oracle.com> + */ + +#include <linux/crc-t10dif.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> + +/* + * Table generated using the following polynomial: + * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1 + * gt: 0x8bb7 + */ +static const u16 t10_dif_crc_table[256] = { + 0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B, + 0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6, + 0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6, + 0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B, + 0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1, + 0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C, + 0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C, + 0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781, + 0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8, + 0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255, + 0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925, + 0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698, + 0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472, + 0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF, + 0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF, + 0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02, + 0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA, + 0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067, + 0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17, + 0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA, + 0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640, + 0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD, + 0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D, + 0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30, + 0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759, + 0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4, + 0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394, + 0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29, + 0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3, + 0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E, + 0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E, + 0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3 +}; + +static inline u16 __maybe_unused +crc_t10dif_generic(u16 crc, const u8 *p, size_t len) +{ + while (len--) + crc = (crc << 8) ^ t10_dif_crc_table[(crc >> 8) ^ *p++]; + return crc; +} + +#ifdef CONFIG_CRC_T10DIF_ARCH +#include "crc-t10dif.h" /* $(SRCARCH)/crc-t10dif.h */ +#else +#define crc_t10dif_arch crc_t10dif_generic +#endif + +u16 crc_t10dif_update(u16 crc, const u8 *p, size_t len) +{ + return crc_t10dif_arch(crc, p, len); +} +EXPORT_SYMBOL(crc_t10dif_update); + +#ifdef crc_t10dif_mod_init_arch +static int __init crc_t10dif_mod_init(void) +{ + crc_t10dif_mod_init_arch(); + return 0; +} +subsys_initcall(crc_t10dif_mod_init); + +static void __exit crc_t10dif_mod_exit(void) +{ +} +module_exit(crc_t10dif_mod_exit); +#endif + +MODULE_DESCRIPTION("CRC-T10DIF library functions"); +MODULE_LICENSE("GPL"); diff --git a/lib/crc/crc16.c b/lib/crc/crc16.c new file mode 100644 index 000000000000..931660a8cbaa --- /dev/null +++ b/lib/crc/crc16.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * crc16.c + */ + +#include <linux/crc16.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> + +/** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */ +static const u16 crc16_table[256] = { + 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, + 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, + 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, + 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, + 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, + 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, + 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, + 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, + 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, + 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, + 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, + 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, + 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, + 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, + 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, + 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, + 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, + 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, + 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, + 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, + 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, + 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, + 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, + 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, + 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, + 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, + 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, + 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, + 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, + 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, + 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, + 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 +}; + +/** + * crc16 - compute the CRC-16 for the data buffer + * @crc: previous CRC value + * @p: data pointer + * @len: number of bytes in the buffer + * + * Returns the updated CRC value. + */ +u16 crc16(u16 crc, const u8 *p, size_t len) +{ + while (len--) + crc = (crc >> 8) ^ crc16_table[(crc & 0xff) ^ *p++]; + return crc; +} +EXPORT_SYMBOL(crc16); + +MODULE_DESCRIPTION("CRC16 calculations"); +MODULE_LICENSE("GPL"); + diff --git a/lib/crc/crc32-main.c b/lib/crc/crc32-main.c new file mode 100644 index 000000000000..fbb90c9006e5 --- /dev/null +++ b/lib/crc/crc32-main.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin + * cleaned up code to current version of sparse and added the slicing-by-8 + * algorithm to the closely similar existing slicing-by-4 algorithm. + * + * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com> + * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks! + * Code was from the public domain, copyright abandoned. Code was + * subsequently included in the kernel, thus was re-licensed under the + * GNU GPL v2. + * + * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com> + * Same crc32 function was used in 5 other places in the kernel. + * I made one version, and deleted the others. + * There are various incantations of crc32(). Some use a seed of 0 or ~0. + * Some xor at the end with ~0. The generic crc32() function takes + * seed as an argument, and doesn't xor at the end. Then individual + * users can do whatever they need. + * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. + * fs/jffs2 uses seed 0, doesn't xor with ~0. + * fs/partitions/efi.c uses seed ~0, xor's with ~0. + */ + +/* see: Documentation/staging/crc32.rst for a description of algorithms */ + +#include <linux/crc32.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> + +#include "crc32table.h" + +static inline u32 __maybe_unused +crc32_le_base(u32 crc, const u8 *p, size_t len) +{ + while (len--) + crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++]; + return crc; +} + +static inline u32 __maybe_unused +crc32_be_base(u32 crc, const u8 *p, size_t len) +{ + while (len--) + crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++]; + return crc; +} + +static inline u32 __maybe_unused +crc32c_base(u32 crc, const u8 *p, size_t len) +{ + while (len--) + crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++]; + return crc; +} + +#ifdef CONFIG_CRC32_ARCH +#include "crc32.h" /* $(SRCARCH)/crc32.h */ + +u32 crc32_optimizations(void) +{ + return crc32_optimizations_arch(); +} +EXPORT_SYMBOL(crc32_optimizations); +#else +#define crc32_le_arch crc32_le_base +#define crc32_be_arch crc32_be_base +#define crc32c_arch crc32c_base +#endif + +u32 crc32_le(u32 crc, const void *p, size_t len) +{ + return crc32_le_arch(crc, p, len); +} +EXPORT_SYMBOL(crc32_le); + +u32 crc32_be(u32 crc, const void *p, size_t len) +{ + return crc32_be_arch(crc, p, len); +} +EXPORT_SYMBOL(crc32_be); + +u32 crc32c(u32 crc, const void *p, size_t len) +{ + return crc32c_arch(crc, p, len); +} +EXPORT_SYMBOL(crc32c); + +#ifdef crc32_mod_init_arch +static int __init crc32_mod_init(void) +{ + crc32_mod_init_arch(); + return 0; +} +subsys_initcall(crc32_mod_init); + +static void __exit crc32_mod_exit(void) +{ +} +module_exit(crc32_mod_exit); +#endif + +MODULE_DESCRIPTION("CRC32 library functions"); +MODULE_LICENSE("GPL"); diff --git a/lib/crc/crc4.c b/lib/crc/crc4.c new file mode 100644 index 000000000000..8e83fbe60bdc --- /dev/null +++ b/lib/crc/crc4.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * crc4.c - simple crc-4 calculations. + */ + +#include <linux/crc4.h> +#include <linux/export.h> +#include <linux/module.h> + +static const uint8_t crc4_tab[] = { + 0x0, 0x7, 0xe, 0x9, 0xb, 0xc, 0x5, 0x2, + 0x1, 0x6, 0xf, 0x8, 0xa, 0xd, 0x4, 0x3, +}; + +/** + * crc4 - calculate the 4-bit crc of a value. + * @c: starting crc4 + * @x: value to checksum + * @bits: number of bits in @x to checksum + * + * Returns the crc4 value of @x, using polynomial 0b10111. + * + * The @x value is treated as left-aligned, and bits above @bits are ignored + * in the crc calculations. + */ +uint8_t crc4(uint8_t c, uint64_t x, int bits) +{ + int i; + + /* mask off anything above the top bit */ + x &= (1ull << bits) - 1; + + /* Align to 4-bits */ + bits = (bits + 3) & ~0x3; + + /* Calculate crc4 over four-bit nibbles, starting at the MSbit */ + for (i = bits - 4; i >= 0; i -= 4) + c = crc4_tab[c ^ ((x >> i) & 0xf)]; + + return c; +} +EXPORT_SYMBOL_GPL(crc4); + +MODULE_DESCRIPTION("CRC4 calculations"); +MODULE_LICENSE("GPL"); diff --git a/lib/crc/crc64-main.c b/lib/crc/crc64-main.c new file mode 100644 index 000000000000..1337036010fe --- /dev/null +++ b/lib/crc/crc64-main.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Normal 64-bit CRC calculation. + * + * This is a basic crc64 implementation following ECMA-182 specification, + * which can be found from, + * https://www.ecma-international.org/publications/standards/Ecma-182.htm + * + * Dr. Ross N. Williams has a great document to introduce the idea of CRC + * algorithm, here the CRC64 code is also inspired by the table-driven + * algorithm and detail example from this paper. This paper can be found + * from, + * http://www.ross.net/crc/download/crc_v3.txt + * + * crc64table[256] is the lookup table of a table-driven 64-bit CRC + * calculation, which is generated by gen_crc64table.c in kernel build + * time. The polynomial of crc64 arithmetic is from ECMA-182 specification + * as well, which is defined as, + * + * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + + * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + + * x^7 + x^4 + x + 1 + * + * crc64nvmetable[256] uses the CRC64 polynomial from the NVME NVM Command Set + * Specification and uses least-significant-bit first bit order: + * + * x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + x^47 + + * x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + x^26 + x^23 + + * x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + x^4 + x^3 + 1 + * + * Copyright 2018 SUSE Linux. + * Author: Coly Li <colyli@suse.de> + */ + +#include <linux/crc64.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> + +#include "crc64table.h" + +static inline u64 __maybe_unused +crc64_be_generic(u64 crc, const u8 *p, size_t len) +{ + while (len--) + crc = (crc << 8) ^ crc64table[(crc >> 56) ^ *p++]; + return crc; +} + +static inline u64 __maybe_unused +crc64_nvme_generic(u64 crc, const u8 *p, size_t len) +{ + while (len--) + crc = (crc >> 8) ^ crc64nvmetable[(crc & 0xff) ^ *p++]; + return crc; +} + +#ifdef CONFIG_CRC64_ARCH +#include "crc64.h" /* $(SRCARCH)/crc64.h */ +#else +#define crc64_be_arch crc64_be_generic +#define crc64_nvme_arch crc64_nvme_generic +#endif + +u64 crc64_be(u64 crc, const void *p, size_t len) +{ + return crc64_be_arch(crc, p, len); +} +EXPORT_SYMBOL_GPL(crc64_be); + +u64 crc64_nvme(u64 crc, const void *p, size_t len) +{ + return ~crc64_nvme_arch(~crc, p, len); +} +EXPORT_SYMBOL_GPL(crc64_nvme); + +#ifdef crc64_mod_init_arch +static int __init crc64_mod_init(void) +{ + crc64_mod_init_arch(); + return 0; +} +subsys_initcall(crc64_mod_init); + +static void __exit crc64_mod_exit(void) +{ +} +module_exit(crc64_mod_exit); +#endif + +MODULE_DESCRIPTION("CRC64 library functions"); +MODULE_LICENSE("GPL"); diff --git a/lib/crc/crc7.c b/lib/crc/crc7.c new file mode 100644 index 000000000000..46b95d7ac6ce --- /dev/null +++ b/lib/crc/crc7.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * crc7.c + */ + +#include <linux/crc7.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/types.h> + +/* + * Table for CRC-7 (polynomial x^7 + x^3 + 1). + * This is a big-endian CRC (msbit is highest power of x), + * aligned so the msbit of the byte is the x^6 coefficient + * and the lsbit is not used. + */ +static const u8 crc7_be_syndrome_table[256] = { + 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, + 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee, + 0x32, 0x20, 0x16, 0x04, 0x7a, 0x68, 0x5e, 0x4c, + 0xa2, 0xb0, 0x86, 0x94, 0xea, 0xf8, 0xce, 0xdc, + 0x64, 0x76, 0x40, 0x52, 0x2c, 0x3e, 0x08, 0x1a, + 0xf4, 0xe6, 0xd0, 0xc2, 0xbc, 0xae, 0x98, 0x8a, + 0x56, 0x44, 0x72, 0x60, 0x1e, 0x0c, 0x3a, 0x28, + 0xc6, 0xd4, 0xe2, 0xf0, 0x8e, 0x9c, 0xaa, 0xb8, + 0xc8, 0xda, 0xec, 0xfe, 0x80, 0x92, 0xa4, 0xb6, + 0x58, 0x4a, 0x7c, 0x6e, 0x10, 0x02, 0x34, 0x26, + 0xfa, 0xe8, 0xde, 0xcc, 0xb2, 0xa0, 0x96, 0x84, + 0x6a, 0x78, 0x4e, 0x5c, 0x22, 0x30, 0x06, 0x14, + 0xac, 0xbe, 0x88, 0x9a, 0xe4, 0xf6, 0xc0, 0xd2, + 0x3c, 0x2e, 0x18, 0x0a, 0x74, 0x66, 0x50, 0x42, + 0x9e, 0x8c, 0xba, 0xa8, 0xd6, 0xc4, 0xf2, 0xe0, + 0x0e, 0x1c, 0x2a, 0x38, 0x46, 0x54, 0x62, 0x70, + 0x82, 0x90, 0xa6, 0xb4, 0xca, 0xd8, 0xee, 0xfc, + 0x12, 0x00, 0x36, 0x24, 0x5a, 0x48, 0x7e, 0x6c, + 0xb0, 0xa2, 0x94, 0x86, 0xf8, 0xea, 0xdc, 0xce, + 0x20, 0x32, 0x04, 0x16, 0x68, 0x7a, 0x4c, 0x5e, + 0xe6, 0xf4, 0xc2, 0xd0, 0xae, 0xbc, 0x8a, 0x98, + 0x76, 0x64, 0x52, 0x40, 0x3e, 0x2c, 0x1a, 0x08, + 0xd4, 0xc6, 0xf0, 0xe2, 0x9c, 0x8e, 0xb8, 0xaa, + 0x44, 0x56, 0x60, 0x72, 0x0c, 0x1e, 0x28, 0x3a, + 0x4a, 0x58, 0x6e, 0x7c, 0x02, 0x10, 0x26, 0x34, + 0xda, 0xc8, 0xfe, 0xec, 0x92, 0x80, 0xb6, 0xa4, + 0x78, 0x6a, 0x5c, 0x4e, 0x30, 0x22, 0x14, 0x06, + 0xe8, 0xfa, 0xcc, 0xde, 0xa0, 0xb2, 0x84, 0x96, + 0x2e, 0x3c, 0x0a, 0x18, 0x66, 0x74, 0x42, 0x50, + 0xbe, 0xac, 0x9a, 0x88, 0xf6, 0xe4, 0xd2, 0xc0, + 0x1c, 0x0e, 0x38, 0x2a, 0x54, 0x46, 0x70, 0x62, + 0x8c, 0x9e, 0xa8, 0xba, 0xc4, 0xd6, 0xe0, 0xf2 +}; + +/** + * crc7_be - update the CRC7 for the data buffer + * @crc: previous CRC7 value + * @buffer: data pointer + * @len: number of bytes in the buffer + * Context: any + * + * Returns the updated CRC7 value. + * The CRC7 is left-aligned in the byte (the lsbit is always 0), as that + * makes the computation easier, and all callers want it in that form. + * + */ +u8 crc7_be(u8 crc, const u8 *buffer, size_t len) +{ + while (len--) + crc = crc7_be_syndrome_table[crc ^ *buffer++]; + return crc; +} +EXPORT_SYMBOL(crc7_be); + +MODULE_DESCRIPTION("CRC7 calculations"); +MODULE_LICENSE("GPL"); diff --git a/lib/crc/crc8.c b/lib/crc/crc8.c new file mode 100644 index 000000000000..329c52158c45 --- /dev/null +++ b/lib/crc/crc8.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2011 Broadcom Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/crc8.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/printk.h> + +/** + * crc8_populate_msb - fill crc table for given polynomial in reverse bit order. + * + * @table: table to be filled. + * @polynomial: polynomial for which table is to be filled. + */ +void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial) +{ + int i, j; + const u8 msbit = 0x80; + u8 t = msbit; + + table[0] = 0; + + for (i = 1; i < CRC8_TABLE_SIZE; i *= 2) { + t = (t << 1) ^ (t & msbit ? polynomial : 0); + for (j = 0; j < i; j++) + table[i+j] = table[j] ^ t; + } +} +EXPORT_SYMBOL(crc8_populate_msb); + +/** + * crc8_populate_lsb - fill crc table for given polynomial in regular bit order. + * + * @table: table to be filled. + * @polynomial: polynomial for which table is to be filled. + */ +void crc8_populate_lsb(u8 table[CRC8_TABLE_SIZE], u8 polynomial) +{ + int i, j; + u8 t = 1; + + table[0] = 0; + + for (i = (CRC8_TABLE_SIZE >> 1); i; i >>= 1) { + t = (t >> 1) ^ (t & 1 ? polynomial : 0); + for (j = 0; j < CRC8_TABLE_SIZE; j += 2*i) + table[i+j] = table[j] ^ t; + } +} +EXPORT_SYMBOL(crc8_populate_lsb); + +/** + * crc8 - calculate a crc8 over the given input data. + * + * @table: crc table used for calculation. + * @pdata: pointer to data buffer. + * @nbytes: number of bytes in data buffer. + * @crc: previous returned crc8 value. + */ +u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc) +{ + /* loop over the buffer data */ + while (nbytes-- > 0) + crc = table[(crc ^ *pdata++) & 0xff]; + + return crc; +} +EXPORT_SYMBOL(crc8); + +MODULE_DESCRIPTION("CRC8 (by Williams, Ross N.) function"); +MODULE_AUTHOR("Broadcom Corporation"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/lib/crc/gen_crc32table.c b/lib/crc/gen_crc32table.c new file mode 100644 index 000000000000..9a7f31658e35 --- /dev/null +++ b/lib/crc/gen_crc32table.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include "../../include/linux/crc32poly.h" +#include "../../include/generated/autoconf.h" +#include <inttypes.h> + +static uint32_t crc32table_le[256]; +static uint32_t crc32table_be[256]; +static uint32_t crc32ctable_le[256]; + +/** + * crc32init_le() - allocate and initialize LE table data + * + * crc is the crc of the byte i; other entries are filled in based on the + * fact that crctable[i^j] = crctable[i] ^ crctable[j]. + * + */ +static void crc32init_le_generic(const uint32_t polynomial, uint32_t tab[256]) +{ + unsigned i, j; + uint32_t crc = 1; + + tab[0] = 0; + + for (i = 128; i; i >>= 1) { + crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0); + for (j = 0; j < 256; j += 2 * i) + tab[i + j] = crc ^ tab[j]; + } +} + +static void crc32init_le(void) +{ + crc32init_le_generic(CRC32_POLY_LE, crc32table_le); +} + +static void crc32cinit_le(void) +{ + crc32init_le_generic(CRC32C_POLY_LE, crc32ctable_le); +} + +/** + * crc32init_be() - allocate and initialize BE table data + */ +static void crc32init_be(void) +{ + unsigned i, j; + uint32_t crc = 0x80000000; + + crc32table_be[0] = 0; + + for (i = 1; i < 256; i <<= 1) { + crc = (crc << 1) ^ ((crc & 0x80000000) ? CRC32_POLY_BE : 0); + for (j = 0; j < i; j++) + crc32table_be[i + j] = crc ^ crc32table_be[j]; + } +} + +static void output_table(const uint32_t table[256]) +{ + int i; + + for (i = 0; i < 256; i += 4) { + printf("\t0x%08x, 0x%08x, 0x%08x, 0x%08x,\n", + table[i], table[i + 1], table[i + 2], table[i + 3]); + } +} + +int main(int argc, char** argv) +{ + printf("/* this file is generated - do not edit */\n\n"); + + crc32init_le(); + printf("static const u32 ____cacheline_aligned crc32table_le[256] = {\n"); + output_table(crc32table_le); + printf("};\n"); + + crc32init_be(); + printf("static const u32 ____cacheline_aligned crc32table_be[256] = {\n"); + output_table(crc32table_be); + printf("};\n"); + + crc32cinit_le(); + printf("static const u32 ____cacheline_aligned crc32ctable_le[256] = {\n"); + output_table(crc32ctable_le); + printf("};\n"); + + return 0; +} diff --git a/lib/crc/gen_crc64table.c b/lib/crc/gen_crc64table.c new file mode 100644 index 000000000000..f2be9f62bab7 --- /dev/null +++ b/lib/crc/gen_crc64table.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This host program runs at kernel build time and generates the lookup tables + * used by the generic CRC64 code. + * + * Copyright 2018 SUSE Linux. + * Author: Coly Li <colyli@suse.de> + */ +#include <inttypes.h> +#include <stdio.h> + +#define CRC64_ECMA182_POLY 0x42F0E1EBA9EA3693ULL +#define CRC64_NVME_POLY 0x9A6C9329AC4BC9B5ULL + +static uint64_t crc64_table[256] = {0}; +static uint64_t crc64_nvme_table[256] = {0}; + +static void generate_reflected_crc64_table(uint64_t table[256], uint64_t poly) +{ + uint64_t i, j, c, crc; + + for (i = 0; i < 256; i++) { + crc = 0ULL; + c = i; + + for (j = 0; j < 8; j++) { + if ((crc ^ (c >> j)) & 1) + crc = (crc >> 1) ^ poly; + else + crc >>= 1; + } + table[i] = crc; + } +} + +static void generate_crc64_table(uint64_t table[256], uint64_t poly) +{ + uint64_t i, j, c, crc; + + for (i = 0; i < 256; i++) { + crc = 0; + c = i << 56; + + for (j = 0; j < 8; j++) { + if ((crc ^ c) & 0x8000000000000000ULL) + crc = (crc << 1) ^ poly; + else + crc <<= 1; + c <<= 1; + } + + table[i] = crc; + } +} + +static void output_table(uint64_t table[256]) +{ + int i; + + for (i = 0; i < 256; i++) { + printf("\t0x%016" PRIx64 "ULL", table[i]); + if (i & 0x1) + printf(",\n"); + else + printf(", "); + } + printf("};\n"); +} + +static void print_crc64_tables(void) +{ + printf("/* this file is generated - do not edit */\n\n"); + printf("#include <linux/types.h>\n"); + printf("#include <linux/cache.h>\n\n"); + printf("static const u64 ____cacheline_aligned crc64table[256] = {\n"); + output_table(crc64_table); + + printf("\nstatic const u64 ____cacheline_aligned crc64nvmetable[256] = {\n"); + output_table(crc64_nvme_table); +} + +int main(int argc, char *argv[]) +{ + generate_crc64_table(crc64_table, CRC64_ECMA182_POLY); + generate_reflected_crc64_table(crc64_nvme_table, CRC64_NVME_POLY); + print_crc64_tables(); + return 0; +} diff --git a/lib/crc/loongarch/crc32.h b/lib/crc/loongarch/crc32.h new file mode 100644 index 000000000000..6de5c96594af --- /dev/null +++ b/lib/crc/loongarch/crc32.h @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CRC32 and CRC32C using LoongArch crc* instructions + * + * Module based on mips/crypto/crc32-mips.c + * + * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org> + * Copyright (C) 2018 MIPS Tech, LLC + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#include <asm/cpu-features.h> +#include <linux/unaligned.h> + +#define _CRC32(crc, value, size, type) \ +do { \ + __asm__ __volatile__( \ + #type ".w." #size ".w" " %0, %1, %0\n\t"\ + : "+r" (crc) \ + : "r" (value) \ + : "memory"); \ +} while (0) + +#define CRC32(crc, value, size) _CRC32(crc, value, size, crc) +#define CRC32C(crc, value, size) _CRC32(crc, value, size, crcc) + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (!static_branch_likely(&have_crc32)) + return crc32_le_base(crc, p, len); + + while (len >= sizeof(u64)) { + u64 value = get_unaligned_le64(p); + + CRC32(crc, value, d); + p += sizeof(u64); + len -= sizeof(u64); + } + + if (len & sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32(crc, value, w); + p += sizeof(u32); + } + + if (len & sizeof(u16)) { + u16 value = get_unaligned_le16(p); + + CRC32(crc, value, h); + p += sizeof(u16); + } + + if (len & sizeof(u8)) { + u8 value = *p++; + + CRC32(crc, value, b); + } + + return crc; +} + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (!static_branch_likely(&have_crc32)) + return crc32c_base(crc, p, len); + + while (len >= sizeof(u64)) { + u64 value = get_unaligned_le64(p); + + CRC32C(crc, value, d); + p += sizeof(u64); + len -= sizeof(u64); + } + + if (len & sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32C(crc, value, w); + p += sizeof(u32); + } + + if (len & sizeof(u16)) { + u16 value = get_unaligned_le16(p); + + CRC32C(crc, value, h); + p += sizeof(u16); + } + + if (len & sizeof(u8)) { + u8 value = *p++; + + CRC32C(crc, value, b); + } + + return crc; +} + +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + if (cpu_has_crc32) + static_branch_enable(&have_crc32); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (static_key_enabled(&have_crc32)) + return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/mips/crc32.h b/lib/crc/mips/crc32.h new file mode 100644 index 000000000000..11cb272c63a6 --- /dev/null +++ b/lib/crc/mips/crc32.h @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * crc32-mips.c - CRC32 and CRC32C using optional MIPSr6 instructions + * + * Module based on arm64/crypto/crc32-arm.c + * + * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org> + * Copyright (C) 2018 MIPS Tech, LLC + */ + +#include <linux/cpufeature.h> +#include <asm/mipsregs.h> +#include <linux/unaligned.h> + +#ifndef TOOLCHAIN_SUPPORTS_CRC +#define _ASM_SET_CRC(OP, SZ, TYPE) \ +_ASM_MACRO_3R(OP, rt, rs, rt2, \ + ".ifnc \\rt, \\rt2\n\t" \ + ".error \"invalid operands \\\"" #OP " \\rt,\\rs,\\rt2\\\"\"\n\t" \ + ".endif\n\t" \ + _ASM_INSN_IF_MIPS(0x7c00000f | (__rt << 16) | (__rs << 21) | \ + ((SZ) << 6) | ((TYPE) << 8)) \ + _ASM_INSN32_IF_MM(0x00000030 | (__rs << 16) | (__rt << 21) | \ + ((SZ) << 14) | ((TYPE) << 3))) +#define _ASM_UNSET_CRC(op, SZ, TYPE) ".purgem " #op "\n\t" +#else /* !TOOLCHAIN_SUPPORTS_CRC */ +#define _ASM_SET_CRC(op, SZ, TYPE) ".set\tcrc\n\t" +#define _ASM_UNSET_CRC(op, SZ, TYPE) +#endif + +#define __CRC32(crc, value, op, SZ, TYPE) \ +do { \ + __asm__ __volatile__( \ + ".set push\n\t" \ + _ASM_SET_CRC(op, SZ, TYPE) \ + #op " %0, %1, %0\n\t" \ + _ASM_UNSET_CRC(op, SZ, TYPE) \ + ".set pop" \ + : "+r" (crc) \ + : "r" (value)); \ +} while (0) + +#define _CRC32_crc32b(crc, value) __CRC32(crc, value, crc32b, 0, 0) +#define _CRC32_crc32h(crc, value) __CRC32(crc, value, crc32h, 1, 0) +#define _CRC32_crc32w(crc, value) __CRC32(crc, value, crc32w, 2, 0) +#define _CRC32_crc32d(crc, value) __CRC32(crc, value, crc32d, 3, 0) +#define _CRC32_crc32cb(crc, value) __CRC32(crc, value, crc32cb, 0, 1) +#define _CRC32_crc32ch(crc, value) __CRC32(crc, value, crc32ch, 1, 1) +#define _CRC32_crc32cw(crc, value) __CRC32(crc, value, crc32cw, 2, 1) +#define _CRC32_crc32cd(crc, value) __CRC32(crc, value, crc32cd, 3, 1) + +#define _CRC32(crc, value, size, op) \ + _CRC32_##op##size(crc, value) + +#define CRC32(crc, value, size) \ + _CRC32(crc, value, size, crc32) + +#define CRC32C(crc, value, size) \ + _CRC32(crc, value, size, crc32c) + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (!static_branch_likely(&have_crc32)) + return crc32_le_base(crc, p, len); + + if (IS_ENABLED(CONFIG_64BIT)) { + for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) { + u64 value = get_unaligned_le64(p); + + CRC32(crc, value, d); + } + + if (len & sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32(crc, value, w); + p += sizeof(u32); + } + } else { + for (; len >= sizeof(u32); len -= sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32(crc, value, w); + p += sizeof(u32); + } + } + + if (len & sizeof(u16)) { + u16 value = get_unaligned_le16(p); + + CRC32(crc, value, h); + p += sizeof(u16); + } + + if (len & sizeof(u8)) { + u8 value = *p++; + + CRC32(crc, value, b); + } + + return crc; +} + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (!static_branch_likely(&have_crc32)) + return crc32c_base(crc, p, len); + + if (IS_ENABLED(CONFIG_64BIT)) { + for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) { + u64 value = get_unaligned_le64(p); + + CRC32C(crc, value, d); + } + + if (len & sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32C(crc, value, w); + p += sizeof(u32); + } + } else { + for (; len >= sizeof(u32); len -= sizeof(u32)) { + u32 value = get_unaligned_le32(p); + + CRC32C(crc, value, w); + p += sizeof(u32); + } + } + + if (len & sizeof(u16)) { + u16 value = get_unaligned_le16(p); + + CRC32C(crc, value, h); + p += sizeof(u16); + } + + if (len & sizeof(u8)) { + u8 value = *p++; + + CRC32C(crc, value, b); + } + return crc; +} + +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + if (cpu_have_feature(cpu_feature(MIPS_CRC32))) + static_branch_enable(&have_crc32); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (static_key_enabled(&have_crc32)) + return CRC32_LE_OPTIMIZATION | CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/powerpc/crc-t10dif.h b/lib/crc/powerpc/crc-t10dif.h new file mode 100644 index 000000000000..59e16804a6ea --- /dev/null +++ b/lib/crc/powerpc/crc-t10dif.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Calculate a CRC T10-DIF with vpmsum acceleration + * + * Copyright 2017, Daniel Axtens, IBM Corporation. + * [based on crc32c-vpmsum_glue.c] + */ + +#include <asm/switch_to.h> +#include <crypto/internal/simd.h> +#include <linux/cpufeature.h> +#include <linux/jump_label.h> +#include <linux/preempt.h> +#include <linux/uaccess.h> + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN-1) + +#define VECTOR_BREAKPOINT 64 + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto); + +u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len); + +static inline u16 crc_t10dif_arch(u16 crci, const u8 *p, size_t len) +{ + unsigned int prealign; + unsigned int tail; + u32 crc = crci; + + if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || + !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable()) + return crc_t10dif_generic(crc, p, len); + + if ((unsigned long)p & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); + crc = crc_t10dif_generic(crc, p, prealign); + len -= prealign; + p += prealign; + } + + if (len & ~VMX_ALIGN_MASK) { + crc <<= 16; + preempt_disable(); + pagefault_disable(); + enable_kernel_altivec(); + crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); + disable_kernel_altivec(); + pagefault_enable(); + preempt_enable(); + crc >>= 16; + } + + tail = len & VMX_ALIGN_MASK; + if (tail) { + p += len & ~VMX_ALIGN_MASK; + crc = crc_t10dif_generic(crc, p, tail); + } + + return crc & 0xffff; +} + +#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch +static inline void crc_t10dif_mod_init_arch(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO)) + static_branch_enable(&have_vec_crypto); +} diff --git a/lib/crc/powerpc/crc-vpmsum-template.S b/lib/crc/powerpc/crc-vpmsum-template.S new file mode 100644 index 000000000000..b0f87f595b26 --- /dev/null +++ b/lib/crc/powerpc/crc-vpmsum-template.S @@ -0,0 +1,746 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Core of the accelerated CRC algorithm. + * In your file, define the constants and CRC_FUNCTION_NAME + * Then include this file. + * + * Calculate the checksum of data that is 16 byte aligned and a multiple of + * 16 bytes. + * + * The first step is to reduce it to 1024 bits. We do this in 8 parallel + * chunks in order to mask the latency of the vpmsum instructions. If we + * have more than 32 kB of data to checksum we repeat this step multiple + * times, passing in the previous 1024 bits. + * + * The next step is to reduce the 1024 bits to 64 bits. This step adds + * 32 bits of 0s to the end - this matches what a CRC does. We just + * calculate constants that land the data in this 32 bits. + * + * We then use fixed point Barrett reduction to compute a mod n over GF(2) + * for n = CRC using POWER8 instructions. We use x = 32. + * + * https://en.wikipedia.org/wiki/Barrett_reduction + * + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM +*/ + +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> + +#define MAX_SIZE 32768 + + .text + +#if defined(__BIG_ENDIAN__) && defined(REFLECT) +#define BYTESWAP_DATA +#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) +#define BYTESWAP_DATA +#else +#undef BYTESWAP_DATA +#endif + +#define off16 r25 +#define off32 r26 +#define off48 r27 +#define off64 r28 +#define off80 r29 +#define off96 r30 +#define off112 r31 + +#define const1 v24 +#define const2 v25 + +#define byteswap v26 +#define mask_32bit v27 +#define mask_64bit v28 +#define zeroes v29 + +#ifdef BYTESWAP_DATA +#define VPERM(A, B, C, D) vperm A, B, C, D +#else +#define VPERM(A, B, C, D) +#endif + +/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */ +FUNC_START(CRC_FUNCTION_NAME) + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + std r26,-48(r1) + std r25,-56(r1) + + li off16,16 + li off32,32 + li off48,48 + li off64,64 + li off80,80 + li off96,96 + li off112,112 + li r0,0 + + /* Enough room for saving 10 non volatile VMX registers */ + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + stvx v20,0,r6 + stvx v21,off16,r6 + stvx v22,off32,r6 + stvx v23,off48,r6 + stvx v24,off64,r6 + stvx v25,off80,r6 + stvx v26,off96,r6 + stvx v27,off112,r6 + stvx v28,0,r7 + stvx v29,off16,r7 + + mr r10,r3 + + vxor zeroes,zeroes,zeroes + vspltisw v0,-1 + + vsldoi mask_32bit,zeroes,v0,4 + vsldoi mask_64bit,zeroes,v0,8 + + /* Get the initial value into v8 */ + vxor v8,v8,v8 + MTVRD(v8, R3) +#ifdef REFLECT + vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ +#else + vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ +#endif + +#ifdef BYTESWAP_DATA + LOAD_REG_ADDR(r3, .byteswap_constant) + lvx byteswap,0,r3 + addi r3,r3,16 +#endif + + cmpdi r5,256 + blt .Lshort + + rldicr r6,r5,0,56 + + /* Checksum in blocks of MAX_SIZE */ +1: lis r7,MAX_SIZE@h + ori r7,r7,MAX_SIZE@l + mr r9,r7 + cmpd r6,r7 + bgt 2f + mr r7,r6 +2: subf r6,r7,r6 + + /* our main loop does 128 bytes at a time */ + srdi r7,r7,7 + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + sldi r8,r7,4 + srdi r9,r9,3 + subf r8,r8,r9 + + /* We reduce our final 128 bytes in a separate step */ + addi r7,r7,-1 + mtctr r7 + + LOAD_REG_ADDR(r3, .constants) + + /* Find the start of our constants */ + add r3,r3,r8 + + /* zero v0-v7 which will contain our checksums */ + vxor v0,v0,v0 + vxor v1,v1,v1 + vxor v2,v2,v2 + vxor v3,v3,v3 + vxor v4,v4,v4 + vxor v5,v5,v5 + vxor v6,v6,v6 + vxor v7,v7,v7 + + lvx const1,0,r3 + + /* + * If we are looping back to consume more data we use the values + * already in v16-v23. + */ + cmpdi r0,1 + beq 2f + + /* First warm up pass */ + lvx v16,0,r4 + lvx v17,off16,r4 + VPERM(v16,v16,v16,byteswap) + VPERM(v17,v17,v17,byteswap) + lvx v18,off32,r4 + lvx v19,off48,r4 + VPERM(v18,v18,v18,byteswap) + VPERM(v19,v19,v19,byteswap) + lvx v20,off64,r4 + lvx v21,off80,r4 + VPERM(v20,v20,v20,byteswap) + VPERM(v21,v21,v21,byteswap) + lvx v22,off96,r4 + lvx v23,off112,r4 + VPERM(v22,v22,v22,byteswap) + VPERM(v23,v23,v23,byteswap) + addi r4,r4,8*16 + + /* xor in initial value */ + vxor v16,v16,v8 + +2: bdz .Lfirst_warm_up_done + + addi r3,r3,16 + lvx const2,0,r3 + + /* Second warm up pass */ + VPMSUMD(v8,v16,const1) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + VPMSUMD(v9,v17,const1) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + VPMSUMD(v10,v18,const1) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + VPMSUMD(v11,v19,const1) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + ori r2,r2,0 + + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdz .Lfirst_cool_down + + /* + * main loop. We modulo schedule it such that it takes three iterations + * to complete - first iteration load, second iteration vpmsum, third + * iteration xor. + */ + .balign 16 +4: lvx const1,0,r3 + addi r3,r3,16 + ori r2,r2,0 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const2) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const2) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const2) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const2) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + lvx const2,0,r3 + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdnz 4b + +.Lfirst_cool_down: + /* First cool down pass */ + lvx const1,0,r3 + addi r3,r3,16 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const1) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const1) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const1) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const1) + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + ori r2,r2,0 + +.Lsecond_cool_down: + /* Second cool down pass */ + vxor v0,v0,v8 + vxor v1,v1,v9 + vxor v2,v2,v10 + vxor v3,v3,v11 + vxor v4,v4,v12 + vxor v5,v5,v13 + vxor v6,v6,v14 + vxor v7,v7,v15 + +#ifdef REFLECT + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + vsldoi v0,v0,zeroes,4 + vsldoi v1,v1,zeroes,4 + vsldoi v2,v2,zeroes,4 + vsldoi v3,v3,zeroes,4 + vsldoi v4,v4,zeroes,4 + vsldoi v5,v5,zeroes,4 + vsldoi v6,v6,zeroes,4 + vsldoi v7,v7,zeroes,4 +#endif + + /* xor with last 1024 bits */ + lvx v8,0,r4 + lvx v9,off16,r4 + VPERM(v8,v8,v8,byteswap) + VPERM(v9,v9,v9,byteswap) + lvx v10,off32,r4 + lvx v11,off48,r4 + VPERM(v10,v10,v10,byteswap) + VPERM(v11,v11,v11,byteswap) + lvx v12,off64,r4 + lvx v13,off80,r4 + VPERM(v12,v12,v12,byteswap) + VPERM(v13,v13,v13,byteswap) + lvx v14,off96,r4 + lvx v15,off112,r4 + VPERM(v14,v14,v14,byteswap) + VPERM(v15,v15,v15,byteswap) + + addi r4,r4,8*16 + + vxor v16,v0,v8 + vxor v17,v1,v9 + vxor v18,v2,v10 + vxor v19,v3,v11 + vxor v20,v4,v12 + vxor v21,v5,v13 + vxor v22,v6,v14 + vxor v23,v7,v15 + + li r0,1 + cmpdi r6,0 + addi r6,r6,128 + bne 1b + + /* Work out how many bytes we have left */ + andi. r5,r5,127 + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,128 + add r3,r3,r6 + + /* How many 16 byte chunks are in the tail */ + srdi r7,r5,4 + mtctr r7 + + /* + * Reduce the previously calculated 1024 bits to 64 bits, shifting + * 32 bits to include the trailing 32 bits of zeros + */ + lvx v0,0,r3 + lvx v1,off16,r3 + lvx v2,off32,r3 + lvx v3,off48,r3 + lvx v4,off64,r3 + lvx v5,off80,r3 + lvx v6,off96,r3 + lvx v7,off112,r3 + addi r3,r3,8*16 + + VPMSUMW(v0,v16,v0) + VPMSUMW(v1,v17,v1) + VPMSUMW(v2,v18,v2) + VPMSUMW(v3,v19,v3) + VPMSUMW(v4,v20,v4) + VPMSUMW(v5,v21,v5) + VPMSUMW(v6,v22,v6) + VPMSUMW(v7,v23,v7) + + /* Now reduce the tail (0 - 112 bytes) */ + cmpdi r7,0 + beq 1f + + lvx v16,0,r4 + lvx v17,0,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off16,r4 + lvx v17,off16,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off32,r4 + lvx v17,off32,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off48,r4 + lvx v17,off48,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off64,r4 + lvx v17,off64,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off80,r4 + lvx v17,off80,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off96,r4 + lvx v17,off96,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + + /* Now xor all the parallel chunks together */ +1: vxor v0,v0,v1 + vxor v2,v2,v3 + vxor v4,v4,v5 + vxor v6,v6,v7 + + vxor v0,v0,v2 + vxor v4,v4,v6 + + vxor v0,v0,v4 + +.Lbarrett_reduction: + /* Barrett constants */ + LOAD_REG_ADDR(r3, .barrett_constants) + + lvx const1,0,r3 + lvx const2,off16,r3 + + vsldoi v1,v0,v0,8 + vxor v0,v0,v1 /* xor two 64 bit results together */ + +#ifdef REFLECT + /* shift left one bit */ + vspltisb v1,1 + vsl v0,v0,v1 +#endif + + vand v0,v0,mask_64bit +#ifndef REFLECT + /* + * Now for the Barrett reduction algorithm. The idea is to calculate q, + * the multiple of our polynomial that we need to subtract. By + * doing the computation 2x bits higher (ie 64 bits) and shifting the + * result back down 2x bits, we round down to the nearest multiple. + */ + VPMSUMD(v1,v0,const1) /* ma */ + vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Get the result into r3. We need to shift it left 8 bytes: + * V0 [ 0 1 2 X ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ +#else + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + vand v1,v0,mask_32bit /* bottom 32 bits of a */ + VPMSUMD(v1,v1,const1) /* ma */ + vand v1,v1,mask_32bit /* bottom 32bits of ma */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ +#endif + + /* Get it into r3 */ + MFVRD(R3, v0) + +.Lout: + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + lvx v20,0,r6 + lvx v21,off16,r6 + lvx v22,off32,r6 + lvx v23,off48,r6 + lvx v24,off64,r6 + lvx v25,off80,r6 + lvx v26,off96,r6 + lvx v27,off112,r6 + lvx v28,0,r7 + lvx v29,off16,r7 + + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + ld r26,-48(r1) + ld r25,-56(r1) + + blr + +.Lfirst_warm_up_done: + lvx const1,0,r3 + addi r3,r3,16 + + VPMSUMD(v8,v16,const1) + VPMSUMD(v9,v17,const1) + VPMSUMD(v10,v18,const1) + VPMSUMD(v11,v19,const1) + VPMSUMD(v12,v20,const1) + VPMSUMD(v13,v21,const1) + VPMSUMD(v14,v22,const1) + VPMSUMD(v15,v23,const1) + + b .Lsecond_cool_down + +.Lshort: + cmpdi r5,0 + beq .Lzero + + LOAD_REG_ADDR(r3, .short_constants) + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,256 + add r3,r3,r6 + + /* How many 16 byte chunks? */ + srdi r7,r5,4 + mtctr r7 + + vxor v19,v19,v19 + vxor v20,v20,v20 + + lvx v0,0,r4 + lvx v16,0,r3 + VPERM(v0,v0,v16,byteswap) + vxor v0,v0,v8 /* xor in initial value */ + VPMSUMW(v0,v0,v16) + bdz .Lv0 + + lvx v1,off16,r4 + lvx v17,off16,r3 + VPERM(v1,v1,v17,byteswap) + VPMSUMW(v1,v1,v17) + bdz .Lv1 + + lvx v2,off32,r4 + lvx v16,off32,r3 + VPERM(v2,v2,v16,byteswap) + VPMSUMW(v2,v2,v16) + bdz .Lv2 + + lvx v3,off48,r4 + lvx v17,off48,r3 + VPERM(v3,v3,v17,byteswap) + VPMSUMW(v3,v3,v17) + bdz .Lv3 + + lvx v4,off64,r4 + lvx v16,off64,r3 + VPERM(v4,v4,v16,byteswap) + VPMSUMW(v4,v4,v16) + bdz .Lv4 + + lvx v5,off80,r4 + lvx v17,off80,r3 + VPERM(v5,v5,v17,byteswap) + VPMSUMW(v5,v5,v17) + bdz .Lv5 + + lvx v6,off96,r4 + lvx v16,off96,r3 + VPERM(v6,v6,v16,byteswap) + VPMSUMW(v6,v6,v16) + bdz .Lv6 + + lvx v7,off112,r4 + lvx v17,off112,r3 + VPERM(v7,v7,v17,byteswap) + VPMSUMW(v7,v7,v17) + bdz .Lv7 + + addi r3,r3,128 + addi r4,r4,128 + + lvx v8,0,r4 + lvx v16,0,r3 + VPERM(v8,v8,v16,byteswap) + VPMSUMW(v8,v8,v16) + bdz .Lv8 + + lvx v9,off16,r4 + lvx v17,off16,r3 + VPERM(v9,v9,v17,byteswap) + VPMSUMW(v9,v9,v17) + bdz .Lv9 + + lvx v10,off32,r4 + lvx v16,off32,r3 + VPERM(v10,v10,v16,byteswap) + VPMSUMW(v10,v10,v16) + bdz .Lv10 + + lvx v11,off48,r4 + lvx v17,off48,r3 + VPERM(v11,v11,v17,byteswap) + VPMSUMW(v11,v11,v17) + bdz .Lv11 + + lvx v12,off64,r4 + lvx v16,off64,r3 + VPERM(v12,v12,v16,byteswap) + VPMSUMW(v12,v12,v16) + bdz .Lv12 + + lvx v13,off80,r4 + lvx v17,off80,r3 + VPERM(v13,v13,v17,byteswap) + VPMSUMW(v13,v13,v17) + bdz .Lv13 + + lvx v14,off96,r4 + lvx v16,off96,r3 + VPERM(v14,v14,v16,byteswap) + VPMSUMW(v14,v14,v16) + bdz .Lv14 + + lvx v15,off112,r4 + lvx v17,off112,r3 + VPERM(v15,v15,v17,byteswap) + VPMSUMW(v15,v15,v17) + +.Lv15: vxor v19,v19,v15 +.Lv14: vxor v20,v20,v14 +.Lv13: vxor v19,v19,v13 +.Lv12: vxor v20,v20,v12 +.Lv11: vxor v19,v19,v11 +.Lv10: vxor v20,v20,v10 +.Lv9: vxor v19,v19,v9 +.Lv8: vxor v20,v20,v8 +.Lv7: vxor v19,v19,v7 +.Lv6: vxor v20,v20,v6 +.Lv5: vxor v19,v19,v5 +.Lv4: vxor v20,v20,v4 +.Lv3: vxor v19,v19,v3 +.Lv2: vxor v20,v20,v2 +.Lv1: vxor v19,v19,v1 +.Lv0: vxor v20,v20,v0 + + vxor v0,v19,v20 + + b .Lbarrett_reduction + +.Lzero: + mr r3,r10 + b .Lout + +FUNC_END(CRC_FUNCTION_NAME) diff --git a/lib/crc/powerpc/crc32.h b/lib/crc/powerpc/crc32.h new file mode 100644 index 000000000000..811cc2e6ed24 --- /dev/null +++ b/lib/crc/powerpc/crc32.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <asm/switch_to.h> +#include <crypto/internal/simd.h> +#include <linux/cpufeature.h> +#include <linux/jump_label.h> +#include <linux/preempt.h> +#include <linux/uaccess.h> + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN-1) + +#define VECTOR_BREAKPOINT 512 + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto); + +#define crc32_le_arch crc32_le_base /* not implemented on this arch */ +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +u32 __crc32c_vpmsum(u32 crc, const u8 *p, size_t len); + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + unsigned int prealign; + unsigned int tail; + + if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || + !static_branch_likely(&have_vec_crypto) || !crypto_simd_usable()) + return crc32c_base(crc, p, len); + + if ((unsigned long)p & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); + crc = crc32c_base(crc, p, prealign); + len -= prealign; + p += prealign; + } + + if (len & ~VMX_ALIGN_MASK) { + preempt_disable(); + pagefault_disable(); + enable_kernel_altivec(); + crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); + disable_kernel_altivec(); + pagefault_enable(); + preempt_enable(); + } + + tail = len & VMX_ALIGN_MASK; + if (tail) { + p += len & ~VMX_ALIGN_MASK; + crc = crc32c_base(crc, p, tail); + } + + return crc; +} + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_VEC_CRYPTO)) + static_branch_enable(&have_vec_crypto); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (static_key_enabled(&have_vec_crypto)) + return CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/powerpc/crc32c-vpmsum_asm.S b/lib/crc/powerpc/crc32c-vpmsum_asm.S new file mode 100644 index 000000000000..1b35c55cce0a --- /dev/null +++ b/lib/crc/powerpc/crc32c-vpmsum_asm.S @@ -0,0 +1,842 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Calculate a crc32c with vpmsum acceleration + * + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + */ + .section .rodata +.balign 16 + +.byteswap_constant: + /* byte reverse permute constant */ + .octa 0x0F0E0D0C0B0A09080706050403020100 + +.constants: + + /* Reduce 262144 kbits to 1024 bits */ + /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ + .octa 0x00000000b6ca9e20000000009c37c408 + + /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ + .octa 0x00000000350249a800000001b51df26c + + /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ + .octa 0x00000001862dac54000000000724b9d0 + + /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ + .octa 0x00000001d87fb48c00000001c00532fe + + /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ + .octa 0x00000001f39b699e00000000f05a9362 + + /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ + .octa 0x0000000101da11b400000001e1007970 + + /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ + .octa 0x00000001cab571e000000000a57366ee + + /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ + .octa 0x00000000c7020cfe0000000192011284 + + /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ + .octa 0x00000000cdaed1ae0000000162716d9a + + /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ + .octa 0x00000001e804effc00000000cd97ecde + + /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ + .octa 0x0000000077c3ea3a0000000058812bc0 + + /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ + .octa 0x0000000068df31b40000000088b8c12e + + /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ + .octa 0x00000000b059b6c200000001230b234c + + /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ + .octa 0x0000000145fb8ed800000001120b416e + + /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ + .octa 0x00000000cbc0916800000001974aecb0 + + /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ + .octa 0x000000005ceeedc2000000008ee3f226 + + /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ + .octa 0x0000000047d74e8600000001089aba9a + + /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ + .octa 0x00000001407e9e220000000065113872 + + /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ + .octa 0x00000001da967bda000000005c07ec10 + + /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ + .octa 0x000000006c8983680000000187590924 + + /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ + .octa 0x00000000f2d14c9800000000e35da7c6 + + /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ + .octa 0x00000001993c6ad4000000000415855a + + /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ + .octa 0x000000014683d1ac0000000073617758 + + /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ + .octa 0x00000001a7c93e6c0000000176021d28 + + /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ + .octa 0x000000010211e90a00000001c358fd0a + + /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ + .octa 0x000000001119403e00000001ff7a2c18 + + /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ + .octa 0x000000001c3261aa00000000f2d9f7e4 + + /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ + .octa 0x000000014e37a634000000016cf1f9c8 + + /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ + .octa 0x0000000073786c0c000000010af9279a + + /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ + .octa 0x000000011dc037f80000000004f101e8 + + /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ + .octa 0x0000000031433dfc0000000070bcf184 + + /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ + .octa 0x000000009cde8348000000000a8de642 + + /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ + .octa 0x0000000038d3c2a60000000062ea130c + + /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ + .octa 0x000000011b25f26000000001eb31cbb2 + + /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ + .octa 0x000000001629e6f00000000170783448 + + /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ + .octa 0x0000000160838b4c00000001a684b4c6 + + /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ + .octa 0x000000007a44011c00000000253ca5b4 + + /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ + .octa 0x00000000226f417a0000000057b4b1e2 + + /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ + .octa 0x0000000045eb2eb400000000b6bd084c + + /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ + .octa 0x000000014459d70c0000000123c2d592 + + /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ + .octa 0x00000001d406ed8200000000159dafce + + /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ + .octa 0x0000000160c8e1a80000000127e1a64e + + /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ + .octa 0x0000000027ba80980000000056860754 + + /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ + .octa 0x000000006d92d01800000001e661aae8 + + /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ + .octa 0x000000012ed7e3f200000000f82c6166 + + /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ + .octa 0x000000002dc8778800000000c4f9c7ae + + /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ + .octa 0x0000000018240bb80000000074203d20 + + /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ + .octa 0x000000001ad381580000000198173052 + + /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ + .octa 0x00000001396b78f200000001ce8aba54 + + /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ + .octa 0x000000011a68133400000001850d5d94 + + /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ + .octa 0x000000012104732e00000001d609239c + + /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ + .octa 0x00000000a140d90c000000001595f048 + + /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ + .octa 0x00000001b7215eda0000000042ccee08 + + /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ + .octa 0x00000001aaf1df3c000000010a389d74 + + /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ + .octa 0x0000000029d15b8a000000012a840da6 + + /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ + .octa 0x00000000f1a96922000000001d181c0c + + /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ + .octa 0x00000001ac80d03c0000000068b7d1f6 + + /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ + .octa 0x000000000f11d56a000000005b0f14fc + + /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ + .octa 0x00000001f1c022a20000000179e9e730 + + /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ + .octa 0x0000000173d00ae200000001ce1368d6 + + /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ + .octa 0x00000001d4ffe4ac0000000112c3a84c + + /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ + .octa 0x000000016edc5ae400000000de940fee + + /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ + .octa 0x00000001f1a0214000000000fe896b7e + + /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ + .octa 0x00000000ca0b28a000000001f797431c + + /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ + .octa 0x00000001928e30a20000000053e989ba + + /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ + .octa 0x0000000097b1b002000000003920cd16 + + /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ + .octa 0x00000000b15bf90600000001e6f579b8 + + /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ + .octa 0x00000000411c5d52000000007493cb0a + + /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ + .octa 0x00000001c36f330000000001bdd376d8 + + /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ + .octa 0x00000001119227e0000000016badfee6 + + /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ + .octa 0x00000000114d47020000000071de5c58 + + /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ + .octa 0x00000000458b5b9800000000453f317c + + /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ + .octa 0x000000012e31fb8e0000000121675cce + + /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ + .octa 0x000000005cf619d800000001f409ee92 + + /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ + .octa 0x0000000063f4d8b200000000f36b9c88 + + /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ + .octa 0x000000004138dc8a0000000036b398f4 + + /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ + .octa 0x00000001d29ee8e000000001748f9adc + + /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ + .octa 0x000000006a08ace800000001be94ec00 + + /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ + .octa 0x0000000127d4201000000000b74370d6 + + /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ + .octa 0x0000000019d76b6200000001174d0b98 + + /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ + .octa 0x00000001b1471f6e00000000befc06a4 + + /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ + .octa 0x00000001f64c19cc00000001ae125288 + + /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ + .octa 0x00000000003c0ea00000000095c19b34 + + /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ + .octa 0x000000014d73abf600000001a78496f2 + + /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ + .octa 0x00000001620eb84400000001ac5390a0 + + /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ + .octa 0x0000000147655048000000002a80ed6e + + /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ + .octa 0x0000000067b5077e00000001fa9b0128 + + /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ + .octa 0x0000000010ffe20600000001ea94929e + + /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ + .octa 0x000000000fee8f1e0000000125f4305c + + /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ + .octa 0x00000001da26fbae00000001471e2002 + + /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ + .octa 0x00000001b3a8bd880000000132d2253a + + /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ + .octa 0x00000000e8f3898e00000000f26b3592 + + /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ + .octa 0x00000000b0d0d28c00000000bc8b67b0 + + /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ + .octa 0x0000000030f2a798000000013a826ef2 + + /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ + .octa 0x000000000fba10020000000081482c84 + + /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ + .octa 0x00000000bdb9bd7200000000e77307c2 + + /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ + .octa 0x0000000075d3bf5a00000000d4a07ec8 + + /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ + .octa 0x00000000ef1f98a00000000017102100 + + /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ + .octa 0x00000000689c760200000000db406486 + + /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ + .octa 0x000000016d5fa5fe0000000192db7f88 + + /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ + .octa 0x00000001d0d2b9ca000000018bf67b1e + + /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ + .octa 0x0000000041e7b470000000007c09163e + + /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ + .octa 0x00000001cbb6495e000000000adac060 + + /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ + .octa 0x000000010052a0b000000000bd8316ae + + /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ + .octa 0x00000001d8effb5c000000019f09ab54 + + /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ + .octa 0x00000001d969853c0000000125155542 + + /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ + .octa 0x00000000523ccce2000000018fdb5882 + + /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ + .octa 0x000000001e2436bc00000000e794b3f4 + + /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ + .octa 0x00000000ddd1c3a2000000016f9bb022 + + /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ + .octa 0x0000000019fcfe3800000000290c9978 + + /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ + .octa 0x00000001ce95db640000000083c0f350 + + /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ + .octa 0x00000000af5828060000000173ea6628 + + /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ + .octa 0x00000001006388f600000001c8b4e00a + + /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ + .octa 0x0000000179eca00a00000000de95d6aa + + /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ + .octa 0x0000000122410a6a000000010b7f7248 + + /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ + .octa 0x000000004288e87c00000001326e3a06 + + /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ + .octa 0x000000016c5490da00000000bb62c2e6 + + /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ + .octa 0x00000000d1c71f6e0000000156a4b2c2 + + /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ + .octa 0x00000001b4ce08a6000000011dfe763a + + /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ + .octa 0x00000001466ba60c000000007bcca8e2 + + /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ + .octa 0x00000001f6c488a40000000186118faa + + /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ + .octa 0x000000013bfb06820000000111a65a88 + + /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ + .octa 0x00000000690e9e54000000003565e1c4 + + /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ + .octa 0x00000000281346b6000000012ed02a82 + + /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ + .octa 0x000000015646402400000000c486ecfc + + /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ + .octa 0x000000016063a8dc0000000001b951b2 + + /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ + .octa 0x0000000116a663620000000048143916 + + /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ + .octa 0x000000017e8aa4d200000001dc2ae124 + + /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ + .octa 0x00000001728eb10c00000001416c58d6 + + /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ + .octa 0x00000001b08fd7fa00000000a479744a + + /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ + .octa 0x00000001092a16e80000000096ca3a26 + + /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ + .octa 0x00000000a505637c00000000ff223d4e + + /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ + .octa 0x00000000d94869b2000000010e84da42 + + /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ + .octa 0x00000001c8b203ae00000001b61ba3d0 + + /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ + .octa 0x000000005704aea000000000680f2de8 + + /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ + .octa 0x000000012e295fa2000000008772a9a8 + + /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ + .octa 0x000000011d0908bc0000000155f295bc + + /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ + .octa 0x0000000193ed97ea00000000595f9282 + + /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ + .octa 0x000000013a0f1c520000000164b1c25a + + /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ + .octa 0x000000010c2c40c000000000fbd67c50 + + /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ + .octa 0x00000000ff6fac3e0000000096076268 + + /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ + .octa 0x000000017b3609c000000001d288e4cc + + /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ + .octa 0x0000000088c8c92200000001eaac1bdc + + /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ + .octa 0x00000001751baae600000001f1ea39e2 + + /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ + .octa 0x000000010795297200000001eb6506fc + + /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ + .octa 0x0000000162b00abe000000010f806ffe + + /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ + .octa 0x000000000d7b404c000000010408481e + + /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ + .octa 0x00000000763b13d40000000188260534 + + /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ + .octa 0x00000000f6dc22d80000000058fc73e0 + + /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ + .octa 0x000000007daae06000000000391c59b8 + + /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ + .octa 0x000000013359ab7c000000018b638400 + + /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ + .octa 0x000000008add438a000000011738f5c4 + + /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ + .octa 0x00000001edbefdea000000008cf7c6da + + /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ + .octa 0x000000004104e0f800000001ef97fb16 + + /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ + .octa 0x00000000b48a82220000000102130e20 + + /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ + .octa 0x00000001bcb4684400000000db968898 + + /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ + .octa 0x000000013293ce0a00000000b5047b5e + + /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ + .octa 0x00000001710d0844000000010b90fdb2 + + /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ + .octa 0x0000000117907f6e000000004834a32e + + /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ + .octa 0x0000000087ddf93e0000000059c8f2b0 + + /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ + .octa 0x000000005970e9b00000000122cec508 + + /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ + .octa 0x0000000185b2b7d0000000000a330cda + + /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ + .octa 0x00000001dcee0efc000000014a47148c + + /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ + .octa 0x0000000030da27220000000042c61cb8 + + /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ + .octa 0x000000012f925a180000000012fe6960 + + /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ + .octa 0x00000000dd2e357c00000000dbda2c20 + + /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ + .octa 0x00000000071c80de000000011122410c + + /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ + .octa 0x000000011513140a00000000977b2070 + + /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ + .octa 0x00000001df876e8e000000014050438e + + /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ + .octa 0x000000015f81d6ce0000000147c840e8 + + /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ + .octa 0x000000019dd94dbe00000001cc7c88ce + + /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ + .octa 0x00000001373d206e00000001476b35a4 + + /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ + .octa 0x00000000668ccade000000013d52d508 + + /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ + .octa 0x00000001b192d268000000008e4be32e + + /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ + .octa 0x00000000e30f3a7800000000024120fe + + /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ + .octa 0x000000010ef1f7bc00000000ddecddb4 + + /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ + .octa 0x00000001f5ac738000000000d4d403bc + + /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ + .octa 0x000000011822ea7000000001734b89aa + + /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ + .octa 0x00000000c3a33848000000010e7a58d6 + + /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ + .octa 0x00000001bd151c2400000001f9f04e9c + + /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ + .octa 0x0000000056002d7600000000b692225e + + /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ + .octa 0x000000014657c4f4000000019b8d3f3e + + /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ + .octa 0x0000000113742d7c00000001a874f11e + + /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ + .octa 0x000000019c5920ba000000010d5a4254 + + /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ + .octa 0x000000005216d2d600000000bbb2f5d6 + + /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ + .octa 0x0000000136f5ad8a0000000179cc0e36 + + /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ + .octa 0x000000018b07beb600000001dca1da4a + + /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ + .octa 0x00000000db1e93b000000000feb1a192 + + /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ + .octa 0x000000000b96fa3a00000000d1eeedd6 + + /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ + .octa 0x00000001d9968af0000000008fad9bb4 + + /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ + .octa 0x000000000e4a77a200000001884938e4 + + /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ + .octa 0x00000000508c2ac800000001bc2e9bc0 + + /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ + .octa 0x0000000021572a8000000001f9658a68 + + /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ + .octa 0x00000001b859daf2000000001b9224fc + + /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ + .octa 0x000000016f7884740000000055b2fb84 + + /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ + .octa 0x00000001b438810e000000018b090348 + + /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ + .octa 0x0000000095ddc6f2000000011ccbd5ea + + /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ + .octa 0x00000001d977c20c0000000007ae47f8 + + /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ + .octa 0x00000000ebedb99a0000000172acbec0 + + /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ + .octa 0x00000001df9e9e9200000001c6e3ff20 + + /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ + .octa 0x00000001a4a3f95200000000e1b38744 + + /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ + .octa 0x00000000e2f5122000000000791585b2 + + /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ + .octa 0x000000004aa01f3e00000000ac53b894 + + /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ + .octa 0x00000000b3e90a5800000001ed5f2cf4 + + /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ + .octa 0x000000000c9ca2aa00000001df48b2e0 + + /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ + .octa 0x000000015168231600000000049c1c62 + + /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ + .octa 0x0000000036fce78c000000017c460c12 + + /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ + .octa 0x000000009037dc10000000015be4da7e + + /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ + .octa 0x00000000d3298582000000010f38f668 + + /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ + .octa 0x00000001b42e8ad60000000039f40a00 + + /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ + .octa 0x00000000142a983800000000bd4c10c4 + + /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ + .octa 0x0000000109c7f1900000000042db1d98 + + /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ + .octa 0x0000000056ff931000000001c905bae6 + + /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ + .octa 0x00000001594513aa00000000069d40ea + + /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ + .octa 0x00000001e3b5b1e8000000008e4fbad0 + + /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ + .octa 0x000000011dd5fc080000000047bedd46 + + /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ + .octa 0x00000001675f0cc20000000026396bf8 + + /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ + .octa 0x00000000d1c8dd4400000000379beb92 + + /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ + .octa 0x0000000115ebd3d8000000000abae54a + + /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ + .octa 0x00000001ecbd0dac0000000007e6a128 + + /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ + .octa 0x00000000cdf67af2000000000ade29d2 + + /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ + .octa 0x000000004c01ff4c00000000f974c45c + + /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ + .octa 0x00000000f2d8657e00000000e77ac60a + + /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ + .octa 0x000000006bae74c40000000145895816 + + /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ + .octa 0x0000000152af8aa00000000038e362be + + /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ + .octa 0x0000000004663802000000007f991a64 + + /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ + .octa 0x00000001ab2f5afc00000000fa366d3a + + /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ + .octa 0x0000000074a4ebd400000001a2bb34f0 + + /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ + .octa 0x00000001d7ab3a4c0000000028a9981e + + /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ + .octa 0x00000001a8da60c600000001dbc672be + + /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ + .octa 0x000000013cf6382000000000b04d77f6 + + /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ + .octa 0x00000000bec12e1e0000000124400d96 + + /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ + .octa 0x00000001c6368010000000014ca4b414 + + /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ + .octa 0x00000001e6e78758000000012fe2c938 + + /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ + .octa 0x000000008d7f2b3c00000001faed01e6 + + /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ + .octa 0x000000016b4a156e000000007e80ecfe + + /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ + .octa 0x00000001c63cfeb60000000098daee94 + + /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ + .octa 0x000000015f902670000000010a04edea + + /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ + .octa 0x00000001cd5de11e00000001c00b4524 + + /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ + .octa 0x000000001acaec540000000170296550 + + /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ + .octa 0x000000002bd0ca780000000181afaa48 + + /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ + .octa 0x0000000032d63d5c0000000185a31ffa + + /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ + .octa 0x000000001c6d4e4c000000002469f608 + + /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ + .octa 0x0000000106a60b92000000006980102a + + /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ + .octa 0x00000000d3855e120000000111ea9ca8 + + /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ + .octa 0x00000000e312563600000001bd1d29ce + + /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ + .octa 0x000000009e8f7ea400000001b34b9580 + + /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ + .octa 0x00000001c82e562c000000003076054e + + /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ + .octa 0x00000000ca9f09ce000000012a608ea4 + + /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ + .octa 0x00000000c63764e600000000784d05fe + + /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ + .octa 0x0000000168d2e49e000000016ef0d82a + + /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ + .octa 0x00000000e986c1480000000075bda454 + + /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ + .octa 0x00000000cfb65894000000003dc0a1c4 + + /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ + .octa 0x0000000111cadee400000000e9a5d8be + + /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ + .octa 0x0000000171fb63ce00000001609bc4b4 + +.short_constants: + + /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ + /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */ + .octa 0x7fec2963e5bf80485cf015c388e56f72 + + /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */ + .octa 0x38e888d4844752a9963a18920246e2e6 + + /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */ + .octa 0x42316c00730206ad419a441956993a31 + + /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */ + .octa 0x543d5c543e65ddf9924752ba2b830011 + + /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */ + .octa 0x78e87aaf56767c9255bd7f9518e4a304 + + /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */ + .octa 0x8f68fcec1903da7f6d76739fe0553f1e + + /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */ + .octa 0x3f4840246791d588c133722b1fe0b5c3 + + /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */ + .octa 0x34c96751b04de25a64b67ee0e55ef1f3 + + /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */ + .octa 0x156c8e180b4a395b069db049b8fdb1e7 + + /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */ + .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e + + /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */ + .octa 0x041d37768cd75659817cdc5119b29a35 + + /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */ + .octa 0x3a0777818cfaa9651ce9d94b36c41f1c + + /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */ + .octa 0x0e148e8252377a554f256efcb82be955 + + /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */ + .octa 0x9c25531d19e65ddeec1631edb2dea967 + + /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */ + .octa 0x790606ff9957c0a65d27e147510ac59a + + /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */ + .octa 0x82f63b786ea2d55ca66805eb18b8ea18 + + +.barrett_constants: + /* 33 bit reflected Barrett constant m - (4^32)/n */ + .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */ + /* 33 bit reflected Barrett constant n */ + .octa 0x00000000000000000000000105ec76f1 + +#define CRC_FUNCTION_NAME __crc32c_vpmsum +#define REFLECT +#include "crc-vpmsum-template.S" diff --git a/lib/crc/powerpc/crct10dif-vpmsum_asm.S b/lib/crc/powerpc/crct10dif-vpmsum_asm.S new file mode 100644 index 000000000000..47a6266d89a8 --- /dev/null +++ b/lib/crc/powerpc/crct10dif-vpmsum_asm.S @@ -0,0 +1,845 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Calculate a CRC T10DIF with vpmsum acceleration + * + * Constants generated by crc32-vpmsum, available at + * https://github.com/antonblanchard/crc32-vpmsum + * + * crc32-vpmsum is + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + */ + .section .rodata +.balign 16 + +.byteswap_constant: + /* byte reverse permute constant */ + .octa 0x0F0E0D0C0B0A09080706050403020100 + +.constants: + + /* Reduce 262144 kbits to 1024 bits */ + /* x^261184 mod p(x), x^261120 mod p(x) */ + .octa 0x0000000056d300000000000052550000 + + /* x^260160 mod p(x), x^260096 mod p(x) */ + .octa 0x00000000ee67000000000000a1e40000 + + /* x^259136 mod p(x), x^259072 mod p(x) */ + .octa 0x0000000060830000000000004ad10000 + + /* x^258112 mod p(x), x^258048 mod p(x) */ + .octa 0x000000008cfe0000000000009ab40000 + + /* x^257088 mod p(x), x^257024 mod p(x) */ + .octa 0x000000003e93000000000000fdb50000 + + /* x^256064 mod p(x), x^256000 mod p(x) */ + .octa 0x000000003c2000000000000045480000 + + /* x^255040 mod p(x), x^254976 mod p(x) */ + .octa 0x00000000b1fc0000000000008d690000 + + /* x^254016 mod p(x), x^253952 mod p(x) */ + .octa 0x00000000f82b00000000000024ad0000 + + /* x^252992 mod p(x), x^252928 mod p(x) */ + .octa 0x0000000044420000000000009f1a0000 + + /* x^251968 mod p(x), x^251904 mod p(x) */ + .octa 0x00000000e88c00000000000066ec0000 + + /* x^250944 mod p(x), x^250880 mod p(x) */ + .octa 0x00000000385c000000000000c87d0000 + + /* x^249920 mod p(x), x^249856 mod p(x) */ + .octa 0x000000003227000000000000c8ff0000 + + /* x^248896 mod p(x), x^248832 mod p(x) */ + .octa 0x00000000a9a900000000000033440000 + + /* x^247872 mod p(x), x^247808 mod p(x) */ + .octa 0x00000000abaa00000000000066eb0000 + + /* x^246848 mod p(x), x^246784 mod p(x) */ + .octa 0x000000001ac3000000000000c4ef0000 + + /* x^245824 mod p(x), x^245760 mod p(x) */ + .octa 0x0000000063f000000000000056f30000 + + /* x^244800 mod p(x), x^244736 mod p(x) */ + .octa 0x0000000032cc00000000000002050000 + + /* x^243776 mod p(x), x^243712 mod p(x) */ + .octa 0x00000000f8b5000000000000568e0000 + + /* x^242752 mod p(x), x^242688 mod p(x) */ + .octa 0x000000008db100000000000064290000 + + /* x^241728 mod p(x), x^241664 mod p(x) */ + .octa 0x0000000059ca0000000000006b660000 + + /* x^240704 mod p(x), x^240640 mod p(x) */ + .octa 0x000000005f5c00000000000018f80000 + + /* x^239680 mod p(x), x^239616 mod p(x) */ + .octa 0x0000000061af000000000000b6090000 + + /* x^238656 mod p(x), x^238592 mod p(x) */ + .octa 0x00000000e29e000000000000099a0000 + + /* x^237632 mod p(x), x^237568 mod p(x) */ + .octa 0x000000000975000000000000a8360000 + + /* x^236608 mod p(x), x^236544 mod p(x) */ + .octa 0x0000000043900000000000004f570000 + + /* x^235584 mod p(x), x^235520 mod p(x) */ + .octa 0x00000000f9cd000000000000134c0000 + + /* x^234560 mod p(x), x^234496 mod p(x) */ + .octa 0x000000007c29000000000000ec380000 + + /* x^233536 mod p(x), x^233472 mod p(x) */ + .octa 0x000000004c6a000000000000b0d10000 + + /* x^232512 mod p(x), x^232448 mod p(x) */ + .octa 0x00000000e7290000000000007d3e0000 + + /* x^231488 mod p(x), x^231424 mod p(x) */ + .octa 0x00000000f1ab000000000000f0b20000 + + /* x^230464 mod p(x), x^230400 mod p(x) */ + .octa 0x0000000039db0000000000009c270000 + + /* x^229440 mod p(x), x^229376 mod p(x) */ + .octa 0x000000005e2800000000000092890000 + + /* x^228416 mod p(x), x^228352 mod p(x) */ + .octa 0x00000000d44e000000000000d5ee0000 + + /* x^227392 mod p(x), x^227328 mod p(x) */ + .octa 0x00000000cd0a00000000000041f50000 + + /* x^226368 mod p(x), x^226304 mod p(x) */ + .octa 0x00000000c5b400000000000010520000 + + /* x^225344 mod p(x), x^225280 mod p(x) */ + .octa 0x00000000fd2100000000000042170000 + + /* x^224320 mod p(x), x^224256 mod p(x) */ + .octa 0x000000002f2500000000000095c20000 + + /* x^223296 mod p(x), x^223232 mod p(x) */ + .octa 0x000000001b0100000000000001ce0000 + + /* x^222272 mod p(x), x^222208 mod p(x) */ + .octa 0x000000000d430000000000002aca0000 + + /* x^221248 mod p(x), x^221184 mod p(x) */ + .octa 0x0000000030a6000000000000385e0000 + + /* x^220224 mod p(x), x^220160 mod p(x) */ + .octa 0x00000000e37b0000000000006f7a0000 + + /* x^219200 mod p(x), x^219136 mod p(x) */ + .octa 0x00000000873600000000000024320000 + + /* x^218176 mod p(x), x^218112 mod p(x) */ + .octa 0x00000000e9fb000000000000bd9c0000 + + /* x^217152 mod p(x), x^217088 mod p(x) */ + .octa 0x000000003b9500000000000054bc0000 + + /* x^216128 mod p(x), x^216064 mod p(x) */ + .octa 0x00000000133e000000000000a4660000 + + /* x^215104 mod p(x), x^215040 mod p(x) */ + .octa 0x00000000784500000000000079930000 + + /* x^214080 mod p(x), x^214016 mod p(x) */ + .octa 0x00000000b9800000000000001bb80000 + + /* x^213056 mod p(x), x^212992 mod p(x) */ + .octa 0x00000000687600000000000024400000 + + /* x^212032 mod p(x), x^211968 mod p(x) */ + .octa 0x00000000aff300000000000029e10000 + + /* x^211008 mod p(x), x^210944 mod p(x) */ + .octa 0x0000000024b50000000000005ded0000 + + /* x^209984 mod p(x), x^209920 mod p(x) */ + .octa 0x0000000017e8000000000000b12e0000 + + /* x^208960 mod p(x), x^208896 mod p(x) */ + .octa 0x00000000128400000000000026d20000 + + /* x^207936 mod p(x), x^207872 mod p(x) */ + .octa 0x000000002115000000000000a32a0000 + + /* x^206912 mod p(x), x^206848 mod p(x) */ + .octa 0x000000009595000000000000a1210000 + + /* x^205888 mod p(x), x^205824 mod p(x) */ + .octa 0x00000000281e000000000000ee8b0000 + + /* x^204864 mod p(x), x^204800 mod p(x) */ + .octa 0x0000000006010000000000003d0d0000 + + /* x^203840 mod p(x), x^203776 mod p(x) */ + .octa 0x00000000e2b600000000000034e90000 + + /* x^202816 mod p(x), x^202752 mod p(x) */ + .octa 0x000000001bd40000000000004cdb0000 + + /* x^201792 mod p(x), x^201728 mod p(x) */ + .octa 0x00000000df2800000000000030e90000 + + /* x^200768 mod p(x), x^200704 mod p(x) */ + .octa 0x0000000049c200000000000042590000 + + /* x^199744 mod p(x), x^199680 mod p(x) */ + .octa 0x000000009b97000000000000df950000 + + /* x^198720 mod p(x), x^198656 mod p(x) */ + .octa 0x000000006184000000000000da7b0000 + + /* x^197696 mod p(x), x^197632 mod p(x) */ + .octa 0x00000000461700000000000012510000 + + /* x^196672 mod p(x), x^196608 mod p(x) */ + .octa 0x000000009b40000000000000f37e0000 + + /* x^195648 mod p(x), x^195584 mod p(x) */ + .octa 0x00000000eeb2000000000000ecf10000 + + /* x^194624 mod p(x), x^194560 mod p(x) */ + .octa 0x00000000b2e800000000000050f20000 + + /* x^193600 mod p(x), x^193536 mod p(x) */ + .octa 0x00000000f59a000000000000e0b30000 + + /* x^192576 mod p(x), x^192512 mod p(x) */ + .octa 0x00000000467f0000000000004d5a0000 + + /* x^191552 mod p(x), x^191488 mod p(x) */ + .octa 0x00000000da92000000000000bb010000 + + /* x^190528 mod p(x), x^190464 mod p(x) */ + .octa 0x000000001e1000000000000022a40000 + + /* x^189504 mod p(x), x^189440 mod p(x) */ + .octa 0x0000000058fe000000000000836f0000 + + /* x^188480 mod p(x), x^188416 mod p(x) */ + .octa 0x00000000b9ce000000000000d78d0000 + + /* x^187456 mod p(x), x^187392 mod p(x) */ + .octa 0x0000000022210000000000004f8d0000 + + /* x^186432 mod p(x), x^186368 mod p(x) */ + .octa 0x00000000744600000000000033760000 + + /* x^185408 mod p(x), x^185344 mod p(x) */ + .octa 0x000000001c2e000000000000a1e50000 + + /* x^184384 mod p(x), x^184320 mod p(x) */ + .octa 0x00000000dcc8000000000000a1a40000 + + /* x^183360 mod p(x), x^183296 mod p(x) */ + .octa 0x00000000910f00000000000019a20000 + + /* x^182336 mod p(x), x^182272 mod p(x) */ + .octa 0x0000000055d5000000000000f6ae0000 + + /* x^181312 mod p(x), x^181248 mod p(x) */ + .octa 0x00000000c8ba000000000000a7ac0000 + + /* x^180288 mod p(x), x^180224 mod p(x) */ + .octa 0x0000000031f8000000000000eea20000 + + /* x^179264 mod p(x), x^179200 mod p(x) */ + .octa 0x000000001966000000000000c4d90000 + + /* x^178240 mod p(x), x^178176 mod p(x) */ + .octa 0x00000000b9810000000000002b470000 + + /* x^177216 mod p(x), x^177152 mod p(x) */ + .octa 0x000000008303000000000000f7cf0000 + + /* x^176192 mod p(x), x^176128 mod p(x) */ + .octa 0x000000002ce500000000000035b30000 + + /* x^175168 mod p(x), x^175104 mod p(x) */ + .octa 0x000000002fae0000000000000c7c0000 + + /* x^174144 mod p(x), x^174080 mod p(x) */ + .octa 0x00000000f50c0000000000009edf0000 + + /* x^173120 mod p(x), x^173056 mod p(x) */ + .octa 0x00000000714f00000000000004cd0000 + + /* x^172096 mod p(x), x^172032 mod p(x) */ + .octa 0x00000000c161000000000000541b0000 + + /* x^171072 mod p(x), x^171008 mod p(x) */ + .octa 0x0000000021c8000000000000e2700000 + + /* x^170048 mod p(x), x^169984 mod p(x) */ + .octa 0x00000000b93d00000000000009a60000 + + /* x^169024 mod p(x), x^168960 mod p(x) */ + .octa 0x00000000fbcf000000000000761c0000 + + /* x^168000 mod p(x), x^167936 mod p(x) */ + .octa 0x0000000026350000000000009db30000 + + /* x^166976 mod p(x), x^166912 mod p(x) */ + .octa 0x00000000b64f0000000000003e9f0000 + + /* x^165952 mod p(x), x^165888 mod p(x) */ + .octa 0x00000000bd0e00000000000078590000 + + /* x^164928 mod p(x), x^164864 mod p(x) */ + .octa 0x00000000d9360000000000008bc80000 + + /* x^163904 mod p(x), x^163840 mod p(x) */ + .octa 0x000000002f140000000000008c9f0000 + + /* x^162880 mod p(x), x^162816 mod p(x) */ + .octa 0x000000006a270000000000006af70000 + + /* x^161856 mod p(x), x^161792 mod p(x) */ + .octa 0x000000006685000000000000e5210000 + + /* x^160832 mod p(x), x^160768 mod p(x) */ + .octa 0x0000000062da00000000000008290000 + + /* x^159808 mod p(x), x^159744 mod p(x) */ + .octa 0x00000000bb4b000000000000e4d00000 + + /* x^158784 mod p(x), x^158720 mod p(x) */ + .octa 0x00000000d2490000000000004ae10000 + + /* x^157760 mod p(x), x^157696 mod p(x) */ + .octa 0x00000000c85b00000000000000e70000 + + /* x^156736 mod p(x), x^156672 mod p(x) */ + .octa 0x00000000c37a00000000000015650000 + + /* x^155712 mod p(x), x^155648 mod p(x) */ + .octa 0x0000000018530000000000001c2f0000 + + /* x^154688 mod p(x), x^154624 mod p(x) */ + .octa 0x00000000b46600000000000037bd0000 + + /* x^153664 mod p(x), x^153600 mod p(x) */ + .octa 0x00000000439b00000000000012190000 + + /* x^152640 mod p(x), x^152576 mod p(x) */ + .octa 0x00000000b1260000000000005ece0000 + + /* x^151616 mod p(x), x^151552 mod p(x) */ + .octa 0x00000000d8110000000000002a5e0000 + + /* x^150592 mod p(x), x^150528 mod p(x) */ + .octa 0x00000000099f00000000000052330000 + + /* x^149568 mod p(x), x^149504 mod p(x) */ + .octa 0x00000000f9f9000000000000f9120000 + + /* x^148544 mod p(x), x^148480 mod p(x) */ + .octa 0x000000005cc00000000000000ddc0000 + + /* x^147520 mod p(x), x^147456 mod p(x) */ + .octa 0x00000000343b00000000000012200000 + + /* x^146496 mod p(x), x^146432 mod p(x) */ + .octa 0x000000009222000000000000d12b0000 + + /* x^145472 mod p(x), x^145408 mod p(x) */ + .octa 0x00000000d781000000000000eb2d0000 + + /* x^144448 mod p(x), x^144384 mod p(x) */ + .octa 0x000000000bf400000000000058970000 + + /* x^143424 mod p(x), x^143360 mod p(x) */ + .octa 0x00000000094200000000000013690000 + + /* x^142400 mod p(x), x^142336 mod p(x) */ + .octa 0x00000000d55100000000000051950000 + + /* x^141376 mod p(x), x^141312 mod p(x) */ + .octa 0x000000008f11000000000000954b0000 + + /* x^140352 mod p(x), x^140288 mod p(x) */ + .octa 0x00000000140f000000000000b29e0000 + + /* x^139328 mod p(x), x^139264 mod p(x) */ + .octa 0x00000000c6db000000000000db5d0000 + + /* x^138304 mod p(x), x^138240 mod p(x) */ + .octa 0x00000000715b000000000000dfaf0000 + + /* x^137280 mod p(x), x^137216 mod p(x) */ + .octa 0x000000000dea000000000000e3b60000 + + /* x^136256 mod p(x), x^136192 mod p(x) */ + .octa 0x000000006f94000000000000ddaf0000 + + /* x^135232 mod p(x), x^135168 mod p(x) */ + .octa 0x0000000024e1000000000000e4f70000 + + /* x^134208 mod p(x), x^134144 mod p(x) */ + .octa 0x000000008810000000000000aa110000 + + /* x^133184 mod p(x), x^133120 mod p(x) */ + .octa 0x0000000030c2000000000000a8e60000 + + /* x^132160 mod p(x), x^132096 mod p(x) */ + .octa 0x00000000e6d0000000000000ccf30000 + + /* x^131136 mod p(x), x^131072 mod p(x) */ + .octa 0x000000004da000000000000079bf0000 + + /* x^130112 mod p(x), x^130048 mod p(x) */ + .octa 0x000000007759000000000000b3a30000 + + /* x^129088 mod p(x), x^129024 mod p(x) */ + .octa 0x00000000597400000000000028790000 + + /* x^128064 mod p(x), x^128000 mod p(x) */ + .octa 0x000000007acd000000000000b5820000 + + /* x^127040 mod p(x), x^126976 mod p(x) */ + .octa 0x00000000e6e400000000000026ad0000 + + /* x^126016 mod p(x), x^125952 mod p(x) */ + .octa 0x000000006d49000000000000985b0000 + + /* x^124992 mod p(x), x^124928 mod p(x) */ + .octa 0x000000000f0800000000000011520000 + + /* x^123968 mod p(x), x^123904 mod p(x) */ + .octa 0x000000002c7f000000000000846c0000 + + /* x^122944 mod p(x), x^122880 mod p(x) */ + .octa 0x000000005ce7000000000000ae1d0000 + + /* x^121920 mod p(x), x^121856 mod p(x) */ + .octa 0x00000000d4cb000000000000e21d0000 + + /* x^120896 mod p(x), x^120832 mod p(x) */ + .octa 0x000000003a2300000000000019bb0000 + + /* x^119872 mod p(x), x^119808 mod p(x) */ + .octa 0x000000000e1700000000000095290000 + + /* x^118848 mod p(x), x^118784 mod p(x) */ + .octa 0x000000006e6400000000000050d20000 + + /* x^117824 mod p(x), x^117760 mod p(x) */ + .octa 0x000000008d5c0000000000000cd10000 + + /* x^116800 mod p(x), x^116736 mod p(x) */ + .octa 0x00000000ef310000000000007b570000 + + /* x^115776 mod p(x), x^115712 mod p(x) */ + .octa 0x00000000645d00000000000053d60000 + + /* x^114752 mod p(x), x^114688 mod p(x) */ + .octa 0x0000000018fc00000000000077510000 + + /* x^113728 mod p(x), x^113664 mod p(x) */ + .octa 0x000000000cb3000000000000a7b70000 + + /* x^112704 mod p(x), x^112640 mod p(x) */ + .octa 0x00000000991b000000000000d0780000 + + /* x^111680 mod p(x), x^111616 mod p(x) */ + .octa 0x00000000845a000000000000be3c0000 + + /* x^110656 mod p(x), x^110592 mod p(x) */ + .octa 0x00000000d3a9000000000000df020000 + + /* x^109632 mod p(x), x^109568 mod p(x) */ + .octa 0x0000000017d7000000000000063e0000 + + /* x^108608 mod p(x), x^108544 mod p(x) */ + .octa 0x000000007a860000000000008ab40000 + + /* x^107584 mod p(x), x^107520 mod p(x) */ + .octa 0x00000000fd7c000000000000c7bd0000 + + /* x^106560 mod p(x), x^106496 mod p(x) */ + .octa 0x00000000a56b000000000000efd60000 + + /* x^105536 mod p(x), x^105472 mod p(x) */ + .octa 0x0000000010e400000000000071380000 + + /* x^104512 mod p(x), x^104448 mod p(x) */ + .octa 0x00000000994500000000000004d30000 + + /* x^103488 mod p(x), x^103424 mod p(x) */ + .octa 0x00000000b83c0000000000003b0e0000 + + /* x^102464 mod p(x), x^102400 mod p(x) */ + .octa 0x00000000d6c10000000000008b020000 + + /* x^101440 mod p(x), x^101376 mod p(x) */ + .octa 0x000000009efc000000000000da940000 + + /* x^100416 mod p(x), x^100352 mod p(x) */ + .octa 0x000000005e87000000000000f9f70000 + + /* x^99392 mod p(x), x^99328 mod p(x) */ + .octa 0x000000006c9b00000000000045e40000 + + /* x^98368 mod p(x), x^98304 mod p(x) */ + .octa 0x00000000178a00000000000083940000 + + /* x^97344 mod p(x), x^97280 mod p(x) */ + .octa 0x00000000f0c8000000000000f0a00000 + + /* x^96320 mod p(x), x^96256 mod p(x) */ + .octa 0x00000000f699000000000000b74b0000 + + /* x^95296 mod p(x), x^95232 mod p(x) */ + .octa 0x00000000316d000000000000c1cf0000 + + /* x^94272 mod p(x), x^94208 mod p(x) */ + .octa 0x00000000987e00000000000072680000 + + /* x^93248 mod p(x), x^93184 mod p(x) */ + .octa 0x00000000acff000000000000e0ab0000 + + /* x^92224 mod p(x), x^92160 mod p(x) */ + .octa 0x00000000a1f6000000000000c5a80000 + + /* x^91200 mod p(x), x^91136 mod p(x) */ + .octa 0x0000000061bd000000000000cf690000 + + /* x^90176 mod p(x), x^90112 mod p(x) */ + .octa 0x00000000c9f2000000000000cbcc0000 + + /* x^89152 mod p(x), x^89088 mod p(x) */ + .octa 0x000000005a33000000000000de050000 + + /* x^88128 mod p(x), x^88064 mod p(x) */ + .octa 0x00000000e416000000000000ccd70000 + + /* x^87104 mod p(x), x^87040 mod p(x) */ + .octa 0x0000000058930000000000002f670000 + + /* x^86080 mod p(x), x^86016 mod p(x) */ + .octa 0x00000000a9d3000000000000152f0000 + + /* x^85056 mod p(x), x^84992 mod p(x) */ + .octa 0x00000000c114000000000000ecc20000 + + /* x^84032 mod p(x), x^83968 mod p(x) */ + .octa 0x00000000b9270000000000007c890000 + + /* x^83008 mod p(x), x^82944 mod p(x) */ + .octa 0x000000002e6000000000000006ee0000 + + /* x^81984 mod p(x), x^81920 mod p(x) */ + .octa 0x00000000dfc600000000000009100000 + + /* x^80960 mod p(x), x^80896 mod p(x) */ + .octa 0x000000004911000000000000ad4e0000 + + /* x^79936 mod p(x), x^79872 mod p(x) */ + .octa 0x00000000ae1b000000000000b04d0000 + + /* x^78912 mod p(x), x^78848 mod p(x) */ + .octa 0x0000000005fa000000000000e9900000 + + /* x^77888 mod p(x), x^77824 mod p(x) */ + .octa 0x0000000004a1000000000000cc6f0000 + + /* x^76864 mod p(x), x^76800 mod p(x) */ + .octa 0x00000000af73000000000000ed110000 + + /* x^75840 mod p(x), x^75776 mod p(x) */ + .octa 0x0000000082530000000000008f7e0000 + + /* x^74816 mod p(x), x^74752 mod p(x) */ + .octa 0x00000000cfdc000000000000594f0000 + + /* x^73792 mod p(x), x^73728 mod p(x) */ + .octa 0x00000000a6b6000000000000a8750000 + + /* x^72768 mod p(x), x^72704 mod p(x) */ + .octa 0x00000000fd76000000000000aa0c0000 + + /* x^71744 mod p(x), x^71680 mod p(x) */ + .octa 0x0000000006f500000000000071db0000 + + /* x^70720 mod p(x), x^70656 mod p(x) */ + .octa 0x0000000037ca000000000000ab0c0000 + + /* x^69696 mod p(x), x^69632 mod p(x) */ + .octa 0x00000000d7ab000000000000b7a00000 + + /* x^68672 mod p(x), x^68608 mod p(x) */ + .octa 0x00000000440800000000000090d30000 + + /* x^67648 mod p(x), x^67584 mod p(x) */ + .octa 0x00000000186100000000000054730000 + + /* x^66624 mod p(x), x^66560 mod p(x) */ + .octa 0x000000007368000000000000a3a20000 + + /* x^65600 mod p(x), x^65536 mod p(x) */ + .octa 0x0000000026d0000000000000f9040000 + + /* x^64576 mod p(x), x^64512 mod p(x) */ + .octa 0x00000000fe770000000000009c0a0000 + + /* x^63552 mod p(x), x^63488 mod p(x) */ + .octa 0x000000002cba000000000000d1e70000 + + /* x^62528 mod p(x), x^62464 mod p(x) */ + .octa 0x00000000f8bd0000000000005ac10000 + + /* x^61504 mod p(x), x^61440 mod p(x) */ + .octa 0x000000007372000000000000d68d0000 + + /* x^60480 mod p(x), x^60416 mod p(x) */ + .octa 0x00000000f37f00000000000089f60000 + + /* x^59456 mod p(x), x^59392 mod p(x) */ + .octa 0x00000000078400000000000008a90000 + + /* x^58432 mod p(x), x^58368 mod p(x) */ + .octa 0x00000000d3e400000000000042360000 + + /* x^57408 mod p(x), x^57344 mod p(x) */ + .octa 0x00000000eba800000000000092d50000 + + /* x^56384 mod p(x), x^56320 mod p(x) */ + .octa 0x00000000afbe000000000000b4d50000 + + /* x^55360 mod p(x), x^55296 mod p(x) */ + .octa 0x00000000d8ca000000000000c9060000 + + /* x^54336 mod p(x), x^54272 mod p(x) */ + .octa 0x00000000c2d00000000000008f4f0000 + + /* x^53312 mod p(x), x^53248 mod p(x) */ + .octa 0x00000000373200000000000028690000 + + /* x^52288 mod p(x), x^52224 mod p(x) */ + .octa 0x0000000046ae000000000000c3b30000 + + /* x^51264 mod p(x), x^51200 mod p(x) */ + .octa 0x00000000b243000000000000f8700000 + + /* x^50240 mod p(x), x^50176 mod p(x) */ + .octa 0x00000000f7f500000000000029eb0000 + + /* x^49216 mod p(x), x^49152 mod p(x) */ + .octa 0x000000000c7e000000000000fe730000 + + /* x^48192 mod p(x), x^48128 mod p(x) */ + .octa 0x00000000c38200000000000096000000 + + /* x^47168 mod p(x), x^47104 mod p(x) */ + .octa 0x000000008956000000000000683c0000 + + /* x^46144 mod p(x), x^46080 mod p(x) */ + .octa 0x00000000422d0000000000005f1e0000 + + /* x^45120 mod p(x), x^45056 mod p(x) */ + .octa 0x00000000ac0f0000000000006f810000 + + /* x^44096 mod p(x), x^44032 mod p(x) */ + .octa 0x00000000ce30000000000000031f0000 + + /* x^43072 mod p(x), x^43008 mod p(x) */ + .octa 0x000000003d43000000000000455a0000 + + /* x^42048 mod p(x), x^41984 mod p(x) */ + .octa 0x000000007ebe000000000000a6050000 + + /* x^41024 mod p(x), x^40960 mod p(x) */ + .octa 0x00000000976e00000000000077eb0000 + + /* x^40000 mod p(x), x^39936 mod p(x) */ + .octa 0x000000000872000000000000389c0000 + + /* x^38976 mod p(x), x^38912 mod p(x) */ + .octa 0x000000008979000000000000c7b20000 + + /* x^37952 mod p(x), x^37888 mod p(x) */ + .octa 0x000000005c1e0000000000001d870000 + + /* x^36928 mod p(x), x^36864 mod p(x) */ + .octa 0x00000000aebb00000000000045810000 + + /* x^35904 mod p(x), x^35840 mod p(x) */ + .octa 0x000000004f7e0000000000006d4a0000 + + /* x^34880 mod p(x), x^34816 mod p(x) */ + .octa 0x00000000ea98000000000000b9200000 + + /* x^33856 mod p(x), x^33792 mod p(x) */ + .octa 0x00000000f39600000000000022f20000 + + /* x^32832 mod p(x), x^32768 mod p(x) */ + .octa 0x000000000bc500000000000041ca0000 + + /* x^31808 mod p(x), x^31744 mod p(x) */ + .octa 0x00000000786400000000000078500000 + + /* x^30784 mod p(x), x^30720 mod p(x) */ + .octa 0x00000000be970000000000009e7e0000 + + /* x^29760 mod p(x), x^29696 mod p(x) */ + .octa 0x00000000dd6d000000000000a53c0000 + + /* x^28736 mod p(x), x^28672 mod p(x) */ + .octa 0x000000004c3f00000000000039340000 + + /* x^27712 mod p(x), x^27648 mod p(x) */ + .octa 0x0000000093a4000000000000b58e0000 + + /* x^26688 mod p(x), x^26624 mod p(x) */ + .octa 0x0000000050fb00000000000062d40000 + + /* x^25664 mod p(x), x^25600 mod p(x) */ + .octa 0x00000000f505000000000000a26f0000 + + /* x^24640 mod p(x), x^24576 mod p(x) */ + .octa 0x0000000064f900000000000065e60000 + + /* x^23616 mod p(x), x^23552 mod p(x) */ + .octa 0x00000000e8c2000000000000aad90000 + + /* x^22592 mod p(x), x^22528 mod p(x) */ + .octa 0x00000000720b000000000000a3b00000 + + /* x^21568 mod p(x), x^21504 mod p(x) */ + .octa 0x00000000e992000000000000d2680000 + + /* x^20544 mod p(x), x^20480 mod p(x) */ + .octa 0x000000009132000000000000cf4c0000 + + /* x^19520 mod p(x), x^19456 mod p(x) */ + .octa 0x00000000608a00000000000076610000 + + /* x^18496 mod p(x), x^18432 mod p(x) */ + .octa 0x000000009948000000000000fb9f0000 + + /* x^17472 mod p(x), x^17408 mod p(x) */ + .octa 0x00000000173000000000000003770000 + + /* x^16448 mod p(x), x^16384 mod p(x) */ + .octa 0x000000006fe300000000000004880000 + + /* x^15424 mod p(x), x^15360 mod p(x) */ + .octa 0x00000000e15300000000000056a70000 + + /* x^14400 mod p(x), x^14336 mod p(x) */ + .octa 0x0000000092d60000000000009dfd0000 + + /* x^13376 mod p(x), x^13312 mod p(x) */ + .octa 0x0000000002fd00000000000074c80000 + + /* x^12352 mod p(x), x^12288 mod p(x) */ + .octa 0x00000000c78b000000000000a3ec0000 + + /* x^11328 mod p(x), x^11264 mod p(x) */ + .octa 0x000000009262000000000000b3530000 + + /* x^10304 mod p(x), x^10240 mod p(x) */ + .octa 0x0000000084f200000000000047bf0000 + + /* x^9280 mod p(x), x^9216 mod p(x) */ + .octa 0x0000000067ee000000000000e97c0000 + + /* x^8256 mod p(x), x^8192 mod p(x) */ + .octa 0x00000000535b00000000000091e10000 + + /* x^7232 mod p(x), x^7168 mod p(x) */ + .octa 0x000000007ebb00000000000055060000 + + /* x^6208 mod p(x), x^6144 mod p(x) */ + .octa 0x00000000c6a1000000000000fd360000 + + /* x^5184 mod p(x), x^5120 mod p(x) */ + .octa 0x000000001be500000000000055860000 + + /* x^4160 mod p(x), x^4096 mod p(x) */ + .octa 0x00000000ae0e0000000000005bd00000 + + /* x^3136 mod p(x), x^3072 mod p(x) */ + .octa 0x0000000022040000000000008db20000 + + /* x^2112 mod p(x), x^2048 mod p(x) */ + .octa 0x00000000c9eb000000000000efe20000 + + /* x^1088 mod p(x), x^1024 mod p(x) */ + .octa 0x0000000039b400000000000051d10000 + +.short_constants: + + /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ + /* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */ + .octa 0xefe20000dccf00009440000033590000 + + /* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */ + .octa 0xee6300002f3f000062180000e0ed0000 + + /* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */ + .octa 0xcf5f000017ef0000ccbe000023d30000 + + /* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */ + .octa 0x6d0c0000a30e00000920000042630000 + + /* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */ + .octa 0x21d30000932b0000a7a00000efcc0000 + + /* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */ + .octa 0x10be00000b310000666f00000d1c0000 + + /* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */ + .octa 0x1f240000ce9e0000caad0000589e0000 + + /* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */ + .octa 0x29610000d02b000039b400007cf50000 + + /* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */ + .octa 0x51d100009d9d00003c0e0000bfd60000 + + /* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */ + .octa 0xda390000ceae000013830000713c0000 + + /* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */ + .octa 0xb67800001e16000085c0000080a60000 + + /* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */ + .octa 0x0db40000f7f90000371d0000e6580000 + + /* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */ + .octa 0x87e70000044c0000aadb0000a4970000 + + /* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */ + .octa 0x1f990000ad180000d8b30000e7b50000 + + /* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */ + .octa 0xbe6c00006ee300004c1a000006df0000 + + /* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */ + .octa 0xfb0b00002d560000136800008bb70000 + + +.barrett_constants: + /* Barrett constant m - (4^32)/n */ + .octa 0x000000000000000000000001f65a57f8 /* x^64 div p(x) */ + /* Barrett constant n */ + .octa 0x0000000000000000000000018bb70000 + +#define CRC_FUNCTION_NAME __crct10dif_vpmsum +#include "crc-vpmsum-template.S" diff --git a/lib/crc/riscv/crc-clmul-consts.h b/lib/crc/riscv/crc-clmul-consts.h new file mode 100644 index 000000000000..8d73449235ef --- /dev/null +++ b/lib/crc/riscv/crc-clmul-consts.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * CRC constants generated by: + * + * ./scripts/gen-crc-consts.py riscv_clmul crc16_msb_0x8bb7,crc32_msb_0x04c11db7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5 + * + * Do not edit manually. + */ + +struct crc_clmul_consts { + unsigned long fold_across_2_longs_const_hi; + unsigned long fold_across_2_longs_const_lo; + unsigned long barrett_reduction_const_1; + unsigned long barrett_reduction_const_2; +}; + +/* + * Constants generated for most-significant-bit-first CRC-16 using + * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 + */ +static const struct crc_clmul_consts crc16_msb_0x8bb7_consts __maybe_unused = { +#ifdef CONFIG_64BIT + .fold_across_2_longs_const_hi = 0x0000000000001faa, /* x^192 mod G */ + .fold_across_2_longs_const_lo = 0x000000000000a010, /* x^128 mod G */ + .barrett_reduction_const_1 = 0xfb2d2bfc0e99d245, /* floor(x^79 / G) */ + .barrett_reduction_const_2 = 0x0000000000008bb7, /* G - x^16 */ +#else + .fold_across_2_longs_const_hi = 0x00005890, /* x^96 mod G */ + .fold_across_2_longs_const_lo = 0x0000f249, /* x^64 mod G */ + .barrett_reduction_const_1 = 0xfb2d2bfc, /* floor(x^47 / G) */ + .barrett_reduction_const_2 = 0x00008bb7, /* G - x^16 */ +#endif +}; + +/* + * Constants generated for most-significant-bit-first CRC-32 using + * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + + * x^5 + x^4 + x^2 + x^1 + x^0 + */ +static const struct crc_clmul_consts crc32_msb_0x04c11db7_consts __maybe_unused = { +#ifdef CONFIG_64BIT + .fold_across_2_longs_const_hi = 0x00000000c5b9cd4c, /* x^192 mod G */ + .fold_across_2_longs_const_lo = 0x00000000e8a45605, /* x^128 mod G */ + .barrett_reduction_const_1 = 0x826880efa40da72d, /* floor(x^95 / G) */ + .barrett_reduction_const_2 = 0x0000000004c11db7, /* G - x^32 */ +#else + .fold_across_2_longs_const_hi = 0xf200aa66, /* x^96 mod G */ + .fold_across_2_longs_const_lo = 0x490d678d, /* x^64 mod G */ + .barrett_reduction_const_1 = 0x826880ef, /* floor(x^63 / G) */ + .barrett_reduction_const_2 = 0x04c11db7, /* G - x^32 */ +#endif +}; + +/* + * Constants generated for least-significant-bit-first CRC-32 using + * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + + * x^5 + x^4 + x^2 + x^1 + x^0 + */ +static const struct crc_clmul_consts crc32_lsb_0xedb88320_consts __maybe_unused = { +#ifdef CONFIG_64BIT + .fold_across_2_longs_const_hi = 0x65673b4600000000, /* x^191 mod G */ + .fold_across_2_longs_const_lo = 0x9ba54c6f00000000, /* x^127 mod G */ + .barrett_reduction_const_1 = 0xb4e5b025f7011641, /* floor(x^95 / G) */ + .barrett_reduction_const_2 = 0x00000000edb88320, /* (G - x^32) * x^32 */ +#else + .fold_across_2_longs_const_hi = 0xccaa009e, /* x^95 mod G */ + .fold_across_2_longs_const_lo = 0xb8bc6765, /* x^63 mod G */ + .barrett_reduction_const_1 = 0xf7011641, /* floor(x^63 / G) */ + .barrett_reduction_const_2 = 0xedb88320, /* (G - x^32) * x^0 */ +#endif +}; + +/* + * Constants generated for least-significant-bit-first CRC-32 using + * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 + + * x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0 + */ +static const struct crc_clmul_consts crc32_lsb_0x82f63b78_consts __maybe_unused = { +#ifdef CONFIG_64BIT + .fold_across_2_longs_const_hi = 0x3743f7bd00000000, /* x^191 mod G */ + .fold_across_2_longs_const_lo = 0x3171d43000000000, /* x^127 mod G */ + .barrett_reduction_const_1 = 0x4869ec38dea713f1, /* floor(x^95 / G) */ + .barrett_reduction_const_2 = 0x0000000082f63b78, /* (G - x^32) * x^32 */ +#else + .fold_across_2_longs_const_hi = 0x493c7d27, /* x^95 mod G */ + .fold_across_2_longs_const_lo = 0xdd45aab8, /* x^63 mod G */ + .barrett_reduction_const_1 = 0xdea713f1, /* floor(x^63 / G) */ + .barrett_reduction_const_2 = 0x82f63b78, /* (G - x^32) * x^0 */ +#endif +}; + +/* + * Constants generated for most-significant-bit-first CRC-64 using + * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + + * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + + * x^7 + x^4 + x^1 + x^0 + */ +#ifdef CONFIG_64BIT +static const struct crc_clmul_consts crc64_msb_0x42f0e1eba9ea3693_consts __maybe_unused = { + .fold_across_2_longs_const_hi = 0x4eb938a7d257740e, /* x^192 mod G */ + .fold_across_2_longs_const_lo = 0x05f5c3c7eb52fab6, /* x^128 mod G */ + .barrett_reduction_const_1 = 0xabc694e836627c39, /* floor(x^127 / G) */ + .barrett_reduction_const_2 = 0x42f0e1eba9ea3693, /* G - x^64 */ +}; +#endif + +/* + * Constants generated for least-significant-bit-first CRC-64 using + * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + + * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + + * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + + * x^4 + x^3 + x^0 + */ +#ifdef CONFIG_64BIT +static const struct crc_clmul_consts crc64_lsb_0x9a6c9329ac4bc9b5_consts __maybe_unused = { + .fold_across_2_longs_const_hi = 0xeadc41fd2ba3d420, /* x^191 mod G */ + .fold_across_2_longs_const_lo = 0x21e9761e252621ac, /* x^127 mod G */ + .barrett_reduction_const_1 = 0x27ecfa329aef9f77, /* floor(x^127 / G) */ + .barrett_reduction_const_2 = 0x9a6c9329ac4bc9b5, /* (G - x^64) * x^0 */ +}; +#endif diff --git a/lib/crc/riscv/crc-clmul-template.h b/lib/crc/riscv/crc-clmul-template.h new file mode 100644 index 000000000000..77187e7f1762 --- /dev/null +++ b/lib/crc/riscv/crc-clmul-template.h @@ -0,0 +1,265 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright 2025 Google LLC */ + +/* + * This file is a "template" that generates a CRC function optimized using the + * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this + * file must define the following parameters to specify the type of CRC: + * + * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC + * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural + * mapping between bits and polynomial coefficients + * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected + * mapping between bits and polynomial coefficients + */ + +#include <asm/byteorder.h> +#include <linux/minmax.h> + +#define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */ + +static inline unsigned long clmul(unsigned long a, unsigned long b) +{ + unsigned long res; + + asm(".option push\n" + ".option arch,+zbc\n" + "clmul %0, %1, %2\n" + ".option pop\n" + : "=r" (res) : "r" (a), "r" (b)); + return res; +} + +static inline unsigned long clmulh(unsigned long a, unsigned long b) +{ + unsigned long res; + + asm(".option push\n" + ".option arch,+zbc\n" + "clmulh %0, %1, %2\n" + ".option pop\n" + : "=r" (res) : "r" (a), "r" (b)); + return res; +} + +static inline unsigned long clmulr(unsigned long a, unsigned long b) +{ + unsigned long res; + + asm(".option push\n" + ".option arch,+zbc\n" + "clmulr %0, %1, %2\n" + ".option pop\n" + : "=r" (res) : "r" (a), "r" (b)); + return res; +} + +/* + * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a + * polynomial whose bit order matches the CRC's bit order. + */ +#ifdef CONFIG_64BIT +# if LSB_CRC +# define crc_load_long(x) le64_to_cpup(x) +# else +# define crc_load_long(x) be64_to_cpup(x) +# endif +#else +# if LSB_CRC +# define crc_load_long(x) le32_to_cpup(x) +# else +# define crc_load_long(x) be32_to_cpup(x) +# endif +#endif + +/* XOR @crc into the end of @msgpoly that represents the high-order terms. */ +static inline unsigned long +crc_clmul_prep(crc_t crc, unsigned long msgpoly) +{ +#if LSB_CRC + return msgpoly ^ crc; +#else + return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS)); +#endif +} + +/* + * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it + * modulo the generator polynomial G. This gives the CRC of @msgpoly. + */ +static inline crc_t +crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts) +{ + unsigned long tmp; + + /* + * First step of Barrett reduction with integrated multiplication by + * x^n: calculate floor((msgpoly * x^n) / G). This is the value by + * which G needs to be multiplied to cancel out the x^n and higher terms + * of msgpoly * x^n. Do it using the following formula: + * + * msb-first: + * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1)) + * lsb-first: + * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG) + * + * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G), + * which fits a long exactly. Using any lower power of x there would + * not carry enough precision through the calculation, while using any + * higher power of x would require extra instructions to handle a wider + * multiplication. In the msb-first case, using this power of x results + * in needing a floored division by x^(BITS_PER_LONG-1), which matches + * what clmulr produces. In the lsb-first case, a factor of x gets + * implicitly introduced by each carryless multiplication (shown as + * '* x' above), and the floored division instead needs to be by + * x^BITS_PER_LONG which matches what clmul produces. + */ +#if LSB_CRC + tmp = clmul(msgpoly, consts->barrett_reduction_const_1); +#else + tmp = clmulr(msgpoly, consts->barrett_reduction_const_1); +#endif + + /* + * Second step of Barrett reduction: + * + * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G)) + * + * This reduces (msgpoly * x^n) modulo G by adding the appropriate + * multiple of G to it. The result uses only the x^0..x^(n-1) terms. + * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those + * terms in the first place, it is more efficient to do the equivalent: + * + * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n + * + * In the lsb-first case further modify it to the following which avoids + * a shift, as the crc ends up in the physically low n bits from clmulr: + * + * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x + * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n + * + * barrett_reduction_const_2 contains the constant multiplier (G - x^n) + * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The + * cast of the result to crc_t is essential, as it applies the mod x^n! + */ +#if LSB_CRC + return clmulr(tmp, consts->barrett_reduction_const_2); +#else + return clmul(tmp, consts->barrett_reduction_const_2); +#endif +} + +/* Update @crc with the data from @msgpoly. */ +static inline crc_t +crc_clmul_update_long(crc_t crc, unsigned long msgpoly, + const struct crc_clmul_consts *consts) +{ + return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts); +} + +/* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */ +static inline crc_t +crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len, + const struct crc_clmul_consts *consts) +{ + unsigned long msgpoly; + size_t i; + +#if LSB_CRC + msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8); + for (i = 1; i < len; i++) + msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8)); +#else + msgpoly = p[0]; + for (i = 1; i < len; i++) + msgpoly = (msgpoly << 8) ^ p[i]; +#endif + + if (len >= sizeof(crc_t)) { + #if LSB_CRC + msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); + #else + msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS); + #endif + return crc_clmul_long(msgpoly, consts); + } +#if LSB_CRC + msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); + return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len)); +#else + msgpoly ^= crc >> (CRC_BITS - 8*len); + return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len)); +#endif +} + +static inline crc_t +crc_clmul(crc_t crc, const void *p, size_t len, + const struct crc_clmul_consts *consts) +{ + size_t align; + + /* This implementation assumes that the CRC fits in an unsigned long. */ + BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long)); + + /* If the buffer is not long-aligned, align it. */ + align = (unsigned long)p % sizeof(unsigned long); + if (align && len) { + align = min(sizeof(unsigned long) - align, len); + crc = crc_clmul_update_partial(crc, p, align, consts); + p += align; + len -= align; + } + + if (len >= 4 * sizeof(unsigned long)) { + unsigned long m0, m1; + + m0 = crc_clmul_prep(crc, crc_load_long(p)); + m1 = crc_load_long(p + sizeof(unsigned long)); + p += 2 * sizeof(unsigned long); + len -= 2 * sizeof(unsigned long); + /* + * Main loop. Each iteration starts with a message polynomial + * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two + * more longs of data to form x^(3*BITS_PER_LONG)*m0 + + * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then + * "folds" that back into a congruent (modulo G) value that uses + * just m0 and m1 again. This is done by multiplying m0 by the + * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by + * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then + * adding the results to m2 and m3 as appropriate. Each such + * multiplication produces a result twice the length of a long, + * which in RISC-V is two instructions clmul and clmulh. + * + * This could be changed to fold across more than 2 longs at a + * time if there is a CPU that can take advantage of it. + */ + do { + unsigned long p0, p1, p2, p3; + + p0 = clmulh(m0, consts->fold_across_2_longs_const_hi); + p1 = clmul(m0, consts->fold_across_2_longs_const_hi); + p2 = clmulh(m1, consts->fold_across_2_longs_const_lo); + p3 = clmul(m1, consts->fold_across_2_longs_const_lo); + m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p); + m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^ + crc_load_long(p + sizeof(unsigned long)); + + p += 2 * sizeof(unsigned long); + len -= 2 * sizeof(unsigned long); + } while (len >= 2 * sizeof(unsigned long)); + + crc = crc_clmul_long(m0, consts); + crc = crc_clmul_update_long(crc, m1, consts); + } + + while (len >= sizeof(unsigned long)) { + crc = crc_clmul_update_long(crc, crc_load_long(p), consts); + p += sizeof(unsigned long); + len -= sizeof(unsigned long); + } + + if (len) + crc = crc_clmul_update_partial(crc, p, len, consts); + + return crc; +} diff --git a/lib/crc/riscv/crc-clmul.h b/lib/crc/riscv/crc-clmul.h new file mode 100644 index 000000000000..dd1736245815 --- /dev/null +++ b/lib/crc/riscv/crc-clmul.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright 2025 Google LLC */ + +#ifndef _RISCV_CRC_CLMUL_H +#define _RISCV_CRC_CLMUL_H + +#include <linux/types.h> +#include "crc-clmul-consts.h" + +u16 crc16_msb_clmul(u16 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts); +u32 crc32_msb_clmul(u32 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts); +u32 crc32_lsb_clmul(u32 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts); +#ifdef CONFIG_64BIT +u64 crc64_msb_clmul(u64 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts); +u64 crc64_lsb_clmul(u64 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts); +#endif + +#endif /* _RISCV_CRC_CLMUL_H */ diff --git a/lib/crc/riscv/crc-t10dif.h b/lib/crc/riscv/crc-t10dif.h new file mode 100644 index 000000000000..cd6136cbfda1 --- /dev/null +++ b/lib/crc/riscv/crc-t10dif.h @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized CRC-T10DIF function + * + * Copyright 2025 Google LLC + */ + +#include <asm/hwcap.h> +#include <asm/alternative-macros.h> + +#include "crc-clmul.h" + +static inline u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return crc16_msb_clmul(crc, p, len, &crc16_msb_0x8bb7_consts); + return crc_t10dif_generic(crc, p, len); +} diff --git a/lib/crc/riscv/crc16_msb.c b/lib/crc/riscv/crc16_msb.c new file mode 100644 index 000000000000..554d295e95f5 --- /dev/null +++ b/lib/crc/riscv/crc16_msb.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized most-significant-bit-first CRC16 + * + * Copyright 2025 Google LLC + */ + +#include "crc-clmul.h" + +typedef u16 crc_t; +#define LSB_CRC 0 +#include "crc-clmul-template.h" + +u16 crc16_msb_clmul(u16 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts) +{ + return crc_clmul(crc, p, len, consts); +} diff --git a/lib/crc/riscv/crc32.h b/lib/crc/riscv/crc32.h new file mode 100644 index 000000000000..3ec6eee98afa --- /dev/null +++ b/lib/crc/riscv/crc32.h @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized CRC32 functions + * + * Copyright 2025 Google LLC + */ + +#include <asm/hwcap.h> +#include <asm/alternative-macros.h> + +#include "crc-clmul.h" + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return crc32_lsb_clmul(crc, p, len, + &crc32_lsb_0xedb88320_consts); + return crc32_le_base(crc, p, len); +} + +static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return crc32_msb_clmul(crc, p, len, + &crc32_msb_0x04c11db7_consts); + return crc32_be_base(crc, p, len); +} + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return crc32_lsb_clmul(crc, p, len, + &crc32_lsb_0x82f63b78_consts); + return crc32c_base(crc, p, len); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return CRC32_LE_OPTIMIZATION | + CRC32_BE_OPTIMIZATION | + CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/riscv/crc32_lsb.c b/lib/crc/riscv/crc32_lsb.c new file mode 100644 index 000000000000..72fd67e7470c --- /dev/null +++ b/lib/crc/riscv/crc32_lsb.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized least-significant-bit-first CRC32 + * + * Copyright 2025 Google LLC + */ + +#include "crc-clmul.h" + +typedef u32 crc_t; +#define LSB_CRC 1 +#include "crc-clmul-template.h" + +u32 crc32_lsb_clmul(u32 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts) +{ + return crc_clmul(crc, p, len, consts); +} diff --git a/lib/crc/riscv/crc32_msb.c b/lib/crc/riscv/crc32_msb.c new file mode 100644 index 000000000000..fdbeaccc369f --- /dev/null +++ b/lib/crc/riscv/crc32_msb.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized most-significant-bit-first CRC32 + * + * Copyright 2025 Google LLC + */ + +#include "crc-clmul.h" + +typedef u32 crc_t; +#define LSB_CRC 0 +#include "crc-clmul-template.h" + +u32 crc32_msb_clmul(u32 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts) +{ + return crc_clmul(crc, p, len, consts); +} diff --git a/lib/crc/riscv/crc64.h b/lib/crc/riscv/crc64.h new file mode 100644 index 000000000000..a1b7873fde57 --- /dev/null +++ b/lib/crc/riscv/crc64.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized CRC64 functions + * + * Copyright 2025 Google LLC + */ + +#include <asm/hwcap.h> +#include <asm/alternative-macros.h> + +#include "crc-clmul.h" + +static inline u64 crc64_be_arch(u64 crc, const u8 *p, size_t len) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return crc64_msb_clmul(crc, p, len, + &crc64_msb_0x42f0e1eba9ea3693_consts); + return crc64_be_generic(crc, p, len); +} + +static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) +{ + if (riscv_has_extension_likely(RISCV_ISA_EXT_ZBC)) + return crc64_lsb_clmul(crc, p, len, + &crc64_lsb_0x9a6c9329ac4bc9b5_consts); + return crc64_nvme_generic(crc, p, len); +} diff --git a/lib/crc/riscv/crc64_lsb.c b/lib/crc/riscv/crc64_lsb.c new file mode 100644 index 000000000000..c5371bb85d90 --- /dev/null +++ b/lib/crc/riscv/crc64_lsb.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized least-significant-bit-first CRC64 + * + * Copyright 2025 Google LLC + */ + +#include "crc-clmul.h" + +typedef u64 crc_t; +#define LSB_CRC 1 +#include "crc-clmul-template.h" + +u64 crc64_lsb_clmul(u64 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts) +{ + return crc_clmul(crc, p, len, consts); +} diff --git a/lib/crc/riscv/crc64_msb.c b/lib/crc/riscv/crc64_msb.c new file mode 100644 index 000000000000..1925d1dbe225 --- /dev/null +++ b/lib/crc/riscv/crc64_msb.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RISC-V optimized most-significant-bit-first CRC64 + * + * Copyright 2025 Google LLC + */ + +#include "crc-clmul.h" + +typedef u64 crc_t; +#define LSB_CRC 0 +#include "crc-clmul-template.h" + +u64 crc64_msb_clmul(u64 crc, const void *p, size_t len, + const struct crc_clmul_consts *consts) +{ + return crc_clmul(crc, p, len, consts); +} diff --git a/lib/crc/s390/crc32-vx.h b/lib/crc/s390/crc32-vx.h new file mode 100644 index 000000000000..652c96e1a822 --- /dev/null +++ b/lib/crc/s390/crc32-vx.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _CRC32_VX_S390_H +#define _CRC32_VX_S390_H + +#include <linux/types.h> + +u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size); +u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); +u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size); + +#endif /* _CRC32_VX_S390_H */ diff --git a/lib/crc/s390/crc32.h b/lib/crc/s390/crc32.h new file mode 100644 index 000000000000..59c8983d428b --- /dev/null +++ b/lib/crc/s390/crc32.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CRC-32 implemented with the z/Architecture Vector Extension Facility. + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#include <linux/cpufeature.h> +#include <asm/fpu.h> +#include "crc32-vx.h" + +#define VX_MIN_LEN 64 +#define VX_ALIGNMENT 16L +#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) + +/* + * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension + * + * Creates a function to perform a particular CRC-32 computation. Depending + * on the message buffer, the hardware-accelerated or software implementation + * is used. Note that the message buffer is aligned to improve fetch + * operations of VECTOR LOAD MULTIPLE instructions. + */ +#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \ + static inline u32 ___fname(u32 crc, const u8 *data, size_t datalen) \ + { \ + unsigned long prealign, aligned, remaining; \ + DECLARE_KERNEL_FPU_ONSTACK16(vxstate); \ + \ + if (datalen < VX_MIN_LEN + VX_ALIGN_MASK || !cpu_has_vx()) \ + return ___crc32_sw(crc, data, datalen); \ + \ + if ((unsigned long)data & VX_ALIGN_MASK) { \ + prealign = VX_ALIGNMENT - \ + ((unsigned long)data & VX_ALIGN_MASK); \ + datalen -= prealign; \ + crc = ___crc32_sw(crc, data, prealign); \ + data = (void *)((unsigned long)data + prealign); \ + } \ + \ + aligned = datalen & ~VX_ALIGN_MASK; \ + remaining = datalen & VX_ALIGN_MASK; \ + \ + kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \ + crc = ___crc32_vx(crc, data, aligned); \ + kernel_fpu_end(&vxstate, KERNEL_VXR_LOW); \ + \ + if (remaining) \ + crc = ___crc32_sw(crc, data + aligned, remaining); \ + \ + return crc; \ + } + +DEFINE_CRC32_VX(crc32_le_arch, crc32_le_vgfm_16, crc32_le_base) +DEFINE_CRC32_VX(crc32_be_arch, crc32_be_vgfm_16, crc32_be_base) +DEFINE_CRC32_VX(crc32c_arch, crc32c_le_vgfm_16, crc32c_base) + +static inline u32 crc32_optimizations_arch(void) +{ + if (cpu_has_vx()) { + return CRC32_LE_OPTIMIZATION | + CRC32_BE_OPTIMIZATION | + CRC32C_OPTIMIZATION; + } + return 0; +} diff --git a/lib/crc/s390/crc32be-vx.c b/lib/crc/s390/crc32be-vx.c new file mode 100644 index 000000000000..fed7c9c70d05 --- /dev/null +++ b/lib/crc/s390/crc32be-vx.c @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hardware-accelerated CRC-32 variants for Linux on z Systems + * + * Use the z/Architecture Vector Extension Facility to accelerate the + * computing of CRC-32 checksums. + * + * This CRC-32 implementation algorithm processes the most-significant + * bit first (BE). + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#include <linux/types.h> +#include <asm/fpu.h> +#include "crc32-vx.h" + +/* Vector register range containing CRC-32 constants */ +#define CONST_R1R2 9 +#define CONST_R3R4 10 +#define CONST_R5 11 +#define CONST_R6 12 +#define CONST_RU_POLY 13 +#define CONST_CRC_POLY 14 + +/* + * The CRC-32 constant block contains reduction constants to fold and + * process particular chunks of the input data stream in parallel. + * + * For the CRC-32 variants, the constants are precomputed according to + * these definitions: + * + * R1 = x4*128+64 mod P(x) + * R2 = x4*128 mod P(x) + * R3 = x128+64 mod P(x) + * R4 = x128 mod P(x) + * R5 = x96 mod P(x) + * R6 = x64 mod P(x) + * + * Barret reduction constant, u, is defined as floor(x**64 / P(x)). + * + * where P(x) is the polynomial in the normal domain and the P'(x) is the + * polynomial in the reversed (bitreflected) domain. + * + * Note that the constant definitions below are extended in order to compute + * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. + * The rightmost doubleword can be 0 to prevent contribution to the result or + * can be multiplied by 1 to perform an XOR without the need for a separate + * VECTOR EXCLUSIVE OR instruction. + * + * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: + * + * P(x) = 0x04C11DB7 + * P'(x) = 0xEDB88320 + */ + +static unsigned long constants_CRC_32_BE[] = { + 0x08833794c, 0x0e6228b11, /* R1, R2 */ + 0x0c5b9cd4c, 0x0e8a45605, /* R3, R4 */ + 0x0f200aa66, 1UL << 32, /* R5, x32 */ + 0x0490d678d, 1, /* R6, 1 */ + 0x104d101df, 0, /* u */ + 0x104C11DB7, 0, /* P(x) */ +}; + +/** + * crc32_be_vgfm_16 - Compute CRC-32 (BE variant) with vector registers + * @crc: Initial CRC value, typically ~0. + * @buf: Input buffer pointer, performance might be improved if the + * buffer is on a doubleword boundary. + * @size: Size of the buffer, must be 64 bytes or greater. + * + * Register usage: + * V0: Initial CRC value and intermediate constants and results. + * V1..V4: Data for CRC computation. + * V5..V8: Next data chunks that are fetched from the input buffer. + * V9..V14: CRC-32 constants. + */ +u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size) +{ + /* Load CRC-32 constants */ + fpu_vlm(CONST_R1R2, CONST_CRC_POLY, &constants_CRC_32_BE); + fpu_vzero(0); + + /* Load the initial CRC value into the leftmost word of V0. */ + fpu_vlvgf(0, crc, 0); + + /* Load a 64-byte data chunk and XOR with CRC */ + fpu_vlm(1, 4, buf); + fpu_vx(1, 0, 1); + buf += 64; + size -= 64; + + while (size >= 64) { + /* Load the next 64-byte data chunk into V5 to V8 */ + fpu_vlm(5, 8, buf); + + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the reduction constants in V0. The intermediate result is + * then folded (accumulated) with the next data chunk in V5 and + * stored in V1. Repeat this step for the register contents + * in V2, V3, and V4 respectively. + */ + fpu_vgfmag(1, CONST_R1R2, 1, 5); + fpu_vgfmag(2, CONST_R1R2, 2, 6); + fpu_vgfmag(3, CONST_R1R2, 3, 7); + fpu_vgfmag(4, CONST_R1R2, 4, 8); + buf += 64; + size -= 64; + } + + /* Fold V1 to V4 into a single 128-bit value in V1 */ + fpu_vgfmag(1, CONST_R3R4, 1, 2); + fpu_vgfmag(1, CONST_R3R4, 1, 3); + fpu_vgfmag(1, CONST_R3R4, 1, 4); + + while (size >= 16) { + fpu_vl(2, buf); + fpu_vgfmag(1, CONST_R3R4, 1, 2); + buf += 16; + size -= 16; + } + + /* + * The R5 constant is used to fold a 128-bit value into an 96-bit value + * that is XORed with the next 96-bit input data chunk. To use a single + * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to + * form an intermediate 96-bit value (with appended zeros) which is then + * XORed with the intermediate reduction result. + */ + fpu_vgfmg(1, CONST_R5, 1); + + /* + * Further reduce the remaining 96-bit value to a 64-bit value using a + * single VGFMG, the rightmost doubleword is multiplied with 0x1. The + * intermediate result is then XORed with the product of the leftmost + * doubleword with R6. The result is a 64-bit value and is subject to + * the Barret reduction. + */ + fpu_vgfmg(1, CONST_R6, 1); + + /* + * The input values to the Barret reduction are the degree-63 polynomial + * in V1 (R(x)), degree-32 generator polynomial, and the reduction + * constant u. The Barret reduction result is the CRC value of R(x) mod + * P(x). + * + * The Barret reduction algorithm is defined as: + * + * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + * 3. C(x) = R(x) XOR T2(x) mod x^32 + * + * Note: To compensate the division by x^32, use the vector unpack + * instruction to move the leftmost word into the leftmost doubleword + * of the vector register. The rightmost doubleword is multiplied + * with zero to not contribute to the intermediate results. + */ + + /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ + fpu_vupllf(2, 1); + fpu_vgfmg(2, CONST_RU_POLY, 2); + + /* + * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in + * V2 and XOR the intermediate result, T2(x), with the value in V1. + * The final result is in the rightmost word of V2. + */ + fpu_vupllf(2, 2); + fpu_vgfmag(2, CONST_CRC_POLY, 2, 1); + return fpu_vlgvf(2, 3); +} diff --git a/lib/crc/s390/crc32le-vx.c b/lib/crc/s390/crc32le-vx.c new file mode 100644 index 000000000000..2f629f394df7 --- /dev/null +++ b/lib/crc/s390/crc32le-vx.c @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hardware-accelerated CRC-32 variants for Linux on z Systems + * + * Use the z/Architecture Vector Extension Facility to accelerate the + * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet + * and Castagnoli. + * + * This CRC-32 implementation algorithm is bitreflected and processes + * the least-significant bit first (Little-Endian). + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#include <linux/types.h> +#include <asm/fpu.h> +#include "crc32-vx.h" + +/* Vector register range containing CRC-32 constants */ +#define CONST_PERM_LE2BE 9 +#define CONST_R2R1 10 +#define CONST_R4R3 11 +#define CONST_R5 12 +#define CONST_RU_POLY 13 +#define CONST_CRC_POLY 14 + +/* + * The CRC-32 constant block contains reduction constants to fold and + * process particular chunks of the input data stream in parallel. + * + * For the CRC-32 variants, the constants are precomputed according to + * these definitions: + * + * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1 + * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1 + * R3 = [(x128+32 mod P'(x) << 32)]' << 1 + * R4 = [(x128-32 mod P'(x) << 32)]' << 1 + * R5 = [(x64 mod P'(x) << 32)]' << 1 + * R6 = [(x32 mod P'(x) << 32)]' << 1 + * + * The bitreflected Barret reduction constant, u', is defined as + * the bit reversal of floor(x**64 / P(x)). + * + * where P(x) is the polynomial in the normal domain and the P'(x) is the + * polynomial in the reversed (bitreflected) domain. + * + * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: + * + * P(x) = 0x04C11DB7 + * P'(x) = 0xEDB88320 + * + * CRC-32C (Castagnoli) polynomials: + * + * P(x) = 0x1EDC6F41 + * P'(x) = 0x82F63B78 + */ + +static unsigned long constants_CRC_32_LE[] = { + 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */ + 0x1c6e41596, 0x154442bd4, /* R2, R1 */ + 0x0ccaa009e, 0x1751997d0, /* R4, R3 */ + 0x0, 0x163cd6124, /* R5 */ + 0x0, 0x1f7011641, /* u' */ + 0x0, 0x1db710641 /* P'(x) << 1 */ +}; + +static unsigned long constants_CRC_32C_LE[] = { + 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */ + 0x09e4addf8, 0x740eef02, /* R2, R1 */ + 0x14cd00bd6, 0xf20c0dfe, /* R4, R3 */ + 0x0, 0x0dd45aab8, /* R5 */ + 0x0, 0x0dea713f1, /* u' */ + 0x0, 0x105ec76f0 /* P'(x) << 1 */ +}; + +/** + * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers + * @crc: Initial CRC value, typically ~0. + * @buf: Input buffer pointer, performance might be improved if the + * buffer is on a doubleword boundary. + * @size: Size of the buffer, must be 64 bytes or greater. + * @constants: CRC-32 constant pool base pointer. + * + * Register usage: + * V0: Initial CRC value and intermediate constants and results. + * V1..V4: Data for CRC computation. + * V5..V8: Next data chunks that are fetched from the input buffer. + * V9: Constant for BE->LE conversion and shift operations + * V10..V14: CRC-32 constants. + */ +static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants) +{ + /* Load CRC-32 constants */ + fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants); + + /* + * Load the initial CRC value. + * + * The CRC value is loaded into the rightmost word of the + * vector register and is later XORed with the LSB portion + * of the loaded input data. + */ + fpu_vzero(0); /* Clear V0 */ + fpu_vlvgf(0, crc, 3); /* Load CRC into rightmost word */ + + /* Load a 64-byte data chunk and XOR with CRC */ + fpu_vlm(1, 4, buf); + fpu_vperm(1, 1, 1, CONST_PERM_LE2BE); + fpu_vperm(2, 2, 2, CONST_PERM_LE2BE); + fpu_vperm(3, 3, 3, CONST_PERM_LE2BE); + fpu_vperm(4, 4, 4, CONST_PERM_LE2BE); + + fpu_vx(1, 0, 1); /* V1 ^= CRC */ + buf += 64; + size -= 64; + + while (size >= 64) { + fpu_vlm(5, 8, buf); + fpu_vperm(5, 5, 5, CONST_PERM_LE2BE); + fpu_vperm(6, 6, 6, CONST_PERM_LE2BE); + fpu_vperm(7, 7, 7, CONST_PERM_LE2BE); + fpu_vperm(8, 8, 8, CONST_PERM_LE2BE); + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the R1 and R2 reduction constants in V0. The intermediate + * result is then folded (accumulated) with the next data chunk + * in V5 and stored in V1. Repeat this step for the register + * contents in V2, V3, and V4 respectively. + */ + fpu_vgfmag(1, CONST_R2R1, 1, 5); + fpu_vgfmag(2, CONST_R2R1, 2, 6); + fpu_vgfmag(3, CONST_R2R1, 3, 7); + fpu_vgfmag(4, CONST_R2R1, 4, 8); + buf += 64; + size -= 64; + } + + /* + * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3 + * and R4 and accumulating the next 128-bit chunk until a single 128-bit + * value remains. + */ + fpu_vgfmag(1, CONST_R4R3, 1, 2); + fpu_vgfmag(1, CONST_R4R3, 1, 3); + fpu_vgfmag(1, CONST_R4R3, 1, 4); + + while (size >= 16) { + fpu_vl(2, buf); + fpu_vperm(2, 2, 2, CONST_PERM_LE2BE); + fpu_vgfmag(1, CONST_R4R3, 1, 2); + buf += 16; + size -= 16; + } + + /* + * Set up a vector register for byte shifts. The shift value must + * be loaded in bits 1-4 in byte element 7 of a vector register. + * Shift by 8 bytes: 0x40 + * Shift by 4 bytes: 0x20 + */ + fpu_vleib(9, 0x40, 7); + + /* + * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes + * to move R4 into the rightmost doubleword and set the leftmost + * doubleword to 0x1. + */ + fpu_vsrlb(0, CONST_R4R3, 9); + fpu_vleig(0, 1, 0); + + /* + * Compute GF(2) product of V1 and V0. The rightmost doubleword + * of V1 is multiplied with R4. The leftmost doubleword of V1 is + * multiplied by 0x1 and is then XORed with rightmost product. + * Implicitly, the intermediate leftmost product becomes padded + */ + fpu_vgfmg(1, 0, 1); + + /* + * Now do the final 32-bit fold by multiplying the rightmost word + * in V1 with R5 and XOR the result with the remaining bits in V1. + * + * To achieve this by a single VGFMAG, right shift V1 by a word + * and store the result in V2 which is then accumulated. Use the + * vector unpack instruction to load the rightmost half of the + * doubleword into the rightmost doubleword element of V1; the other + * half is loaded in the leftmost doubleword. + * The vector register with CONST_R5 contains the R5 constant in the + * rightmost doubleword and the leftmost doubleword is zero to ignore + * the leftmost product of V1. + */ + fpu_vleib(9, 0x20, 7); /* Shift by words */ + fpu_vsrlb(2, 1, 9); /* Store remaining bits in V2 */ + fpu_vupllf(1, 1); /* Split rightmost doubleword */ + fpu_vgfmag(1, CONST_R5, 1, 2); /* V1 = (V1 * R5) XOR V2 */ + + /* + * Apply a Barret reduction to compute the final 32-bit CRC value. + * + * The input values to the Barret reduction are the degree-63 polynomial + * in V1 (R(x)), degree-32 generator polynomial, and the reduction + * constant u. The Barret reduction result is the CRC value of R(x) mod + * P(x). + * + * The Barret reduction algorithm is defined as: + * + * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + * 3. C(x) = R(x) XOR T2(x) mod x^32 + * + * Note: The leftmost doubleword of vector register containing + * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product + * is zero and does not contribute to the final result. + */ + + /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ + fpu_vupllf(2, 1); + fpu_vgfmg(2, CONST_RU_POLY, 2); + + /* + * Compute the GF(2) product of the CRC polynomial with T1(x) in + * V2 and XOR the intermediate result, T2(x), with the value in V1. + * The final result is stored in word element 2 of V2. + */ + fpu_vupllf(2, 2); + fpu_vgfmag(2, CONST_CRC_POLY, 2, 1); + + return fpu_vlgvf(2, 2); +} + +u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size) +{ + return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]); +} + +u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size) +{ + return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]); +} diff --git a/lib/crc/sparc/crc32.h b/lib/crc/sparc/crc32.h new file mode 100644 index 000000000000..60f2765ac015 --- /dev/null +++ b/lib/crc/sparc/crc32.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* CRC32c (Castagnoli), sparc64 crc32c opcode accelerated + * + * This is based largely upon arch/x86/crypto/crc32c-intel.c + * + * Copyright (C) 2008 Intel Corporation + * Authors: Austin Zhang <austin_zhang@linux.intel.com> + * Kent Liu <kent.liu@intel.com> + */ + +#include <asm/pstate.h> +#include <asm/elf.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32c_opcode); + +#define crc32_le_arch crc32_le_base /* not implemented on this arch */ +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +void crc32c_sparc64(u32 *crcp, const u64 *data, size_t len); + +static inline u32 crc32c_arch(u32 crc, const u8 *data, size_t len) +{ + size_t n = -(uintptr_t)data & 7; + + if (!static_branch_likely(&have_crc32c_opcode)) + return crc32c_base(crc, data, len); + + if (n) { + /* Data isn't 8-byte aligned. Align it. */ + n = min(n, len); + crc = crc32c_base(crc, data, n); + data += n; + len -= n; + } + n = len & ~7U; + if (n) { + crc32c_sparc64(&crc, (const u64 *)data, n); + data += n; + len -= n; + } + if (len) + crc = crc32c_base(crc, data, len); + return crc; +} + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + unsigned long cfr; + + if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO)) + return; + + __asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr)); + if (!(cfr & CFR_CRC32C)) + return; + + static_branch_enable(&have_crc32c_opcode); + pr_info("Using sparc64 crc32c opcode optimized CRC32C implementation\n"); +} + +static inline u32 crc32_optimizations_arch(void) +{ + if (static_key_enabled(&have_crc32c_opcode)) + return CRC32C_OPTIMIZATION; + return 0; +} diff --git a/lib/crc/sparc/crc32c_asm.S b/lib/crc/sparc/crc32c_asm.S new file mode 100644 index 000000000000..4db873850f44 --- /dev/null +++ b/lib/crc/sparc/crc32c_asm.S @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <asm/opcodes.h> +#include <asm/visasm.h> +#include <asm/asi.h> + +ENTRY(crc32c_sparc64) + /* %o0=crc32p, %o1=data_ptr, %o2=len */ + VISEntryHalf + lda [%o0] ASI_PL, %f1 +1: ldd [%o1], %f2 + CRC32C(0,2,0) + subcc %o2, 8, %o2 + bne,pt %icc, 1b + add %o1, 0x8, %o1 + sta %f1, [%o0] ASI_PL + VISExitHalf +2: retl + nop +ENDPROC(crc32c_sparc64) diff --git a/lib/crc/tests/Makefile b/lib/crc/tests/Makefile new file mode 100644 index 000000000000..65f63c318ef5 --- /dev/null +++ b/lib/crc/tests/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o diff --git a/lib/crc/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c new file mode 100644 index 000000000000..f08d985d8860 --- /dev/null +++ b/lib/crc/tests/crc_kunit.c @@ -0,0 +1,452 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Unit tests and benchmarks for the CRC library functions + * + * Copyright 2024 Google LLC + * + * Author: Eric Biggers <ebiggers@google.com> + */ +#include <kunit/test.h> +#include <linux/crc7.h> +#include <linux/crc16.h> +#include <linux/crc-t10dif.h> +#include <linux/crc32.h> +#include <linux/crc32c.h> +#include <linux/crc64.h> +#include <linux/prandom.h> +#include <linux/vmalloc.h> + +#define CRC_KUNIT_SEED 42 +#define CRC_KUNIT_MAX_LEN 16384 +#define CRC_KUNIT_NUM_TEST_ITERS 1000 + +static struct rnd_state rng; +static u8 *test_buffer; +static size_t test_buflen; + +/** + * struct crc_variant - describes a CRC variant + * @bits: Number of bits in the CRC, 1 <= @bits <= 64. + * @le: true if it's a "little endian" CRC (reversed mapping between bits and + * polynomial coefficients in each byte), false if it's a "big endian" CRC + * (natural mapping between bits and polynomial coefficients in each byte) + * @poly: The generator polynomial with the highest-order term omitted. + * Bit-reversed if @le is true. + * @func: The function to compute a CRC. The type signature uses u64 so that it + * can fit any CRC up to CRC-64. The CRC is passed in, and is expected + * to be returned in, the least significant bits of the u64. The + * function is expected to *not* invert the CRC at the beginning and end. + */ +struct crc_variant { + int bits; + bool le; + u64 poly; + u64 (*func)(u64 crc, const u8 *p, size_t len); +}; + +static u32 rand32(void) +{ + return prandom_u32_state(&rng); +} + +static u64 rand64(void) +{ + u32 n = rand32(); + + return ((u64)n << 32) | rand32(); +} + +static u64 crc_mask(const struct crc_variant *v) +{ + return (u64)-1 >> (64 - v->bits); +} + +/* Reference implementation of any CRC variant */ +static u64 crc_ref(const struct crc_variant *v, + u64 crc, const u8 *p, size_t len) +{ + size_t i, j; + + for (i = 0; i < len; i++) { + for (j = 0; j < 8; j++) { + if (v->le) { + crc ^= (p[i] >> j) & 1; + crc = (crc >> 1) ^ ((crc & 1) ? v->poly : 0); + } else { + crc ^= (u64)((p[i] >> (7 - j)) & 1) << + (v->bits - 1); + if (crc & (1ULL << (v->bits - 1))) + crc = ((crc << 1) ^ v->poly) & + crc_mask(v); + else + crc <<= 1; + } + } + } + return crc; +} + +static int crc_suite_init(struct kunit_suite *suite) +{ + /* + * Allocate the test buffer using vmalloc() with a page-aligned length + * so that it is immediately followed by a guard page. This allows + * buffer overreads to be detected, even in assembly code. + */ + test_buflen = round_up(CRC_KUNIT_MAX_LEN, PAGE_SIZE); + test_buffer = vmalloc(test_buflen); + if (!test_buffer) + return -ENOMEM; + + prandom_seed_state(&rng, CRC_KUNIT_SEED); + prandom_bytes_state(&rng, test_buffer, test_buflen); + return 0; +} + +static void crc_suite_exit(struct kunit_suite *suite) +{ + vfree(test_buffer); + test_buffer = NULL; +} + +/* Generate a random initial CRC. */ +static u64 generate_random_initial_crc(const struct crc_variant *v) +{ + switch (rand32() % 4) { + case 0: + return 0; + case 1: + return crc_mask(v); /* All 1 bits */ + default: + return rand64() & crc_mask(v); + } +} + +/* Generate a random length, preferring small lengths. */ +static size_t generate_random_length(size_t max_length) +{ + size_t len; + + switch (rand32() % 3) { + case 0: + len = rand32() % 128; + break; + case 1: + len = rand32() % 3072; + break; + default: + len = rand32(); + break; + } + return len % (max_length + 1); +} + +/* Test that v->func gives the same CRCs as a reference implementation. */ +static void crc_test(struct kunit *test, const struct crc_variant *v) +{ + size_t i; + + for (i = 0; i < CRC_KUNIT_NUM_TEST_ITERS; i++) { + u64 init_crc, expected_crc, actual_crc; + size_t len, offset; + bool nosimd; + + init_crc = generate_random_initial_crc(v); + len = generate_random_length(CRC_KUNIT_MAX_LEN); + + /* Generate a random offset. */ + if (rand32() % 2 == 0) { + /* Use a random alignment mod 64 */ + offset = rand32() % 64; + offset = min(offset, CRC_KUNIT_MAX_LEN - len); + } else { + /* Go up to the guard page, to catch buffer overreads */ + offset = test_buflen - len; + } + + if (rand32() % 8 == 0) + /* Refresh the data occasionally. */ + prandom_bytes_state(&rng, &test_buffer[offset], len); + + nosimd = rand32() % 8 == 0; + + /* + * Compute the CRC, and verify that it equals the CRC computed + * by a simple bit-at-a-time reference implementation. + */ + expected_crc = crc_ref(v, init_crc, &test_buffer[offset], len); + if (nosimd) + local_irq_disable(); + actual_crc = v->func(init_crc, &test_buffer[offset], len); + if (nosimd) + local_irq_enable(); + KUNIT_EXPECT_EQ_MSG(test, expected_crc, actual_crc, + "Wrong result with len=%zu offset=%zu nosimd=%d", + len, offset, nosimd); + } +} + +static __always_inline void +crc_benchmark(struct kunit *test, + u64 (*crc_func)(u64 crc, const u8 *p, size_t len)) +{ + static const size_t lens_to_test[] = { + 1, 16, 64, 127, 128, 200, 256, 511, 512, 1024, 3173, 4096, 16384, + }; + size_t len, i, j, num_iters; + /* + * The CRC value that this function computes in a series of calls to + * crc_func is never actually used, so use volatile to ensure that the + * computations are done as intended and don't all get optimized out. + */ + volatile u64 crc = 0; + u64 t; + + if (!IS_ENABLED(CONFIG_CRC_BENCHMARK)) + kunit_skip(test, "not enabled"); + + /* warm-up */ + for (i = 0; i < 10000000; i += CRC_KUNIT_MAX_LEN) + crc = crc_func(crc, test_buffer, CRC_KUNIT_MAX_LEN); + + for (i = 0; i < ARRAY_SIZE(lens_to_test); i++) { + len = lens_to_test[i]; + KUNIT_ASSERT_LE(test, len, CRC_KUNIT_MAX_LEN); + num_iters = 10000000 / (len + 128); + preempt_disable(); + t = ktime_get_ns(); + for (j = 0; j < num_iters; j++) + crc = crc_func(crc, test_buffer, len); + t = ktime_get_ns() - t; + preempt_enable(); + kunit_info(test, "len=%zu: %llu MB/s\n", + len, div64_u64((u64)len * num_iters * 1000, t)); + } +} + +/* crc7_be */ + +static u64 crc7_be_wrapper(u64 crc, const u8 *p, size_t len) +{ + /* + * crc7_be() left-aligns the 7-bit CRC in a u8, whereas the test wants a + * right-aligned CRC (in a u64). Convert between the conventions. + */ + return crc7_be(crc << 1, p, len) >> 1; +} + +static const struct crc_variant crc_variant_crc7_be = { + .bits = 7, + .poly = 0x9, + .func = crc7_be_wrapper, +}; + +static void crc7_be_test(struct kunit *test) +{ + crc_test(test, &crc_variant_crc7_be); +} + +static void crc7_be_benchmark(struct kunit *test) +{ + crc_benchmark(test, crc7_be_wrapper); +} + +/* crc16 */ + +static u64 crc16_wrapper(u64 crc, const u8 *p, size_t len) +{ + return crc16(crc, p, len); +} + +static const struct crc_variant crc_variant_crc16 = { + .bits = 16, + .le = true, + .poly = 0xa001, + .func = crc16_wrapper, +}; + +static void crc16_test(struct kunit *test) +{ + crc_test(test, &crc_variant_crc16); +} + +static void crc16_benchmark(struct kunit *test) +{ + crc_benchmark(test, crc16_wrapper); +} + +/* crc_t10dif */ + +static u64 crc_t10dif_wrapper(u64 crc, const u8 *p, size_t len) +{ + return crc_t10dif_update(crc, p, len); +} + +static const struct crc_variant crc_variant_crc_t10dif = { + .bits = 16, + .le = false, + .poly = 0x8bb7, + .func = crc_t10dif_wrapper, +}; + +static void crc_t10dif_test(struct kunit *test) +{ + crc_test(test, &crc_variant_crc_t10dif); +} + +static void crc_t10dif_benchmark(struct kunit *test) +{ + crc_benchmark(test, crc_t10dif_wrapper); +} + +/* crc32_le */ + +static u64 crc32_le_wrapper(u64 crc, const u8 *p, size_t len) +{ + return crc32_le(crc, p, len); +} + +static const struct crc_variant crc_variant_crc32_le = { + .bits = 32, + .le = true, + .poly = 0xedb88320, + .func = crc32_le_wrapper, +}; + +static void crc32_le_test(struct kunit *test) +{ + crc_test(test, &crc_variant_crc32_le); +} + +static void crc32_le_benchmark(struct kunit *test) +{ + crc_benchmark(test, crc32_le_wrapper); +} + +/* crc32_be */ + +static u64 crc32_be_wrapper(u64 crc, const u8 *p, size_t len) +{ + return crc32_be(crc, p, len); +} + +static const struct crc_variant crc_variant_crc32_be = { + .bits = 32, + .le = false, + .poly = 0x04c11db7, + .func = crc32_be_wrapper, +}; + +static void crc32_be_test(struct kunit *test) +{ + crc_test(test, &crc_variant_crc32_be); +} + +static void crc32_be_benchmark(struct kunit *test) +{ + crc_benchmark(test, crc32_be_wrapper); +} + +/* crc32c */ + +static u64 crc32c_wrapper(u64 crc, const u8 *p, size_t len) +{ + return crc32c(crc, p, len); +} + +static const struct crc_variant crc_variant_crc32c = { + .bits = 32, + .le = true, + .poly = 0x82f63b78, + .func = crc32c_wrapper, +}; + +static void crc32c_test(struct kunit *test) +{ + crc_test(test, &crc_variant_crc32c); +} + +static void crc32c_benchmark(struct kunit *test) +{ + crc_benchmark(test, crc32c_wrapper); +} + +/* crc64_be */ + +static u64 crc64_be_wrapper(u64 crc, const u8 *p, size_t len) +{ + return crc64_be(crc, p, len); +} + +static const struct crc_variant crc_variant_crc64_be = { + .bits = 64, + .le = false, + .poly = 0x42f0e1eba9ea3693, + .func = crc64_be_wrapper, +}; + +static void crc64_be_test(struct kunit *test) +{ + crc_test(test, &crc_variant_crc64_be); +} + +static void crc64_be_benchmark(struct kunit *test) +{ + crc_benchmark(test, crc64_be_wrapper); +} + +/* crc64_nvme */ + +static u64 crc64_nvme_wrapper(u64 crc, const u8 *p, size_t len) +{ + /* The inversions that crc64_nvme() does have to be undone here. */ + return ~crc64_nvme(~crc, p, len); +} + +static const struct crc_variant crc_variant_crc64_nvme = { + .bits = 64, + .le = true, + .poly = 0x9a6c9329ac4bc9b5, + .func = crc64_nvme_wrapper, +}; + +static void crc64_nvme_test(struct kunit *test) +{ + crc_test(test, &crc_variant_crc64_nvme); +} + +static void crc64_nvme_benchmark(struct kunit *test) +{ + crc_benchmark(test, crc64_nvme_wrapper); +} + +static struct kunit_case crc_test_cases[] = { + KUNIT_CASE(crc7_be_test), + KUNIT_CASE(crc7_be_benchmark), + KUNIT_CASE(crc16_test), + KUNIT_CASE(crc16_benchmark), + KUNIT_CASE(crc_t10dif_test), + KUNIT_CASE(crc_t10dif_benchmark), + KUNIT_CASE(crc32_le_test), + KUNIT_CASE(crc32_le_benchmark), + KUNIT_CASE(crc32_be_test), + KUNIT_CASE(crc32_be_benchmark), + KUNIT_CASE(crc32c_test), + KUNIT_CASE(crc32c_benchmark), + KUNIT_CASE(crc64_be_test), + KUNIT_CASE(crc64_be_benchmark), + KUNIT_CASE(crc64_nvme_test), + KUNIT_CASE(crc64_nvme_benchmark), + {}, +}; + +static struct kunit_suite crc_test_suite = { + .name = "crc", + .test_cases = crc_test_cases, + .suite_init = crc_suite_init, + .suite_exit = crc_suite_exit, +}; +kunit_test_suite(crc_test_suite); + +MODULE_DESCRIPTION("Unit tests and benchmarks for the CRC library functions"); +MODULE_LICENSE("GPL"); diff --git a/lib/crc/x86/crc-pclmul-consts.h b/lib/crc/x86/crc-pclmul-consts.h new file mode 100644 index 000000000000..6ae94158fca2 --- /dev/null +++ b/lib/crc/x86/crc-pclmul-consts.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * CRC constants generated by: + * + * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5 + * + * Do not edit manually. + */ + +/* + * CRC folding constants generated for most-significant-bit-first CRC-16 using + * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 + */ +static const struct { + u8 bswap_mask[16]; + u64 fold_across_2048_bits_consts[2]; + u64 fold_across_1024_bits_consts[2]; + u64 fold_across_512_bits_consts[2]; + u64 fold_across_256_bits_consts[2]; + u64 fold_across_128_bits_consts[2]; + u8 shuf_table[48]; + u64 barrett_reduction_consts[2]; +} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = { + .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, + .fold_across_2048_bits_consts = { + 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */ + 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */ + }, + .fold_across_1024_bits_consts = { + 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */ + 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */ + }, + .fold_across_512_bits_consts = { + 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */ + 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */ + }, + .fold_across_256_bits_consts = { + 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */ + 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */ + }, + .fold_across_128_bits_consts = { + 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */ + 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */ + }, + .shuf_table = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, + .barrett_reduction_consts = { + 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */ + 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */ + }, +}; + +/* + * CRC folding constants generated for least-significant-bit-first CRC-32 using + * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + + * x^5 + x^4 + x^2 + x^1 + x^0 + */ +static const struct { + u64 fold_across_2048_bits_consts[2]; + u64 fold_across_1024_bits_consts[2]; + u64 fold_across_512_bits_consts[2]; + u64 fold_across_256_bits_consts[2]; + u64 fold_across_128_bits_consts[2]; + u8 shuf_table[48]; + u64 barrett_reduction_consts[2]; +} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = { + .fold_across_2048_bits_consts = { + 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */ + 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */ + }, + .fold_across_1024_bits_consts = { + 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */ + 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */ + }, + .fold_across_512_bits_consts = { + 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */ + 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */ + }, + .fold_across_256_bits_consts = { + 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */ + 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */ + }, + .fold_across_128_bits_consts = { + 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */ + 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */ + }, + .shuf_table = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, + .barrett_reduction_consts = { + 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */ + 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */ + }, +}; + +/* + * CRC folding constants generated for least-significant-bit-first CRC-32 using + * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 + + * x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0 + */ +static const struct { + u64 fold_across_2048_bits_consts[2]; + u64 fold_across_1024_bits_consts[2]; + u64 fold_across_512_bits_consts[2]; + u64 fold_across_256_bits_consts[2]; + u64 fold_across_128_bits_consts[2]; + u8 shuf_table[48]; + u64 barrett_reduction_consts[2]; +} crc32_lsb_0x82f63b78_consts ____cacheline_aligned __maybe_unused = { + .fold_across_2048_bits_consts = { + 0x00000000dcb17aa4, /* HI64_TERMS: (x^2079 mod G) * x^32 */ + 0x00000000b9e02b86, /* LO64_TERMS: (x^2015 mod G) * x^32 */ + }, + .fold_across_1024_bits_consts = { + 0x000000006992cea2, /* HI64_TERMS: (x^1055 mod G) * x^32 */ + 0x000000000d3b6092, /* LO64_TERMS: (x^991 mod G) * x^32 */ + }, + .fold_across_512_bits_consts = { + 0x00000000740eef02, /* HI64_TERMS: (x^543 mod G) * x^32 */ + 0x000000009e4addf8, /* LO64_TERMS: (x^479 mod G) * x^32 */ + }, + .fold_across_256_bits_consts = { + 0x000000003da6d0cb, /* HI64_TERMS: (x^287 mod G) * x^32 */ + 0x00000000ba4fc28e, /* LO64_TERMS: (x^223 mod G) * x^32 */ + }, + .fold_across_128_bits_consts = { + 0x00000000f20c0dfe, /* HI64_TERMS: (x^159 mod G) * x^32 */ + 0x00000000493c7d27, /* LO64_TERMS: (x^95 mod G) * x^32 */ + }, + .shuf_table = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, + .barrett_reduction_consts = { + 0x4869ec38dea713f1, /* HI64_TERMS: floor(x^95 / G) */ + 0x0000000105ec76f0, /* LO64_TERMS: (G - x^32) * x^31 */ + }, +}; + +/* + * CRC folding constants generated for most-significant-bit-first CRC-64 using + * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + + * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + + * x^7 + x^4 + x^1 + x^0 + */ +static const struct { + u8 bswap_mask[16]; + u64 fold_across_2048_bits_consts[2]; + u64 fold_across_1024_bits_consts[2]; + u64 fold_across_512_bits_consts[2]; + u64 fold_across_256_bits_consts[2]; + u64 fold_across_128_bits_consts[2]; + u8 shuf_table[48]; + u64 barrett_reduction_consts[2]; +} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = { + .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, + .fold_across_2048_bits_consts = { + 0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */ + 0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */ + }, + .fold_across_1024_bits_consts = { + 0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */ + 0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */ + }, + .fold_across_512_bits_consts = { + 0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */ + 0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */ + }, + .fold_across_256_bits_consts = { + 0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */ + 0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */ + }, + .fold_across_128_bits_consts = { + 0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */ + 0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */ + }, + .shuf_table = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, + .barrett_reduction_consts = { + 0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */ + 0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */ + }, +}; + +/* + * CRC folding constants generated for least-significant-bit-first CRC-64 using + * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + + * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + + * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + + * x^4 + x^3 + x^0 + */ +static const struct { + u64 fold_across_2048_bits_consts[2]; + u64 fold_across_1024_bits_consts[2]; + u64 fold_across_512_bits_consts[2]; + u64 fold_across_256_bits_consts[2]; + u64 fold_across_128_bits_consts[2]; + u8 shuf_table[48]; + u64 barrett_reduction_consts[2]; +} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = { + .fold_across_2048_bits_consts = { + 0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */ + 0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */ + }, + .fold_across_1024_bits_consts = { + 0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */ + 0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */ + }, + .fold_across_512_bits_consts = { + 0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */ + 0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */ + }, + .fold_across_256_bits_consts = { + 0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */ + 0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */ + }, + .fold_across_128_bits_consts = { + 0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */ + 0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */ + }, + .shuf_table = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, + .barrett_reduction_consts = { + 0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */ + 0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */ + }, +}; diff --git a/lib/crc/x86/crc-pclmul-template.S b/lib/crc/x86/crc-pclmul-template.S new file mode 100644 index 000000000000..a02f7dc8053e --- /dev/null +++ b/lib/crc/x86/crc-pclmul-template.S @@ -0,0 +1,575 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +// +// Template to generate [V]PCLMULQDQ-based CRC functions for x86 +// +// Copyright 2025 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> + +#include <linux/linkage.h> +#include <linux/objtool.h> + +// Offsets within the generated constants table +.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only +.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next +.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next +.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next +.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next +.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0 +.set OFFSETOF_SHUF_TABLE, 1*16 +.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16 + +// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the +// corresponding non-VEX instruction plus any needed moves. The supported +// instruction formats are: +// +// - Two-arg [src, dst], where the non-VEX format is the same. +// - Three-arg [src1, src2, dst] where the non-VEX format is +// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too. +// +// \insn gives the instruction without a "v" prefix and including any immediate +// argument if needed to make the instruction follow one of the above formats. +// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to +// it first; this is needed when \arg1 is an unaligned mem operand. +.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp +.if AVX_LEVEL == 0 + // VEX not allowed. Emulate it. + .ifnb \arg3 // Three-arg [src1, src2, dst] + .ifc "\arg2", "\arg3" // src2 == dst? + .ifnb \unaligned_mem_tmp + movdqu \arg1, \unaligned_mem_tmp + \insn \unaligned_mem_tmp, \arg3 + .else + \insn \arg1, \arg3 + .endif + .else // src2 != dst + .ifc "\arg1", "\arg3" + .error "Can't have src1 == dst when src2 != dst" + .endif + .ifnb \unaligned_mem_tmp + movdqu \arg1, \unaligned_mem_tmp + movdqa \arg2, \arg3 + \insn \unaligned_mem_tmp, \arg3 + .else + movdqa \arg2, \arg3 + \insn \arg1, \arg3 + .endif + .endif + .else // Two-arg [src, dst] + .ifnb \unaligned_mem_tmp + movdqu \arg1, \unaligned_mem_tmp + \insn \unaligned_mem_tmp, \arg2 + .else + \insn \arg1, \arg2 + .endif + .endif +.else + // VEX is allowed. Emit the desired instruction directly. + .ifnb \arg3 + v\insn \arg1, \arg2, \arg3 + .else + v\insn \arg1, \arg2 + .endif +.endif +.endm + +// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector +// register of length VL. +.macro _vbroadcast src, dst +.if VL == 16 + _cond_vex movdqa, \src, \dst +.elseif VL == 32 + vbroadcasti128 \src, \dst +.else + vbroadcasti32x4 \src, \dst +.endif +.endm + +// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC +// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane. +.macro _load_data vl, src, bswap_mask, dst +.if \vl < 64 + _cond_vex movdqu, "\src", \dst +.else + vmovdqu8 \src, \dst +.endif +.if !LSB_CRC + _cond_vex pshufb, \bswap_mask, \dst, \dst +.endif +.endm + +.macro _prepare_v0 vl, v0, v1, bswap_mask +.if LSB_CRC + .if \vl < 64 + _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1 + .else + vpxorq (BUF), \v0, \v0 + .endif +.else + _load_data \vl, (BUF), \bswap_mask, \v1 + .if \vl < 64 + _cond_vex pxor, \v1, \v0, \v0 + .else + vpxorq \v1, \v0, \v0 + .endif +.endif +.endm + +// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for +// msb-first order or the physically high qword for lsb-first order +#define LO64_TERMS 0 + +// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high +// qword for msb-first order or the physically low qword for lsb-first order +#define HI64_TERMS 1 + +// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given +// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst. +.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst + _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \ + \src1, \src2, \dst +.endm + +// Fold \acc into \data and store the result back into \acc. \data can be an +// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no +// byte-reflection is needed; otherwise it must be a vector register. \consts +// is a vector register containing the needed fold constants, and \tmp is a +// temporary vector register. All arguments must be the same length. +.macro _fold_vec acc, data, consts, tmp + _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp + _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc +.if AVX_LEVEL <= 2 + _cond_vex pxor, \data, \tmp, \tmp + _cond_vex pxor, \tmp, \acc, \acc +.else + vpternlogq $0x96, \data, \tmp, \acc +.endif +.endm + +// Fold \acc into \data and store the result back into \acc. \data is an +// unaligned mem operand, \consts is a vector register containing the needed +// fold constants, \bswap_mask is a vector register containing the +// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are +// temporary vector registers. All arguments must have length \vl. +.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2 +.if AVX_LEVEL == 0 || !LSB_CRC + _load_data \vl, \data, \bswap_mask, \tmp1 + _fold_vec \acc, \tmp1, \consts, \tmp2 +.else + _fold_vec \acc, \data, \consts, \tmp1 +.endif +.endm + +// Load the constants for folding across 2**i vectors of length VL at a time +// into all 128-bit lanes of the vector register CONSTS. +.macro _load_vec_folding_consts i + _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \ + CONSTS +.endm + +// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store +// the result back into \v0. If the remaining length mod \vl is nonzero, also +// fold \vl data bytes from BUF. For both operations the fold distance is \vl. +// \consts must be a register of length \vl containing the fold constants. +.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2 + _fold_vec \v0, \v1, \consts, \tmp1 + test $\vl, LEN8 + jz .Lfold_vec_final_done\@ + _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2 + add $\vl, BUF +.Lfold_vec_final_done\@: +.endm + +// This macro generates the body of a CRC function with the following prototype: +// +// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts); +// +// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it. +// |buf| is the data to checksum. |len| is the data length in bytes, which must +// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts +// field of the constants struct that was generated for the chosen CRC variant. +// +// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g. +// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If +// the file is compiled in i386 mode, then the maximum supported value is 32. +// +// \lsb_crc is 1 if the CRC processes the least significant bit of each byte +// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0 +// if the CRC processes the most significant bit of each byte first, i.e. maps +// bit0 to x^0, bit1 to x^1, bit7 to x^7. +// +// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64. +// +// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or +// 512 for AVX512. +// +// If \vl == 16 && \avx_level == 0, the generated code requires: +// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.) +// +// If \vl == 32 && \avx_level == 2, the generated code requires: +// VPCLMULQDQ && AVX2. +// +// If \vl == 64 && \avx_level == 512, the generated code requires: +// VPCLMULQDQ && AVX512BW && AVX512VL. +// +// Other \vl and \avx_level combinations are either not supported or not useful. +.macro _crc_pclmul n, lsb_crc, vl, avx_level + .set LSB_CRC, \lsb_crc + .set VL, \vl + .set AVX_LEVEL, \avx_level + + // Define aliases for the xmm, ymm, or zmm registers according to VL. +.irp i, 0,1,2,3,4,5,6,7 + .if VL == 16 + .set V\i, %xmm\i + .set LOG2_VL, 4 + .elseif VL == 32 + .set V\i, %ymm\i + .set LOG2_VL, 5 + .elseif VL == 64 + .set V\i, %zmm\i + .set LOG2_VL, 6 + .else + .error "Unsupported vector length" + .endif +.endr + // Define aliases for the function parameters. + // Note: when crc_t is shorter than u32, zero-extension to 32 bits is + // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed + // when crc_t is shorter than u64. +#ifdef __x86_64__ +.if \n <= 32 + .set CRC, %edi +.else + .set CRC, %rdi +.endif + .set BUF, %rsi + .set LEN, %rdx + .set LEN32, %edx + .set LEN8, %dl + .set CONSTS_PTR, %rcx +#else + // 32-bit support, assuming -mregparm=3 and not including support for + // CRC-64 (which would use both eax and edx to pass the crc parameter). + .set CRC, %eax + .set BUF, %edx + .set LEN, %ecx + .set LEN32, %ecx + .set LEN8, %cl + .set CONSTS_PTR, %ebx // Passed on stack +#endif + + // Define aliases for some local variables. V0-V5 are used without + // aliases (for accumulators, data, temporary values, etc). Staying + // within the first 8 vector registers keeps the code 32-bit SSE + // compatible and reduces the size of 64-bit SSE code slightly. + .set BSWAP_MASK, V6 + .set BSWAP_MASK_YMM, %ymm6 + .set BSWAP_MASK_XMM, %xmm6 + .set CONSTS, V7 + .set CONSTS_YMM, %ymm7 + .set CONSTS_XMM, %xmm7 + + // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the + // functions generated by this macro are called only by static_call. + ANNOTATE_NOENDBR + +#ifdef __i386__ + push CONSTS_PTR + mov 8(%esp), CONSTS_PTR +#endif + + // Create a 128-bit vector that contains the initial CRC in the end + // representing the high-order polynomial coefficients, and the rest 0. + // If the CRC is msb-first, also load the byte-reflection table. +.if \n <= 32 + _cond_vex movd, CRC, %xmm0 +.else + _cond_vex movq, CRC, %xmm0 +.endif +.if !LSB_CRC + _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0 + _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK +.endif + + // Load the first vector of data and XOR the initial CRC into the + // appropriate end of the first 128-bit lane of data. If LEN < VL, then + // use a short vector and jump ahead to the final reduction. (LEN >= 16 + // is guaranteed here but not necessarily LEN >= VL.) +.if VL >= 32 + cmp $VL, LEN + jae .Lat_least_1vec\@ + .if VL == 64 + cmp $32, LEN32 + jb .Lless_than_32bytes\@ + _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM + add $32, BUF + jmp .Lreduce_256bits_to_128bits\@ +.Lless_than_32bytes\@: + .endif + _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM + add $16, BUF + vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM + jmp .Lcheck_for_partial_block\@ +.Lat_least_1vec\@: +.endif + _prepare_v0 VL, V0, V1, BSWAP_MASK + + // Handle VL <= LEN < 4*VL. + cmp $4*VL-1, LEN + ja .Lat_least_4vecs\@ + add $VL, BUF + // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector. + // If VL==16 then load fold_across_128_bits_consts first, as the final + // reduction depends on it and it won't be loaded anywhere else. + cmp $2*VL-1, LEN32 +.if VL == 16 + _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM +.endif + jbe .Lreduce_1vec_to_128bits\@ + // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to + // the reduction from 2 vectors. + _load_data VL, (BUF), BSWAP_MASK, V1 + add $VL, BUF + jmp .Lreduce_2vecs_to_1\@ + +.Lat_least_4vecs\@: + // Load 3 more vectors of data. + _load_data VL, 1*VL(BUF), BSWAP_MASK, V1 + _load_data VL, 2*VL(BUF), BSWAP_MASK, V2 + _load_data VL, 3*VL(BUF), BSWAP_MASK, V3 + sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32 + add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32 + + // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next + // 4 vectors of data and write the result back to V0-V3. + cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32 + jbe .Lreduce_4vecs_to_2\@ + _load_vec_folding_consts 2 +.Lfold_4vecs_loop\@: + _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 + _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 + _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 + _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 + sub $-4*VL, BUF + add $-4*VL, LEN + cmp $4*VL-1, LEN + ja .Lfold_4vecs_loop\@ + + // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold + // two more vectors of data from BUF, if at least that much remains. +.Lreduce_4vecs_to_2\@: + _load_vec_folding_consts 1 + _fold_vec V0, V2, CONSTS, V4 + _fold_vec V1, V3, CONSTS, V4 + test $2*VL, LEN8 + jz .Lreduce_2vecs_to_1\@ + _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 + _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 + sub $-2*VL, BUF + + // Fold V0 into V1 and write the result back to V0. Then fold one more + // vector of data from BUF, if at least that much remains. +.Lreduce_2vecs_to_1\@: + _load_vec_folding_consts 0 + _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5 + +.Lreduce_1vec_to_128bits\@: +.if VL == 64 + // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of + // data from BUF, if at least that much remains. + vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM + vextracti64x4 $1, %zmm0, %ymm1 + _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5 +.Lreduce_256bits_to_128bits\@: +.endif +.if VL >= 32 + // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of + // data from BUF, if at least that much remains. + vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM + vextracti128 $1, %ymm0, %xmm1 + _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5 +.Lcheck_for_partial_block\@: +.endif + and $15, LEN32 + jz .Lreduce_128bits_to_crc\@ + + // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now + // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0 + // and B is the polynomial of the remaining LEN data bytes. To reduce + // this to 128 bits without needing fold constants for each possible + // LEN, rearrange this expression into C1*(x^128) + C2, where + // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128. + // Then fold C1 into C2, which is just another fold across 128 bits. + +.if !LSB_CRC || AVX_LEVEL == 0 + // Load the last 16 data bytes. Note that originally LEN was >= 16. + _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2 +.endif // Else will use vpblendvb mem operand later. +.if !LSB_CRC + neg LEN // Needed for indexing shuf_table +.endif + + // tmp = A*x^(8*LEN) mod x^128 + // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1] + // i.e. right-shift by LEN bytes. + // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN] + // i.e. left-shift by LEN bytes. + _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3 + _cond_vex pshufb, %xmm3, %xmm0, %xmm1 + + // C1 = floor(A / x^(128 - 8*LEN)) + // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1] + // i.e. left-shift by 16-LEN bytes. + // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1] + // i.e. right-shift by 16-LEN bytes. + _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \ + %xmm0, %xmm0, unaligned_mem_tmp=%xmm4 + + // C2 = tmp + B. This is just a blend of tmp with the last 16 data + // bytes (reflected if msb-first). The blend mask is the shuffle table + // that was used to create tmp. 0 selects tmp, and 1 last16databytes. +.if AVX_LEVEL == 0 + movdqa %xmm0, %xmm4 + movdqa %xmm3, %xmm0 + pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand + movdqa %xmm4, %xmm0 +.elseif LSB_CRC + vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1 +.else + vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 +.endif + + // Fold C1 into C2 and store the 128-bit result in %xmm0. + _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4 + +.Lreduce_128bits_to_crc\@: + // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit + // polynomial stored in %xmm0 (using either lsb-first or msb-first bit + // order according to LSB_CRC), and G is the CRC's generator polynomial. + + // First, multiply %xmm0 by x^n and reduce the result to 64+n bits: + // + // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) + + // x^n * (%xmm0 mod x^64) + // + // Store t0 * x^(64-n) in %xmm0. I.e., actually do: + // + // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) + + // x^64 * (%xmm0 mod x^64) + // + // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned + // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily + // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the + // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case + // (considering the extra factor of x that gets implicitly introduced by + // each pclmulqdq when using lsb-first order), is identical to the + // constant that was used earlier for folding the LO64_TERMS across 128 + // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM. + _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1 +.if LSB_CRC + _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) +.else + _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) +.endif + _cond_vex pxor, %xmm1, %xmm0, %xmm0 + // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n). + // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n). + + // First step of Barrett reduction: Compute floor(t0 / G). This is the + // polynomial by which G needs to be multiplied to cancel out the x^n + // and higher terms of t0, i.e. to reduce t0 mod G. First do: + // + // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n) + // + // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in + // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest + // value that makes enough precision be carried through the calculation. + // + // The '* x' makes it so the result is floor(t1 / x^64) rather than + // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it + // can be extracted much more easily in the next step. In the lsb-first + // case the '* x' happens implicitly. In the msb-first case it must be + // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the + // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and + // the multiplication by the x^64 term is handled using a pxor. The + // pxor causes the low 64 terms of t1 to be wrong, but they are unused. + _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM + _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1 +.if !LSB_CRC + _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n) +.endif + // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G). + + // Second step of Barrett reduction: Cancel out the x^n and higher terms + // of t0 by subtracting the needed multiple of G. This gives the CRC: + // + // crc := t0 - (G * floor(t0 / G)) + // + // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do: + // + // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n) + // + // Furthermore, since the resulting CRC is n-bit, if mod x^n is + // explicitly applied to it then the x^n term of G makes no difference + // in the result and can be omitted. This helps keep the constant + // multiplier in 64 bits in most cases. This gives the following: + // + // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G)) + // crc := (%xmm0 / x^(64-n)) mod x^n + // + // In the lsb-first case, each pclmulqdq implicitly introduces + // an extra factor of x, so in that case the constant that needs to be + // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63. + // For lsb-first CRCs where n=64, the extra factor of x cannot be as + // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to + // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC + // polynomials have nonzero x^n and x^0 terms.) It works out as: the + // CRC has be XORed with the physically low qword of %xmm1, representing + // floor(t0 / G). The most efficient way to do that is to move it to + // the physically high qword and use a ternlog to combine the two XORs. +.if LSB_CRC && \n == 64 + _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2 + _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 + .if AVX_LEVEL <= 2 + _cond_vex pxor, %xmm2, %xmm0, %xmm0 + _cond_vex pxor, %xmm1, %xmm0, %xmm0 + .else + vpternlogq $0x96, %xmm2, %xmm1, %xmm0 + .endif + _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64 +.else + _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 + _cond_vex pxor, %xmm1, %xmm0, %xmm0 + .if \n == 8 + _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8 + .elseif \n == 16 + _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16 + .elseif \n == 32 + _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32 + .else // \n == 64 && !LSB_CRC + _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64 + .endif +.endif + +.if VL > 16 + vzeroupper // Needed when ymm or zmm registers may have been used. +.endif +#ifdef __i386__ + pop CONSTS_PTR +#endif + RET +.endm + +#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ +SYM_FUNC_START(prefix##_pclmul_sse); \ + _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ +SYM_FUNC_END(prefix##_pclmul_sse); \ + \ +SYM_FUNC_START(prefix##_vpclmul_avx2); \ + _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \ +SYM_FUNC_END(prefix##_vpclmul_avx2); \ + \ +SYM_FUNC_START(prefix##_vpclmul_avx512); \ + _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \ +SYM_FUNC_END(prefix##_vpclmul_avx512); diff --git a/lib/crc/x86/crc-pclmul-template.h b/lib/crc/x86/crc-pclmul-template.h new file mode 100644 index 000000000000..35c950d7010c --- /dev/null +++ b/lib/crc/x86/crc-pclmul-template.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are + * instantiated by crc-pclmul-template.S + * + * Copyright 2025 Google LLC + * + * Author: Eric Biggers <ebiggers@google.com> + */ +#ifndef _CRC_PCLMUL_TEMPLATE_H +#define _CRC_PCLMUL_TEMPLATE_H + +#include <asm/cpufeatures.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <linux/static_call.h> +#include "crc-pclmul-consts.h" + +#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \ +crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \ + const void *consts_ptr); \ +crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \ + const void *consts_ptr); \ +crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \ + const void *consts_ptr); \ +DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse) + +static inline bool have_vpclmul(void) +{ + return boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL); +} + +static inline bool have_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + !boot_cpu_has(X86_FEATURE_PREFER_YMM) && + cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL); +} + +/* + * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16 + * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD. + * + * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions. + * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(), + * varying by CPU and factors such as which parts of the "FPU" state userspace + * has touched, which could result in a larger cutoff being better. Indeed, a + * larger cutoff is usually better for a *single* message. However, the + * overhead of the FPU section gets amortized if multiple FPU sections get + * executed before returning to userspace, since the XSAVE and XRSTOR occur only + * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on + * the dcache than the table-based code is, a 16-byte cutoff seems to work well. + */ +#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \ +do { \ + if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \ + crypto_simd_usable()) { \ + const void *consts_ptr; \ + \ + consts_ptr = (consts).fold_across_128_bits_consts; \ + kernel_fpu_begin(); \ + crc = static_call(prefix##_pclmul)((crc), (p), (len), \ + consts_ptr); \ + kernel_fpu_end(); \ + return crc; \ + } \ +} while (0) + +#endif /* _CRC_PCLMUL_TEMPLATE_H */ diff --git a/lib/crc/x86/crc-t10dif.h b/lib/crc/x86/crc-t10dif.h new file mode 100644 index 000000000000..2a02a3026f3f --- /dev/null +++ b/lib/crc/x86/crc-t10dif.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * CRC-T10DIF using [V]PCLMULQDQ instructions + * + * Copyright 2024 Google LLC + */ + +#include "crc-pclmul-template.h" + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16); + +static inline u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts, + have_pclmulqdq); + return crc_t10dif_generic(crc, p, len); +} + +#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch +static inline void crc_t10dif_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + static_branch_enable(&have_pclmulqdq); + if (have_vpclmul()) { + if (have_avx512()) + static_call_update(crc16_msb_pclmul, + crc16_msb_vpclmul_avx512); + else + static_call_update(crc16_msb_pclmul, + crc16_msb_vpclmul_avx2); + } + } +} diff --git a/lib/crc/x86/crc16-msb-pclmul.S b/lib/crc/x86/crc16-msb-pclmul.S new file mode 100644 index 000000000000..e9fe248093a8 --- /dev/null +++ b/lib/crc/x86/crc16-msb-pclmul.S @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +// Copyright 2025 Google LLC + +#include "crc-pclmul-template.S" + +DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0) diff --git a/lib/crc/x86/crc32-pclmul.S b/lib/crc/x86/crc32-pclmul.S new file mode 100644 index 000000000000..f20f40fb0172 --- /dev/null +++ b/lib/crc/x86/crc32-pclmul.S @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +// Copyright 2025 Google LLC + +#include "crc-pclmul-template.S" + +DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1) diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h new file mode 100644 index 000000000000..cea2c96d08d0 --- /dev/null +++ b/lib/crc/x86/crc32.h @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * x86-optimized CRC32 functions + * + * Copyright (C) 2008 Intel Corporation + * Copyright 2012 Xyratex Technology Limited + * Copyright 2024 Google LLC + */ + +#include "crc-pclmul-template.h" + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512); + +DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, + have_pclmulqdq); + return crc32_le_base(crc, p, len); +} + +#ifdef CONFIG_X86_64 +#define CRC32_INST "crc32q %1, %q0" +#else +#define CRC32_INST "crc32l %1, %0" +#endif + +/* + * Use carryless multiply version of crc32c when buffer size is >= 512 to + * account for FPU state save/restore overhead. + */ +#define CRC32C_PCLMUL_BREAKEVEN 512 + +asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + size_t num_longs; + + if (!static_branch_likely(&have_crc32)) + return crc32c_base(crc, p, len); + + if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && + static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { + /* + * Long length, the vector registers are usable, and the CPU is + * 64-bit and supports both CRC32 and PCLMULQDQ instructions. + * It is worthwhile to divide the data into multiple streams, + * CRC them independently, and combine them using PCLMULQDQ. + * crc32c_x86_3way() does this using 3 streams, which is the + * most that x86_64 CPUs have traditionally been capable of. + * + * However, due to improved VPCLMULQDQ performance on newer + * CPUs, use crc32_lsb_vpclmul_avx512() instead of + * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a + * "good" implementation of AVX-512. + * + * Future work: the optimal strategy on Zen 3--5 is actually to + * use both crc32q and VPCLMULQDQ in parallel. Unfortunately, + * different numbers of streams and vector lengths are optimal + * on each CPU microarchitecture, making it challenging to take + * advantage of this. (Zen 5 even supports 7 parallel crc32q, a + * major upgrade.) For now, just choose between + * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter + * is needed anyway for crc32_le(), so we just reuse it here. + */ + kernel_fpu_begin(); + if (static_branch_likely(&have_vpclmul_avx512)) + crc = crc32_lsb_vpclmul_avx512(crc, p, len, + crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts); + else + crc = crc32c_x86_3way(crc, p, len); + kernel_fpu_end(); + return crc; + } + + /* + * Short length, XMM registers unusable, or the CPU is 32-bit; but the + * CPU supports CRC32 instructions. Just issue a single stream of CRC32 + * instructions inline. While this doesn't use the CPU's CRC32 + * throughput very well, it avoids the need to combine streams. Stream + * combination would be inefficient here. + */ + + for (num_longs = len / sizeof(unsigned long); + num_longs != 0; num_longs--, p += sizeof(unsigned long)) + asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p)); + + if (sizeof(unsigned long) > 4 && (len & 4)) { + asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p)); + p += 4; + } + if (len & 2) { + asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p)); + p += 2; + } + if (len & 1) + asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p)); + + return crc; +} + +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_XMM4_2)) + static_branch_enable(&have_crc32); + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + static_branch_enable(&have_pclmulqdq); + if (have_vpclmul()) { + if (have_avx512()) { + static_call_update(crc32_lsb_pclmul, + crc32_lsb_vpclmul_avx512); + static_branch_enable(&have_vpclmul_avx512); + } else { + static_call_update(crc32_lsb_pclmul, + crc32_lsb_vpclmul_avx2); + } + } + } +} + +static inline u32 crc32_optimizations_arch(void) +{ + u32 optimizations = 0; + + if (static_key_enabled(&have_crc32)) + optimizations |= CRC32C_OPTIMIZATION; + if (static_key_enabled(&have_pclmulqdq)) + optimizations |= CRC32_LE_OPTIMIZATION; + return optimizations; +} diff --git a/lib/crc/x86/crc32c-3way.S b/lib/crc/x86/crc32c-3way.S new file mode 100644 index 000000000000..9b8770503bbc --- /dev/null +++ b/lib/crc/x86/crc32c-3way.S @@ -0,0 +1,360 @@ +/* + * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) + * + * The white papers on CRC32C calculations with PCLMULQDQ instruction can be + * downloaded from: + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf + * + * Copyright (C) 2012 Intel Corporation. + * Copyright 2024 Google LLC + * + * Authors: + * Wajdi Feghali <wajdi.k.feghali@intel.com> + * James Guilford <james.guilford@intel.com> + * David Cote <david.m.cote@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/linkage.h> + +## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +# Define threshold below which buffers are considered "small" and routed to +# regular CRC code that does not interleave the CRC instructions. +#define SMALL_SIZE 200 + +# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); + +.text +SYM_FUNC_START(crc32c_x86_3way) +#define crc0 %edi +#define crc0_q %rdi +#define bufp %rsi +#define bufp_d %esi +#define len %rdx +#define len_dw %edx +#define n_misaligned %ecx /* overlaps chunk_bytes! */ +#define n_misaligned_q %rcx +#define chunk_bytes %ecx /* overlaps n_misaligned! */ +#define chunk_bytes_q %rcx +#define crc1 %r8 +#define crc2 %r9 + + cmp $SMALL_SIZE, len + jb .Lsmall + + ################################################################ + ## 1) ALIGN: + ################################################################ + mov bufp_d, n_misaligned + neg n_misaligned + and $7, n_misaligned # calculate the misalignment amount of + # the address + je .Laligned # Skip if aligned + + # Process 1 <= n_misaligned <= 7 bytes individually in order to align + # the remaining data to an 8-byte boundary. +.Ldo_align: + movq (bufp), %rax + add n_misaligned_q, bufp + sub n_misaligned_q, len +.Lalign_loop: + crc32b %al, crc0 # compute crc32 of 1-byte + shr $8, %rax # get next byte + dec n_misaligned + jne .Lalign_loop +.Laligned: + + ################################################################ + ## 2) PROCESS BLOCK: + ################################################################ + + cmp $128*24, len + jae .Lfull_block + +.Lpartial_block: + # Compute floor(len / 24) to get num qwords to process from each lane. + imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24) + shr $16, %eax + jmp .Lcrc_3lanes + +.Lfull_block: + # Processing 128 qwords from each lane. + mov $128, %eax + + ################################################################ + ## 3) CRC each of three lanes: + ################################################################ + +.Lcrc_3lanes: + xor crc1,crc1 + xor crc2,crc2 + mov %eax, chunk_bytes + shl $3, chunk_bytes # num bytes to process from each lane + sub $5, %eax # 4 for 4x_loop, 1 for special last iter + jl .Lcrc_3lanes_4x_done + + # Unroll the loop by a factor of 4 to reduce the overhead of the loop + # bookkeeping instructions, which can compete with crc32q for the ALUs. +.Lcrc_3lanes_4x_loop: + crc32q (bufp), crc0_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + crc32q 8(bufp), crc0_q + crc32q 8(bufp,chunk_bytes_q), crc1 + crc32q 8(bufp,chunk_bytes_q,2), crc2 + crc32q 16(bufp), crc0_q + crc32q 16(bufp,chunk_bytes_q), crc1 + crc32q 16(bufp,chunk_bytes_q,2), crc2 + crc32q 24(bufp), crc0_q + crc32q 24(bufp,chunk_bytes_q), crc1 + crc32q 24(bufp,chunk_bytes_q,2), crc2 + add $32, bufp + sub $4, %eax + jge .Lcrc_3lanes_4x_loop + +.Lcrc_3lanes_4x_done: + add $4, %eax + jz .Lcrc_3lanes_last_qword + +.Lcrc_3lanes_1x_loop: + crc32q (bufp), crc0_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + add $8, bufp + dec %eax + jnz .Lcrc_3lanes_1x_loop + +.Lcrc_3lanes_last_qword: + crc32q (bufp), crc0_q + crc32q (bufp,chunk_bytes_q), crc1 +# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet + + ################################################################ + ## 4) Combine three results: + ################################################################ + + lea (K_table-8)(%rip), %rax # first entry is for idx 1 + pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 + lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 + sub %rax, len # len -= chunk_bytes * 3 + + movq crc0_q, %xmm1 # CRC for block 1 + pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 + + movq crc1, %xmm2 # CRC for block 2 + pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 + + pxor %xmm2,%xmm1 + movq %xmm1, %rax + xor (bufp,chunk_bytes_q,2), %rax + mov crc2, crc0_q + crc32 %rax, crc0_q + lea 8(bufp,chunk_bytes_q,2), bufp + + ################################################################ + ## 5) If more blocks remain, goto (2): + ################################################################ + + cmp $128*24, len + jae .Lfull_block + cmp $SMALL_SIZE, len + jae .Lpartial_block + + ####################################################################### + ## 6) Process any remainder without interleaving: + ####################################################################### +.Lsmall: + test len_dw, len_dw + jz .Ldone + mov len_dw, %eax + shr $3, %eax + jz .Ldo_dword +.Ldo_qwords: + crc32q (bufp), crc0_q + add $8, bufp + dec %eax + jnz .Ldo_qwords +.Ldo_dword: + test $4, len_dw + jz .Ldo_word + crc32l (bufp), crc0 + add $4, bufp +.Ldo_word: + test $2, len_dw + jz .Ldo_byte + crc32w (bufp), crc0 + add $2, bufp +.Ldo_byte: + test $1, len_dw + jz .Ldone + crc32b (bufp), crc0 +.Ldone: + mov crc0, %eax + RET +SYM_FUNC_END(crc32c_x86_3way) + +.section .rodata, "a", @progbits + ################################################################ + ## PCLMULQDQ tables + ## Table is 128 entries x 2 words (8 bytes) each + ################################################################ +.align 8 +K_table: + .long 0x493c7d27, 0x00000001 + .long 0xba4fc28e, 0x493c7d27 + .long 0xddc0152b, 0xf20c0dfe + .long 0x9e4addf8, 0xba4fc28e + .long 0x39d3b296, 0x3da6d0cb + .long 0x0715ce53, 0xddc0152b + .long 0x47db8317, 0x1c291d04 + .long 0x0d3b6092, 0x9e4addf8 + .long 0xc96cfdc0, 0x740eef02 + .long 0x878a92a7, 0x39d3b296 + .long 0xdaece73e, 0x083a6eec + .long 0xab7aff2a, 0x0715ce53 + .long 0x2162d385, 0xc49f4f67 + .long 0x83348832, 0x47db8317 + .long 0x299847d5, 0x2ad91c30 + .long 0xb9e02b86, 0x0d3b6092 + .long 0x18b33a4e, 0x6992cea2 + .long 0xb6dd949b, 0xc96cfdc0 + .long 0x78d9ccb7, 0x7e908048 + .long 0xbac2fd7b, 0x878a92a7 + .long 0xa60ce07b, 0x1b3d8f29 + .long 0xce7f39f4, 0xdaece73e + .long 0x61d82e56, 0xf1d0f55e + .long 0xd270f1a2, 0xab7aff2a + .long 0xc619809d, 0xa87ab8a8 + .long 0x2b3cac5d, 0x2162d385 + .long 0x65863b64, 0x8462d800 + .long 0x1b03397f, 0x83348832 + .long 0xebb883bd, 0x71d111a8 + .long 0xb3e32c28, 0x299847d5 + .long 0x064f7f26, 0xffd852c6 + .long 0xdd7e3b0c, 0xb9e02b86 + .long 0xf285651c, 0xdcb17aa4 + .long 0x10746f3c, 0x18b33a4e + .long 0xc7a68855, 0xf37c5aee + .long 0x271d9844, 0xb6dd949b + .long 0x8e766a0c, 0x6051d5a2 + .long 0x93a5f730, 0x78d9ccb7 + .long 0x6cb08e5c, 0x18b0d4ff + .long 0x6b749fb2, 0xbac2fd7b + .long 0x1393e203, 0x21f3d99c + .long 0xcec3662e, 0xa60ce07b + .long 0x96c515bb, 0x8f158014 + .long 0xe6fc4e6a, 0xce7f39f4 + .long 0x8227bb8a, 0xa00457f7 + .long 0xb0cd4768, 0x61d82e56 + .long 0x39c7ff35, 0x8d6d2c43 + .long 0xd7a4825c, 0xd270f1a2 + .long 0x0ab3844b, 0x00ac29cf + .long 0x0167d312, 0xc619809d + .long 0xf6076544, 0xe9adf796 + .long 0x26f6a60a, 0x2b3cac5d + .long 0xa741c1bf, 0x96638b34 + .long 0x98d8d9cb, 0x65863b64 + .long 0x49c3cc9c, 0xe0e9f351 + .long 0x68bce87a, 0x1b03397f + .long 0x57a3d037, 0x9af01f2d + .long 0x6956fc3b, 0xebb883bd + .long 0x42d98888, 0x2cff42cf + .long 0x3771e98f, 0xb3e32c28 + .long 0xb42ae3d9, 0x88f25a3a + .long 0x2178513a, 0x064f7f26 + .long 0xe0ac139e, 0x4e36f0b0 + .long 0x170076fa, 0xdd7e3b0c + .long 0x444dd413, 0xbd6f81f8 + .long 0x6f345e45, 0xf285651c + .long 0x41d17b64, 0x91c9bd4b + .long 0xff0dba97, 0x10746f3c + .long 0xa2b73df1, 0x885f087b + .long 0xf872e54c, 0xc7a68855 + .long 0x1e41e9fc, 0x4c144932 + .long 0x86d8e4d2, 0x271d9844 + .long 0x651bd98b, 0x52148f02 + .long 0x5bb8f1bc, 0x8e766a0c + .long 0xa90fd27a, 0xa3c6f37a + .long 0xb3af077a, 0x93a5f730 + .long 0x4984d782, 0xd7c0557f + .long 0xca6ef3ac, 0x6cb08e5c + .long 0x234e0b26, 0x63ded06a + .long 0xdd66cbbb, 0x6b749fb2 + .long 0x4597456a, 0x4d56973c + .long 0xe9e28eb4, 0x1393e203 + .long 0x7b3ff57a, 0x9669c9df + .long 0xc9c8b782, 0xcec3662e + .long 0x3f70cc6f, 0xe417f38a + .long 0x93e106a4, 0x96c515bb + .long 0x62ec6c6d, 0x4b9e0f71 + .long 0xd813b325, 0xe6fc4e6a + .long 0x0df04680, 0xd104b8fc + .long 0x2342001e, 0x8227bb8a + .long 0x0a2a8d7e, 0x5b397730 + .long 0x6d9a4957, 0xb0cd4768 + .long 0xe8b6368b, 0xe78eb416 + .long 0xd2c3ed1a, 0x39c7ff35 + .long 0x995a5724, 0x61ff0e01 + .long 0x9ef68d35, 0xd7a4825c + .long 0x0c139b31, 0x8d96551c + .long 0xf2271e60, 0x0ab3844b + .long 0x0b0bf8ca, 0x0bf80dd2 + .long 0x2664fd8b, 0x0167d312 + .long 0xed64812d, 0x8821abed + .long 0x02ee03b2, 0xf6076544 + .long 0x8604ae0f, 0x6a45d2b2 + .long 0x363bd6b3, 0x26f6a60a + .long 0x135c83fd, 0xd8d26619 + .long 0x5fabe670, 0xa741c1bf + .long 0x35ec3279, 0xde87806c + .long 0x00bcf5f6, 0x98d8d9cb + .long 0x8ae00689, 0x14338754 + .long 0x17f27698, 0x49c3cc9c + .long 0x58ca5f00, 0x5bd2011f + .long 0xaa7c7ad5, 0x68bce87a + .long 0xb5cfca28, 0xdd07448e + .long 0xded288f8, 0x57a3d037 + .long 0x59f229bc, 0xdde8f5b9 + .long 0x6d390dec, 0x6956fc3b + .long 0x37170390, 0xa3e3e02c + .long 0x6353c1cc, 0x42d98888 + .long 0xc4584f5c, 0xd73c7bea + .long 0xf48642e9, 0x3771e98f + .long 0x531377e2, 0x80ff0093 + .long 0xdd35bc8d, 0xb42ae3d9 + .long 0xb25b29f2, 0x8fe4c34d + .long 0x9a5ede41, 0x2178513a + .long 0xa563905d, 0xdf99fc11 + .long 0x45cddf4e, 0xe0ac139e + .long 0xacfa3103, 0x6c23e841 + .long 0xa51b6135, 0x170076fa diff --git a/lib/crc/x86/crc64-pclmul.S b/lib/crc/x86/crc64-pclmul.S new file mode 100644 index 000000000000..4173051b5197 --- /dev/null +++ b/lib/crc/x86/crc64-pclmul.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +// Copyright 2025 Google LLC + +#include "crc-pclmul-template.S" + +DEFINE_CRC_PCLMUL_FUNCS(crc64_msb, /* bits= */ 64, /* lsb= */ 0) +DEFINE_CRC_PCLMUL_FUNCS(crc64_lsb, /* bits= */ 64, /* lsb= */ 1) diff --git a/lib/crc/x86/crc64.h b/lib/crc/x86/crc64.h new file mode 100644 index 000000000000..fde1222c4c58 --- /dev/null +++ b/lib/crc/x86/crc64.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * CRC64 using [V]PCLMULQDQ instructions + * + * Copyright 2025 Google LLC + */ + +#include "crc-pclmul-template.h" + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64); +DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64); + +static inline u64 crc64_be_arch(u64 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts, + have_pclmulqdq); + return crc64_be_generic(crc, p, len); +} + +static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts, + have_pclmulqdq); + return crc64_nvme_generic(crc, p, len); +} + +#define crc64_mod_init_arch crc64_mod_init_arch +static inline void crc64_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + static_branch_enable(&have_pclmulqdq); + if (have_vpclmul()) { + if (have_avx512()) { + static_call_update(crc64_msb_pclmul, + crc64_msb_vpclmul_avx512); + static_call_update(crc64_lsb_pclmul, + crc64_lsb_vpclmul_avx512); + } else { + static_call_update(crc64_msb_pclmul, + crc64_msb_vpclmul_avx2); + static_call_update(crc64_lsb_pclmul, + crc64_lsb_vpclmul_avx2); + } + } + } +} |