10 files changed, 1486 insertions, 0 deletions
diff --git a/lib/crc/x86/crc-pclmul-consts.h b/lib/crc/x86/crc-pclmul-consts.h
new file mode 100644
index 000000000000..6ae94158fca2
--- /dev/null
+++ b/lib/crc/x86/crc-pclmul-consts.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * CRC constants generated by:
+ *
+ *	./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
+ *
+ * Do not edit manually.
+ */
+
+/*
+ * CRC folding constants generated for most-significant-bit-first CRC-16 using
+ * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct {
+	u8 bswap_mask[16];
+	u64 fold_across_2048_bits_consts[2];
+	u64 fold_across_1024_bits_consts[2];
+	u64 fold_across_512_bits_consts[2];
+	u64 fold_across_256_bits_consts[2];
+	u64 fold_across_128_bits_consts[2];
+	u8 shuf_table[48];
+	u64 barrett_reduction_consts[2];
+} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = {
+	.bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+	.fold_across_2048_bits_consts = {
+		0xdccf000000000000,	/* LO64_TERMS: (x^2000 mod G) * x^48 */
+		0x4b0b000000000000,	/* HI64_TERMS: (x^2064 mod G) * x^48 */
+	},
+	.fold_across_1024_bits_consts = {
+		0x9d9d000000000000,	/* LO64_TERMS: (x^976 mod G) * x^48 */
+		0x7cf5000000000000,	/* HI64_TERMS: (x^1040 mod G) * x^48 */
+	},
+	.fold_across_512_bits_consts = {
+		0x044c000000000000,	/* LO64_TERMS: (x^464 mod G) * x^48 */
+		0xe658000000000000,	/* HI64_TERMS: (x^528 mod G) * x^48 */
+	},
+	.fold_across_256_bits_consts = {
+		0x6ee3000000000000,	/* LO64_TERMS: (x^208 mod G) * x^48 */
+		0xe7b5000000000000,	/* HI64_TERMS: (x^272 mod G) * x^48 */
+	},
+	.fold_across_128_bits_consts = {
+		0x2d56000000000000,	/* LO64_TERMS: (x^80 mod G) * x^48 */
+		0x06df000000000000,	/* HI64_TERMS: (x^144 mod G) * x^48 */
+	},
+	.shuf_table = {
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	},
+	.barrett_reduction_consts = {
+		0x8bb7000000000000,	/* LO64_TERMS: (G - x^16) * x^48 */
+		0xf65a57f81d33a48a,	/* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */
+	},
+};
+
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
+ *        x^5 + x^4 + x^2 + x^1 + x^0
+ */
+static const struct {
+	u64 fold_across_2048_bits_consts[2];
+	u64 fold_across_1024_bits_consts[2];
+	u64 fold_across_512_bits_consts[2];
+	u64 fold_across_256_bits_consts[2];
+	u64 fold_across_128_bits_consts[2];
+	u8 shuf_table[48];
+	u64 barrett_reduction_consts[2];
+} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = {
+	.fold_across_2048_bits_consts = {
+		0x00000000ce3371cb,	/* HI64_TERMS: (x^2079 mod G) * x^32 */
+		0x00000000e95c1271,	/* LO64_TERMS: (x^2015 mod G) * x^32 */
+	},
+	.fold_across_1024_bits_consts = {
+		0x0000000033fff533,	/* HI64_TERMS: (x^1055 mod G) * x^32 */
+		0x00000000910eeec1,	/* LO64_TERMS: (x^991 mod G) * x^32 */
+	},
+	.fold_across_512_bits_consts = {
+		0x000000008f352d95,	/* HI64_TERMS: (x^543 mod G) * x^32 */
+		0x000000001d9513d7,	/* LO64_TERMS: (x^479 mod G) * x^32 */
+	},
+	.fold_across_256_bits_consts = {
+		0x00000000f1da05aa,	/* HI64_TERMS: (x^287 mod G) * x^32 */
+		0x0000000081256527,	/* LO64_TERMS: (x^223 mod G) * x^32 */
+	},
+	.fold_across_128_bits_consts = {
+		0x00000000ae689191,	/* HI64_TERMS: (x^159 mod G) * x^32 */
+		0x00000000ccaa009e,	/* LO64_TERMS: (x^95 mod G) * x^32 */
+	},
+	.shuf_table = {
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	},
+	.barrett_reduction_consts = {
+		0xb4e5b025f7011641,	/* HI64_TERMS: floor(x^95 / G) */
+		0x00000001db710640,	/* LO64_TERMS: (G - x^32) * x^31 */
+	},
+};
+
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 +
+ *        x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0
+ */
+static const struct {
+	u64 fold_across_2048_bits_consts[2];
+	u64 fold_across_1024_bits_consts[2];
+	u64 fold_across_512_bits_consts[2];
+	u64 fold_across_256_bits_consts[2];
+	u64 fold_across_128_bits_consts[2];
+	u8 shuf_table[48];
+	u64 barrett_reduction_consts[2];
+} crc32_lsb_0x82f63b78_consts ____cacheline_aligned __maybe_unused = {
+	.fold_across_2048_bits_consts = {
+		0x00000000dcb17aa4,	/* HI64_TERMS: (x^2079 mod G) * x^32 */
+		0x00000000b9e02b86,	/* LO64_TERMS: (x^2015 mod G) * x^32 */
+	},
+	.fold_across_1024_bits_consts = {
+		0x000000006992cea2,	/* HI64_TERMS: (x^1055 mod G) * x^32 */
+		0x000000000d3b6092,	/* LO64_TERMS: (x^991 mod G) * x^32 */
+	},
+	.fold_across_512_bits_consts = {
+		0x00000000740eef02,	/* HI64_TERMS: (x^543 mod G) * x^32 */
+		0x000000009e4addf8,	/* LO64_TERMS: (x^479 mod G) * x^32 */
+	},
+	.fold_across_256_bits_consts = {
+		0x000000003da6d0cb,	/* HI64_TERMS: (x^287 mod G) * x^32 */
+		0x00000000ba4fc28e,	/* LO64_TERMS: (x^223 mod G) * x^32 */
+	},
+	.fold_across_128_bits_consts = {
+		0x00000000f20c0dfe,	/* HI64_TERMS: (x^159 mod G) * x^32 */
+		0x00000000493c7d27,	/* LO64_TERMS: (x^95 mod G) * x^32 */
+	},
+	.shuf_table = {
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	},
+	.barrett_reduction_consts = {
+		0x4869ec38dea713f1,	/* HI64_TERMS: floor(x^95 / G) */
+		0x0000000105ec76f0,	/* LO64_TERMS: (G - x^32) * x^31 */
+	},
+};
+
+/*
+ * CRC folding constants generated for most-significant-bit-first CRC-64 using
+ * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ *        x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ *        x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ *        x^7 + x^4 + x^1 + x^0
+ */
+static const struct {
+	u8 bswap_mask[16];
+	u64 fold_across_2048_bits_consts[2];
+	u64 fold_across_1024_bits_consts[2];
+	u64 fold_across_512_bits_consts[2];
+	u64 fold_across_256_bits_consts[2];
+	u64 fold_across_128_bits_consts[2];
+	u8 shuf_table[48];
+	u64 barrett_reduction_consts[2];
+} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = {
+	.bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+	.fold_across_2048_bits_consts = {
+		0x7f52691a60ddc70d,	/* LO64_TERMS: (x^2048 mod G) * x^0 */
+		0x7036b0389f6a0c82,	/* HI64_TERMS: (x^2112 mod G) * x^0 */
+	},
+	.fold_across_1024_bits_consts = {
+		0x05cf79dea9ac37d6,	/* LO64_TERMS: (x^1024 mod G) * x^0 */
+		0x001067e571d7d5c2,	/* HI64_TERMS: (x^1088 mod G) * x^0 */
+	},
+	.fold_across_512_bits_consts = {
+		0x5f6843ca540df020,	/* LO64_TERMS: (x^512 mod G) * x^0 */
+		0xddf4b6981205b83f,	/* HI64_TERMS: (x^576 mod G) * x^0 */
+	},
+	.fold_across_256_bits_consts = {
+		0x571bee0a227ef92b,	/* LO64_TERMS: (x^256 mod G) * x^0 */
+		0x44bef2a201b5200c,	/* HI64_TERMS: (x^320 mod G) * x^0 */
+	},
+	.fold_across_128_bits_consts = {
+		0x05f5c3c7eb52fab6,	/* LO64_TERMS: (x^128 mod G) * x^0 */
+		0x4eb938a7d257740e,	/* HI64_TERMS: (x^192 mod G) * x^0 */
+	},
+	.shuf_table = {
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	},
+	.barrett_reduction_consts = {
+		0x42f0e1eba9ea3693,	/* LO64_TERMS: (G - x^64) * x^0 */
+		0x578d29d06cc4f872,	/* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */
+	},
+};
+
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-64 using
+ * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
+ *        x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
+ *        x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
+ *        x^4 + x^3 + x^0
+ */
+static const struct {
+	u64 fold_across_2048_bits_consts[2];
+	u64 fold_across_1024_bits_consts[2];
+	u64 fold_across_512_bits_consts[2];
+	u64 fold_across_256_bits_consts[2];
+	u64 fold_across_128_bits_consts[2];
+	u8 shuf_table[48];
+	u64 barrett_reduction_consts[2];
+} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = {
+	.fold_across_2048_bits_consts = {
+		0x37ccd3e14069cabc,	/* HI64_TERMS: (x^2111 mod G) * x^0 */
+		0xa043808c0f782663,	/* LO64_TERMS: (x^2047 mod G) * x^0 */
+	},
+	.fold_across_1024_bits_consts = {
+		0xa1ca681e733f9c40,	/* HI64_TERMS: (x^1087 mod G) * x^0 */
+		0x5f852fb61e8d92dc,	/* LO64_TERMS: (x^1023 mod G) * x^0 */
+	},
+	.fold_across_512_bits_consts = {
+		0x0c32cdb31e18a84a,	/* HI64_TERMS: (x^575 mod G) * x^0 */
+		0x62242240ace5045a,	/* LO64_TERMS: (x^511 mod G) * x^0 */
+	},
+	.fold_across_256_bits_consts = {
+		0xb0bc2e589204f500,	/* HI64_TERMS: (x^319 mod G) * x^0 */
+		0xe1e0bb9d45d7a44c,	/* LO64_TERMS: (x^255 mod G) * x^0 */
+	},
+	.fold_across_128_bits_consts = {
+		0xeadc41fd2ba3d420,	/* HI64_TERMS: (x^191 mod G) * x^0 */
+		0x21e9761e252621ac,	/* LO64_TERMS: (x^127 mod G) * x^0 */
+	},
+	.shuf_table = {
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	},
+	.barrett_reduction_consts = {
+		0x27ecfa329aef9f77,	/* HI64_TERMS: floor(x^127 / G) */
+		0x34d926535897936a,	/* LO64_TERMS: (G - x^64 - x^0) / x */
+	},
+};
diff --git a/lib/crc/x86/crc-pclmul-template.S b/lib/crc/x86/crc-pclmul-template.S
new file mode 100644
index 000000000000..a02f7dc8053e
--- /dev/null
+++ b/lib/crc/x86/crc-pclmul-template.S
@@ -0,0 +1,575 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+//
+// Template to generate [V]PCLMULQDQ-based CRC functions for x86
+//
+// Copyright 2025 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+
+// Offsets within the generated constants table
+.set OFFSETOF_BSWAP_MASK,			-5*16	// msb-first CRCs only
+.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS,	-4*16	// must precede next
+.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS,	-3*16	// must precede next
+.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS,	-2*16	// must precede next
+.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS,	-1*16	// must precede next
+.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS,	0*16	// must be 0
+.set OFFSETOF_SHUF_TABLE,			1*16
+.set OFFSETOF_BARRETT_REDUCTION_CONSTS,		4*16
+
+// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
+// corresponding non-VEX instruction plus any needed moves.  The supported
+// instruction formats are:
+//
+//     - Two-arg [src, dst], where the non-VEX format is the same.
+//     - Three-arg [src1, src2, dst] where the non-VEX format is
+//	 [src1, src2_and_dst].  If src2 != dst, then src1 must != dst too.
+//
+// \insn gives the instruction without a "v" prefix and including any immediate
+// argument if needed to make the instruction follow one of the above formats.
+// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
+// it first; this is needed when \arg1 is an unaligned mem operand.
+.macro	_cond_vex	insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
+.if AVX_LEVEL == 0
+  // VEX not allowed.  Emulate it.
+  .ifnb \arg3 // Three-arg [src1, src2, dst]
+    .ifc "\arg2", "\arg3" // src2 == dst?
+      .ifnb \unaligned_mem_tmp
+	movdqu		\arg1, \unaligned_mem_tmp
+	\insn		\unaligned_mem_tmp, \arg3
+      .else
+	\insn		\arg1, \arg3
+      .endif
+    .else // src2 != dst
+      .ifc "\arg1", "\arg3"
+	.error "Can't have src1 == dst when src2 != dst"
+      .endif
+      .ifnb \unaligned_mem_tmp
+	movdqu		\arg1, \unaligned_mem_tmp
+	movdqa		\arg2, \arg3
+	\insn		\unaligned_mem_tmp, \arg3
+      .else
+	movdqa		\arg2, \arg3
+	\insn		\arg1, \arg3
+      .endif
+    .endif
+  .else // Two-arg [src, dst]
+    .ifnb \unaligned_mem_tmp
+	movdqu		\arg1, \unaligned_mem_tmp
+	\insn		\unaligned_mem_tmp, \arg2
+    .else
+	\insn		\arg1, \arg2
+    .endif
+  .endif
+.else
+  // VEX is allowed.  Emit the desired instruction directly.
+  .ifnb \arg3
+	v\insn		\arg1, \arg2, \arg3
+  .else
+	v\insn		\arg1, \arg2
+  .endif
+.endif
+.endm
+
+// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
+// register of length VL.
+.macro	_vbroadcast	src, dst
+.if VL == 16
+	_cond_vex movdqa,	\src, \dst
+.elseif VL == 32
+	vbroadcasti128		\src, \dst
+.else
+	vbroadcasti32x4		\src, \dst
+.endif
+.endm
+
+// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
+// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
+.macro	_load_data	vl, src, bswap_mask, dst
+.if \vl < 64
+	_cond_vex movdqu,	"\src", \dst
+.else
+	vmovdqu8		\src, \dst
+.endif
+.if !LSB_CRC
+	_cond_vex pshufb,	\bswap_mask, \dst, \dst
+.endif
+.endm
+
+.macro	_prepare_v0	vl, v0, v1, bswap_mask
+.if LSB_CRC
+  .if \vl < 64
+	_cond_vex pxor,		(BUF), \v0, \v0, unaligned_mem_tmp=\v1
+  .else
+	vpxorq			(BUF), \v0, \v0
+  .endif
+.else
+	_load_data		\vl, (BUF), \bswap_mask, \v1
+  .if \vl < 64
+	_cond_vex pxor,		\v1, \v0, \v0
+  .else
+	vpxorq			\v1, \v0, \v0
+  .endif
+.endif
+.endm
+
+// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
+// msb-first order or the physically high qword for lsb-first order
+#define LO64_TERMS 0
+
+// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
+// qword for msb-first order or the physically low qword for lsb-first order
+#define HI64_TERMS 1
+
+// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
+// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
+.macro	_pclmulqdq	src1, src1_terms, src2, src2_terms, dst
+	_cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
+		  \src1, \src2, \dst
+.endm
+
+// Fold \acc into \data and store the result back into \acc.  \data can be an
+// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
+// byte-reflection is needed; otherwise it must be a vector register.  \consts
+// is a vector register containing the needed fold constants, and \tmp is a
+// temporary vector register.  All arguments must be the same length.
+.macro	_fold_vec	acc, data, consts, tmp
+	_pclmulqdq	\consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
+	_pclmulqdq	\consts, LO64_TERMS, \acc, LO64_TERMS, \acc
+.if AVX_LEVEL <= 2
+	_cond_vex pxor,	\data, \tmp, \tmp
+	_cond_vex pxor,	\tmp, \acc, \acc
+.else
+	vpternlogq	$0x96, \data, \tmp, \acc
+.endif
+.endm
+
+// Fold \acc into \data and store the result back into \acc.  \data is an
+// unaligned mem operand, \consts is a vector register containing the needed
+// fold constants, \bswap_mask is a vector register containing the
+// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
+// temporary vector registers.  All arguments must have length \vl.
+.macro	_fold_vec_mem	vl, acc, data, consts, bswap_mask, tmp1, tmp2
+.if AVX_LEVEL == 0 || !LSB_CRC
+	_load_data	\vl, \data, \bswap_mask, \tmp1
+	_fold_vec	\acc, \tmp1, \consts, \tmp2
+.else
+	_fold_vec	\acc, \data, \consts, \tmp1
+.endif
+.endm
+
+// Load the constants for folding across 2**i vectors of length VL at a time
+// into all 128-bit lanes of the vector register CONSTS.
+.macro	_load_vec_folding_consts	i
+	_vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
+		    CONSTS
+.endm
+
+// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
+// the result back into \v0.  If the remaining length mod \vl is nonzero, also
+// fold \vl data bytes from BUF.  For both operations the fold distance is \vl.
+// \consts must be a register of length \vl containing the fold constants.
+.macro	_fold_vec_final	vl, v0, v1, consts, bswap_mask, tmp1, tmp2
+	_fold_vec	\v0, \v1, \consts, \tmp1
+	test		$\vl, LEN8
+	jz		.Lfold_vec_final_done\@
+	_fold_vec_mem	\vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
+	add		$\vl, BUF
+.Lfold_vec_final_done\@:
+.endm
+
+// This macro generates the body of a CRC function with the following prototype:
+//
+// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
+//
+// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
+// |buf| is the data to checksum.  |len| is the data length in bytes, which must
+// be at least 16.  |consts| is a pointer to the fold_across_128_bits_consts
+// field of the constants struct that was generated for the chosen CRC variant.
+//
+// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
+// 32 for a CRC-32.  Currently the supported values are 8, 16, 32, and 64.  If
+// the file is compiled in i386 mode, then the maximum supported value is 32.
+//
+// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
+// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0.  \lsb_crc is 0
+// if the CRC processes the most significant bit of each byte first, i.e. maps
+// bit0 to x^0, bit1 to x^1, bit7 to x^7.
+//
+// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
+//
+// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
+// 512 for AVX512.
+//
+// If \vl == 16 && \avx_level == 0, the generated code requires:
+// PCLMULQDQ && SSE4.1.  (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
+//
+// If \vl == 32 && \avx_level == 2, the generated code requires:
+// VPCLMULQDQ && AVX2.
+//
+// If \vl == 64 && \avx_level == 512, the generated code requires:
+// VPCLMULQDQ && AVX512BW && AVX512VL.
+//
+// Other \vl and \avx_level combinations are either not supported or not useful.
+.macro	_crc_pclmul	n, lsb_crc, vl, avx_level
+	.set	LSB_CRC,	\lsb_crc
+	.set	VL,		\vl
+	.set	AVX_LEVEL,	\avx_level
+
+	// Define aliases for the xmm, ymm, or zmm registers according to VL.
+.irp i, 0,1,2,3,4,5,6,7
+  .if VL == 16
+	.set	V\i,		%xmm\i
+	.set	LOG2_VL,	4
+  .elseif VL == 32
+	.set	V\i,		%ymm\i
+	.set	LOG2_VL,	5
+  .elseif VL == 64
+	.set	V\i,		%zmm\i
+	.set	LOG2_VL,	6
+  .else
+	.error "Unsupported vector length"
+  .endif
+.endr
+	// Define aliases for the function parameters.
+	// Note: when crc_t is shorter than u32, zero-extension to 32 bits is
+	// guaranteed by the ABI.  Zero-extension to 64 bits is *not* guaranteed
+	// when crc_t is shorter than u64.
+#ifdef __x86_64__
+.if \n <= 32
+	.set	CRC,		%edi
+.else
+	.set	CRC,		%rdi
+.endif
+	.set	BUF,		%rsi
+	.set	LEN,		%rdx
+	.set	LEN32,		%edx
+	.set	LEN8,		%dl
+	.set	CONSTS_PTR,	%rcx
+#else
+	// 32-bit support, assuming -mregparm=3 and not including support for
+	// CRC-64 (which would use both eax and edx to pass the crc parameter).
+	.set	CRC,		%eax
+	.set	BUF,		%edx
+	.set	LEN,		%ecx
+	.set	LEN32,		%ecx
+	.set	LEN8,		%cl
+	.set	CONSTS_PTR,	%ebx	// Passed on stack
+#endif
+
+	// Define aliases for some local variables.  V0-V5 are used without
+	// aliases (for accumulators, data, temporary values, etc).  Staying
+	// within the first 8 vector registers keeps the code 32-bit SSE
+	// compatible and reduces the size of 64-bit SSE code slightly.
+	.set	BSWAP_MASK,	V6
+	.set	BSWAP_MASK_YMM,	%ymm6
+	.set	BSWAP_MASK_XMM,	%xmm6
+	.set	CONSTS,		V7
+	.set	CONSTS_YMM,	%ymm7
+	.set	CONSTS_XMM,	%xmm7
+
+	// Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
+	// functions generated by this macro are called only by static_call.
+	ANNOTATE_NOENDBR
+
+#ifdef __i386__
+	push		CONSTS_PTR
+	mov		8(%esp), CONSTS_PTR
+#endif
+
+	// Create a 128-bit vector that contains the initial CRC in the end
+	// representing the high-order polynomial coefficients, and the rest 0.
+	// If the CRC is msb-first, also load the byte-reflection table.
+.if \n <= 32
+	_cond_vex movd,	CRC, %xmm0
+.else
+	_cond_vex movq,	CRC, %xmm0
+.endif
+.if !LSB_CRC
+	_cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
+	_vbroadcast	OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
+.endif
+
+	// Load the first vector of data and XOR the initial CRC into the
+	// appropriate end of the first 128-bit lane of data.  If LEN < VL, then
+	// use a short vector and jump ahead to the final reduction.  (LEN >= 16
+	// is guaranteed here but not necessarily LEN >= VL.)
+.if VL >= 32
+	cmp		$VL, LEN
+	jae		.Lat_least_1vec\@
+  .if VL == 64
+	cmp		$32, LEN32
+	jb		.Lless_than_32bytes\@
+	_prepare_v0	32, %ymm0, %ymm1, BSWAP_MASK_YMM
+	add		$32, BUF
+	jmp		.Lreduce_256bits_to_128bits\@
+.Lless_than_32bytes\@:
+  .endif
+	_prepare_v0	16, %xmm0, %xmm1, BSWAP_MASK_XMM
+	add		$16, BUF
+	vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
+	jmp		.Lcheck_for_partial_block\@
+.Lat_least_1vec\@:
+.endif
+	_prepare_v0	VL, V0, V1, BSWAP_MASK
+
+	// Handle VL <= LEN < 4*VL.
+	cmp		$4*VL-1, LEN
+	ja		.Lat_least_4vecs\@
+	add		$VL, BUF
+	// If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
+	// If VL==16 then load fold_across_128_bits_consts first, as the final
+	// reduction depends on it and it won't be loaded anywhere else.
+	cmp		$2*VL-1, LEN32
+.if VL == 16
+	_cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
+.endif
+	jbe		.Lreduce_1vec_to_128bits\@
+	// Otherwise 2*VL <= LEN < 4*VL.  Load one more vector and jump ahead to
+	// the reduction from 2 vectors.
+	_load_data	VL, (BUF), BSWAP_MASK, V1
+	add		$VL, BUF
+	jmp		.Lreduce_2vecs_to_1\@
+
+.Lat_least_4vecs\@:
+	// Load 3 more vectors of data.
+	_load_data	VL, 1*VL(BUF), BSWAP_MASK, V1
+	_load_data	VL, 2*VL(BUF), BSWAP_MASK, V2
+	_load_data	VL, 3*VL(BUF), BSWAP_MASK, V3
+	sub		$-4*VL, BUF	// Shorter than 'add 4*VL' when VL=32
+	add		$-4*VL, LEN	// Shorter than 'sub 4*VL' when VL=32
+
+	// Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
+	// 4 vectors of data and write the result back to V0-V3.
+	cmp		$4*VL-1, LEN	// Shorter than 'cmp 4*VL' when VL=32
+	jbe		.Lreduce_4vecs_to_2\@
+	_load_vec_folding_consts	2
+.Lfold_4vecs_loop\@:
+	_fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+	_fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+	_fold_vec_mem	VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+	_fold_vec_mem	VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+	sub		$-4*VL, BUF
+	add		$-4*VL, LEN
+	cmp		$4*VL-1, LEN
+	ja		.Lfold_4vecs_loop\@
+
+	// Fold V0,V1 into V2,V3 and write the result back to V0,V1.  Then fold
+	// two more vectors of data from BUF, if at least that much remains.
+.Lreduce_4vecs_to_2\@:
+	_load_vec_folding_consts	1
+	_fold_vec	V0, V2, CONSTS, V4
+	_fold_vec	V1, V3, CONSTS, V4
+	test		$2*VL, LEN8
+	jz		.Lreduce_2vecs_to_1\@
+	_fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+	_fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
+	sub		$-2*VL, BUF
+
+	// Fold V0 into V1 and write the result back to V0.  Then fold one more
+	// vector of data from BUF, if at least that much remains.
+.Lreduce_2vecs_to_1\@:
+	_load_vec_folding_consts	0
+	_fold_vec_final	VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
+
+.Lreduce_1vec_to_128bits\@:
+.if VL == 64
+	// Reduce 512-bit %zmm0 to 256-bit %ymm0.  Then fold 256 more bits of
+	// data from BUF, if at least that much remains.
+	vbroadcasti128	OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
+	vextracti64x4	$1, %zmm0, %ymm1
+	_fold_vec_final	32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
+.Lreduce_256bits_to_128bits\@:
+.endif
+.if VL >= 32
+	// Reduce 256-bit %ymm0 to 128-bit %xmm0.  Then fold 128 more bits of
+	// data from BUF, if at least that much remains.
+	vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
+	vextracti128	$1, %ymm0, %xmm1
+	_fold_vec_final	16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
+.Lcheck_for_partial_block\@:
+.endif
+	and		$15, LEN32
+	jz		.Lreduce_128bits_to_crc\@
+
+	// 1 <= LEN <= 15 data bytes remain in BUF.  The polynomial is now
+	// A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
+	// and B is the polynomial of the remaining LEN data bytes.  To reduce
+	// this to 128 bits without needing fold constants for each possible
+	// LEN, rearrange this expression into C1*(x^128) + C2, where
+	// C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
+	// Then fold C1 into C2, which is just another fold across 128 bits.
+
+.if !LSB_CRC || AVX_LEVEL == 0
+	// Load the last 16 data bytes.  Note that originally LEN was >= 16.
+	_load_data	16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
+.endif // Else will use vpblendvb mem operand later.
+.if !LSB_CRC
+	neg		LEN	// Needed for indexing shuf_table
+.endif
+
+	// tmp = A*x^(8*LEN) mod x^128
+	// lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
+	//	i.e. right-shift by LEN bytes.
+	// msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
+	//	i.e. left-shift by LEN bytes.
+	_cond_vex movdqu,	"OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
+	_cond_vex pshufb,	%xmm3, %xmm0, %xmm1
+
+	// C1 = floor(A / x^(128 - 8*LEN))
+	// lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
+	//	i.e. left-shift by 16-LEN bytes.
+	// msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
+	//	i.e. right-shift by 16-LEN bytes.
+	_cond_vex pshufb,	"OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
+				%xmm0, %xmm0, unaligned_mem_tmp=%xmm4
+
+	// C2 = tmp + B.  This is just a blend of tmp with the last 16 data
+	// bytes (reflected if msb-first).  The blend mask is the shuffle table
+	// that was used to create tmp.  0 selects tmp, and 1 last16databytes.
+.if AVX_LEVEL == 0
+	movdqa		%xmm0, %xmm4
+	movdqa		%xmm3, %xmm0
+	pblendvb	%xmm2, %xmm1	// uses %xmm0 as implicit operand
+	movdqa		%xmm4, %xmm0
+.elseif LSB_CRC
+	vpblendvb	%xmm3, -16(BUF,LEN), %xmm1, %xmm1
+.else
+	vpblendvb	%xmm3, %xmm2, %xmm1, %xmm1
+.endif
+
+	// Fold C1 into C2 and store the 128-bit result in %xmm0.
+	_fold_vec	%xmm0, %xmm1, CONSTS_XMM, %xmm4
+
+.Lreduce_128bits_to_crc\@:
+	// Compute the CRC as %xmm0 * x^n mod G.  Here %xmm0 means the 128-bit
+	// polynomial stored in %xmm0 (using either lsb-first or msb-first bit
+	// order according to LSB_CRC), and G is the CRC's generator polynomial.
+
+	// First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
+	//
+	//	t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
+	//	      x^n * (%xmm0 mod x^64)
+	//
+	// Store t0 * x^(64-n) in %xmm0.  I.e., actually do:
+	//
+	//	%xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
+	//		 x^64 * (%xmm0 mod x^64)
+	//
+	// The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
+	// to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
+	// select it.  The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
+	// msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
+	// (considering the extra factor of x that gets implicitly introduced by
+	// each pclmulqdq when using lsb-first order), is identical to the
+	// constant that was used earlier for folding the LO64_TERMS across 128
+	// bits.  Thus it's already available in LO64_TERMS of CONSTS_XMM.
+	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
+.if LSB_CRC
+	_cond_vex psrldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
+.else
+	_cond_vex pslldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
+.endif
+	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
+	// The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
+	// The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
+
+	// First step of Barrett reduction: Compute floor(t0 / G).  This is the
+	// polynomial by which G needs to be multiplied to cancel out the x^n
+	// and higher terms of t0, i.e. to reduce t0 mod G.  First do:
+	//
+	//	t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
+	//
+	// Then the desired value floor(t0 / G) is floor(t1 / x^64).  The 63 in
+	// x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
+	// value that makes enough precision be carried through the calculation.
+	//
+	// The '* x' makes it so the result is floor(t1 / x^64) rather than
+	// floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
+	// can be extracted much more easily in the next step.  In the lsb-first
+	// case the '* x' happens implicitly.  In the msb-first case it must be
+	// done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
+	// constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
+	// the multiplication by the x^64 term is handled using a pxor.  The
+	// pxor causes the low 64 terms of t1 to be wrong, but they are unused.
+	_cond_vex movdqa,	OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
+	_pclmulqdq		CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
+.if !LSB_CRC
+	_cond_vex pxor,		%xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
+.endif
+	// The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
+
+	// Second step of Barrett reduction: Cancel out the x^n and higher terms
+	// of t0 by subtracting the needed multiple of G.  This gives the CRC:
+	//
+	//	crc := t0 - (G * floor(t0 / G))
+	//
+	// But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
+	//
+	//	crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
+	//
+	// Furthermore, since the resulting CRC is n-bit, if mod x^n is
+	// explicitly applied to it then the x^n term of G makes no difference
+	// in the result and can be omitted.  This helps keep the constant
+	// multiplier in 64 bits in most cases.  This gives the following:
+	//
+	//	%xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
+	//	crc := (%xmm0 / x^(64-n)) mod x^n
+	//
+	// In the lsb-first case, each pclmulqdq implicitly introduces
+	// an extra factor of x, so in that case the constant that needs to be
+	// passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
+	// For lsb-first CRCs where n=64, the extra factor of x cannot be as
+	// easily avoided.  In that case, instead pass '(G - x^n - x^0) / x' to
+	// pclmulqdq and handle the x^0 term (i.e. 1) separately.  (All CRC
+	// polynomials have nonzero x^n and x^0 terms.)  It works out as: the
+	// CRC has be XORed with the physically low qword of %xmm1, representing
+	// floor(t0 / G).  The most efficient way to do that is to move it to
+	// the physically high qword and use a ternlog to combine the two XORs.
+.if LSB_CRC && \n == 64
+	_cond_vex punpcklqdq,	%xmm1, %xmm2, %xmm2
+	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
+    .if AVX_LEVEL <= 2
+	_cond_vex pxor,		%xmm2, %xmm0, %xmm0
+	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
+    .else
+	vpternlogq		$0x96, %xmm2, %xmm1, %xmm0
+    .endif
+	_cond_vex "pextrq $1,",	%xmm0, %rax  // (%xmm0 / x^0) mod x^64
+.else
+	_pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
+	_cond_vex pxor,		%xmm1, %xmm0, %xmm0
+  .if \n == 8
+	_cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
+  .elseif \n == 16
+	_cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
+  .elseif \n == 32
+	_cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
+  .else // \n == 64 && !LSB_CRC
+	_cond_vex movq,		%xmm0, %rax  // (%xmm0 / x^0) mod x^64
+  .endif
+.endif
+
+.if VL > 16
+	vzeroupper	// Needed when ymm or zmm registers may have been used.
+.endif
+#ifdef __i386__
+	pop		CONSTS_PTR
+#endif
+	RET
+.endm
+
+#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\
+SYM_FUNC_START(prefix##_pclmul_sse);					\
+	_crc_pclmul	n=bits, lsb_crc=lsb, vl=16, avx_level=0;	\
+SYM_FUNC_END(prefix##_pclmul_sse);					\
+									\
+SYM_FUNC_START(prefix##_vpclmul_avx2);					\
+	_crc_pclmul	n=bits, lsb_crc=lsb, vl=32, avx_level=2;	\
+SYM_FUNC_END(prefix##_vpclmul_avx2);					\
+									\
+SYM_FUNC_START(prefix##_vpclmul_avx512);				\
+	_crc_pclmul	n=bits, lsb_crc=lsb, vl=64, avx_level=512;	\
+SYM_FUNC_END(prefix##_vpclmul_avx512);
diff --git a/lib/crc/x86/crc-pclmul-template.h b/lib/crc/x86/crc-pclmul-template.h
new file mode 100644
index 000000000000..35c950d7010c
--- /dev/null
+++ b/lib/crc/x86/crc-pclmul-template.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
+ * instantiated by crc-pclmul-template.S
+ *
+ * Copyright 2025 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+#ifndef _CRC_PCLMUL_TEMPLATE_H
+#define _CRC_PCLMUL_TEMPLATE_H
+
+#include <asm/cpufeatures.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <linux/static_call.h>
+#include "crc-pclmul-consts.h"
+
+#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t)				\
+crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len,		\
+			  const void *consts_ptr);			\
+crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len,		\
+			    const void *consts_ptr);			\
+crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len,	\
+			      const void *consts_ptr);			\
+DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
+
+static inline bool have_vpclmul(void)
+{
+	return boot_cpu_has(X86_FEATURE_VPCLMULQDQ) &&
+	       boot_cpu_has(X86_FEATURE_AVX2) &&
+	       cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL);
+}
+
+static inline bool have_avx512(void)
+{
+	return boot_cpu_has(X86_FEATURE_AVX512BW) &&
+	       boot_cpu_has(X86_FEATURE_AVX512VL) &&
+	       !boot_cpu_has(X86_FEATURE_PREFER_YMM) &&
+	       cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL);
+}
+
+/*
+ * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
+ * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
+ *
+ * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
+ * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
+ * varying by CPU and factors such as which parts of the "FPU" state userspace
+ * has touched, which could result in a larger cutoff being better.  Indeed, a
+ * larger cutoff is usually better for a *single* message.  However, the
+ * overhead of the FPU section gets amortized if multiple FPU sections get
+ * executed before returning to userspace, since the XSAVE and XRSTOR occur only
+ * once.  Considering that and the fact that the [V]PCLMULQDQ code is lighter on
+ * the dcache than the table-based code is, a 16-byte cutoff seems to work well.
+ */
+#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq)		\
+do {									\
+	if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) &&	\
+	    crypto_simd_usable()) {					\
+		const void *consts_ptr;					\
+									\
+		consts_ptr = (consts).fold_across_128_bits_consts;	\
+		kernel_fpu_begin();					\
+		crc = static_call(prefix##_pclmul)((crc), (p), (len),	\
+						   consts_ptr);		\
+		kernel_fpu_end();					\
+		return crc;						\
+	}								\
+} while (0)
+
+#endif /* _CRC_PCLMUL_TEMPLATE_H */
diff --git a/lib/crc/x86/crc-t10dif.h b/lib/crc/x86/crc-t10dif.h
new file mode 100644
index 000000000000..2a02a3026f3f
--- /dev/null
+++ b/lib/crc/x86/crc-t10dif.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CRC-T10DIF using [V]PCLMULQDQ instructions
+ *
+ * Copyright 2024 Google LLC
+ */
+
+#include "crc-pclmul-template.h"
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16);
+
+static inline u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
+{
+	CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts,
+		   have_pclmulqdq);
+	return crc_t10dif_generic(crc, p, len);
+}
+
+#define crc_t10dif_mod_init_arch crc_t10dif_mod_init_arch
+static inline void crc_t10dif_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+		static_branch_enable(&have_pclmulqdq);
+		if (have_vpclmul()) {
+			if (have_avx512())
+				static_call_update(crc16_msb_pclmul,
+						   crc16_msb_vpclmul_avx512);
+			else
+				static_call_update(crc16_msb_pclmul,
+						   crc16_msb_vpclmul_avx2);
+		}
+	}
+}
diff --git a/lib/crc/x86/crc16-msb-pclmul.S b/lib/crc/x86/crc16-msb-pclmul.S
new file mode 100644
index 000000000000..e9fe248093a8
--- /dev/null
+++ b/lib/crc/x86/crc16-msb-pclmul.S
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2025 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0)
diff --git a/lib/crc/x86/crc32-pclmul.S b/lib/crc/x86/crc32-pclmul.S
new file mode 100644
index 000000000000..f20f40fb0172
--- /dev/null
+++ b/lib/crc/x86/crc32-pclmul.S
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2025 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)
diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h
new file mode 100644
index 000000000000..cea2c96d08d0
--- /dev/null
+++ b/lib/crc/x86/crc32.h
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * x86-optimized CRC32 functions
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Copyright 2012 Xyratex Technology Limited
+ * Copyright 2024 Google LLC
+ */
+
+#include "crc-pclmul-template.h"
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
+
+static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+	CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
+		   have_pclmulqdq);
+	return crc32_le_base(crc, p, len);
+}
+
+#ifdef CONFIG_X86_64
+#define CRC32_INST "crc32q %1, %q0"
+#else
+#define CRC32_INST "crc32l %1, %0"
+#endif
+
+/*
+ * Use carryless multiply version of crc32c when buffer size is >= 512 to
+ * account for FPU state save/restore overhead.
+ */
+#define CRC32C_PCLMUL_BREAKEVEN	512
+
+asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
+
+static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
+{
+	size_t num_longs;
+
+	if (!static_branch_likely(&have_crc32))
+		return crc32c_base(crc, p, len);
+
+	if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
+	    static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
+		/*
+		 * Long length, the vector registers are usable, and the CPU is
+		 * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
+		 * It is worthwhile to divide the data into multiple streams,
+		 * CRC them independently, and combine them using PCLMULQDQ.
+		 * crc32c_x86_3way() does this using 3 streams, which is the
+		 * most that x86_64 CPUs have traditionally been capable of.
+		 *
+		 * However, due to improved VPCLMULQDQ performance on newer
+		 * CPUs, use crc32_lsb_vpclmul_avx512() instead of
+		 * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
+		 * "good" implementation of AVX-512.
+		 *
+		 * Future work: the optimal strategy on Zen 3--5 is actually to
+		 * use both crc32q and VPCLMULQDQ in parallel.  Unfortunately,
+		 * different numbers of streams and vector lengths are optimal
+		 * on each CPU microarchitecture, making it challenging to take
+		 * advantage of this.  (Zen 5 even supports 7 parallel crc32q, a
+		 * major upgrade.)  For now, just choose between
+		 * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512().  The latter
+		 * is needed anyway for crc32_le(), so we just reuse it here.
+		 */
+		kernel_fpu_begin();
+		if (static_branch_likely(&have_vpclmul_avx512))
+			crc = crc32_lsb_vpclmul_avx512(crc, p, len,
+				       crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
+		else
+			crc = crc32c_x86_3way(crc, p, len);
+		kernel_fpu_end();
+		return crc;
+	}
+
+	/*
+	 * Short length, XMM registers unusable, or the CPU is 32-bit; but the
+	 * CPU supports CRC32 instructions.  Just issue a single stream of CRC32
+	 * instructions inline.  While this doesn't use the CPU's CRC32
+	 * throughput very well, it avoids the need to combine streams.  Stream
+	 * combination would be inefficient here.
+	 */
+
+	for (num_longs = len / sizeof(unsigned long);
+	     num_longs != 0; num_longs--, p += sizeof(unsigned long))
+		asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
+
+	if (sizeof(unsigned long) > 4 && (len & 4)) {
+		asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
+		p += 4;
+	}
+	if (len & 2) {
+		asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
+		p += 2;
+	}
+	if (len & 1)
+		asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
+
+	return crc;
+}
+
+#define crc32_be_arch crc32_be_base /* not implemented on this arch */
+
+#define crc32_mod_init_arch crc32_mod_init_arch
+static inline void crc32_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_XMM4_2))
+		static_branch_enable(&have_crc32);
+	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+		static_branch_enable(&have_pclmulqdq);
+		if (have_vpclmul()) {
+			if (have_avx512()) {
+				static_call_update(crc32_lsb_pclmul,
+						   crc32_lsb_vpclmul_avx512);
+				static_branch_enable(&have_vpclmul_avx512);
+			} else {
+				static_call_update(crc32_lsb_pclmul,
+						   crc32_lsb_vpclmul_avx2);
+			}
+		}
+	}
+}
+
+static inline u32 crc32_optimizations_arch(void)
+{
+	u32 optimizations = 0;
+
+	if (static_key_enabled(&have_crc32))
+		optimizations |= CRC32C_OPTIMIZATION;
+	if (static_key_enabled(&have_pclmulqdq))
+		optimizations |= CRC32_LE_OPTIMIZATION;
+	return optimizations;
+}
diff --git a/lib/crc/x86/crc32c-3way.S b/lib/crc/x86/crc32c-3way.S
new file mode 100644
index 000000000000..9b8770503bbc
--- /dev/null
+++ b/lib/crc/x86/crc32c-3way.S
@@ -0,0 +1,360 @@
+/*
+ * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
+ *
+ * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
+ * downloaded from:
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ * Copyright 2024 Google LLC
+ *
+ * Authors:
+ *	Wajdi Feghali <wajdi.k.feghali@intel.com>
+ *	James Guilford <james.guilford@intel.com>
+ *	David Cote <david.m.cote@intel.com>
+ *	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/linkage.h>
+
+## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+
+# Define threshold below which buffers are considered "small" and routed to
+# regular CRC code that does not interleave the CRC instructions.
+#define SMALL_SIZE 200
+
+# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
+
+.text
+SYM_FUNC_START(crc32c_x86_3way)
+#define    crc0		  %edi
+#define    crc0_q	  %rdi
+#define    bufp		  %rsi
+#define    bufp_d	  %esi
+#define    len		  %rdx
+#define    len_dw	  %edx
+#define    n_misaligned	  %ecx /* overlaps chunk_bytes! */
+#define    n_misaligned_q %rcx
+#define    chunk_bytes	  %ecx /* overlaps n_misaligned! */
+#define    chunk_bytes_q  %rcx
+#define    crc1		  %r8
+#define    crc2		  %r9
+
+	cmp	$SMALL_SIZE, len
+	jb	.Lsmall
+
+	################################################################
+	## 1) ALIGN:
+	################################################################
+	mov	bufp_d, n_misaligned
+	neg	n_misaligned
+	and	$7, n_misaligned	# calculate the misalignment amount of
+					# the address
+	je	.Laligned		# Skip if aligned
+
+	# Process 1 <= n_misaligned <= 7 bytes individually in order to align
+	# the remaining data to an 8-byte boundary.
+.Ldo_align:
+	movq	(bufp), %rax
+	add	n_misaligned_q, bufp
+	sub	n_misaligned_q, len
+.Lalign_loop:
+	crc32b	%al, crc0		# compute crc32 of 1-byte
+	shr	$8, %rax		# get next byte
+	dec	n_misaligned
+	jne     .Lalign_loop
+.Laligned:
+
+	################################################################
+	## 2) PROCESS BLOCK:
+	################################################################
+
+	cmp	$128*24, len
+	jae     .Lfull_block
+
+.Lpartial_block:
+	# Compute floor(len / 24) to get num qwords to process from each lane.
+	imul	$2731, len_dw, %eax	# 2731 = ceil(2^16 / 24)
+	shr	$16, %eax
+	jmp	.Lcrc_3lanes
+
+.Lfull_block:
+	# Processing 128 qwords from each lane.
+	mov	$128, %eax
+
+	################################################################
+	## 3) CRC each of three lanes:
+	################################################################
+
+.Lcrc_3lanes:
+	xor	crc1,crc1
+	xor     crc2,crc2
+	mov	%eax, chunk_bytes
+	shl	$3, chunk_bytes		# num bytes to process from each lane
+	sub	$5, %eax		# 4 for 4x_loop, 1 for special last iter
+	jl	.Lcrc_3lanes_4x_done
+
+	# Unroll the loop by a factor of 4 to reduce the overhead of the loop
+	# bookkeeping instructions, which can compete with crc32q for the ALUs.
+.Lcrc_3lanes_4x_loop:
+	crc32q	(bufp), crc0_q
+	crc32q	(bufp,chunk_bytes_q), crc1
+	crc32q	(bufp,chunk_bytes_q,2), crc2
+	crc32q	8(bufp), crc0_q
+	crc32q	8(bufp,chunk_bytes_q), crc1
+	crc32q	8(bufp,chunk_bytes_q,2), crc2
+	crc32q	16(bufp), crc0_q
+	crc32q	16(bufp,chunk_bytes_q), crc1
+	crc32q	16(bufp,chunk_bytes_q,2), crc2
+	crc32q	24(bufp), crc0_q
+	crc32q	24(bufp,chunk_bytes_q), crc1
+	crc32q	24(bufp,chunk_bytes_q,2), crc2
+	add	$32, bufp
+	sub	$4, %eax
+	jge	.Lcrc_3lanes_4x_loop
+
+.Lcrc_3lanes_4x_done:
+	add	$4, %eax
+	jz	.Lcrc_3lanes_last_qword
+
+.Lcrc_3lanes_1x_loop:
+	crc32q	(bufp), crc0_q
+	crc32q	(bufp,chunk_bytes_q), crc1
+	crc32q	(bufp,chunk_bytes_q,2), crc2
+	add	$8, bufp
+	dec	%eax
+	jnz	.Lcrc_3lanes_1x_loop
+
+.Lcrc_3lanes_last_qword:
+	crc32q	(bufp), crc0_q
+	crc32q	(bufp,chunk_bytes_q), crc1
+# SKIP  crc32q	(bufp,chunk_bytes_q,2), crc2	; Don't do this one yet
+
+	################################################################
+	## 4) Combine three results:
+	################################################################
+
+	lea	(K_table-8)(%rip), %rax		# first entry is for idx 1
+	pmovzxdq (%rax,chunk_bytes_q), %xmm0	# 2 consts: K1:K2
+	lea	(chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
+	sub	%rax, len			# len -= chunk_bytes * 3
+
+	movq	crc0_q, %xmm1			# CRC for block 1
+	pclmulqdq $0x00, %xmm0, %xmm1		# Multiply by K2
+
+	movq    crc1, %xmm2			# CRC for block 2
+	pclmulqdq $0x10, %xmm0, %xmm2		# Multiply by K1
+
+	pxor    %xmm2,%xmm1
+	movq    %xmm1, %rax
+	xor	(bufp,chunk_bytes_q,2), %rax
+	mov	crc2, crc0_q
+	crc32	%rax, crc0_q
+	lea	8(bufp,chunk_bytes_q,2), bufp
+
+	################################################################
+	## 5) If more blocks remain, goto (2):
+	################################################################
+
+	cmp	$128*24, len
+	jae	.Lfull_block
+	cmp	$SMALL_SIZE, len
+	jae	.Lpartial_block
+
+	#######################################################################
+	## 6) Process any remainder without interleaving:
+	#######################################################################
+.Lsmall:
+	test	len_dw, len_dw
+	jz	.Ldone
+	mov	len_dw, %eax
+	shr	$3, %eax
+	jz	.Ldo_dword
+.Ldo_qwords:
+	crc32q	(bufp), crc0_q
+	add	$8, bufp
+	dec	%eax
+	jnz	.Ldo_qwords
+.Ldo_dword:
+	test	$4, len_dw
+	jz	.Ldo_word
+	crc32l	(bufp), crc0
+	add	$4, bufp
+.Ldo_word:
+	test	$2, len_dw
+	jz	.Ldo_byte
+	crc32w	(bufp), crc0
+	add	$2, bufp
+.Ldo_byte:
+	test	$1, len_dw
+	jz	.Ldone
+	crc32b	(bufp), crc0
+.Ldone:
+	mov	crc0, %eax
+        RET
+SYM_FUNC_END(crc32c_x86_3way)
+
+.section	.rodata, "a", @progbits
+	################################################################
+	## PCLMULQDQ tables
+	## Table is 128 entries x 2 words (8 bytes) each
+	################################################################
+.align 8
+K_table:
+	.long 0x493c7d27, 0x00000001
+	.long 0xba4fc28e, 0x493c7d27
+	.long 0xddc0152b, 0xf20c0dfe
+	.long 0x9e4addf8, 0xba4fc28e
+	.long 0x39d3b296, 0x3da6d0cb
+	.long 0x0715ce53, 0xddc0152b
+	.long 0x47db8317, 0x1c291d04
+	.long 0x0d3b6092, 0x9e4addf8
+	.long 0xc96cfdc0, 0x740eef02
+	.long 0x878a92a7, 0x39d3b296
+	.long 0xdaece73e, 0x083a6eec
+	.long 0xab7aff2a, 0x0715ce53
+	.long 0x2162d385, 0xc49f4f67
+	.long 0x83348832, 0x47db8317
+	.long 0x299847d5, 0x2ad91c30
+	.long 0xb9e02b86, 0x0d3b6092
+	.long 0x18b33a4e, 0x6992cea2
+	.long 0xb6dd949b, 0xc96cfdc0
+	.long 0x78d9ccb7, 0x7e908048
+	.long 0xbac2fd7b, 0x878a92a7
+	.long 0xa60ce07b, 0x1b3d8f29
+	.long 0xce7f39f4, 0xdaece73e
+	.long 0x61d82e56, 0xf1d0f55e
+	.long 0xd270f1a2, 0xab7aff2a
+	.long 0xc619809d, 0xa87ab8a8
+	.long 0x2b3cac5d, 0x2162d385
+	.long 0x65863b64, 0x8462d800
+	.long 0x1b03397f, 0x83348832
+	.long 0xebb883bd, 0x71d111a8
+	.long 0xb3e32c28, 0x299847d5
+	.long 0x064f7f26, 0xffd852c6
+	.long 0xdd7e3b0c, 0xb9e02b86
+	.long 0xf285651c, 0xdcb17aa4
+	.long 0x10746f3c, 0x18b33a4e
+	.long 0xc7a68855, 0xf37c5aee
+	.long 0x271d9844, 0xb6dd949b
+	.long 0x8e766a0c, 0x6051d5a2
+	.long 0x93a5f730, 0x78d9ccb7
+	.long 0x6cb08e5c, 0x18b0d4ff
+	.long 0x6b749fb2, 0xbac2fd7b
+	.long 0x1393e203, 0x21f3d99c
+	.long 0xcec3662e, 0xa60ce07b
+	.long 0x96c515bb, 0x8f158014
+	.long 0xe6fc4e6a, 0xce7f39f4
+	.long 0x8227bb8a, 0xa00457f7
+	.long 0xb0cd4768, 0x61d82e56
+	.long 0x39c7ff35, 0x8d6d2c43
+	.long 0xd7a4825c, 0xd270f1a2
+	.long 0x0ab3844b, 0x00ac29cf
+	.long 0x0167d312, 0xc619809d
+	.long 0xf6076544, 0xe9adf796
+	.long 0x26f6a60a, 0x2b3cac5d
+	.long 0xa741c1bf, 0x96638b34
+	.long 0x98d8d9cb, 0x65863b64
+	.long 0x49c3cc9c, 0xe0e9f351
+	.long 0x68bce87a, 0x1b03397f
+	.long 0x57a3d037, 0x9af01f2d
+	.long 0x6956fc3b, 0xebb883bd
+	.long 0x42d98888, 0x2cff42cf
+	.long 0x3771e98f, 0xb3e32c28
+	.long 0xb42ae3d9, 0x88f25a3a
+	.long 0x2178513a, 0x064f7f26
+	.long 0xe0ac139e, 0x4e36f0b0
+	.long 0x170076fa, 0xdd7e3b0c
+	.long 0x444dd413, 0xbd6f81f8
+	.long 0x6f345e45, 0xf285651c
+	.long 0x41d17b64, 0x91c9bd4b
+	.long 0xff0dba97, 0x10746f3c
+	.long 0xa2b73df1, 0x885f087b
+	.long 0xf872e54c, 0xc7a68855
+	.long 0x1e41e9fc, 0x4c144932
+	.long 0x86d8e4d2, 0x271d9844
+	.long 0x651bd98b, 0x52148f02
+	.long 0x5bb8f1bc, 0x8e766a0c
+	.long 0xa90fd27a, 0xa3c6f37a
+	.long 0xb3af077a, 0x93a5f730
+	.long 0x4984d782, 0xd7c0557f
+	.long 0xca6ef3ac, 0x6cb08e5c
+	.long 0x234e0b26, 0x63ded06a
+	.long 0xdd66cbbb, 0x6b749fb2
+	.long 0x4597456a, 0x4d56973c
+	.long 0xe9e28eb4, 0x1393e203
+	.long 0x7b3ff57a, 0x9669c9df
+	.long 0xc9c8b782, 0xcec3662e
+	.long 0x3f70cc6f, 0xe417f38a
+	.long 0x93e106a4, 0x96c515bb
+	.long 0x62ec6c6d, 0x4b9e0f71
+	.long 0xd813b325, 0xe6fc4e6a
+	.long 0x0df04680, 0xd104b8fc
+	.long 0x2342001e, 0x8227bb8a
+	.long 0x0a2a8d7e, 0x5b397730
+	.long 0x6d9a4957, 0xb0cd4768
+	.long 0xe8b6368b, 0xe78eb416
+	.long 0xd2c3ed1a, 0x39c7ff35
+	.long 0x995a5724, 0x61ff0e01
+	.long 0x9ef68d35, 0xd7a4825c
+	.long 0x0c139b31, 0x8d96551c
+	.long 0xf2271e60, 0x0ab3844b
+	.long 0x0b0bf8ca, 0x0bf80dd2
+	.long 0x2664fd8b, 0x0167d312
+	.long 0xed64812d, 0x8821abed
+	.long 0x02ee03b2, 0xf6076544
+	.long 0x8604ae0f, 0x6a45d2b2
+	.long 0x363bd6b3, 0x26f6a60a
+	.long 0x135c83fd, 0xd8d26619
+	.long 0x5fabe670, 0xa741c1bf
+	.long 0x35ec3279, 0xde87806c
+	.long 0x00bcf5f6, 0x98d8d9cb
+	.long 0x8ae00689, 0x14338754
+	.long 0x17f27698, 0x49c3cc9c
+	.long 0x58ca5f00, 0x5bd2011f
+	.long 0xaa7c7ad5, 0x68bce87a
+	.long 0xb5cfca28, 0xdd07448e
+	.long 0xded288f8, 0x57a3d037
+	.long 0x59f229bc, 0xdde8f5b9
+	.long 0x6d390dec, 0x6956fc3b
+	.long 0x37170390, 0xa3e3e02c
+	.long 0x6353c1cc, 0x42d98888
+	.long 0xc4584f5c, 0xd73c7bea
+	.long 0xf48642e9, 0x3771e98f
+	.long 0x531377e2, 0x80ff0093
+	.long 0xdd35bc8d, 0xb42ae3d9
+	.long 0xb25b29f2, 0x8fe4c34d
+	.long 0x9a5ede41, 0x2178513a
+	.long 0xa563905d, 0xdf99fc11
+	.long 0x45cddf4e, 0xe0ac139e
+	.long 0xacfa3103, 0x6c23e841
+	.long 0xa51b6135, 0x170076fa
diff --git a/lib/crc/x86/crc64-pclmul.S b/lib/crc/x86/crc64-pclmul.S
new file mode 100644
index 000000000000..4173051b5197
--- /dev/null
+++ b/lib/crc/x86/crc64-pclmul.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+// Copyright 2025 Google LLC
+
+#include "crc-pclmul-template.S"
+
+DEFINE_CRC_PCLMUL_FUNCS(crc64_msb, /* bits= */ 64, /* lsb= */ 0)
+DEFINE_CRC_PCLMUL_FUNCS(crc64_lsb, /* bits= */ 64, /* lsb= */ 1)
diff --git a/lib/crc/x86/crc64.h b/lib/crc/x86/crc64.h
new file mode 100644
index 000000000000..fde1222c4c58
--- /dev/null
+++ b/lib/crc/x86/crc64.h
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CRC64 using [V]PCLMULQDQ instructions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include "crc-pclmul-template.h"
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+
+DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64);
+DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64);
+
+static inline u64 crc64_be_arch(u64 crc, const u8 *p, size_t len)
+{
+	CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts,
+		   have_pclmulqdq);
+	return crc64_be_generic(crc, p, len);
+}
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+	CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts,
+		   have_pclmulqdq);
+	return crc64_nvme_generic(crc, p, len);
+}
+
+#define crc64_mod_init_arch crc64_mod_init_arch
+static inline void crc64_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+		static_branch_enable(&have_pclmulqdq);
+		if (have_vpclmul()) {
+			if (have_avx512()) {
+				static_call_update(crc64_msb_pclmul,
+						   crc64_msb_vpclmul_avx512);
+				static_call_update(crc64_lsb_pclmul,
+						   crc64_lsb_vpclmul_avx512);
+			} else {
+				static_call_update(crc64_msb_pclmul,
+						   crc64_msb_vpclmul_avx2);
+				static_call_update(crc64_lsb_pclmul,
+						   crc64_lsb_vpclmul_avx2);
+			}
+		}
+	}
+}