2 files changed, 83 insertions, 2 deletions
diff --git a/lib/crc/x86/crc-pclmul-consts.h b/lib/crc/x86/crc-pclmul-consts.h
index fcc63c064333..6ae94158fca2 100644
--- a/lib/crc/x86/crc-pclmul-consts.h
+++ b/lib/crc/x86/crc-pclmul-consts.h
@@ -2,7 +2,7 @@
 /*
  * CRC constants generated by:
  *
- *	./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
+ *	./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
  *
  * Do not edit manually.
  */
@@ -99,6 +99,51 @@ static const struct {
 };
 
 /*
+ * CRC folding constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 +
+ *        x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0
+ */
+static const struct {
+	u64 fold_across_2048_bits_consts[2];
+	u64 fold_across_1024_bits_consts[2];
+	u64 fold_across_512_bits_consts[2];
+	u64 fold_across_256_bits_consts[2];
+	u64 fold_across_128_bits_consts[2];
+	u8 shuf_table[48];
+	u64 barrett_reduction_consts[2];
+} crc32_lsb_0x82f63b78_consts ____cacheline_aligned __maybe_unused = {
+	.fold_across_2048_bits_consts = {
+		0x00000000dcb17aa4,	/* HI64_TERMS: (x^2079 mod G) * x^32 */
+		0x00000000b9e02b86,	/* LO64_TERMS: (x^2015 mod G) * x^32 */
+	},
+	.fold_across_1024_bits_consts = {
+		0x000000006992cea2,	/* HI64_TERMS: (x^1055 mod G) * x^32 */
+		0x000000000d3b6092,	/* LO64_TERMS: (x^991 mod G) * x^32 */
+	},
+	.fold_across_512_bits_consts = {
+		0x00000000740eef02,	/* HI64_TERMS: (x^543 mod G) * x^32 */
+		0x000000009e4addf8,	/* LO64_TERMS: (x^479 mod G) * x^32 */
+	},
+	.fold_across_256_bits_consts = {
+		0x000000003da6d0cb,	/* HI64_TERMS: (x^287 mod G) * x^32 */
+		0x00000000ba4fc28e,	/* LO64_TERMS: (x^223 mod G) * x^32 */
+	},
+	.fold_across_128_bits_consts = {
+		0x00000000f20c0dfe,	/* HI64_TERMS: (x^159 mod G) * x^32 */
+		0x00000000493c7d27,	/* LO64_TERMS: (x^95 mod G) * x^32 */
+	},
+	.shuf_table = {
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	},
+	.barrett_reduction_consts = {
+		0x4869ec38dea713f1,	/* HI64_TERMS: floor(x^95 / G) */
+		0x0000000105ec76f0,	/* LO64_TERMS: (G - x^32) * x^31 */
+	},
+};
+
+/*
  * CRC folding constants generated for most-significant-bit-first CRC-64 using
  * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
  *        x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h
index ba4dacf23340..cea2c96d08d0 100644
--- a/lib/crc/x86/crc32.h
+++ b/lib/crc/x86/crc32.h
@@ -11,6 +11,7 @@
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
 
 DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
 
@@ -44,12 +45,46 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 
 	if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
 	    static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
+		/*
+		 * Long length, the vector registers are usable, and the CPU is
+		 * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
+		 * It is worthwhile to divide the data into multiple streams,
+		 * CRC them independently, and combine them using PCLMULQDQ.
+		 * crc32c_x86_3way() does this using 3 streams, which is the
+		 * most that x86_64 CPUs have traditionally been capable of.
+		 *
+		 * However, due to improved VPCLMULQDQ performance on newer
+		 * CPUs, use crc32_lsb_vpclmul_avx512() instead of
+		 * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
+		 * "good" implementation of AVX-512.
+		 *
+		 * Future work: the optimal strategy on Zen 3--5 is actually to
+		 * use both crc32q and VPCLMULQDQ in parallel.  Unfortunately,
+		 * different numbers of streams and vector lengths are optimal
+		 * on each CPU microarchitecture, making it challenging to take
+		 * advantage of this.  (Zen 5 even supports 7 parallel crc32q, a
+		 * major upgrade.)  For now, just choose between
+		 * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512().  The latter
+		 * is needed anyway for crc32_le(), so we just reuse it here.
+		 */
 		kernel_fpu_begin();
-		crc = crc32c_x86_3way(crc, p, len);
+		if (static_branch_likely(&have_vpclmul_avx512))
+			crc = crc32_lsb_vpclmul_avx512(crc, p, len,
+				       crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
+		else
+			crc = crc32c_x86_3way(crc, p, len);
 		kernel_fpu_end();
 		return crc;
 	}
 
+	/*
+	 * Short length, XMM registers unusable, or the CPU is 32-bit; but the
+	 * CPU supports CRC32 instructions.  Just issue a single stream of CRC32
+	 * instructions inline.  While this doesn't use the CPU's CRC32
+	 * throughput very well, it avoids the need to combine streams.  Stream
+	 * combination would be inefficient here.
+	 */
+
 	for (num_longs = len / sizeof(unsigned long);
 	     num_longs != 0; num_longs--, p += sizeof(unsigned long))
 		asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
@@ -81,6 +116,7 @@ static inline void crc32_mod_init_arch(void)
 			if (have_avx512()) {
 				static_call_update(crc32_lsb_pclmul,
 						   crc32_lsb_vpclmul_avx512);
+				static_branch_enable(&have_vpclmul_avx512);
 			} else {
 				static_call_update(crc32_lsb_pclmul,
 						   crc32_lsb_vpclmul_avx2);