diff options
-rw-r--r-- | lib/crc/x86/crc-pclmul-consts.h | 47 | ||||
-rw-r--r-- | lib/crc/x86/crc32.h | 38 |
2 files changed, 83 insertions, 2 deletions
diff --git a/lib/crc/x86/crc-pclmul-consts.h b/lib/crc/x86/crc-pclmul-consts.h index fcc63c064333..6ae94158fca2 100644 --- a/lib/crc/x86/crc-pclmul-consts.h +++ b/lib/crc/x86/crc-pclmul-consts.h @@ -2,7 +2,7 @@ /* * CRC constants generated by: * - * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5 + * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5 * * Do not edit manually. */ @@ -99,6 +99,51 @@ static const struct { }; /* + * CRC folding constants generated for least-significant-bit-first CRC-32 using + * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 + + * x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0 + */ +static const struct { + u64 fold_across_2048_bits_consts[2]; + u64 fold_across_1024_bits_consts[2]; + u64 fold_across_512_bits_consts[2]; + u64 fold_across_256_bits_consts[2]; + u64 fold_across_128_bits_consts[2]; + u8 shuf_table[48]; + u64 barrett_reduction_consts[2]; +} crc32_lsb_0x82f63b78_consts ____cacheline_aligned __maybe_unused = { + .fold_across_2048_bits_consts = { + 0x00000000dcb17aa4, /* HI64_TERMS: (x^2079 mod G) * x^32 */ + 0x00000000b9e02b86, /* LO64_TERMS: (x^2015 mod G) * x^32 */ + }, + .fold_across_1024_bits_consts = { + 0x000000006992cea2, /* HI64_TERMS: (x^1055 mod G) * x^32 */ + 0x000000000d3b6092, /* LO64_TERMS: (x^991 mod G) * x^32 */ + }, + .fold_across_512_bits_consts = { + 0x00000000740eef02, /* HI64_TERMS: (x^543 mod G) * x^32 */ + 0x000000009e4addf8, /* LO64_TERMS: (x^479 mod G) * x^32 */ + }, + .fold_across_256_bits_consts = { + 0x000000003da6d0cb, /* HI64_TERMS: (x^287 mod G) * x^32 */ + 0x00000000ba4fc28e, /* LO64_TERMS: (x^223 mod G) * x^32 */ + }, + .fold_across_128_bits_consts = { + 0x00000000f20c0dfe, /* HI64_TERMS: (x^159 mod G) * x^32 */ + 0x00000000493c7d27, /* LO64_TERMS: (x^95 mod G) * x^32 */ + }, + .shuf_table = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }, + .barrett_reduction_consts = { + 0x4869ec38dea713f1, /* HI64_TERMS: floor(x^95 / G) */ + 0x0000000105ec76f0, /* LO64_TERMS: (G - x^32) * x^31 */ + }, +}; + +/* * CRC folding constants generated for most-significant-bit-first CRC-64 using * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h index ba4dacf23340..cea2c96d08d0 100644 --- a/lib/crc/x86/crc32.h +++ b/lib/crc/x86/crc32.h @@ -11,6 +11,7 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512); DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); @@ -44,12 +45,46 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { + /* + * Long length, the vector registers are usable, and the CPU is + * 64-bit and supports both CRC32 and PCLMULQDQ instructions. + * It is worthwhile to divide the data into multiple streams, + * CRC them independently, and combine them using PCLMULQDQ. + * crc32c_x86_3way() does this using 3 streams, which is the + * most that x86_64 CPUs have traditionally been capable of. + * + * However, due to improved VPCLMULQDQ performance on newer + * CPUs, use crc32_lsb_vpclmul_avx512() instead of + * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a + * "good" implementation of AVX-512. + * + * Future work: the optimal strategy on Zen 3--5 is actually to + * use both crc32q and VPCLMULQDQ in parallel. Unfortunately, + * different numbers of streams and vector lengths are optimal + * on each CPU microarchitecture, making it challenging to take + * advantage of this. (Zen 5 even supports 7 parallel crc32q, a + * major upgrade.) For now, just choose between + * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter + * is needed anyway for crc32_le(), so we just reuse it here. + */ kernel_fpu_begin(); - crc = crc32c_x86_3way(crc, p, len); + if (static_branch_likely(&have_vpclmul_avx512)) + crc = crc32_lsb_vpclmul_avx512(crc, p, len, + crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts); + else + crc = crc32c_x86_3way(crc, p, len); kernel_fpu_end(); return crc; } + /* + * Short length, XMM registers unusable, or the CPU is 32-bit; but the + * CPU supports CRC32 instructions. Just issue a single stream of CRC32 + * instructions inline. While this doesn't use the CPU's CRC32 + * throughput very well, it avoids the need to combine streams. Stream + * combination would be inefficient here. + */ + for (num_longs = len / sizeof(unsigned long); num_longs != 0; num_longs--, p += sizeof(unsigned long)) asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p)); @@ -81,6 +116,7 @@ static inline void crc32_mod_init_arch(void) if (have_avx512()) { static_call_update(crc32_lsb_pclmul, crc32_lsb_vpclmul_avx512); + static_branch_enable(&have_vpclmul_avx512); } else { static_call_update(crc32_lsb_pclmul, crc32_lsb_vpclmul_avx2); |