diff options
Diffstat (limited to 'lib/crc/x86/crc32.h')
-rw-r--r-- | lib/crc/x86/crc32.h | 137 |
1 files changed, 137 insertions, 0 deletions
diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h new file mode 100644 index 000000000000..cea2c96d08d0 --- /dev/null +++ b/lib/crc/x86/crc32.h @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * x86-optimized CRC32 functions + * + * Copyright (C) 2008 Intel Corporation + * Copyright 2012 Xyratex Technology Limited + * Copyright 2024 Google LLC + */ + +#include "crc-pclmul-template.h" + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512); + +DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); + +static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, + have_pclmulqdq); + return crc32_le_base(crc, p, len); +} + +#ifdef CONFIG_X86_64 +#define CRC32_INST "crc32q %1, %q0" +#else +#define CRC32_INST "crc32l %1, %0" +#endif + +/* + * Use carryless multiply version of crc32c when buffer size is >= 512 to + * account for FPU state save/restore overhead. + */ +#define CRC32C_PCLMUL_BREAKEVEN 512 + +asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); + +static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) +{ + size_t num_longs; + + if (!static_branch_likely(&have_crc32)) + return crc32c_base(crc, p, len); + + if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && + static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { + /* + * Long length, the vector registers are usable, and the CPU is + * 64-bit and supports both CRC32 and PCLMULQDQ instructions. + * It is worthwhile to divide the data into multiple streams, + * CRC them independently, and combine them using PCLMULQDQ. + * crc32c_x86_3way() does this using 3 streams, which is the + * most that x86_64 CPUs have traditionally been capable of. + * + * However, due to improved VPCLMULQDQ performance on newer + * CPUs, use crc32_lsb_vpclmul_avx512() instead of + * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a + * "good" implementation of AVX-512. + * + * Future work: the optimal strategy on Zen 3--5 is actually to + * use both crc32q and VPCLMULQDQ in parallel. Unfortunately, + * different numbers of streams and vector lengths are optimal + * on each CPU microarchitecture, making it challenging to take + * advantage of this. (Zen 5 even supports 7 parallel crc32q, a + * major upgrade.) For now, just choose between + * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter + * is needed anyway for crc32_le(), so we just reuse it here. + */ + kernel_fpu_begin(); + if (static_branch_likely(&have_vpclmul_avx512)) + crc = crc32_lsb_vpclmul_avx512(crc, p, len, + crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts); + else + crc = crc32c_x86_3way(crc, p, len); + kernel_fpu_end(); + return crc; + } + + /* + * Short length, XMM registers unusable, or the CPU is 32-bit; but the + * CPU supports CRC32 instructions. Just issue a single stream of CRC32 + * instructions inline. While this doesn't use the CPU's CRC32 + * throughput very well, it avoids the need to combine streams. Stream + * combination would be inefficient here. + */ + + for (num_longs = len / sizeof(unsigned long); + num_longs != 0; num_longs--, p += sizeof(unsigned long)) + asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p)); + + if (sizeof(unsigned long) > 4 && (len & 4)) { + asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p)); + p += 4; + } + if (len & 2) { + asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p)); + p += 2; + } + if (len & 1) + asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p)); + + return crc; +} + +#define crc32_be_arch crc32_be_base /* not implemented on this arch */ + +#define crc32_mod_init_arch crc32_mod_init_arch +static inline void crc32_mod_init_arch(void) +{ + if (boot_cpu_has(X86_FEATURE_XMM4_2)) + static_branch_enable(&have_crc32); + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + static_branch_enable(&have_pclmulqdq); + if (have_vpclmul()) { + if (have_avx512()) { + static_call_update(crc32_lsb_pclmul, + crc32_lsb_vpclmul_avx512); + static_branch_enable(&have_vpclmul_avx512); + } else { + static_call_update(crc32_lsb_pclmul, + crc32_lsb_vpclmul_avx2); + } + } + } +} + +static inline u32 crc32_optimizations_arch(void) +{ + u32 optimizations = 0; + + if (static_key_enabled(&have_crc32)) + optimizations |= CRC32C_OPTIMIZATION; + if (static_key_enabled(&have_pclmulqdq)) + optimizations |= CRC32_LE_OPTIMIZATION; + return optimizations; +} |