diff options
Diffstat (limited to 'arch/x86/lib')
28 files changed, 182 insertions, 1523 deletions
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore index 8ae0f93ecbfd..ec2131c9fd20 100644 --- a/arch/x86/lib/.gitignore +++ b/arch/x86/lib/.gitignore @@ -1,2 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only + +# This now-removed directory used to contain generated files. +/crypto/ + inat-tables.c diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 1c50352eb49f..2dba7f83ef97 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -38,16 +38,6 @@ lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o -obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o -crc32-x86-y := crc32-glue.o crc32-pclmul.o -crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o - -obj-$(CONFIG_CRC64_ARCH) += crc64-x86.o -crc64-x86-y := crc64-glue.o crc64-pclmul.o - -obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o -crc-t10dif-x86-y := crc-t10dif-glue.o crc16-msb-pclmul.o - obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o obj-y += iomem.o diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c index 7af743bd3b13..c5c60d07308c 100644 --- a/arch/x86/lib/cache-smp.c +++ b/arch/x86/lib/cache-smp.c @@ -14,9 +14,31 @@ void wbinvd_on_cpu(int cpu) } EXPORT_SYMBOL(wbinvd_on_cpu); -int wbinvd_on_all_cpus(void) +void wbinvd_on_all_cpus(void) { on_each_cpu(__wbinvd, NULL, 1); - return 0; } EXPORT_SYMBOL(wbinvd_on_all_cpus); + +void wbinvd_on_cpus_mask(struct cpumask *cpus) +{ + on_each_cpu_mask(cpus, __wbinvd, NULL, 1); +} +EXPORT_SYMBOL_GPL(wbinvd_on_cpus_mask); + +static void __wbnoinvd(void *dummy) +{ + wbnoinvd(); +} + +void wbnoinvd_on_all_cpus(void) +{ + on_each_cpu(__wbnoinvd, NULL, 1); +} +EXPORT_SYMBOL_GPL(wbnoinvd_on_all_cpus); + +void wbnoinvd_on_cpus_mask(struct cpumask *cpus) +{ + on_each_cpu_mask(cpus, __wbnoinvd, NULL, 1); +} +EXPORT_SYMBOL_GPL(wbnoinvd_on_cpus_mask); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index aa8c341b2441..06296eb69fd4 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -77,6 +77,24 @@ SYM_FUNC_START(rep_movs_alternative) _ASM_EXTABLE_UA( 0b, 1b) .Llarge_movsq: + /* Do the first possibly unaligned word */ +0: movq (%rsi),%rax +1: movq %rax,(%rdi) + + _ASM_EXTABLE_UA( 0b, .Lcopy_user_tail) + _ASM_EXTABLE_UA( 1b, .Lcopy_user_tail) + + /* What would be the offset to the aligned destination? */ + leaq 8(%rdi),%rax + andq $-8,%rax + subq %rdi,%rax + + /* .. and update pointers and count to match */ + addq %rax,%rdi + addq %rax,%rsi + subq %rax,%rcx + + /* make %rcx contain the number of words, %rax the remainder */ movq %rcx,%rax shrq $3,%rcx andl $7,%eax diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h deleted file mode 100644 index fcc63c064333..000000000000 --- a/arch/x86/lib/crc-pclmul-consts.h +++ /dev/null @@ -1,195 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * CRC constants generated by: - * - * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5 - * - * Do not edit manually. - */ - -/* - * CRC folding constants generated for most-significant-bit-first CRC-16 using - * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 - */ -static const struct { - u8 bswap_mask[16]; - u64 fold_across_2048_bits_consts[2]; - u64 fold_across_1024_bits_consts[2]; - u64 fold_across_512_bits_consts[2]; - u64 fold_across_256_bits_consts[2]; - u64 fold_across_128_bits_consts[2]; - u8 shuf_table[48]; - u64 barrett_reduction_consts[2]; -} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = { - .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, - .fold_across_2048_bits_consts = { - 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */ - 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */ - }, - .fold_across_1024_bits_consts = { - 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */ - 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */ - }, - .fold_across_512_bits_consts = { - 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */ - 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */ - }, - .fold_across_256_bits_consts = { - 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */ - 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */ - }, - .fold_across_128_bits_consts = { - 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */ - 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */ - }, - .shuf_table = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - .barrett_reduction_consts = { - 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */ - 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */ - }, -}; - -/* - * CRC folding constants generated for least-significant-bit-first CRC-32 using - * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + - * x^5 + x^4 + x^2 + x^1 + x^0 - */ -static const struct { - u64 fold_across_2048_bits_consts[2]; - u64 fold_across_1024_bits_consts[2]; - u64 fold_across_512_bits_consts[2]; - u64 fold_across_256_bits_consts[2]; - u64 fold_across_128_bits_consts[2]; - u8 shuf_table[48]; - u64 barrett_reduction_consts[2]; -} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = { - .fold_across_2048_bits_consts = { - 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */ - 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */ - }, - .fold_across_1024_bits_consts = { - 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */ - 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */ - }, - .fold_across_512_bits_consts = { - 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */ - 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */ - }, - .fold_across_256_bits_consts = { - 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */ - 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */ - }, - .fold_across_128_bits_consts = { - 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */ - 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */ - }, - .shuf_table = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - .barrett_reduction_consts = { - 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */ - 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */ - }, -}; - -/* - * CRC folding constants generated for most-significant-bit-first CRC-64 using - * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + - * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + - * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + - * x^7 + x^4 + x^1 + x^0 - */ -static const struct { - u8 bswap_mask[16]; - u64 fold_across_2048_bits_consts[2]; - u64 fold_across_1024_bits_consts[2]; - u64 fold_across_512_bits_consts[2]; - u64 fold_across_256_bits_consts[2]; - u64 fold_across_128_bits_consts[2]; - u8 shuf_table[48]; - u64 barrett_reduction_consts[2]; -} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = { - .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, - .fold_across_2048_bits_consts = { - 0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */ - 0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */ - }, - .fold_across_1024_bits_consts = { - 0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */ - 0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */ - }, - .fold_across_512_bits_consts = { - 0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */ - 0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */ - }, - .fold_across_256_bits_consts = { - 0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */ - 0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */ - }, - .fold_across_128_bits_consts = { - 0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */ - 0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */ - }, - .shuf_table = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - .barrett_reduction_consts = { - 0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */ - 0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */ - }, -}; - -/* - * CRC folding constants generated for least-significant-bit-first CRC-64 using - * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + - * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + - * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + - * x^4 + x^3 + x^0 - */ -static const struct { - u64 fold_across_2048_bits_consts[2]; - u64 fold_across_1024_bits_consts[2]; - u64 fold_across_512_bits_consts[2]; - u64 fold_across_256_bits_consts[2]; - u64 fold_across_128_bits_consts[2]; - u8 shuf_table[48]; - u64 barrett_reduction_consts[2]; -} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = { - .fold_across_2048_bits_consts = { - 0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */ - 0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */ - }, - .fold_across_1024_bits_consts = { - 0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */ - 0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */ - }, - .fold_across_512_bits_consts = { - 0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */ - 0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */ - }, - .fold_across_256_bits_consts = { - 0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */ - 0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */ - }, - .fold_across_128_bits_consts = { - 0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */ - 0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */ - }, - .shuf_table = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - }, - .barrett_reduction_consts = { - 0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */ - 0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */ - }, -}; diff --git a/arch/x86/lib/crc-pclmul-template.S b/arch/x86/lib/crc-pclmul-template.S deleted file mode 100644 index ae0b6144c503..000000000000 --- a/arch/x86/lib/crc-pclmul-template.S +++ /dev/null @@ -1,582 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -// -// Template to generate [V]PCLMULQDQ-based CRC functions for x86 -// -// Copyright 2025 Google LLC -// -// Author: Eric Biggers <ebiggers@google.com> - -#include <linux/linkage.h> -#include <linux/objtool.h> - -// Offsets within the generated constants table -.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only -.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next -.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next -.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next -.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next -.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0 -.set OFFSETOF_SHUF_TABLE, 1*16 -.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16 - -// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the -// corresponding non-VEX instruction plus any needed moves. The supported -// instruction formats are: -// -// - Two-arg [src, dst], where the non-VEX format is the same. -// - Three-arg [src1, src2, dst] where the non-VEX format is -// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too. -// -// \insn gives the instruction without a "v" prefix and including any immediate -// argument if needed to make the instruction follow one of the above formats. -// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to -// it first; this is needed when \arg1 is an unaligned mem operand. -.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp -.if AVX_LEVEL == 0 - // VEX not allowed. Emulate it. - .ifnb \arg3 // Three-arg [src1, src2, dst] - .ifc "\arg2", "\arg3" // src2 == dst? - .ifnb \unaligned_mem_tmp - movdqu \arg1, \unaligned_mem_tmp - \insn \unaligned_mem_tmp, \arg3 - .else - \insn \arg1, \arg3 - .endif - .else // src2 != dst - .ifc "\arg1", "\arg3" - .error "Can't have src1 == dst when src2 != dst" - .endif - .ifnb \unaligned_mem_tmp - movdqu \arg1, \unaligned_mem_tmp - movdqa \arg2, \arg3 - \insn \unaligned_mem_tmp, \arg3 - .else - movdqa \arg2, \arg3 - \insn \arg1, \arg3 - .endif - .endif - .else // Two-arg [src, dst] - .ifnb \unaligned_mem_tmp - movdqu \arg1, \unaligned_mem_tmp - \insn \unaligned_mem_tmp, \arg2 - .else - \insn \arg1, \arg2 - .endif - .endif -.else - // VEX is allowed. Emit the desired instruction directly. - .ifnb \arg3 - v\insn \arg1, \arg2, \arg3 - .else - v\insn \arg1, \arg2 - .endif -.endif -.endm - -// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector -// register of length VL. -.macro _vbroadcast src, dst -.if VL == 16 - _cond_vex movdqa, \src, \dst -.elseif VL == 32 - vbroadcasti128 \src, \dst -.else - vbroadcasti32x4 \src, \dst -.endif -.endm - -// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC -// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane. -.macro _load_data vl, src, bswap_mask, dst -.if \vl < 64 - _cond_vex movdqu, "\src", \dst -.else - vmovdqu8 \src, \dst -.endif -.if !LSB_CRC - _cond_vex pshufb, \bswap_mask, \dst, \dst -.endif -.endm - -.macro _prepare_v0 vl, v0, v1, bswap_mask -.if LSB_CRC - .if \vl < 64 - _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1 - .else - vpxorq (BUF), \v0, \v0 - .endif -.else - _load_data \vl, (BUF), \bswap_mask, \v1 - .if \vl < 64 - _cond_vex pxor, \v1, \v0, \v0 - .else - vpxorq \v1, \v0, \v0 - .endif -.endif -.endm - -// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for -// msb-first order or the physically high qword for lsb-first order -#define LO64_TERMS 0 - -// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high -// qword for msb-first order or the physically low qword for lsb-first order -#define HI64_TERMS 1 - -// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given -// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst. -.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst - _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \ - \src1, \src2, \dst -.endm - -// Fold \acc into \data and store the result back into \acc. \data can be an -// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no -// byte-reflection is needed; otherwise it must be a vector register. \consts -// is a vector register containing the needed fold constants, and \tmp is a -// temporary vector register. All arguments must be the same length. -.macro _fold_vec acc, data, consts, tmp - _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp - _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc -.if AVX_LEVEL <= 2 - _cond_vex pxor, \data, \tmp, \tmp - _cond_vex pxor, \tmp, \acc, \acc -.else - vpternlogq $0x96, \data, \tmp, \acc -.endif -.endm - -// Fold \acc into \data and store the result back into \acc. \data is an -// unaligned mem operand, \consts is a vector register containing the needed -// fold constants, \bswap_mask is a vector register containing the -// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are -// temporary vector registers. All arguments must have length \vl. -.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2 -.if AVX_LEVEL == 0 || !LSB_CRC - _load_data \vl, \data, \bswap_mask, \tmp1 - _fold_vec \acc, \tmp1, \consts, \tmp2 -.else - _fold_vec \acc, \data, \consts, \tmp1 -.endif -.endm - -// Load the constants for folding across 2**i vectors of length VL at a time -// into all 128-bit lanes of the vector register CONSTS. -.macro _load_vec_folding_consts i - _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \ - CONSTS -.endm - -// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store -// the result back into \v0. If the remaining length mod \vl is nonzero, also -// fold \vl data bytes from BUF. For both operations the fold distance is \vl. -// \consts must be a register of length \vl containing the fold constants. -.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2 - _fold_vec \v0, \v1, \consts, \tmp1 - test $\vl, LEN8 - jz .Lfold_vec_final_done\@ - _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2 - add $\vl, BUF -.Lfold_vec_final_done\@: -.endm - -// This macro generates the body of a CRC function with the following prototype: -// -// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts); -// -// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it. -// |buf| is the data to checksum. |len| is the data length in bytes, which must -// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts -// field of the constants struct that was generated for the chosen CRC variant. -// -// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g. -// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If -// the file is compiled in i386 mode, then the maximum supported value is 32. -// -// \lsb_crc is 1 if the CRC processes the least significant bit of each byte -// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0 -// if the CRC processes the most significant bit of each byte first, i.e. maps -// bit0 to x^0, bit1 to x^1, bit7 to x^7. -// -// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64. -// -// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or -// 512 for AVX512. -// -// If \vl == 16 && \avx_level == 0, the generated code requires: -// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.) -// -// If \vl == 32 && \avx_level == 2, the generated code requires: -// VPCLMULQDQ && AVX2. -// -// If \vl == 64 && \avx_level == 512, the generated code requires: -// VPCLMULQDQ && AVX512BW && AVX512VL. -// -// Other \vl and \avx_level combinations are either not supported or not useful. -.macro _crc_pclmul n, lsb_crc, vl, avx_level - .set LSB_CRC, \lsb_crc - .set VL, \vl - .set AVX_LEVEL, \avx_level - - // Define aliases for the xmm, ymm, or zmm registers according to VL. -.irp i, 0,1,2,3,4,5,6,7 - .if VL == 16 - .set V\i, %xmm\i - .set LOG2_VL, 4 - .elseif VL == 32 - .set V\i, %ymm\i - .set LOG2_VL, 5 - .elseif VL == 64 - .set V\i, %zmm\i - .set LOG2_VL, 6 - .else - .error "Unsupported vector length" - .endif -.endr - // Define aliases for the function parameters. - // Note: when crc_t is shorter than u32, zero-extension to 32 bits is - // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed - // when crc_t is shorter than u64. -#ifdef __x86_64__ -.if \n <= 32 - .set CRC, %edi -.else - .set CRC, %rdi -.endif - .set BUF, %rsi - .set LEN, %rdx - .set LEN32, %edx - .set LEN8, %dl - .set CONSTS_PTR, %rcx -#else - // 32-bit support, assuming -mregparm=3 and not including support for - // CRC-64 (which would use both eax and edx to pass the crc parameter). - .set CRC, %eax - .set BUF, %edx - .set LEN, %ecx - .set LEN32, %ecx - .set LEN8, %cl - .set CONSTS_PTR, %ebx // Passed on stack -#endif - - // Define aliases for some local variables. V0-V5 are used without - // aliases (for accumulators, data, temporary values, etc). Staying - // within the first 8 vector registers keeps the code 32-bit SSE - // compatible and reduces the size of 64-bit SSE code slightly. - .set BSWAP_MASK, V6 - .set BSWAP_MASK_YMM, %ymm6 - .set BSWAP_MASK_XMM, %xmm6 - .set CONSTS, V7 - .set CONSTS_YMM, %ymm7 - .set CONSTS_XMM, %xmm7 - - // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the - // functions generated by this macro are called only by static_call. - ANNOTATE_NOENDBR - -#ifdef __i386__ - push CONSTS_PTR - mov 8(%esp), CONSTS_PTR -#endif - - // Create a 128-bit vector that contains the initial CRC in the end - // representing the high-order polynomial coefficients, and the rest 0. - // If the CRC is msb-first, also load the byte-reflection table. -.if \n <= 32 - _cond_vex movd, CRC, %xmm0 -.else - _cond_vex movq, CRC, %xmm0 -.endif -.if !LSB_CRC - _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0 - _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK -.endif - - // Load the first vector of data and XOR the initial CRC into the - // appropriate end of the first 128-bit lane of data. If LEN < VL, then - // use a short vector and jump ahead to the final reduction. (LEN >= 16 - // is guaranteed here but not necessarily LEN >= VL.) -.if VL >= 32 - cmp $VL, LEN - jae .Lat_least_1vec\@ - .if VL == 64 - cmp $32, LEN32 - jb .Lless_than_32bytes\@ - _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM - add $32, BUF - jmp .Lreduce_256bits_to_128bits\@ -.Lless_than_32bytes\@: - .endif - _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM - add $16, BUF - vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM - jmp .Lcheck_for_partial_block\@ -.Lat_least_1vec\@: -.endif - _prepare_v0 VL, V0, V1, BSWAP_MASK - - // Handle VL <= LEN < 4*VL. - cmp $4*VL-1, LEN - ja .Lat_least_4vecs\@ - add $VL, BUF - // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector. - // If VL==16 then load fold_across_128_bits_consts first, as the final - // reduction depends on it and it won't be loaded anywhere else. - cmp $2*VL-1, LEN32 -.if VL == 16 - _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM -.endif - jbe .Lreduce_1vec_to_128bits\@ - // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to - // the reduction from 2 vectors. - _load_data VL, (BUF), BSWAP_MASK, V1 - add $VL, BUF - jmp .Lreduce_2vecs_to_1\@ - -.Lat_least_4vecs\@: - // Load 3 more vectors of data. - _load_data VL, 1*VL(BUF), BSWAP_MASK, V1 - _load_data VL, 2*VL(BUF), BSWAP_MASK, V2 - _load_data VL, 3*VL(BUF), BSWAP_MASK, V3 - sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32 - add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32 - - // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next - // 4 vectors of data and write the result back to V0-V3. - cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32 - jbe .Lreduce_4vecs_to_2\@ - _load_vec_folding_consts 2 -.Lfold_4vecs_loop\@: - _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 - _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 - _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 - _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 - sub $-4*VL, BUF - add $-4*VL, LEN - cmp $4*VL-1, LEN - ja .Lfold_4vecs_loop\@ - - // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold - // two more vectors of data from BUF, if at least that much remains. -.Lreduce_4vecs_to_2\@: - _load_vec_folding_consts 1 - _fold_vec V0, V2, CONSTS, V4 - _fold_vec V1, V3, CONSTS, V4 - test $2*VL, LEN8 - jz .Lreduce_2vecs_to_1\@ - _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 - _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 - sub $-2*VL, BUF - - // Fold V0 into V1 and write the result back to V0. Then fold one more - // vector of data from BUF, if at least that much remains. -.Lreduce_2vecs_to_1\@: - _load_vec_folding_consts 0 - _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5 - -.Lreduce_1vec_to_128bits\@: -.if VL == 64 - // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of - // data from BUF, if at least that much remains. - vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM - vextracti64x4 $1, %zmm0, %ymm1 - _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5 -.Lreduce_256bits_to_128bits\@: -.endif -.if VL >= 32 - // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of - // data from BUF, if at least that much remains. - vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM - vextracti128 $1, %ymm0, %xmm1 - _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5 -.Lcheck_for_partial_block\@: -.endif - and $15, LEN32 - jz .Lreduce_128bits_to_crc\@ - - // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now - // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0 - // and B is the polynomial of the remaining LEN data bytes. To reduce - // this to 128 bits without needing fold constants for each possible - // LEN, rearrange this expression into C1*(x^128) + C2, where - // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128. - // Then fold C1 into C2, which is just another fold across 128 bits. - -.if !LSB_CRC || AVX_LEVEL == 0 - // Load the last 16 data bytes. Note that originally LEN was >= 16. - _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2 -.endif // Else will use vpblendvb mem operand later. -.if !LSB_CRC - neg LEN // Needed for indexing shuf_table -.endif - - // tmp = A*x^(8*LEN) mod x^128 - // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1] - // i.e. right-shift by LEN bytes. - // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN] - // i.e. left-shift by LEN bytes. - _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3 - _cond_vex pshufb, %xmm3, %xmm0, %xmm1 - - // C1 = floor(A / x^(128 - 8*LEN)) - // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1] - // i.e. left-shift by 16-LEN bytes. - // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1] - // i.e. right-shift by 16-LEN bytes. - _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \ - %xmm0, %xmm0, unaligned_mem_tmp=%xmm4 - - // C2 = tmp + B. This is just a blend of tmp with the last 16 data - // bytes (reflected if msb-first). The blend mask is the shuffle table - // that was used to create tmp. 0 selects tmp, and 1 last16databytes. -.if AVX_LEVEL == 0 - movdqa %xmm0, %xmm4 - movdqa %xmm3, %xmm0 - pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand - movdqa %xmm4, %xmm0 -.elseif LSB_CRC - vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1 -.else - vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 -.endif - - // Fold C1 into C2 and store the 128-bit result in %xmm0. - _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4 - -.Lreduce_128bits_to_crc\@: - // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit - // polynomial stored in %xmm0 (using either lsb-first or msb-first bit - // order according to LSB_CRC), and G is the CRC's generator polynomial. - - // First, multiply %xmm0 by x^n and reduce the result to 64+n bits: - // - // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) + - // x^n * (%xmm0 mod x^64) - // - // Store t0 * x^(64-n) in %xmm0. I.e., actually do: - // - // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) + - // x^64 * (%xmm0 mod x^64) - // - // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned - // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily - // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the - // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case - // (considering the extra factor of x that gets implicitly introduced by - // each pclmulqdq when using lsb-first order), is identical to the - // constant that was used earlier for folding the LO64_TERMS across 128 - // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM. - _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1 -.if LSB_CRC - _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) -.else - _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) -.endif - _cond_vex pxor, %xmm1, %xmm0, %xmm0 - // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n). - // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n). - - // First step of Barrett reduction: Compute floor(t0 / G). This is the - // polynomial by which G needs to be multiplied to cancel out the x^n - // and higher terms of t0, i.e. to reduce t0 mod G. First do: - // - // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n) - // - // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in - // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest - // value that makes enough precision be carried through the calculation. - // - // The '* x' makes it so the result is floor(t1 / x^64) rather than - // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it - // can be extracted much more easily in the next step. In the lsb-first - // case the '* x' happens implicitly. In the msb-first case it must be - // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the - // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and - // the multiplication by the x^64 term is handled using a pxor. The - // pxor causes the low 64 terms of t1 to be wrong, but they are unused. - _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM - _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1 -.if !LSB_CRC - _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n) -.endif - // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G). - - // Second step of Barrett reduction: Cancel out the x^n and higher terms - // of t0 by subtracting the needed multiple of G. This gives the CRC: - // - // crc := t0 - (G * floor(t0 / G)) - // - // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do: - // - // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n) - // - // Furthermore, since the resulting CRC is n-bit, if mod x^n is - // explicitly applied to it then the x^n term of G makes no difference - // in the result and can be omitted. This helps keep the constant - // multiplier in 64 bits in most cases. This gives the following: - // - // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G)) - // crc := (%xmm0 / x^(64-n)) mod x^n - // - // In the lsb-first case, each pclmulqdq implicitly introduces - // an extra factor of x, so in that case the constant that needs to be - // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63. - // For lsb-first CRCs where n=64, the extra factor of x cannot be as - // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to - // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC - // polynomials have nonzero x^n and x^0 terms.) It works out as: the - // CRC has be XORed with the physically low qword of %xmm1, representing - // floor(t0 / G). The most efficient way to do that is to move it to - // the physically high qword and use a ternlog to combine the two XORs. -.if LSB_CRC && \n == 64 - _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2 - _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 - .if AVX_LEVEL <= 2 - _cond_vex pxor, %xmm2, %xmm0, %xmm0 - _cond_vex pxor, %xmm1, %xmm0, %xmm0 - .else - vpternlogq $0x96, %xmm2, %xmm1, %xmm0 - .endif - _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64 -.else - _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 - _cond_vex pxor, %xmm1, %xmm0, %xmm0 - .if \n == 8 - _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8 - .elseif \n == 16 - _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16 - .elseif \n == 32 - _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32 - .else // \n == 64 && !LSB_CRC - _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64 - .endif -.endif - -.if VL > 16 - vzeroupper // Needed when ymm or zmm registers may have been used. -.endif -#ifdef __i386__ - pop CONSTS_PTR -#endif - RET -.endm - -#ifdef CONFIG_AS_VPCLMULQDQ -#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ -SYM_FUNC_START(prefix##_pclmul_sse); \ - _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ -SYM_FUNC_END(prefix##_pclmul_sse); \ - \ -SYM_FUNC_START(prefix##_vpclmul_avx2); \ - _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \ -SYM_FUNC_END(prefix##_vpclmul_avx2); \ - \ -SYM_FUNC_START(prefix##_vpclmul_avx512); \ - _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \ -SYM_FUNC_END(prefix##_vpclmul_avx512); -#else -#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ -SYM_FUNC_START(prefix##_pclmul_sse); \ - _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ -SYM_FUNC_END(prefix##_pclmul_sse); -#endif // !CONFIG_AS_VPCLMULQDQ diff --git a/arch/x86/lib/crc-pclmul-template.h b/arch/x86/lib/crc-pclmul-template.h deleted file mode 100644 index c5b3bfe11d8d..000000000000 --- a/arch/x86/lib/crc-pclmul-template.h +++ /dev/null @@ -1,76 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are - * instantiated by crc-pclmul-template.S - * - * Copyright 2025 Google LLC - * - * Author: Eric Biggers <ebiggers@google.com> - */ -#ifndef _CRC_PCLMUL_TEMPLATE_H -#define _CRC_PCLMUL_TEMPLATE_H - -#include <asm/cpufeatures.h> -#include <asm/simd.h> -#include <crypto/internal/simd.h> -#include <linux/static_call.h> -#include "crc-pclmul-consts.h" - -#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \ -crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \ - const void *consts_ptr); \ -crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \ - const void *consts_ptr); \ -crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \ - const void *consts_ptr); \ -DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse) - -#define INIT_CRC_PCLMUL(prefix) \ -do { \ - if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) && \ - boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && \ - boot_cpu_has(X86_FEATURE_AVX2) && \ - cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) { \ - if (boot_cpu_has(X86_FEATURE_AVX512BW) && \ - boot_cpu_has(X86_FEATURE_AVX512VL) && \ - !boot_cpu_has(X86_FEATURE_PREFER_YMM) && \ - cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) { \ - static_call_update(prefix##_pclmul, \ - prefix##_vpclmul_avx512); \ - } else { \ - static_call_update(prefix##_pclmul, \ - prefix##_vpclmul_avx2); \ - } \ - } \ -} while (0) - -/* - * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16 - * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD. - * - * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions. - * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(), - * varying by CPU and factors such as which parts of the "FPU" state userspace - * has touched, which could result in a larger cutoff being better. Indeed, a - * larger cutoff is usually better for a *single* message. However, the - * overhead of the FPU section gets amortized if multiple FPU sections get - * executed before returning to userspace, since the XSAVE and XRSTOR occur only - * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on - * the dcache than the table-based code is, a 16-byte cutoff seems to work well. - */ -#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \ -do { \ - if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \ - crypto_simd_usable()) { \ - const void *consts_ptr; \ - \ - consts_ptr = (consts).fold_across_128_bits_consts; \ - kernel_fpu_begin(); \ - crc = static_call(prefix##_pclmul)((crc), (p), (len), \ - consts_ptr); \ - kernel_fpu_end(); \ - return crc; \ - } \ -} while (0) - -#endif /* _CRC_PCLMUL_TEMPLATE_H */ diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif-glue.c deleted file mode 100644 index f89c335cde3c..000000000000 --- a/arch/x86/lib/crc-t10dif-glue.c +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * CRC-T10DIF using [V]PCLMULQDQ instructions - * - * Copyright 2024 Google LLC - */ - -#include <linux/crc-t10dif.h> -#include <linux/module.h> -#include "crc-pclmul-template.h" - -static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); - -DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16); - -u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) -{ - CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts, - have_pclmulqdq); - return crc_t10dif_generic(crc, p, len); -} -EXPORT_SYMBOL(crc_t10dif_arch); - -static int __init crc_t10dif_x86_init(void) -{ - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { - static_branch_enable(&have_pclmulqdq); - INIT_CRC_PCLMUL(crc16_msb); - } - return 0; -} -arch_initcall(crc_t10dif_x86_init); - -static void __exit crc_t10dif_x86_exit(void) -{ -} -module_exit(crc_t10dif_x86_exit); - -MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc16-msb-pclmul.S b/arch/x86/lib/crc16-msb-pclmul.S deleted file mode 100644 index e9fe248093a8..000000000000 --- a/arch/x86/lib/crc16-msb-pclmul.S +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -// Copyright 2025 Google LLC - -#include "crc-pclmul-template.S" - -DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0) diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c deleted file mode 100644 index e3f93b17ac3f..000000000000 --- a/arch/x86/lib/crc32-glue.c +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * x86-optimized CRC32 functions - * - * Copyright (C) 2008 Intel Corporation - * Copyright 2012 Xyratex Technology Limited - * Copyright 2024 Google LLC - */ - -#include <linux/crc32.h> -#include <linux/module.h> -#include "crc-pclmul-template.h" - -static DEFINE_STATIC_KEY_FALSE(have_crc32); -static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); - -DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); - -u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) -{ - CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, - have_pclmulqdq); - return crc32_le_base(crc, p, len); -} -EXPORT_SYMBOL(crc32_le_arch); - -#ifdef CONFIG_X86_64 -#define CRC32_INST "crc32q %1, %q0" -#else -#define CRC32_INST "crc32l %1, %0" -#endif - -/* - * Use carryless multiply version of crc32c when buffer size is >= 512 to - * account for FPU state save/restore overhead. - */ -#define CRC32C_PCLMUL_BREAKEVEN 512 - -asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); - -u32 crc32c_arch(u32 crc, const u8 *p, size_t len) -{ - size_t num_longs; - - if (!static_branch_likely(&have_crc32)) - return crc32c_base(crc, p, len); - - if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && - static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { - kernel_fpu_begin(); - crc = crc32c_x86_3way(crc, p, len); - kernel_fpu_end(); - return crc; - } - - for (num_longs = len / sizeof(unsigned long); - num_longs != 0; num_longs--, p += sizeof(unsigned long)) - asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p)); - - if (sizeof(unsigned long) > 4 && (len & 4)) { - asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p)); - p += 4; - } - if (len & 2) { - asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p)); - p += 2; - } - if (len & 1) - asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p)); - - return crc; -} -EXPORT_SYMBOL(crc32c_arch); - -u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) -{ - return crc32_be_base(crc, p, len); -} -EXPORT_SYMBOL(crc32_be_arch); - -static int __init crc32_x86_init(void) -{ - if (boot_cpu_has(X86_FEATURE_XMM4_2)) - static_branch_enable(&have_crc32); - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { - static_branch_enable(&have_pclmulqdq); - INIT_CRC_PCLMUL(crc32_lsb); - } - return 0; -} -arch_initcall(crc32_x86_init); - -static void __exit crc32_x86_exit(void) -{ -} -module_exit(crc32_x86_exit); - -u32 crc32_optimizations(void) -{ - u32 optimizations = 0; - - if (static_key_enabled(&have_crc32)) - optimizations |= CRC32C_OPTIMIZATION; - if (static_key_enabled(&have_pclmulqdq)) - optimizations |= CRC32_LE_OPTIMIZATION; - return optimizations; -} -EXPORT_SYMBOL(crc32_optimizations); - -MODULE_DESCRIPTION("x86-optimized CRC32 functions"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S deleted file mode 100644 index f20f40fb0172..000000000000 --- a/arch/x86/lib/crc32-pclmul.S +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -// Copyright 2025 Google LLC - -#include "crc-pclmul-template.S" - -DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1) diff --git a/arch/x86/lib/crc32c-3way.S b/arch/x86/lib/crc32c-3way.S deleted file mode 100644 index 9b8770503bbc..000000000000 --- a/arch/x86/lib/crc32c-3way.S +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) - * - * The white papers on CRC32C calculations with PCLMULQDQ instruction can be - * downloaded from: - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf - * - * Copyright (C) 2012 Intel Corporation. - * Copyright 2024 Google LLC - * - * Authors: - * Wajdi Feghali <wajdi.k.feghali@intel.com> - * James Guilford <james.guilford@intel.com> - * David Cote <david.m.cote@intel.com> - * Tim Chen <tim.c.chen@linux.intel.com> - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/linkage.h> - -## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction - -# Define threshold below which buffers are considered "small" and routed to -# regular CRC code that does not interleave the CRC instructions. -#define SMALL_SIZE 200 - -# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); - -.text -SYM_FUNC_START(crc32c_x86_3way) -#define crc0 %edi -#define crc0_q %rdi -#define bufp %rsi -#define bufp_d %esi -#define len %rdx -#define len_dw %edx -#define n_misaligned %ecx /* overlaps chunk_bytes! */ -#define n_misaligned_q %rcx -#define chunk_bytes %ecx /* overlaps n_misaligned! */ -#define chunk_bytes_q %rcx -#define crc1 %r8 -#define crc2 %r9 - - cmp $SMALL_SIZE, len - jb .Lsmall - - ################################################################ - ## 1) ALIGN: - ################################################################ - mov bufp_d, n_misaligned - neg n_misaligned - and $7, n_misaligned # calculate the misalignment amount of - # the address - je .Laligned # Skip if aligned - - # Process 1 <= n_misaligned <= 7 bytes individually in order to align - # the remaining data to an 8-byte boundary. -.Ldo_align: - movq (bufp), %rax - add n_misaligned_q, bufp - sub n_misaligned_q, len -.Lalign_loop: - crc32b %al, crc0 # compute crc32 of 1-byte - shr $8, %rax # get next byte - dec n_misaligned - jne .Lalign_loop -.Laligned: - - ################################################################ - ## 2) PROCESS BLOCK: - ################################################################ - - cmp $128*24, len - jae .Lfull_block - -.Lpartial_block: - # Compute floor(len / 24) to get num qwords to process from each lane. - imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24) - shr $16, %eax - jmp .Lcrc_3lanes - -.Lfull_block: - # Processing 128 qwords from each lane. - mov $128, %eax - - ################################################################ - ## 3) CRC each of three lanes: - ################################################################ - -.Lcrc_3lanes: - xor crc1,crc1 - xor crc2,crc2 - mov %eax, chunk_bytes - shl $3, chunk_bytes # num bytes to process from each lane - sub $5, %eax # 4 for 4x_loop, 1 for special last iter - jl .Lcrc_3lanes_4x_done - - # Unroll the loop by a factor of 4 to reduce the overhead of the loop - # bookkeeping instructions, which can compete with crc32q for the ALUs. -.Lcrc_3lanes_4x_loop: - crc32q (bufp), crc0_q - crc32q (bufp,chunk_bytes_q), crc1 - crc32q (bufp,chunk_bytes_q,2), crc2 - crc32q 8(bufp), crc0_q - crc32q 8(bufp,chunk_bytes_q), crc1 - crc32q 8(bufp,chunk_bytes_q,2), crc2 - crc32q 16(bufp), crc0_q - crc32q 16(bufp,chunk_bytes_q), crc1 - crc32q 16(bufp,chunk_bytes_q,2), crc2 - crc32q 24(bufp), crc0_q - crc32q 24(bufp,chunk_bytes_q), crc1 - crc32q 24(bufp,chunk_bytes_q,2), crc2 - add $32, bufp - sub $4, %eax - jge .Lcrc_3lanes_4x_loop - -.Lcrc_3lanes_4x_done: - add $4, %eax - jz .Lcrc_3lanes_last_qword - -.Lcrc_3lanes_1x_loop: - crc32q (bufp), crc0_q - crc32q (bufp,chunk_bytes_q), crc1 - crc32q (bufp,chunk_bytes_q,2), crc2 - add $8, bufp - dec %eax - jnz .Lcrc_3lanes_1x_loop - -.Lcrc_3lanes_last_qword: - crc32q (bufp), crc0_q - crc32q (bufp,chunk_bytes_q), crc1 -# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet - - ################################################################ - ## 4) Combine three results: - ################################################################ - - lea (K_table-8)(%rip), %rax # first entry is for idx 1 - pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 - lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 - sub %rax, len # len -= chunk_bytes * 3 - - movq crc0_q, %xmm1 # CRC for block 1 - pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 - - movq crc1, %xmm2 # CRC for block 2 - pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 - - pxor %xmm2,%xmm1 - movq %xmm1, %rax - xor (bufp,chunk_bytes_q,2), %rax - mov crc2, crc0_q - crc32 %rax, crc0_q - lea 8(bufp,chunk_bytes_q,2), bufp - - ################################################################ - ## 5) If more blocks remain, goto (2): - ################################################################ - - cmp $128*24, len - jae .Lfull_block - cmp $SMALL_SIZE, len - jae .Lpartial_block - - ####################################################################### - ## 6) Process any remainder without interleaving: - ####################################################################### -.Lsmall: - test len_dw, len_dw - jz .Ldone - mov len_dw, %eax - shr $3, %eax - jz .Ldo_dword -.Ldo_qwords: - crc32q (bufp), crc0_q - add $8, bufp - dec %eax - jnz .Ldo_qwords -.Ldo_dword: - test $4, len_dw - jz .Ldo_word - crc32l (bufp), crc0 - add $4, bufp -.Ldo_word: - test $2, len_dw - jz .Ldo_byte - crc32w (bufp), crc0 - add $2, bufp -.Ldo_byte: - test $1, len_dw - jz .Ldone - crc32b (bufp), crc0 -.Ldone: - mov crc0, %eax - RET -SYM_FUNC_END(crc32c_x86_3way) - -.section .rodata, "a", @progbits - ################################################################ - ## PCLMULQDQ tables - ## Table is 128 entries x 2 words (8 bytes) each - ################################################################ -.align 8 -K_table: - .long 0x493c7d27, 0x00000001 - .long 0xba4fc28e, 0x493c7d27 - .long 0xddc0152b, 0xf20c0dfe - .long 0x9e4addf8, 0xba4fc28e - .long 0x39d3b296, 0x3da6d0cb - .long 0x0715ce53, 0xddc0152b - .long 0x47db8317, 0x1c291d04 - .long 0x0d3b6092, 0x9e4addf8 - .long 0xc96cfdc0, 0x740eef02 - .long 0x878a92a7, 0x39d3b296 - .long 0xdaece73e, 0x083a6eec - .long 0xab7aff2a, 0x0715ce53 - .long 0x2162d385, 0xc49f4f67 - .long 0x83348832, 0x47db8317 - .long 0x299847d5, 0x2ad91c30 - .long 0xb9e02b86, 0x0d3b6092 - .long 0x18b33a4e, 0x6992cea2 - .long 0xb6dd949b, 0xc96cfdc0 - .long 0x78d9ccb7, 0x7e908048 - .long 0xbac2fd7b, 0x878a92a7 - .long 0xa60ce07b, 0x1b3d8f29 - .long 0xce7f39f4, 0xdaece73e - .long 0x61d82e56, 0xf1d0f55e - .long 0xd270f1a2, 0xab7aff2a - .long 0xc619809d, 0xa87ab8a8 - .long 0x2b3cac5d, 0x2162d385 - .long 0x65863b64, 0x8462d800 - .long 0x1b03397f, 0x83348832 - .long 0xebb883bd, 0x71d111a8 - .long 0xb3e32c28, 0x299847d5 - .long 0x064f7f26, 0xffd852c6 - .long 0xdd7e3b0c, 0xb9e02b86 - .long 0xf285651c, 0xdcb17aa4 - .long 0x10746f3c, 0x18b33a4e - .long 0xc7a68855, 0xf37c5aee - .long 0x271d9844, 0xb6dd949b - .long 0x8e766a0c, 0x6051d5a2 - .long 0x93a5f730, 0x78d9ccb7 - .long 0x6cb08e5c, 0x18b0d4ff - .long 0x6b749fb2, 0xbac2fd7b - .long 0x1393e203, 0x21f3d99c - .long 0xcec3662e, 0xa60ce07b - .long 0x96c515bb, 0x8f158014 - .long 0xe6fc4e6a, 0xce7f39f4 - .long 0x8227bb8a, 0xa00457f7 - .long 0xb0cd4768, 0x61d82e56 - .long 0x39c7ff35, 0x8d6d2c43 - .long 0xd7a4825c, 0xd270f1a2 - .long 0x0ab3844b, 0x00ac29cf - .long 0x0167d312, 0xc619809d - .long 0xf6076544, 0xe9adf796 - .long 0x26f6a60a, 0x2b3cac5d - .long 0xa741c1bf, 0x96638b34 - .long 0x98d8d9cb, 0x65863b64 - .long 0x49c3cc9c, 0xe0e9f351 - .long 0x68bce87a, 0x1b03397f - .long 0x57a3d037, 0x9af01f2d - .long 0x6956fc3b, 0xebb883bd - .long 0x42d98888, 0x2cff42cf - .long 0x3771e98f, 0xb3e32c28 - .long 0xb42ae3d9, 0x88f25a3a - .long 0x2178513a, 0x064f7f26 - .long 0xe0ac139e, 0x4e36f0b0 - .long 0x170076fa, 0xdd7e3b0c - .long 0x444dd413, 0xbd6f81f8 - .long 0x6f345e45, 0xf285651c - .long 0x41d17b64, 0x91c9bd4b - .long 0xff0dba97, 0x10746f3c - .long 0xa2b73df1, 0x885f087b - .long 0xf872e54c, 0xc7a68855 - .long 0x1e41e9fc, 0x4c144932 - .long 0x86d8e4d2, 0x271d9844 - .long 0x651bd98b, 0x52148f02 - .long 0x5bb8f1bc, 0x8e766a0c - .long 0xa90fd27a, 0xa3c6f37a - .long 0xb3af077a, 0x93a5f730 - .long 0x4984d782, 0xd7c0557f - .long 0xca6ef3ac, 0x6cb08e5c - .long 0x234e0b26, 0x63ded06a - .long 0xdd66cbbb, 0x6b749fb2 - .long 0x4597456a, 0x4d56973c - .long 0xe9e28eb4, 0x1393e203 - .long 0x7b3ff57a, 0x9669c9df - .long 0xc9c8b782, 0xcec3662e - .long 0x3f70cc6f, 0xe417f38a - .long 0x93e106a4, 0x96c515bb - .long 0x62ec6c6d, 0x4b9e0f71 - .long 0xd813b325, 0xe6fc4e6a - .long 0x0df04680, 0xd104b8fc - .long 0x2342001e, 0x8227bb8a - .long 0x0a2a8d7e, 0x5b397730 - .long 0x6d9a4957, 0xb0cd4768 - .long 0xe8b6368b, 0xe78eb416 - .long 0xd2c3ed1a, 0x39c7ff35 - .long 0x995a5724, 0x61ff0e01 - .long 0x9ef68d35, 0xd7a4825c - .long 0x0c139b31, 0x8d96551c - .long 0xf2271e60, 0x0ab3844b - .long 0x0b0bf8ca, 0x0bf80dd2 - .long 0x2664fd8b, 0x0167d312 - .long 0xed64812d, 0x8821abed - .long 0x02ee03b2, 0xf6076544 - .long 0x8604ae0f, 0x6a45d2b2 - .long 0x363bd6b3, 0x26f6a60a - .long 0x135c83fd, 0xd8d26619 - .long 0x5fabe670, 0xa741c1bf - .long 0x35ec3279, 0xde87806c - .long 0x00bcf5f6, 0x98d8d9cb - .long 0x8ae00689, 0x14338754 - .long 0x17f27698, 0x49c3cc9c - .long 0x58ca5f00, 0x5bd2011f - .long 0xaa7c7ad5, 0x68bce87a - .long 0xb5cfca28, 0xdd07448e - .long 0xded288f8, 0x57a3d037 - .long 0x59f229bc, 0xdde8f5b9 - .long 0x6d390dec, 0x6956fc3b - .long 0x37170390, 0xa3e3e02c - .long 0x6353c1cc, 0x42d98888 - .long 0xc4584f5c, 0xd73c7bea - .long 0xf48642e9, 0x3771e98f - .long 0x531377e2, 0x80ff0093 - .long 0xdd35bc8d, 0xb42ae3d9 - .long 0xb25b29f2, 0x8fe4c34d - .long 0x9a5ede41, 0x2178513a - .long 0xa563905d, 0xdf99fc11 - .long 0x45cddf4e, 0xe0ac139e - .long 0xacfa3103, 0x6c23e841 - .long 0xa51b6135, 0x170076fa diff --git a/arch/x86/lib/crc64-glue.c b/arch/x86/lib/crc64-glue.c deleted file mode 100644 index b0e1b719ecbf..000000000000 --- a/arch/x86/lib/crc64-glue.c +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * CRC64 using [V]PCLMULQDQ instructions - * - * Copyright 2025 Google LLC - */ - -#include <linux/crc64.h> -#include <linux/module.h> -#include "crc-pclmul-template.h" - -static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); - -DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64); -DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64); - -u64 crc64_be_arch(u64 crc, const u8 *p, size_t len) -{ - CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts, - have_pclmulqdq); - return crc64_be_generic(crc, p, len); -} -EXPORT_SYMBOL_GPL(crc64_be_arch); - -u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len) -{ - CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts, - have_pclmulqdq); - return crc64_nvme_generic(crc, p, len); -} -EXPORT_SYMBOL_GPL(crc64_nvme_arch); - -static int __init crc64_x86_init(void) -{ - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { - static_branch_enable(&have_pclmulqdq); - INIT_CRC_PCLMUL(crc64_msb); - INIT_CRC_PCLMUL(crc64_lsb); - } - return 0; -} -arch_initcall(crc64_x86_init); - -static void __exit crc64_x86_exit(void) -{ -} -module_exit(crc64_x86_exit); - -MODULE_DESCRIPTION("CRC64 using [V]PCLMULQDQ instructions"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc64-pclmul.S b/arch/x86/lib/crc64-pclmul.S deleted file mode 100644 index 4173051b5197..000000000000 --- a/arch/x86/lib/crc64-pclmul.S +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -// Copyright 2025 Google LLC - -#include "crc-pclmul-template.S" - -DEFINE_CRC_PCLMUL_FUNCS(crc64_msb, /* bits= */ 64, /* lsb= */ 0) -DEFINE_CRC_PCLMUL_FUNCS(crc64_lsb, /* bits= */ 64, /* lsb= */ 1) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index e86eda2c0b04..eb2d2e1cbddd 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -75,7 +75,7 @@ static void delay_tsc(u64 cycles) /* Allow RT tasks to run */ preempt_enable(); - rep_nop(); + native_pause(); preempt_disable(); /* diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 98631c0e7a11..4e385cbfd444 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -13,6 +13,7 @@ #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/ldt.h> +#include <asm/msr.h> #include <asm/vm86.h> #undef pr_fmt @@ -631,14 +632,21 @@ static bool get_desc(struct desc_struct *out, unsigned short sel) /* Bits [15:3] contain the index of the desired entry. */ sel >>= 3; - mutex_lock(¤t->active_mm->context.lock); - ldt = current->active_mm->context.ldt; + /* + * If we're not in a valid context with a real (not just lazy) + * user mm, then don't even try. + */ + if (!nmi_uaccess_okay()) + return false; + + mutex_lock(¤t->mm->context.lock); + ldt = current->mm->context.ldt; if (ldt && sel < ldt->nr_entries) { *out = ldt->entries[sel]; success = true; } - mutex_unlock(¤t->active_mm->context.lock); + mutex_unlock(¤t->mm->context.lock); return success; } @@ -702,16 +710,16 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) unsigned long base; if (seg_reg_idx == INAT_SEG_REG_FS) { - rdmsrl(MSR_FS_BASE, base); + rdmsrq(MSR_FS_BASE, base); } else if (seg_reg_idx == INAT_SEG_REG_GS) { /* * swapgs was called at the kernel entry point. Thus, * MSR_KERNEL_GS_BASE will have the user-space GS base. */ if (user_mode(regs)) - rdmsrl(MSR_KERNEL_GS_BASE, base); + rdmsrq(MSR_KERNEL_GS_BASE, base); else - rdmsrl(MSR_GS_BASE, base); + rdmsrq(MSR_GS_BASE, base); } else { base = 0; } diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 6ffb931b9fb1..149a57e334ab 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -324,6 +324,11 @@ int insn_get_opcode(struct insn *insn) } insn->attr = inat_get_opcode_attribute(op); + if (insn->x86_64 && inat_is_invalid64(insn->attr)) { + /* This instruction is invalid, like UD2. Stop decoding. */ + insn->attr &= INAT_INV64; + } + while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ op = get_next(insn_byte_t, insn); @@ -337,6 +342,7 @@ int insn_get_opcode(struct insn *insn) insn->attr = 0; return -EINVAL; } + end: opcode->got = 1; return 0; @@ -658,7 +664,6 @@ int insn_get_immediate(struct insn *insn) } if (!inat_has_immediate(insn->attr)) - /* no immediates */ goto done; switch (inat_immediate_size(insn->attr)) { diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c index 5eecb45d05d5..c20e04764edc 100644 --- a/arch/x86/lib/iomem.c +++ b/arch/x86/lib/iomem.c @@ -10,7 +10,7 @@ static __always_inline void rep_movs(void *to, const void *from, size_t n) { unsigned long d0, d1, d2; - asm volatile("rep ; movsl\n\t" + asm volatile("rep movsl\n\t" "testb $2,%b4\n\t" "je 1f\n\t" "movsw\n" diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index a58f451a7dd3..b5893928d55c 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -8,7 +8,7 @@ */ #include <asm/asm.h> #include <asm/kaslr.h> -#include <asm/msr.h> +#include <asm/tsc.h> #include <asm/archrandom.h> #include <asm/e820/api.h> #include <asm/shared/io.h> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 0ae2e1712e2e..12a23fa7c44c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -41,6 +41,7 @@ SYM_FUNC_END(__memcpy) EXPORT_SYMBOL(__memcpy) SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy) +SYM_PIC_ALIAS(memcpy) EXPORT_SYMBOL(memcpy) SYM_FUNC_START_LOCAL(memcpy_orig) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index d66b710d628f..fb5a03cf5ab7 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -42,6 +42,7 @@ SYM_FUNC_END(__memset) EXPORT_SYMBOL(__memset) SYM_FUNC_ALIAS_MEMFUNC(memset, __memset) +SYM_PIC_ALIAS(memset) EXPORT_SYMBOL(memset) SYM_FUNC_START_LOCAL(memset_orig) diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index acd463d887e1..b8f63419e6ae 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -47,7 +47,7 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) } EXPORT_SYMBOL(rdmsr_on_cpu); -int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { int err; struct msr_info rv; @@ -60,7 +60,7 @@ int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) return err; } -EXPORT_SYMBOL(rdmsrl_on_cpu); +EXPORT_SYMBOL(rdmsrq_on_cpu); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { @@ -78,7 +78,7 @@ int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) } EXPORT_SYMBOL(wrmsr_on_cpu); -int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { int err; struct msr_info rv; @@ -92,7 +92,7 @@ int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) return err; } -EXPORT_SYMBOL(wrmsrl_on_cpu); +EXPORT_SYMBOL(wrmsrq_on_cpu); static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs, @@ -204,7 +204,7 @@ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) } EXPORT_SYMBOL(wrmsr_safe_on_cpu); -int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { int err; struct msr_info rv; @@ -218,9 +218,9 @@ int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) return err ? err : rv.err; } -EXPORT_SYMBOL(wrmsrl_safe_on_cpu); +EXPORT_SYMBOL(wrmsrq_safe_on_cpu); -int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { u32 low, high; int err; @@ -230,7 +230,7 @@ int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) return err; } -EXPORT_SYMBOL(rdmsrl_safe_on_cpu); +EXPORT_SYMBOL(rdmsrq_safe_on_cpu); /* * These variants are significantly slower, but allows control over diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 5a18ecc04a6c..4ef7c6dcbea6 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -41,7 +41,7 @@ static int msr_read(u32 msr, struct msr *m) int err; u64 val; - err = rdmsrl_safe(msr, &val); + err = rdmsrq_safe(msr, &val); if (!err) m->q = val; @@ -58,7 +58,7 @@ static int msr_read(u32 msr, struct msr *m) */ static int msr_write(u32 msr, struct msr *m) { - return wrmsrl_safe(msr, m->q); + return wrmsrq_safe(msr, m->q); } static inline int __flip_bit(u32 msr, u8 bit, bool set) @@ -122,23 +122,23 @@ int msr_clear_bit(u32 msr, u8 bit) EXPORT_SYMBOL_GPL(msr_clear_bit); #ifdef CONFIG_TRACEPOINTS -void do_trace_write_msr(unsigned int msr, u64 val, int failed) +void do_trace_write_msr(u32 msr, u64 val, int failed) { trace_write_msr(msr, val, failed); } EXPORT_SYMBOL(do_trace_write_msr); EXPORT_TRACEPOINT_SYMBOL(write_msr); -void do_trace_read_msr(unsigned int msr, u64 val, int failed) +void do_trace_read_msr(u32 msr, u64 val, int failed) { trace_read_msr(msr, val, failed); } EXPORT_SYMBOL(do_trace_read_msr); EXPORT_TRACEPOINT_SYMBOL(read_msr); -void do_trace_rdpmc(unsigned counter, u64 val, int failed) +void do_trace_rdpmc(u32 msr, u64 val, int failed) { - trace_rdpmc(counter, val, failed); + trace_rdpmc(msr, val, failed); } EXPORT_SYMBOL(do_trace_rdpmc); EXPORT_TRACEPOINT_SYMBOL(rdpmc); diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index a26c43abd47d..d78d769a02bd 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -40,6 +40,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) ALTERNATIVE_2 __stringify(RETPOLINE \reg), \ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_LFENCE, \ __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), ALT_NOT(X86_FEATURE_RETPOLINE) +SYM_PIC_ALIAS(__x86_indirect_thunk_\reg) .endm @@ -367,6 +368,54 @@ SYM_FUNC_END(call_depth_return_thunk) #endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ +#ifdef CONFIG_MITIGATION_ITS + +.macro ITS_THUNK reg + +/* + * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b) + * that complete the fineibt_paranoid caller sequence. + */ +1: .byte 0xea +SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + jne 1b +SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + jmp *%\reg + int3 + .align 32, 0xcc /* fill to the end of the line */ + .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */ +.endm + +/* ITS mitigation requires thunks be aligned to upper half of cacheline */ +.align 64, 0xcc +.skip 29, 0xcc + +#define GEN(reg) ITS_THUNK reg +#include <asm/GEN-for-each-reg.h> +#undef GEN + + .align 64, 0xcc +SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax) +SYM_CODE_END(__x86_indirect_its_thunk_array) + +.align 64, 0xcc +.skip 32, 0xcc +SYM_CODE_START(its_return_thunk) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(its_return_thunk) +EXPORT_SYMBOL(its_return_thunk) + +#endif /* CONFIG_MITIGATION_ITS */ + /* * This function name is magical and is used by -mfunction-return=thunk-extern * for the compiler to generate JMPs to it. @@ -394,6 +443,7 @@ SYM_CODE_START(__x86_return_thunk) #endif int3 SYM_CODE_END(__x86_return_thunk) +SYM_PIC_ALIAS(__x86_return_thunk) EXPORT_SYMBOL(__x86_return_thunk) #endif /* CONFIG_MITIGATION_RETHUNK */ diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index 53b3f202267c..f87ec24fa579 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -40,8 +40,7 @@ char *strncpy(char *dest, const char *src, size_t count) "stosb\n\t" "testb %%al,%%al\n\t" "jne 1b\n\t" - "rep\n\t" - "stosb\n" + "rep stosb\n" "2:" : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) : "0" (src), "1" (dest), "2" (count) : "memory"); @@ -54,8 +53,7 @@ EXPORT_SYMBOL(strncpy); char *strcat(char *dest, const char *src) { int d0, d1, d2, d3; - asm volatile("repne\n\t" - "scasb\n\t" + asm volatile("repne scasb\n\t" "decl %1\n" "1:\tlodsb\n\t" "stosb\n\t" @@ -72,8 +70,7 @@ EXPORT_SYMBOL(strcat); char *strncat(char *dest, const char *src, size_t count) { int d0, d1, d2, d3; - asm volatile("repne\n\t" - "scasb\n\t" + asm volatile("repne scasb\n\t" "decl %1\n\t" "movl %8,%3\n" "1:\tdecl %3\n\t" @@ -167,8 +164,7 @@ size_t strlen(const char *s) { int d0; size_t res; - asm volatile("repne\n\t" - "scasb" + asm volatile("repne scasb" : "=c" (res), "=&D" (d0) : "1" (s), "a" (0), "0" (0xffffffffu) : "memory"); @@ -184,8 +180,7 @@ void *memchr(const void *cs, int c, size_t count) void *res; if (!count) return NULL; - asm volatile("repne\n\t" - "scasb\n\t" + asm volatile("repne scasb\n\t" "je 1f\n\t" "movl $1,%0\n" "1:\tdecl %0" @@ -202,7 +197,7 @@ void *memscan(void *addr, int c, size_t size) { if (!size) return addr; - asm volatile("repnz; scasb\n\t" + asm volatile("repnz scasb\n\t" "jnz 1f\n\t" "dec %%edi\n" "1:" diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c index 38f37df056f7..28267985e85f 100644 --- a/arch/x86/lib/strstr_32.c +++ b/arch/x86/lib/strstr_32.c @@ -8,16 +8,14 @@ int d0, d1; register char *__res; __asm__ __volatile__( "movl %6,%%edi\n\t" - "repne\n\t" - "scasb\n\t" + "repne scasb\n\t" "notl %%ecx\n\t" "decl %%ecx\n\t" /* NOTE! This also sets Z if searchstring='' */ "movl %%ecx,%%edx\n" "1:\tmovl %6,%%edi\n\t" "movl %%esi,%%eax\n\t" "movl %%edx,%%ecx\n\t" - "repe\n\t" - "cmpsb\n\t" + "repe cmpsb\n\t" "je 2f\n\t" /* also works for empty string, see above */ "xchgl %%eax,%%esi\n\t" "incl %%esi\n\t" diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 422257c350c6..f6f436f1d573 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -38,9 +38,9 @@ do { \ might_fault(); \ __asm__ __volatile__( \ ASM_STAC "\n" \ - "0: rep; stosl\n" \ + "0: rep stosl\n" \ " movl %2,%0\n" \ - "1: rep; stosb\n" \ + "1: rep stosb\n" \ "2: " ASM_CLAC "\n" \ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %2) \ _ASM_EXTABLE_UA(1b, 2b) \ @@ -140,9 +140,9 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size) " shrl $2, %0\n" " andl $3, %%eax\n" " cld\n" - "99: rep; movsl\n" + "99: rep movsl\n" "36: movl %%eax, %0\n" - "37: rep; movsb\n" + "37: rep movsb\n" "100:\n" _ASM_EXTABLE_UA(1b, 100b) _ASM_EXTABLE_UA(2b, 100b) @@ -242,9 +242,9 @@ static unsigned long __copy_user_intel_nocache(void *to, " shrl $2, %0\n" " andl $3, %%eax\n" " cld\n" - "6: rep; movsl\n" + "6: rep movsl\n" " movl %%eax,%0\n" - "7: rep; movsb\n" + "7: rep movsb\n" "8:\n" _ASM_EXTABLE_UA(0b, 8b) _ASM_EXTABLE_UA(1b, 8b) @@ -293,14 +293,14 @@ do { \ " negl %0\n" \ " andl $7,%0\n" \ " subl %0,%3\n" \ - "4: rep; movsb\n" \ + "4: rep movsb\n" \ " movl %3,%0\n" \ " shrl $2,%0\n" \ " andl $3,%3\n" \ " .align 2,0x90\n" \ - "0: rep; movsl\n" \ + "0: rep movsl\n" \ " movl %3,%0\n" \ - "1: rep; movsb\n" \ + "1: rep movsb\n" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(4b, 2b, EX_TYPE_UCOPY_LEN1, %3) \ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %3) \ diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index caedb3ef6688..262f7ca1fb95 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -35,7 +35,7 @@ # - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) # - (66&F2): Both 0x66 and 0xF2 prefixes are specified. # -# REX2 Prefix +# REX2 Prefix Superscripts # - (!REX2): REX2 is not allowed # - (REX2): REX2 variant e.g. JMPABS @@ -147,7 +147,7 @@ AVXcode: # 0x60 - 0x6f 60: PUSHA/PUSHAD (i64) 61: POPA/POPAD (i64) -62: BOUND Gv,Ma (i64) | EVEX (Prefix) +62: BOUND Gv,Ma (i64) | EVEX (Prefix),(o64) 63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64) 64: SEG=FS (Prefix) 65: SEG=GS (Prefix) @@ -253,8 +253,8 @@ c0: Grp2 Eb,Ib (1A) c1: Grp2 Ev,Ib (1A) c2: RETN Iw (f64) c3: RETN -c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) -c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) +c4: LES Gz,Mp (i64) | VEX+2byte (Prefix),(o64) +c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix),(o64) c6: Grp11A Eb,Ib (1A) c7: Grp11B Ev,Iz (1A) c8: ENTER Iw,Ib @@ -286,10 +286,10 @@ df: ESC # Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix # in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation # to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. -e0: LOOPNE/LOOPNZ Jb (f64) (!REX2) -e1: LOOPE/LOOPZ Jb (f64) (!REX2) -e2: LOOP Jb (f64) (!REX2) -e3: JrCXZ Jb (f64) (!REX2) +e0: LOOPNE/LOOPNZ Jb (f64),(!REX2) +e1: LOOPE/LOOPZ Jb (f64),(!REX2) +e2: LOOP Jb (f64),(!REX2) +e3: JrCXZ Jb (f64),(!REX2) e4: IN AL,Ib (!REX2) e5: IN eAX,Ib (!REX2) e6: OUT Ib,AL (!REX2) @@ -298,10 +298,10 @@ e7: OUT Ib,eAX (!REX2) # in "near" jumps and calls is 16-bit. For CALL, # push of return address is 16-bit wide, RSP is decremented by 2 # but is not truncated to 16 bits, unlike RIP. -e8: CALL Jz (f64) (!REX2) -e9: JMP-near Jz (f64) (!REX2) -ea: JMP-far Ap (i64) (!REX2) -eb: JMP-short Jb (f64) (!REX2) +e8: CALL Jz (f64),(!REX2) +e9: JMP-near Jz (f64),(!REX2) +ea: JMP-far Ap (i64),(!REX2) +eb: JMP-short Jb (f64),(!REX2) ec: IN AL,DX (!REX2) ed: IN eAX,DX (!REX2) ee: OUT DX,AL (!REX2) @@ -478,22 +478,22 @@ AVXcode: 1 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev) # 0x0f 0x80-0x8f # Note: "forced64" is Intel CPU behavior (see comment about CALL insn). -80: JO Jz (f64) (!REX2) -81: JNO Jz (f64) (!REX2) -82: JB/JC/JNAE Jz (f64) (!REX2) -83: JAE/JNB/JNC Jz (f64) (!REX2) -84: JE/JZ Jz (f64) (!REX2) -85: JNE/JNZ Jz (f64) (!REX2) -86: JBE/JNA Jz (f64) (!REX2) -87: JA/JNBE Jz (f64) (!REX2) -88: JS Jz (f64) (!REX2) -89: JNS Jz (f64) (!REX2) -8a: JP/JPE Jz (f64) (!REX2) -8b: JNP/JPO Jz (f64) (!REX2) -8c: JL/JNGE Jz (f64) (!REX2) -8d: JNL/JGE Jz (f64) (!REX2) -8e: JLE/JNG Jz (f64) (!REX2) -8f: JNLE/JG Jz (f64) (!REX2) +80: JO Jz (f64),(!REX2) +81: JNO Jz (f64),(!REX2) +82: JB/JC/JNAE Jz (f64),(!REX2) +83: JAE/JNB/JNC Jz (f64),(!REX2) +84: JE/JZ Jz (f64),(!REX2) +85: JNE/JNZ Jz (f64),(!REX2) +86: JBE/JNA Jz (f64),(!REX2) +87: JA/JNBE Jz (f64),(!REX2) +88: JS Jz (f64),(!REX2) +89: JNS Jz (f64),(!REX2) +8a: JP/JPE Jz (f64),(!REX2) +8b: JNP/JPO Jz (f64),(!REX2) +8c: JL/JNGE Jz (f64),(!REX2) +8d: JNL/JGE Jz (f64),(!REX2) +8e: JLE/JNG Jz (f64),(!REX2) +8f: JNLE/JG Jz (f64),(!REX2) # 0x0f 0x90-0x9f 90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66) 91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66) @@ -996,8 +996,8 @@ AVXcode: 4 83: Grp1 Ev,Ib (1A),(es) # CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL, # CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ -84: CTESTSCC (ev) -85: CTESTSCC (es) | CTESTSCC (66),(es) +84: CTESTSCC Eb,Gb (ev) +85: CTESTSCC Ev,Gv (es) | CTESTSCC Ev,Gv (66),(es) 88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es) 8f: POP2 Bq,Rq (000),(11B),(ev) a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es) |