summaryrefslogtreecommitdiff
path: root/arch/x86/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/lib')
-rw-r--r--arch/x86/lib/.gitignore4
-rw-r--r--arch/x86/lib/Makefile10
-rw-r--r--arch/x86/lib/cache-smp.c26
-rw-r--r--arch/x86/lib/copy_user_64.S18
-rw-r--r--arch/x86/lib/crc-pclmul-consts.h195
-rw-r--r--arch/x86/lib/crc-pclmul-template.S582
-rw-r--r--arch/x86/lib/crc-pclmul-template.h76
-rw-r--r--arch/x86/lib/crc-t10dif-glue.c40
-rw-r--r--arch/x86/lib/crc16-msb-pclmul.S6
-rw-r--r--arch/x86/lib/crc32-glue.c111
-rw-r--r--arch/x86/lib/crc32-pclmul.S6
-rw-r--r--arch/x86/lib/crc32c-3way.S360
-rw-r--r--arch/x86/lib/crc64-glue.c50
-rw-r--r--arch/x86/lib/crc64-pclmul.S7
-rw-r--r--arch/x86/lib/delay.c2
-rw-r--r--arch/x86/lib/insn-eval.c20
-rw-r--r--arch/x86/lib/insn.c7
-rw-r--r--arch/x86/lib/iomem.c2
-rw-r--r--arch/x86/lib/kaslr.c2
-rw-r--r--arch/x86/lib/memcpy_64.S1
-rw-r--r--arch/x86/lib/memset_64.S1
-rw-r--r--arch/x86/lib/msr-smp.c16
-rw-r--r--arch/x86/lib/msr.c12
-rw-r--r--arch/x86/lib/retpoline.S50
-rw-r--r--arch/x86/lib/string_32.c17
-rw-r--r--arch/x86/lib/strstr_32.c6
-rw-r--r--arch/x86/lib/usercopy_32.c18
-rw-r--r--arch/x86/lib/x86-opcode-map.txt60
28 files changed, 182 insertions, 1523 deletions
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
index 8ae0f93ecbfd..ec2131c9fd20 100644
--- a/arch/x86/lib/.gitignore
+++ b/arch/x86/lib/.gitignore
@@ -1,2 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
+
inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 1c50352eb49f..2dba7f83ef97 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -38,16 +38,6 @@ lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o
-obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o
-crc32-x86-y := crc32-glue.o crc32-pclmul.o
-crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o
-
-obj-$(CONFIG_CRC64_ARCH) += crc64-x86.o
-crc64-x86-y := crc64-glue.o crc64-pclmul.o
-
-obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o
-crc-t10dif-x86-y := crc-t10dif-glue.o crc16-msb-pclmul.o
-
obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
obj-y += iomem.o
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index 7af743bd3b13..c5c60d07308c 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -14,9 +14,31 @@ void wbinvd_on_cpu(int cpu)
}
EXPORT_SYMBOL(wbinvd_on_cpu);
-int wbinvd_on_all_cpus(void)
+void wbinvd_on_all_cpus(void)
{
on_each_cpu(__wbinvd, NULL, 1);
- return 0;
}
EXPORT_SYMBOL(wbinvd_on_all_cpus);
+
+void wbinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ on_each_cpu_mask(cpus, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(wbinvd_on_cpus_mask);
+
+static void __wbnoinvd(void *dummy)
+{
+ wbnoinvd();
+}
+
+void wbnoinvd_on_all_cpus(void)
+{
+ on_each_cpu(__wbnoinvd, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(wbnoinvd_on_all_cpus);
+
+void wbnoinvd_on_cpus_mask(struct cpumask *cpus)
+{
+ on_each_cpu_mask(cpus, __wbnoinvd, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(wbnoinvd_on_cpus_mask);
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index aa8c341b2441..06296eb69fd4 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -77,6 +77,24 @@ SYM_FUNC_START(rep_movs_alternative)
_ASM_EXTABLE_UA( 0b, 1b)
.Llarge_movsq:
+ /* Do the first possibly unaligned word */
+0: movq (%rsi),%rax
+1: movq %rax,(%rdi)
+
+ _ASM_EXTABLE_UA( 0b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA( 1b, .Lcopy_user_tail)
+
+ /* What would be the offset to the aligned destination? */
+ leaq 8(%rdi),%rax
+ andq $-8,%rax
+ subq %rdi,%rax
+
+ /* .. and update pointers and count to match */
+ addq %rax,%rdi
+ addq %rax,%rsi
+ subq %rax,%rcx
+
+ /* make %rcx contain the number of words, %rax the remainder */
movq %rcx,%rax
shrq $3,%rcx
andl $7,%eax
diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h
deleted file mode 100644
index fcc63c064333..000000000000
--- a/arch/x86/lib/crc-pclmul-consts.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * CRC constants generated by:
- *
- * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
- *
- * Do not edit manually.
- */
-
-/*
- * CRC folding constants generated for most-significant-bit-first CRC-16 using
- * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
- */
-static const struct {
- u8 bswap_mask[16];
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = {
- .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
- .fold_across_2048_bits_consts = {
- 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */
- 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */
- },
- .fold_across_1024_bits_consts = {
- 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */
- 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */
- },
- .fold_across_512_bits_consts = {
- 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */
- 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */
- },
- .fold_across_256_bits_consts = {
- 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */
- 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */
- },
- .fold_across_128_bits_consts = {
- 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */
- 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */
- 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */
- },
-};
-
-/*
- * CRC folding constants generated for least-significant-bit-first CRC-32 using
- * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
- * x^5 + x^4 + x^2 + x^1 + x^0
- */
-static const struct {
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = {
- .fold_across_2048_bits_consts = {
- 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */
- 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */
- },
- .fold_across_1024_bits_consts = {
- 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */
- 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */
- },
- .fold_across_512_bits_consts = {
- 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */
- 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */
- },
- .fold_across_256_bits_consts = {
- 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */
- 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */
- },
- .fold_across_128_bits_consts = {
- 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */
- 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */
- 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */
- },
-};
-
-/*
- * CRC folding constants generated for most-significant-bit-first CRC-64 using
- * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x^1 + x^0
- */
-static const struct {
- u8 bswap_mask[16];
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc64_msb_0x42f0e1eba9ea3693_consts ____cacheline_aligned __maybe_unused = {
- .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
- .fold_across_2048_bits_consts = {
- 0x7f52691a60ddc70d, /* LO64_TERMS: (x^2048 mod G) * x^0 */
- 0x7036b0389f6a0c82, /* HI64_TERMS: (x^2112 mod G) * x^0 */
- },
- .fold_across_1024_bits_consts = {
- 0x05cf79dea9ac37d6, /* LO64_TERMS: (x^1024 mod G) * x^0 */
- 0x001067e571d7d5c2, /* HI64_TERMS: (x^1088 mod G) * x^0 */
- },
- .fold_across_512_bits_consts = {
- 0x5f6843ca540df020, /* LO64_TERMS: (x^512 mod G) * x^0 */
- 0xddf4b6981205b83f, /* HI64_TERMS: (x^576 mod G) * x^0 */
- },
- .fold_across_256_bits_consts = {
- 0x571bee0a227ef92b, /* LO64_TERMS: (x^256 mod G) * x^0 */
- 0x44bef2a201b5200c, /* HI64_TERMS: (x^320 mod G) * x^0 */
- },
- .fold_across_128_bits_consts = {
- 0x05f5c3c7eb52fab6, /* LO64_TERMS: (x^128 mod G) * x^0 */
- 0x4eb938a7d257740e, /* HI64_TERMS: (x^192 mod G) * x^0 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0x42f0e1eba9ea3693, /* LO64_TERMS: (G - x^64) * x^0 */
- 0x578d29d06cc4f872, /* HI64_TERMS: (floor(x^127 / G) * x) - x^64 */
- },
-};
-
-/*
- * CRC folding constants generated for least-significant-bit-first CRC-64 using
- * G(x) = x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 +
- * x^47 + x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 +
- * x^26 + x^23 + x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 +
- * x^4 + x^3 + x^0
- */
-static const struct {
- u64 fold_across_2048_bits_consts[2];
- u64 fold_across_1024_bits_consts[2];
- u64 fold_across_512_bits_consts[2];
- u64 fold_across_256_bits_consts[2];
- u64 fold_across_128_bits_consts[2];
- u8 shuf_table[48];
- u64 barrett_reduction_consts[2];
-} crc64_lsb_0x9a6c9329ac4bc9b5_consts ____cacheline_aligned __maybe_unused = {
- .fold_across_2048_bits_consts = {
- 0x37ccd3e14069cabc, /* HI64_TERMS: (x^2111 mod G) * x^0 */
- 0xa043808c0f782663, /* LO64_TERMS: (x^2047 mod G) * x^0 */
- },
- .fold_across_1024_bits_consts = {
- 0xa1ca681e733f9c40, /* HI64_TERMS: (x^1087 mod G) * x^0 */
- 0x5f852fb61e8d92dc, /* LO64_TERMS: (x^1023 mod G) * x^0 */
- },
- .fold_across_512_bits_consts = {
- 0x0c32cdb31e18a84a, /* HI64_TERMS: (x^575 mod G) * x^0 */
- 0x62242240ace5045a, /* LO64_TERMS: (x^511 mod G) * x^0 */
- },
- .fold_across_256_bits_consts = {
- 0xb0bc2e589204f500, /* HI64_TERMS: (x^319 mod G) * x^0 */
- 0xe1e0bb9d45d7a44c, /* LO64_TERMS: (x^255 mod G) * x^0 */
- },
- .fold_across_128_bits_consts = {
- 0xeadc41fd2ba3d420, /* HI64_TERMS: (x^191 mod G) * x^0 */
- 0x21e9761e252621ac, /* LO64_TERMS: (x^127 mod G) * x^0 */
- },
- .shuf_table = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- },
- .barrett_reduction_consts = {
- 0x27ecfa329aef9f77, /* HI64_TERMS: floor(x^127 / G) */
- 0x34d926535897936a, /* LO64_TERMS: (G - x^64 - x^0) / x */
- },
-};
diff --git a/arch/x86/lib/crc-pclmul-template.S b/arch/x86/lib/crc-pclmul-template.S
deleted file mode 100644
index ae0b6144c503..000000000000
--- a/arch/x86/lib/crc-pclmul-template.S
+++ /dev/null
@@ -1,582 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-//
-// Template to generate [V]PCLMULQDQ-based CRC functions for x86
-//
-// Copyright 2025 Google LLC
-//
-// Author: Eric Biggers <ebiggers@google.com>
-
-#include <linux/linkage.h>
-#include <linux/objtool.h>
-
-// Offsets within the generated constants table
-.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only
-.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next
-.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0
-.set OFFSETOF_SHUF_TABLE, 1*16
-.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16
-
-// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
-// corresponding non-VEX instruction plus any needed moves. The supported
-// instruction formats are:
-//
-// - Two-arg [src, dst], where the non-VEX format is the same.
-// - Three-arg [src1, src2, dst] where the non-VEX format is
-// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too.
-//
-// \insn gives the instruction without a "v" prefix and including any immediate
-// argument if needed to make the instruction follow one of the above formats.
-// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
-// it first; this is needed when \arg1 is an unaligned mem operand.
-.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
-.if AVX_LEVEL == 0
- // VEX not allowed. Emulate it.
- .ifnb \arg3 // Three-arg [src1, src2, dst]
- .ifc "\arg2", "\arg3" // src2 == dst?
- .ifnb \unaligned_mem_tmp
- movdqu \arg1, \unaligned_mem_tmp
- \insn \unaligned_mem_tmp, \arg3
- .else
- \insn \arg1, \arg3
- .endif
- .else // src2 != dst
- .ifc "\arg1", "\arg3"
- .error "Can't have src1 == dst when src2 != dst"
- .endif
- .ifnb \unaligned_mem_tmp
- movdqu \arg1, \unaligned_mem_tmp
- movdqa \arg2, \arg3
- \insn \unaligned_mem_tmp, \arg3
- .else
- movdqa \arg2, \arg3
- \insn \arg1, \arg3
- .endif
- .endif
- .else // Two-arg [src, dst]
- .ifnb \unaligned_mem_tmp
- movdqu \arg1, \unaligned_mem_tmp
- \insn \unaligned_mem_tmp, \arg2
- .else
- \insn \arg1, \arg2
- .endif
- .endif
-.else
- // VEX is allowed. Emit the desired instruction directly.
- .ifnb \arg3
- v\insn \arg1, \arg2, \arg3
- .else
- v\insn \arg1, \arg2
- .endif
-.endif
-.endm
-
-// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
-// register of length VL.
-.macro _vbroadcast src, dst
-.if VL == 16
- _cond_vex movdqa, \src, \dst
-.elseif VL == 32
- vbroadcasti128 \src, \dst
-.else
- vbroadcasti32x4 \src, \dst
-.endif
-.endm
-
-// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
-// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
-.macro _load_data vl, src, bswap_mask, dst
-.if \vl < 64
- _cond_vex movdqu, "\src", \dst
-.else
- vmovdqu8 \src, \dst
-.endif
-.if !LSB_CRC
- _cond_vex pshufb, \bswap_mask, \dst, \dst
-.endif
-.endm
-
-.macro _prepare_v0 vl, v0, v1, bswap_mask
-.if LSB_CRC
- .if \vl < 64
- _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1
- .else
- vpxorq (BUF), \v0, \v0
- .endif
-.else
- _load_data \vl, (BUF), \bswap_mask, \v1
- .if \vl < 64
- _cond_vex pxor, \v1, \v0, \v0
- .else
- vpxorq \v1, \v0, \v0
- .endif
-.endif
-.endm
-
-// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
-// msb-first order or the physically high qword for lsb-first order
-#define LO64_TERMS 0
-
-// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
-// qword for msb-first order or the physically low qword for lsb-first order
-#define HI64_TERMS 1
-
-// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
-// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
-.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst
- _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
- \src1, \src2, \dst
-.endm
-
-// Fold \acc into \data and store the result back into \acc. \data can be an
-// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
-// byte-reflection is needed; otherwise it must be a vector register. \consts
-// is a vector register containing the needed fold constants, and \tmp is a
-// temporary vector register. All arguments must be the same length.
-.macro _fold_vec acc, data, consts, tmp
- _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
- _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc
-.if AVX_LEVEL <= 2
- _cond_vex pxor, \data, \tmp, \tmp
- _cond_vex pxor, \tmp, \acc, \acc
-.else
- vpternlogq $0x96, \data, \tmp, \acc
-.endif
-.endm
-
-// Fold \acc into \data and store the result back into \acc. \data is an
-// unaligned mem operand, \consts is a vector register containing the needed
-// fold constants, \bswap_mask is a vector register containing the
-// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
-// temporary vector registers. All arguments must have length \vl.
-.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2
-.if AVX_LEVEL == 0 || !LSB_CRC
- _load_data \vl, \data, \bswap_mask, \tmp1
- _fold_vec \acc, \tmp1, \consts, \tmp2
-.else
- _fold_vec \acc, \data, \consts, \tmp1
-.endif
-.endm
-
-// Load the constants for folding across 2**i vectors of length VL at a time
-// into all 128-bit lanes of the vector register CONSTS.
-.macro _load_vec_folding_consts i
- _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
- CONSTS
-.endm
-
-// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
-// the result back into \v0. If the remaining length mod \vl is nonzero, also
-// fold \vl data bytes from BUF. For both operations the fold distance is \vl.
-// \consts must be a register of length \vl containing the fold constants.
-.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2
- _fold_vec \v0, \v1, \consts, \tmp1
- test $\vl, LEN8
- jz .Lfold_vec_final_done\@
- _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
- add $\vl, BUF
-.Lfold_vec_final_done\@:
-.endm
-
-// This macro generates the body of a CRC function with the following prototype:
-//
-// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts);
-//
-// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it.
-// |buf| is the data to checksum. |len| is the data length in bytes, which must
-// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts
-// field of the constants struct that was generated for the chosen CRC variant.
-//
-// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
-// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If
-// the file is compiled in i386 mode, then the maximum supported value is 32.
-//
-// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
-// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0
-// if the CRC processes the most significant bit of each byte first, i.e. maps
-// bit0 to x^0, bit1 to x^1, bit7 to x^7.
-//
-// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
-//
-// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
-// 512 for AVX512.
-//
-// If \vl == 16 && \avx_level == 0, the generated code requires:
-// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
-//
-// If \vl == 32 && \avx_level == 2, the generated code requires:
-// VPCLMULQDQ && AVX2.
-//
-// If \vl == 64 && \avx_level == 512, the generated code requires:
-// VPCLMULQDQ && AVX512BW && AVX512VL.
-//
-// Other \vl and \avx_level combinations are either not supported or not useful.
-.macro _crc_pclmul n, lsb_crc, vl, avx_level
- .set LSB_CRC, \lsb_crc
- .set VL, \vl
- .set AVX_LEVEL, \avx_level
-
- // Define aliases for the xmm, ymm, or zmm registers according to VL.
-.irp i, 0,1,2,3,4,5,6,7
- .if VL == 16
- .set V\i, %xmm\i
- .set LOG2_VL, 4
- .elseif VL == 32
- .set V\i, %ymm\i
- .set LOG2_VL, 5
- .elseif VL == 64
- .set V\i, %zmm\i
- .set LOG2_VL, 6
- .else
- .error "Unsupported vector length"
- .endif
-.endr
- // Define aliases for the function parameters.
- // Note: when crc_t is shorter than u32, zero-extension to 32 bits is
- // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed
- // when crc_t is shorter than u64.
-#ifdef __x86_64__
-.if \n <= 32
- .set CRC, %edi
-.else
- .set CRC, %rdi
-.endif
- .set BUF, %rsi
- .set LEN, %rdx
- .set LEN32, %edx
- .set LEN8, %dl
- .set CONSTS_PTR, %rcx
-#else
- // 32-bit support, assuming -mregparm=3 and not including support for
- // CRC-64 (which would use both eax and edx to pass the crc parameter).
- .set CRC, %eax
- .set BUF, %edx
- .set LEN, %ecx
- .set LEN32, %ecx
- .set LEN8, %cl
- .set CONSTS_PTR, %ebx // Passed on stack
-#endif
-
- // Define aliases for some local variables. V0-V5 are used without
- // aliases (for accumulators, data, temporary values, etc). Staying
- // within the first 8 vector registers keeps the code 32-bit SSE
- // compatible and reduces the size of 64-bit SSE code slightly.
- .set BSWAP_MASK, V6
- .set BSWAP_MASK_YMM, %ymm6
- .set BSWAP_MASK_XMM, %xmm6
- .set CONSTS, V7
- .set CONSTS_YMM, %ymm7
- .set CONSTS_XMM, %xmm7
-
- // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
- // functions generated by this macro are called only by static_call.
- ANNOTATE_NOENDBR
-
-#ifdef __i386__
- push CONSTS_PTR
- mov 8(%esp), CONSTS_PTR
-#endif
-
- // Create a 128-bit vector that contains the initial CRC in the end
- // representing the high-order polynomial coefficients, and the rest 0.
- // If the CRC is msb-first, also load the byte-reflection table.
-.if \n <= 32
- _cond_vex movd, CRC, %xmm0
-.else
- _cond_vex movq, CRC, %xmm0
-.endif
-.if !LSB_CRC
- _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
- _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
-.endif
-
- // Load the first vector of data and XOR the initial CRC into the
- // appropriate end of the first 128-bit lane of data. If LEN < VL, then
- // use a short vector and jump ahead to the final reduction. (LEN >= 16
- // is guaranteed here but not necessarily LEN >= VL.)
-.if VL >= 32
- cmp $VL, LEN
- jae .Lat_least_1vec\@
- .if VL == 64
- cmp $32, LEN32
- jb .Lless_than_32bytes\@
- _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM
- add $32, BUF
- jmp .Lreduce_256bits_to_128bits\@
-.Lless_than_32bytes\@:
- .endif
- _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM
- add $16, BUF
- vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
- jmp .Lcheck_for_partial_block\@
-.Lat_least_1vec\@:
-.endif
- _prepare_v0 VL, V0, V1, BSWAP_MASK
-
- // Handle VL <= LEN < 4*VL.
- cmp $4*VL-1, LEN
- ja .Lat_least_4vecs\@
- add $VL, BUF
- // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector.
- // If VL==16 then load fold_across_128_bits_consts first, as the final
- // reduction depends on it and it won't be loaded anywhere else.
- cmp $2*VL-1, LEN32
-.if VL == 16
- _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
-.endif
- jbe .Lreduce_1vec_to_128bits\@
- // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to
- // the reduction from 2 vectors.
- _load_data VL, (BUF), BSWAP_MASK, V1
- add $VL, BUF
- jmp .Lreduce_2vecs_to_1\@
-
-.Lat_least_4vecs\@:
- // Load 3 more vectors of data.
- _load_data VL, 1*VL(BUF), BSWAP_MASK, V1
- _load_data VL, 2*VL(BUF), BSWAP_MASK, V2
- _load_data VL, 3*VL(BUF), BSWAP_MASK, V3
- sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32
- add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32
-
- // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
- // 4 vectors of data and write the result back to V0-V3.
- cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32
- jbe .Lreduce_4vecs_to_2\@
- _load_vec_folding_consts 2
-.Lfold_4vecs_loop\@:
- _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- sub $-4*VL, BUF
- add $-4*VL, LEN
- cmp $4*VL-1, LEN
- ja .Lfold_4vecs_loop\@
-
- // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold
- // two more vectors of data from BUF, if at least that much remains.
-.Lreduce_4vecs_to_2\@:
- _load_vec_folding_consts 1
- _fold_vec V0, V2, CONSTS, V4
- _fold_vec V1, V3, CONSTS, V4
- test $2*VL, LEN8
- jz .Lreduce_2vecs_to_1\@
- _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
- sub $-2*VL, BUF
-
- // Fold V0 into V1 and write the result back to V0. Then fold one more
- // vector of data from BUF, if at least that much remains.
-.Lreduce_2vecs_to_1\@:
- _load_vec_folding_consts 0
- _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
-
-.Lreduce_1vec_to_128bits\@:
-.if VL == 64
- // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of
- // data from BUF, if at least that much remains.
- vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
- vextracti64x4 $1, %zmm0, %ymm1
- _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
-.Lreduce_256bits_to_128bits\@:
-.endif
-.if VL >= 32
- // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of
- // data from BUF, if at least that much remains.
- vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
- vextracti128 $1, %ymm0, %xmm1
- _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
-.Lcheck_for_partial_block\@:
-.endif
- and $15, LEN32
- jz .Lreduce_128bits_to_crc\@
-
- // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now
- // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
- // and B is the polynomial of the remaining LEN data bytes. To reduce
- // this to 128 bits without needing fold constants for each possible
- // LEN, rearrange this expression into C1*(x^128) + C2, where
- // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
- // Then fold C1 into C2, which is just another fold across 128 bits.
-
-.if !LSB_CRC || AVX_LEVEL == 0
- // Load the last 16 data bytes. Note that originally LEN was >= 16.
- _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
-.endif // Else will use vpblendvb mem operand later.
-.if !LSB_CRC
- neg LEN // Needed for indexing shuf_table
-.endif
-
- // tmp = A*x^(8*LEN) mod x^128
- // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
- // i.e. right-shift by LEN bytes.
- // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
- // i.e. left-shift by LEN bytes.
- _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
- _cond_vex pshufb, %xmm3, %xmm0, %xmm1
-
- // C1 = floor(A / x^(128 - 8*LEN))
- // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
- // i.e. left-shift by 16-LEN bytes.
- // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
- // i.e. right-shift by 16-LEN bytes.
- _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
- %xmm0, %xmm0, unaligned_mem_tmp=%xmm4
-
- // C2 = tmp + B. This is just a blend of tmp with the last 16 data
- // bytes (reflected if msb-first). The blend mask is the shuffle table
- // that was used to create tmp. 0 selects tmp, and 1 last16databytes.
-.if AVX_LEVEL == 0
- movdqa %xmm0, %xmm4
- movdqa %xmm3, %xmm0
- pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand
- movdqa %xmm4, %xmm0
-.elseif LSB_CRC
- vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1
-.else
- vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-.endif
-
- // Fold C1 into C2 and store the 128-bit result in %xmm0.
- _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4
-
-.Lreduce_128bits_to_crc\@:
- // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit
- // polynomial stored in %xmm0 (using either lsb-first or msb-first bit
- // order according to LSB_CRC), and G is the CRC's generator polynomial.
-
- // First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
- //
- // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
- // x^n * (%xmm0 mod x^64)
- //
- // Store t0 * x^(64-n) in %xmm0. I.e., actually do:
- //
- // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
- // x^64 * (%xmm0 mod x^64)
- //
- // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
- // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
- // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
- // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
- // (considering the extra factor of x that gets implicitly introduced by
- // each pclmulqdq when using lsb-first order), is identical to the
- // constant that was used earlier for folding the LO64_TERMS across 128
- // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM.
- _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
-.if LSB_CRC
- _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
-.else
- _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
-.endif
- _cond_vex pxor, %xmm1, %xmm0, %xmm0
- // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
- // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
-
- // First step of Barrett reduction: Compute floor(t0 / G). This is the
- // polynomial by which G needs to be multiplied to cancel out the x^n
- // and higher terms of t0, i.e. to reduce t0 mod G. First do:
- //
- // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n)
- //
- // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in
- // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
- // value that makes enough precision be carried through the calculation.
- //
- // The '* x' makes it so the result is floor(t1 / x^64) rather than
- // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
- // can be extracted much more easily in the next step. In the lsb-first
- // case the '* x' happens implicitly. In the msb-first case it must be
- // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
- // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
- // the multiplication by the x^64 term is handled using a pxor. The
- // pxor causes the low 64 terms of t1 to be wrong, but they are unused.
- _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
- _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
-.if !LSB_CRC
- _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
-.endif
- // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
-
- // Second step of Barrett reduction: Cancel out the x^n and higher terms
- // of t0 by subtracting the needed multiple of G. This gives the CRC:
- //
- // crc := t0 - (G * floor(t0 / G))
- //
- // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
- //
- // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
- //
- // Furthermore, since the resulting CRC is n-bit, if mod x^n is
- // explicitly applied to it then the x^n term of G makes no difference
- // in the result and can be omitted. This helps keep the constant
- // multiplier in 64 bits in most cases. This gives the following:
- //
- // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
- // crc := (%xmm0 / x^(64-n)) mod x^n
- //
- // In the lsb-first case, each pclmulqdq implicitly introduces
- // an extra factor of x, so in that case the constant that needs to be
- // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
- // For lsb-first CRCs where n=64, the extra factor of x cannot be as
- // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to
- // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC
- // polynomials have nonzero x^n and x^0 terms.) It works out as: the
- // CRC has be XORed with the physically low qword of %xmm1, representing
- // floor(t0 / G). The most efficient way to do that is to move it to
- // the physically high qword and use a ternlog to combine the two XORs.
-.if LSB_CRC && \n == 64
- _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2
- _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
- .if AVX_LEVEL <= 2
- _cond_vex pxor, %xmm2, %xmm0, %xmm0
- _cond_vex pxor, %xmm1, %xmm0, %xmm0
- .else
- vpternlogq $0x96, %xmm2, %xmm1, %xmm0
- .endif
- _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64
-.else
- _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
- _cond_vex pxor, %xmm1, %xmm0, %xmm0
- .if \n == 8
- _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
- .elseif \n == 16
- _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
- .elseif \n == 32
- _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
- .else // \n == 64 && !LSB_CRC
- _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64
- .endif
-.endif
-
-.if VL > 16
- vzeroupper // Needed when ymm or zmm registers may have been used.
-.endif
-#ifdef __i386__
- pop CONSTS_PTR
-#endif
- RET
-.endm
-
-#ifdef CONFIG_AS_VPCLMULQDQ
-#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
-SYM_FUNC_START(prefix##_pclmul_sse); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
-SYM_FUNC_END(prefix##_pclmul_sse); \
- \
-SYM_FUNC_START(prefix##_vpclmul_avx2); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \
-SYM_FUNC_END(prefix##_vpclmul_avx2); \
- \
-SYM_FUNC_START(prefix##_vpclmul_avx512); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \
-SYM_FUNC_END(prefix##_vpclmul_avx512);
-#else
-#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
-SYM_FUNC_START(prefix##_pclmul_sse); \
- _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
-SYM_FUNC_END(prefix##_pclmul_sse);
-#endif // !CONFIG_AS_VPCLMULQDQ
diff --git a/arch/x86/lib/crc-pclmul-template.h b/arch/x86/lib/crc-pclmul-template.h
deleted file mode 100644
index c5b3bfe11d8d..000000000000
--- a/arch/x86/lib/crc-pclmul-template.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
- * instantiated by crc-pclmul-template.S
- *
- * Copyright 2025 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-#ifndef _CRC_PCLMUL_TEMPLATE_H
-#define _CRC_PCLMUL_TEMPLATE_H
-
-#include <asm/cpufeatures.h>
-#include <asm/simd.h>
-#include <crypto/internal/simd.h>
-#include <linux/static_call.h>
-#include "crc-pclmul-consts.h"
-
-#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \
-crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \
- const void *consts_ptr); \
-DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
-
-#define INIT_CRC_PCLMUL(prefix) \
-do { \
- if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) && \
- boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && \
- boot_cpu_has(X86_FEATURE_AVX2) && \
- cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) { \
- if (boot_cpu_has(X86_FEATURE_AVX512BW) && \
- boot_cpu_has(X86_FEATURE_AVX512VL) && \
- !boot_cpu_has(X86_FEATURE_PREFER_YMM) && \
- cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) { \
- static_call_update(prefix##_pclmul, \
- prefix##_vpclmul_avx512); \
- } else { \
- static_call_update(prefix##_pclmul, \
- prefix##_vpclmul_avx2); \
- } \
- } \
-} while (0)
-
-/*
- * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
- * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
- *
- * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
- * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
- * varying by CPU and factors such as which parts of the "FPU" state userspace
- * has touched, which could result in a larger cutoff being better. Indeed, a
- * larger cutoff is usually better for a *single* message. However, the
- * overhead of the FPU section gets amortized if multiple FPU sections get
- * executed before returning to userspace, since the XSAVE and XRSTOR occur only
- * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on
- * the dcache than the table-based code is, a 16-byte cutoff seems to work well.
- */
-#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \
-do { \
- if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \
- crypto_simd_usable()) { \
- const void *consts_ptr; \
- \
- consts_ptr = (consts).fold_across_128_bits_consts; \
- kernel_fpu_begin(); \
- crc = static_call(prefix##_pclmul)((crc), (p), (len), \
- consts_ptr); \
- kernel_fpu_end(); \
- return crc; \
- } \
-} while (0)
-
-#endif /* _CRC_PCLMUL_TEMPLATE_H */
diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif-glue.c
deleted file mode 100644
index f89c335cde3c..000000000000
--- a/arch/x86/lib/crc-t10dif-glue.c
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * CRC-T10DIF using [V]PCLMULQDQ instructions
- *
- * Copyright 2024 Google LLC
- */
-
-#include <linux/crc-t10dif.h>
-#include <linux/module.h>
-#include "crc-pclmul-template.h"
-
-static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
-
-DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16);
-
-u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts,
- have_pclmulqdq);
- return crc_t10dif_generic(crc, p, len);
-}
-EXPORT_SYMBOL(crc_t10dif_arch);
-
-static int __init crc_t10dif_x86_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
- static_branch_enable(&have_pclmulqdq);
- INIT_CRC_PCLMUL(crc16_msb);
- }
- return 0;
-}
-arch_initcall(crc_t10dif_x86_init);
-
-static void __exit crc_t10dif_x86_exit(void)
-{
-}
-module_exit(crc_t10dif_x86_exit);
-
-MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc16-msb-pclmul.S b/arch/x86/lib/crc16-msb-pclmul.S
deleted file mode 100644
index e9fe248093a8..000000000000
--- a/arch/x86/lib/crc16-msb-pclmul.S
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-// Copyright 2025 Google LLC
-
-#include "crc-pclmul-template.S"
-
-DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0)
diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c
deleted file mode 100644
index e3f93b17ac3f..000000000000
--- a/arch/x86/lib/crc32-glue.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * x86-optimized CRC32 functions
- *
- * Copyright (C) 2008 Intel Corporation
- * Copyright 2012 Xyratex Technology Limited
- * Copyright 2024 Google LLC
- */
-
-#include <linux/crc32.h>
-#include <linux/module.h>
-#include "crc-pclmul-template.h"
-
-static DEFINE_STATIC_KEY_FALSE(have_crc32);
-static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
-
-DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
-
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
- have_pclmulqdq);
- return crc32_le_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_le_arch);
-
-#ifdef CONFIG_X86_64
-#define CRC32_INST "crc32q %1, %q0"
-#else
-#define CRC32_INST "crc32l %1, %0"
-#endif
-
-/*
- * Use carryless multiply version of crc32c when buffer size is >= 512 to
- * account for FPU state save/restore overhead.
- */
-#define CRC32C_PCLMUL_BREAKEVEN 512
-
-asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
-
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
-{
- size_t num_longs;
-
- if (!static_branch_likely(&have_crc32))
- return crc32c_base(crc, p, len);
-
- if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
- static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
- kernel_fpu_begin();
- crc = crc32c_x86_3way(crc, p, len);
- kernel_fpu_end();
- return crc;
- }
-
- for (num_longs = len / sizeof(unsigned long);
- num_longs != 0; num_longs--, p += sizeof(unsigned long))
- asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
-
- if (sizeof(unsigned long) > 4 && (len & 4)) {
- asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
- p += 4;
- }
- if (len & 2) {
- asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
- p += 2;
- }
- if (len & 1)
- asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
-
- return crc;
-}
-EXPORT_SYMBOL(crc32c_arch);
-
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
-{
- return crc32_be_base(crc, p, len);
-}
-EXPORT_SYMBOL(crc32_be_arch);
-
-static int __init crc32_x86_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_XMM4_2))
- static_branch_enable(&have_crc32);
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
- static_branch_enable(&have_pclmulqdq);
- INIT_CRC_PCLMUL(crc32_lsb);
- }
- return 0;
-}
-arch_initcall(crc32_x86_init);
-
-static void __exit crc32_x86_exit(void)
-{
-}
-module_exit(crc32_x86_exit);
-
-u32 crc32_optimizations(void)
-{
- u32 optimizations = 0;
-
- if (static_key_enabled(&have_crc32))
- optimizations |= CRC32C_OPTIMIZATION;
- if (static_key_enabled(&have_pclmulqdq))
- optimizations |= CRC32_LE_OPTIMIZATION;
- return optimizations;
-}
-EXPORT_SYMBOL(crc32_optimizations);
-
-MODULE_DESCRIPTION("x86-optimized CRC32 functions");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S
deleted file mode 100644
index f20f40fb0172..000000000000
--- a/arch/x86/lib/crc32-pclmul.S
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-// Copyright 2025 Google LLC
-
-#include "crc-pclmul-template.S"
-
-DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)
diff --git a/arch/x86/lib/crc32c-3way.S b/arch/x86/lib/crc32c-3way.S
deleted file mode 100644
index 9b8770503bbc..000000000000
--- a/arch/x86/lib/crc32c-3way.S
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
- *
- * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
- * downloaded from:
- * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
- * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
- *
- * Copyright (C) 2012 Intel Corporation.
- * Copyright 2024 Google LLC
- *
- * Authors:
- * Wajdi Feghali <wajdi.k.feghali@intel.com>
- * James Guilford <james.guilford@intel.com>
- * David Cote <david.m.cote@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/linkage.h>
-
-## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
-
-# Define threshold below which buffers are considered "small" and routed to
-# regular CRC code that does not interleave the CRC instructions.
-#define SMALL_SIZE 200
-
-# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
-
-.text
-SYM_FUNC_START(crc32c_x86_3way)
-#define crc0 %edi
-#define crc0_q %rdi
-#define bufp %rsi
-#define bufp_d %esi
-#define len %rdx
-#define len_dw %edx
-#define n_misaligned %ecx /* overlaps chunk_bytes! */
-#define n_misaligned_q %rcx
-#define chunk_bytes %ecx /* overlaps n_misaligned! */
-#define chunk_bytes_q %rcx
-#define crc1 %r8
-#define crc2 %r9
-
- cmp $SMALL_SIZE, len
- jb .Lsmall
-
- ################################################################
- ## 1) ALIGN:
- ################################################################
- mov bufp_d, n_misaligned
- neg n_misaligned
- and $7, n_misaligned # calculate the misalignment amount of
- # the address
- je .Laligned # Skip if aligned
-
- # Process 1 <= n_misaligned <= 7 bytes individually in order to align
- # the remaining data to an 8-byte boundary.
-.Ldo_align:
- movq (bufp), %rax
- add n_misaligned_q, bufp
- sub n_misaligned_q, len
-.Lalign_loop:
- crc32b %al, crc0 # compute crc32 of 1-byte
- shr $8, %rax # get next byte
- dec n_misaligned
- jne .Lalign_loop
-.Laligned:
-
- ################################################################
- ## 2) PROCESS BLOCK:
- ################################################################
-
- cmp $128*24, len
- jae .Lfull_block
-
-.Lpartial_block:
- # Compute floor(len / 24) to get num qwords to process from each lane.
- imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24)
- shr $16, %eax
- jmp .Lcrc_3lanes
-
-.Lfull_block:
- # Processing 128 qwords from each lane.
- mov $128, %eax
-
- ################################################################
- ## 3) CRC each of three lanes:
- ################################################################
-
-.Lcrc_3lanes:
- xor crc1,crc1
- xor crc2,crc2
- mov %eax, chunk_bytes
- shl $3, chunk_bytes # num bytes to process from each lane
- sub $5, %eax # 4 for 4x_loop, 1 for special last iter
- jl .Lcrc_3lanes_4x_done
-
- # Unroll the loop by a factor of 4 to reduce the overhead of the loop
- # bookkeeping instructions, which can compete with crc32q for the ALUs.
-.Lcrc_3lanes_4x_loop:
- crc32q (bufp), crc0_q
- crc32q (bufp,chunk_bytes_q), crc1
- crc32q (bufp,chunk_bytes_q,2), crc2
- crc32q 8(bufp), crc0_q
- crc32q 8(bufp,chunk_bytes_q), crc1
- crc32q 8(bufp,chunk_bytes_q,2), crc2
- crc32q 16(bufp), crc0_q
- crc32q 16(bufp,chunk_bytes_q), crc1
- crc32q 16(bufp,chunk_bytes_q,2), crc2
- crc32q 24(bufp), crc0_q
- crc32q 24(bufp,chunk_bytes_q), crc1
- crc32q 24(bufp,chunk_bytes_q,2), crc2
- add $32, bufp
- sub $4, %eax
- jge .Lcrc_3lanes_4x_loop
-
-.Lcrc_3lanes_4x_done:
- add $4, %eax
- jz .Lcrc_3lanes_last_qword
-
-.Lcrc_3lanes_1x_loop:
- crc32q (bufp), crc0_q
- crc32q (bufp,chunk_bytes_q), crc1
- crc32q (bufp,chunk_bytes_q,2), crc2
- add $8, bufp
- dec %eax
- jnz .Lcrc_3lanes_1x_loop
-
-.Lcrc_3lanes_last_qword:
- crc32q (bufp), crc0_q
- crc32q (bufp,chunk_bytes_q), crc1
-# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
-
- ################################################################
- ## 4) Combine three results:
- ################################################################
-
- lea (K_table-8)(%rip), %rax # first entry is for idx 1
- pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
- lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
- sub %rax, len # len -= chunk_bytes * 3
-
- movq crc0_q, %xmm1 # CRC for block 1
- pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
-
- movq crc1, %xmm2 # CRC for block 2
- pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
-
- pxor %xmm2,%xmm1
- movq %xmm1, %rax
- xor (bufp,chunk_bytes_q,2), %rax
- mov crc2, crc0_q
- crc32 %rax, crc0_q
- lea 8(bufp,chunk_bytes_q,2), bufp
-
- ################################################################
- ## 5) If more blocks remain, goto (2):
- ################################################################
-
- cmp $128*24, len
- jae .Lfull_block
- cmp $SMALL_SIZE, len
- jae .Lpartial_block
-
- #######################################################################
- ## 6) Process any remainder without interleaving:
- #######################################################################
-.Lsmall:
- test len_dw, len_dw
- jz .Ldone
- mov len_dw, %eax
- shr $3, %eax
- jz .Ldo_dword
-.Ldo_qwords:
- crc32q (bufp), crc0_q
- add $8, bufp
- dec %eax
- jnz .Ldo_qwords
-.Ldo_dword:
- test $4, len_dw
- jz .Ldo_word
- crc32l (bufp), crc0
- add $4, bufp
-.Ldo_word:
- test $2, len_dw
- jz .Ldo_byte
- crc32w (bufp), crc0
- add $2, bufp
-.Ldo_byte:
- test $1, len_dw
- jz .Ldone
- crc32b (bufp), crc0
-.Ldone:
- mov crc0, %eax
- RET
-SYM_FUNC_END(crc32c_x86_3way)
-
-.section .rodata, "a", @progbits
- ################################################################
- ## PCLMULQDQ tables
- ## Table is 128 entries x 2 words (8 bytes) each
- ################################################################
-.align 8
-K_table:
- .long 0x493c7d27, 0x00000001
- .long 0xba4fc28e, 0x493c7d27
- .long 0xddc0152b, 0xf20c0dfe
- .long 0x9e4addf8, 0xba4fc28e
- .long 0x39d3b296, 0x3da6d0cb
- .long 0x0715ce53, 0xddc0152b
- .long 0x47db8317, 0x1c291d04
- .long 0x0d3b6092, 0x9e4addf8
- .long 0xc96cfdc0, 0x740eef02
- .long 0x878a92a7, 0x39d3b296
- .long 0xdaece73e, 0x083a6eec
- .long 0xab7aff2a, 0x0715ce53
- .long 0x2162d385, 0xc49f4f67
- .long 0x83348832, 0x47db8317
- .long 0x299847d5, 0x2ad91c30
- .long 0xb9e02b86, 0x0d3b6092
- .long 0x18b33a4e, 0x6992cea2
- .long 0xb6dd949b, 0xc96cfdc0
- .long 0x78d9ccb7, 0x7e908048
- .long 0xbac2fd7b, 0x878a92a7
- .long 0xa60ce07b, 0x1b3d8f29
- .long 0xce7f39f4, 0xdaece73e
- .long 0x61d82e56, 0xf1d0f55e
- .long 0xd270f1a2, 0xab7aff2a
- .long 0xc619809d, 0xa87ab8a8
- .long 0x2b3cac5d, 0x2162d385
- .long 0x65863b64, 0x8462d800
- .long 0x1b03397f, 0x83348832
- .long 0xebb883bd, 0x71d111a8
- .long 0xb3e32c28, 0x299847d5
- .long 0x064f7f26, 0xffd852c6
- .long 0xdd7e3b0c, 0xb9e02b86
- .long 0xf285651c, 0xdcb17aa4
- .long 0x10746f3c, 0x18b33a4e
- .long 0xc7a68855, 0xf37c5aee
- .long 0x271d9844, 0xb6dd949b
- .long 0x8e766a0c, 0x6051d5a2
- .long 0x93a5f730, 0x78d9ccb7
- .long 0x6cb08e5c, 0x18b0d4ff
- .long 0x6b749fb2, 0xbac2fd7b
- .long 0x1393e203, 0x21f3d99c
- .long 0xcec3662e, 0xa60ce07b
- .long 0x96c515bb, 0x8f158014
- .long 0xe6fc4e6a, 0xce7f39f4
- .long 0x8227bb8a, 0xa00457f7
- .long 0xb0cd4768, 0x61d82e56
- .long 0x39c7ff35, 0x8d6d2c43
- .long 0xd7a4825c, 0xd270f1a2
- .long 0x0ab3844b, 0x00ac29cf
- .long 0x0167d312, 0xc619809d
- .long 0xf6076544, 0xe9adf796
- .long 0x26f6a60a, 0x2b3cac5d
- .long 0xa741c1bf, 0x96638b34
- .long 0x98d8d9cb, 0x65863b64
- .long 0x49c3cc9c, 0xe0e9f351
- .long 0x68bce87a, 0x1b03397f
- .long 0x57a3d037, 0x9af01f2d
- .long 0x6956fc3b, 0xebb883bd
- .long 0x42d98888, 0x2cff42cf
- .long 0x3771e98f, 0xb3e32c28
- .long 0xb42ae3d9, 0x88f25a3a
- .long 0x2178513a, 0x064f7f26
- .long 0xe0ac139e, 0x4e36f0b0
- .long 0x170076fa, 0xdd7e3b0c
- .long 0x444dd413, 0xbd6f81f8
- .long 0x6f345e45, 0xf285651c
- .long 0x41d17b64, 0x91c9bd4b
- .long 0xff0dba97, 0x10746f3c
- .long 0xa2b73df1, 0x885f087b
- .long 0xf872e54c, 0xc7a68855
- .long 0x1e41e9fc, 0x4c144932
- .long 0x86d8e4d2, 0x271d9844
- .long 0x651bd98b, 0x52148f02
- .long 0x5bb8f1bc, 0x8e766a0c
- .long 0xa90fd27a, 0xa3c6f37a
- .long 0xb3af077a, 0x93a5f730
- .long 0x4984d782, 0xd7c0557f
- .long 0xca6ef3ac, 0x6cb08e5c
- .long 0x234e0b26, 0x63ded06a
- .long 0xdd66cbbb, 0x6b749fb2
- .long 0x4597456a, 0x4d56973c
- .long 0xe9e28eb4, 0x1393e203
- .long 0x7b3ff57a, 0x9669c9df
- .long 0xc9c8b782, 0xcec3662e
- .long 0x3f70cc6f, 0xe417f38a
- .long 0x93e106a4, 0x96c515bb
- .long 0x62ec6c6d, 0x4b9e0f71
- .long 0xd813b325, 0xe6fc4e6a
- .long 0x0df04680, 0xd104b8fc
- .long 0x2342001e, 0x8227bb8a
- .long 0x0a2a8d7e, 0x5b397730
- .long 0x6d9a4957, 0xb0cd4768
- .long 0xe8b6368b, 0xe78eb416
- .long 0xd2c3ed1a, 0x39c7ff35
- .long 0x995a5724, 0x61ff0e01
- .long 0x9ef68d35, 0xd7a4825c
- .long 0x0c139b31, 0x8d96551c
- .long 0xf2271e60, 0x0ab3844b
- .long 0x0b0bf8ca, 0x0bf80dd2
- .long 0x2664fd8b, 0x0167d312
- .long 0xed64812d, 0x8821abed
- .long 0x02ee03b2, 0xf6076544
- .long 0x8604ae0f, 0x6a45d2b2
- .long 0x363bd6b3, 0x26f6a60a
- .long 0x135c83fd, 0xd8d26619
- .long 0x5fabe670, 0xa741c1bf
- .long 0x35ec3279, 0xde87806c
- .long 0x00bcf5f6, 0x98d8d9cb
- .long 0x8ae00689, 0x14338754
- .long 0x17f27698, 0x49c3cc9c
- .long 0x58ca5f00, 0x5bd2011f
- .long 0xaa7c7ad5, 0x68bce87a
- .long 0xb5cfca28, 0xdd07448e
- .long 0xded288f8, 0x57a3d037
- .long 0x59f229bc, 0xdde8f5b9
- .long 0x6d390dec, 0x6956fc3b
- .long 0x37170390, 0xa3e3e02c
- .long 0x6353c1cc, 0x42d98888
- .long 0xc4584f5c, 0xd73c7bea
- .long 0xf48642e9, 0x3771e98f
- .long 0x531377e2, 0x80ff0093
- .long 0xdd35bc8d, 0xb42ae3d9
- .long 0xb25b29f2, 0x8fe4c34d
- .long 0x9a5ede41, 0x2178513a
- .long 0xa563905d, 0xdf99fc11
- .long 0x45cddf4e, 0xe0ac139e
- .long 0xacfa3103, 0x6c23e841
- .long 0xa51b6135, 0x170076fa
diff --git a/arch/x86/lib/crc64-glue.c b/arch/x86/lib/crc64-glue.c
deleted file mode 100644
index b0e1b719ecbf..000000000000
--- a/arch/x86/lib/crc64-glue.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * CRC64 using [V]PCLMULQDQ instructions
- *
- * Copyright 2025 Google LLC
- */
-
-#include <linux/crc64.h>
-#include <linux/module.h>
-#include "crc-pclmul-template.h"
-
-static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
-
-DECLARE_CRC_PCLMUL_FUNCS(crc64_msb, u64);
-DECLARE_CRC_PCLMUL_FUNCS(crc64_lsb, u64);
-
-u64 crc64_be_arch(u64 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc64_msb, crc64_msb_0x42f0e1eba9ea3693_consts,
- have_pclmulqdq);
- return crc64_be_generic(crc, p, len);
-}
-EXPORT_SYMBOL_GPL(crc64_be_arch);
-
-u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
-{
- CRC_PCLMUL(crc, p, len, crc64_lsb, crc64_lsb_0x9a6c9329ac4bc9b5_consts,
- have_pclmulqdq);
- return crc64_nvme_generic(crc, p, len);
-}
-EXPORT_SYMBOL_GPL(crc64_nvme_arch);
-
-static int __init crc64_x86_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
- static_branch_enable(&have_pclmulqdq);
- INIT_CRC_PCLMUL(crc64_msb);
- INIT_CRC_PCLMUL(crc64_lsb);
- }
- return 0;
-}
-arch_initcall(crc64_x86_init);
-
-static void __exit crc64_x86_exit(void)
-{
-}
-module_exit(crc64_x86_exit);
-
-MODULE_DESCRIPTION("CRC64 using [V]PCLMULQDQ instructions");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/lib/crc64-pclmul.S b/arch/x86/lib/crc64-pclmul.S
deleted file mode 100644
index 4173051b5197..000000000000
--- a/arch/x86/lib/crc64-pclmul.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-// Copyright 2025 Google LLC
-
-#include "crc-pclmul-template.S"
-
-DEFINE_CRC_PCLMUL_FUNCS(crc64_msb, /* bits= */ 64, /* lsb= */ 0)
-DEFINE_CRC_PCLMUL_FUNCS(crc64_lsb, /* bits= */ 64, /* lsb= */ 1)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index e86eda2c0b04..eb2d2e1cbddd 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -75,7 +75,7 @@ static void delay_tsc(u64 cycles)
/* Allow RT tasks to run */
preempt_enable();
- rep_nop();
+ native_pause();
preempt_disable();
/*
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 98631c0e7a11..4e385cbfd444 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -13,6 +13,7 @@
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/ldt.h>
+#include <asm/msr.h>
#include <asm/vm86.h>
#undef pr_fmt
@@ -631,14 +632,21 @@ static bool get_desc(struct desc_struct *out, unsigned short sel)
/* Bits [15:3] contain the index of the desired entry. */
sel >>= 3;
- mutex_lock(&current->active_mm->context.lock);
- ldt = current->active_mm->context.ldt;
+ /*
+ * If we're not in a valid context with a real (not just lazy)
+ * user mm, then don't even try.
+ */
+ if (!nmi_uaccess_okay())
+ return false;
+
+ mutex_lock(&current->mm->context.lock);
+ ldt = current->mm->context.ldt;
if (ldt && sel < ldt->nr_entries) {
*out = ldt->entries[sel];
success = true;
}
- mutex_unlock(&current->active_mm->context.lock);
+ mutex_unlock(&current->mm->context.lock);
return success;
}
@@ -702,16 +710,16 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
unsigned long base;
if (seg_reg_idx == INAT_SEG_REG_FS) {
- rdmsrl(MSR_FS_BASE, base);
+ rdmsrq(MSR_FS_BASE, base);
} else if (seg_reg_idx == INAT_SEG_REG_GS) {
/*
* swapgs was called at the kernel entry point. Thus,
* MSR_KERNEL_GS_BASE will have the user-space GS base.
*/
if (user_mode(regs))
- rdmsrl(MSR_KERNEL_GS_BASE, base);
+ rdmsrq(MSR_KERNEL_GS_BASE, base);
else
- rdmsrl(MSR_GS_BASE, base);
+ rdmsrq(MSR_GS_BASE, base);
} else {
base = 0;
}
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 6ffb931b9fb1..149a57e334ab 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -324,6 +324,11 @@ int insn_get_opcode(struct insn *insn)
}
insn->attr = inat_get_opcode_attribute(op);
+ if (insn->x86_64 && inat_is_invalid64(insn->attr)) {
+ /* This instruction is invalid, like UD2. Stop decoding. */
+ insn->attr &= INAT_INV64;
+ }
+
while (inat_is_escape(insn->attr)) {
/* Get escaped opcode */
op = get_next(insn_byte_t, insn);
@@ -337,6 +342,7 @@ int insn_get_opcode(struct insn *insn)
insn->attr = 0;
return -EINVAL;
}
+
end:
opcode->got = 1;
return 0;
@@ -658,7 +664,6 @@ int insn_get_immediate(struct insn *insn)
}
if (!inat_has_immediate(insn->attr))
- /* no immediates */
goto done;
switch (inat_immediate_size(insn->attr)) {
diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c
index 5eecb45d05d5..c20e04764edc 100644
--- a/arch/x86/lib/iomem.c
+++ b/arch/x86/lib/iomem.c
@@ -10,7 +10,7 @@
static __always_inline void rep_movs(void *to, const void *from, size_t n)
{
unsigned long d0, d1, d2;
- asm volatile("rep ; movsl\n\t"
+ asm volatile("rep movsl\n\t"
"testb $2,%b4\n\t"
"je 1f\n\t"
"movsw\n"
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index a58f451a7dd3..b5893928d55c 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -8,7 +8,7 @@
*/
#include <asm/asm.h>
#include <asm/kaslr.h>
-#include <asm/msr.h>
+#include <asm/tsc.h>
#include <asm/archrandom.h>
#include <asm/e820/api.h>
#include <asm/shared/io.h>
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 0ae2e1712e2e..12a23fa7c44c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -41,6 +41,7 @@ SYM_FUNC_END(__memcpy)
EXPORT_SYMBOL(__memcpy)
SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
+SYM_PIC_ALIAS(memcpy)
EXPORT_SYMBOL(memcpy)
SYM_FUNC_START_LOCAL(memcpy_orig)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index d66b710d628f..fb5a03cf5ab7 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -42,6 +42,7 @@ SYM_FUNC_END(__memset)
EXPORT_SYMBOL(__memset)
SYM_FUNC_ALIAS_MEMFUNC(memset, __memset)
+SYM_PIC_ALIAS(memset)
EXPORT_SYMBOL(memset)
SYM_FUNC_START_LOCAL(memset_orig)
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
index acd463d887e1..b8f63419e6ae 100644
--- a/arch/x86/lib/msr-smp.c
+++ b/arch/x86/lib/msr-smp.c
@@ -47,7 +47,7 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
}
EXPORT_SYMBOL(rdmsr_on_cpu);
-int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
+int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
int err;
struct msr_info rv;
@@ -60,7 +60,7 @@ int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
return err;
}
-EXPORT_SYMBOL(rdmsrl_on_cpu);
+EXPORT_SYMBOL(rdmsrq_on_cpu);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
@@ -78,7 +78,7 @@ int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
}
EXPORT_SYMBOL(wrmsr_on_cpu);
-int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
+int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
int err;
struct msr_info rv;
@@ -92,7 +92,7 @@ int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
return err;
}
-EXPORT_SYMBOL(wrmsrl_on_cpu);
+EXPORT_SYMBOL(wrmsrq_on_cpu);
static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
struct msr __percpu *msrs,
@@ -204,7 +204,7 @@ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
}
EXPORT_SYMBOL(wrmsr_safe_on_cpu);
-int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
+int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
int err;
struct msr_info rv;
@@ -218,9 +218,9 @@ int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
return err ? err : rv.err;
}
-EXPORT_SYMBOL(wrmsrl_safe_on_cpu);
+EXPORT_SYMBOL(wrmsrq_safe_on_cpu);
-int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
+int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
u32 low, high;
int err;
@@ -230,7 +230,7 @@ int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
return err;
}
-EXPORT_SYMBOL(rdmsrl_safe_on_cpu);
+EXPORT_SYMBOL(rdmsrq_safe_on_cpu);
/*
* These variants are significantly slower, but allows control over
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 5a18ecc04a6c..4ef7c6dcbea6 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -41,7 +41,7 @@ static int msr_read(u32 msr, struct msr *m)
int err;
u64 val;
- err = rdmsrl_safe(msr, &val);
+ err = rdmsrq_safe(msr, &val);
if (!err)
m->q = val;
@@ -58,7 +58,7 @@ static int msr_read(u32 msr, struct msr *m)
*/
static int msr_write(u32 msr, struct msr *m)
{
- return wrmsrl_safe(msr, m->q);
+ return wrmsrq_safe(msr, m->q);
}
static inline int __flip_bit(u32 msr, u8 bit, bool set)
@@ -122,23 +122,23 @@ int msr_clear_bit(u32 msr, u8 bit)
EXPORT_SYMBOL_GPL(msr_clear_bit);
#ifdef CONFIG_TRACEPOINTS
-void do_trace_write_msr(unsigned int msr, u64 val, int failed)
+void do_trace_write_msr(u32 msr, u64 val, int failed)
{
trace_write_msr(msr, val, failed);
}
EXPORT_SYMBOL(do_trace_write_msr);
EXPORT_TRACEPOINT_SYMBOL(write_msr);
-void do_trace_read_msr(unsigned int msr, u64 val, int failed)
+void do_trace_read_msr(u32 msr, u64 val, int failed)
{
trace_read_msr(msr, val, failed);
}
EXPORT_SYMBOL(do_trace_read_msr);
EXPORT_TRACEPOINT_SYMBOL(read_msr);
-void do_trace_rdpmc(unsigned counter, u64 val, int failed)
+void do_trace_rdpmc(u32 msr, u64 val, int failed)
{
- trace_rdpmc(counter, val, failed);
+ trace_rdpmc(msr, val, failed);
}
EXPORT_SYMBOL(do_trace_rdpmc);
EXPORT_TRACEPOINT_SYMBOL(rdpmc);
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index a26c43abd47d..d78d769a02bd 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -40,6 +40,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
ALTERNATIVE_2 __stringify(RETPOLINE \reg), \
__stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_LFENCE, \
__stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), ALT_NOT(X86_FEATURE_RETPOLINE)
+SYM_PIC_ALIAS(__x86_indirect_thunk_\reg)
.endm
@@ -367,6 +368,54 @@ SYM_FUNC_END(call_depth_return_thunk)
#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
+#ifdef CONFIG_MITIGATION_ITS
+
+.macro ITS_THUNK reg
+
+/*
+ * If CFI paranoid is used then the ITS thunk starts with opcodes (0xea; jne 1b)
+ * that complete the fineibt_paranoid caller sequence.
+ */
+1: .byte 0xea
+SYM_INNER_LABEL(__x86_indirect_paranoid_thunk_\reg, SYM_L_GLOBAL)
+ UNWIND_HINT_UNDEFINED
+ ANNOTATE_NOENDBR
+ jne 1b
+SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL)
+ UNWIND_HINT_UNDEFINED
+ ANNOTATE_NOENDBR
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%\reg
+ int3
+ .align 32, 0xcc /* fill to the end of the line */
+ .skip 32 - (__x86_indirect_its_thunk_\reg - 1b), 0xcc /* skip to the next upper half */
+.endm
+
+/* ITS mitigation requires thunks be aligned to upper half of cacheline */
+.align 64, 0xcc
+.skip 29, 0xcc
+
+#define GEN(reg) ITS_THUNK reg
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+ .align 64, 0xcc
+SYM_FUNC_ALIAS(__x86_indirect_its_thunk_array, __x86_indirect_its_thunk_rax)
+SYM_CODE_END(__x86_indirect_its_thunk_array)
+
+.align 64, 0xcc
+.skip 32, 0xcc
+SYM_CODE_START(its_return_thunk)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+SYM_CODE_END(its_return_thunk)
+EXPORT_SYMBOL(its_return_thunk)
+
+#endif /* CONFIG_MITIGATION_ITS */
+
/*
* This function name is magical and is used by -mfunction-return=thunk-extern
* for the compiler to generate JMPs to it.
@@ -394,6 +443,7 @@ SYM_CODE_START(__x86_return_thunk)
#endif
int3
SYM_CODE_END(__x86_return_thunk)
+SYM_PIC_ALIAS(__x86_return_thunk)
EXPORT_SYMBOL(__x86_return_thunk)
#endif /* CONFIG_MITIGATION_RETHUNK */
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 53b3f202267c..f87ec24fa579 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -40,8 +40,7 @@ char *strncpy(char *dest, const char *src, size_t count)
"stosb\n\t"
"testb %%al,%%al\n\t"
"jne 1b\n\t"
- "rep\n\t"
- "stosb\n"
+ "rep stosb\n"
"2:"
: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
: "0" (src), "1" (dest), "2" (count) : "memory");
@@ -54,8 +53,7 @@ EXPORT_SYMBOL(strncpy);
char *strcat(char *dest, const char *src)
{
int d0, d1, d2, d3;
- asm volatile("repne\n\t"
- "scasb\n\t"
+ asm volatile("repne scasb\n\t"
"decl %1\n"
"1:\tlodsb\n\t"
"stosb\n\t"
@@ -72,8 +70,7 @@ EXPORT_SYMBOL(strcat);
char *strncat(char *dest, const char *src, size_t count)
{
int d0, d1, d2, d3;
- asm volatile("repne\n\t"
- "scasb\n\t"
+ asm volatile("repne scasb\n\t"
"decl %1\n\t"
"movl %8,%3\n"
"1:\tdecl %3\n\t"
@@ -167,8 +164,7 @@ size_t strlen(const char *s)
{
int d0;
size_t res;
- asm volatile("repne\n\t"
- "scasb"
+ asm volatile("repne scasb"
: "=c" (res), "=&D" (d0)
: "1" (s), "a" (0), "0" (0xffffffffu)
: "memory");
@@ -184,8 +180,7 @@ void *memchr(const void *cs, int c, size_t count)
void *res;
if (!count)
return NULL;
- asm volatile("repne\n\t"
- "scasb\n\t"
+ asm volatile("repne scasb\n\t"
"je 1f\n\t"
"movl $1,%0\n"
"1:\tdecl %0"
@@ -202,7 +197,7 @@ void *memscan(void *addr, int c, size_t size)
{
if (!size)
return addr;
- asm volatile("repnz; scasb\n\t"
+ asm volatile("repnz scasb\n\t"
"jnz 1f\n\t"
"dec %%edi\n"
"1:"
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
index 38f37df056f7..28267985e85f 100644
--- a/arch/x86/lib/strstr_32.c
+++ b/arch/x86/lib/strstr_32.c
@@ -8,16 +8,14 @@ int d0, d1;
register char *__res;
__asm__ __volatile__(
"movl %6,%%edi\n\t"
- "repne\n\t"
- "scasb\n\t"
+ "repne scasb\n\t"
"notl %%ecx\n\t"
"decl %%ecx\n\t" /* NOTE! This also sets Z if searchstring='' */
"movl %%ecx,%%edx\n"
"1:\tmovl %6,%%edi\n\t"
"movl %%esi,%%eax\n\t"
"movl %%edx,%%ecx\n\t"
- "repe\n\t"
- "cmpsb\n\t"
+ "repe cmpsb\n\t"
"je 2f\n\t" /* also works for empty string, see above */
"xchgl %%eax,%%esi\n\t"
"incl %%esi\n\t"
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 422257c350c6..f6f436f1d573 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -38,9 +38,9 @@ do { \
might_fault(); \
__asm__ __volatile__( \
ASM_STAC "\n" \
- "0: rep; stosl\n" \
+ "0: rep stosl\n" \
" movl %2,%0\n" \
- "1: rep; stosb\n" \
+ "1: rep stosb\n" \
"2: " ASM_CLAC "\n" \
_ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %2) \
_ASM_EXTABLE_UA(1b, 2b) \
@@ -140,9 +140,9 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
" shrl $2, %0\n"
" andl $3, %%eax\n"
" cld\n"
- "99: rep; movsl\n"
+ "99: rep movsl\n"
"36: movl %%eax, %0\n"
- "37: rep; movsb\n"
+ "37: rep movsb\n"
"100:\n"
_ASM_EXTABLE_UA(1b, 100b)
_ASM_EXTABLE_UA(2b, 100b)
@@ -242,9 +242,9 @@ static unsigned long __copy_user_intel_nocache(void *to,
" shrl $2, %0\n"
" andl $3, %%eax\n"
" cld\n"
- "6: rep; movsl\n"
+ "6: rep movsl\n"
" movl %%eax,%0\n"
- "7: rep; movsb\n"
+ "7: rep movsb\n"
"8:\n"
_ASM_EXTABLE_UA(0b, 8b)
_ASM_EXTABLE_UA(1b, 8b)
@@ -293,14 +293,14 @@ do { \
" negl %0\n" \
" andl $7,%0\n" \
" subl %0,%3\n" \
- "4: rep; movsb\n" \
+ "4: rep movsb\n" \
" movl %3,%0\n" \
" shrl $2,%0\n" \
" andl $3,%3\n" \
" .align 2,0x90\n" \
- "0: rep; movsl\n" \
+ "0: rep movsl\n" \
" movl %3,%0\n" \
- "1: rep; movsb\n" \
+ "1: rep movsb\n" \
"2:\n" \
_ASM_EXTABLE_TYPE_REG(4b, 2b, EX_TYPE_UCOPY_LEN1, %3) \
_ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %3) \
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index caedb3ef6688..262f7ca1fb95 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -35,7 +35,7 @@
# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
# - (66&F2): Both 0x66 and 0xF2 prefixes are specified.
#
-# REX2 Prefix
+# REX2 Prefix Superscripts
# - (!REX2): REX2 is not allowed
# - (REX2): REX2 variant e.g. JMPABS
@@ -147,7 +147,7 @@ AVXcode:
# 0x60 - 0x6f
60: PUSHA/PUSHAD (i64)
61: POPA/POPAD (i64)
-62: BOUND Gv,Ma (i64) | EVEX (Prefix)
+62: BOUND Gv,Ma (i64) | EVEX (Prefix),(o64)
63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
64: SEG=FS (Prefix)
65: SEG=GS (Prefix)
@@ -253,8 +253,8 @@ c0: Grp2 Eb,Ib (1A)
c1: Grp2 Ev,Ib (1A)
c2: RETN Iw (f64)
c3: RETN
-c4: LES Gz,Mp (i64) | VEX+2byte (Prefix)
-c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix)
+c4: LES Gz,Mp (i64) | VEX+2byte (Prefix),(o64)
+c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix),(o64)
c6: Grp11A Eb,Ib (1A)
c7: Grp11B Ev,Iz (1A)
c8: ENTER Iw,Ib
@@ -286,10 +286,10 @@ df: ESC
# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
-e0: LOOPNE/LOOPNZ Jb (f64) (!REX2)
-e1: LOOPE/LOOPZ Jb (f64) (!REX2)
-e2: LOOP Jb (f64) (!REX2)
-e3: JrCXZ Jb (f64) (!REX2)
+e0: LOOPNE/LOOPNZ Jb (f64),(!REX2)
+e1: LOOPE/LOOPZ Jb (f64),(!REX2)
+e2: LOOP Jb (f64),(!REX2)
+e3: JrCXZ Jb (f64),(!REX2)
e4: IN AL,Ib (!REX2)
e5: IN eAX,Ib (!REX2)
e6: OUT Ib,AL (!REX2)
@@ -298,10 +298,10 @@ e7: OUT Ib,eAX (!REX2)
# in "near" jumps and calls is 16-bit. For CALL,
# push of return address is 16-bit wide, RSP is decremented by 2
# but is not truncated to 16 bits, unlike RIP.
-e8: CALL Jz (f64) (!REX2)
-e9: JMP-near Jz (f64) (!REX2)
-ea: JMP-far Ap (i64) (!REX2)
-eb: JMP-short Jb (f64) (!REX2)
+e8: CALL Jz (f64),(!REX2)
+e9: JMP-near Jz (f64),(!REX2)
+ea: JMP-far Ap (i64),(!REX2)
+eb: JMP-short Jb (f64),(!REX2)
ec: IN AL,DX (!REX2)
ed: IN eAX,DX (!REX2)
ee: OUT DX,AL (!REX2)
@@ -478,22 +478,22 @@ AVXcode: 1
7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev)
# 0x0f 0x80-0x8f
# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
-80: JO Jz (f64) (!REX2)
-81: JNO Jz (f64) (!REX2)
-82: JB/JC/JNAE Jz (f64) (!REX2)
-83: JAE/JNB/JNC Jz (f64) (!REX2)
-84: JE/JZ Jz (f64) (!REX2)
-85: JNE/JNZ Jz (f64) (!REX2)
-86: JBE/JNA Jz (f64) (!REX2)
-87: JA/JNBE Jz (f64) (!REX2)
-88: JS Jz (f64) (!REX2)
-89: JNS Jz (f64) (!REX2)
-8a: JP/JPE Jz (f64) (!REX2)
-8b: JNP/JPO Jz (f64) (!REX2)
-8c: JL/JNGE Jz (f64) (!REX2)
-8d: JNL/JGE Jz (f64) (!REX2)
-8e: JLE/JNG Jz (f64) (!REX2)
-8f: JNLE/JG Jz (f64) (!REX2)
+80: JO Jz (f64),(!REX2)
+81: JNO Jz (f64),(!REX2)
+82: JB/JC/JNAE Jz (f64),(!REX2)
+83: JAE/JNB/JNC Jz (f64),(!REX2)
+84: JE/JZ Jz (f64),(!REX2)
+85: JNE/JNZ Jz (f64),(!REX2)
+86: JBE/JNA Jz (f64),(!REX2)
+87: JA/JNBE Jz (f64),(!REX2)
+88: JS Jz (f64),(!REX2)
+89: JNS Jz (f64),(!REX2)
+8a: JP/JPE Jz (f64),(!REX2)
+8b: JNP/JPO Jz (f64),(!REX2)
+8c: JL/JNGE Jz (f64),(!REX2)
+8d: JNL/JGE Jz (f64),(!REX2)
+8e: JLE/JNG Jz (f64),(!REX2)
+8f: JNLE/JG Jz (f64),(!REX2)
# 0x0f 0x90-0x9f
90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66)
91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66)
@@ -996,8 +996,8 @@ AVXcode: 4
83: Grp1 Ev,Ib (1A),(es)
# CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL,
# CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ
-84: CTESTSCC (ev)
-85: CTESTSCC (es) | CTESTSCC (66),(es)
+84: CTESTSCC Eb,Gb (ev)
+85: CTESTSCC Ev,Gv (es) | CTESTSCC Ev,Gv (66),(es)
88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es)
8f: POP2 Bq,Rq (000),(11B),(ev)
a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es)