diff options
Diffstat (limited to 'arch/x86/lib')
-rw-r--r-- | arch/x86/lib/Makefile | 23 | ||||
-rw-r--r-- | arch/x86/lib/atomic64_cx8_32.S | 9 | ||||
-rw-r--r-- | arch/x86/lib/cache-smp.c | 1 | ||||
-rw-r--r-- | arch/x86/lib/cmdline.c | 25 | ||||
-rw-r--r-- | arch/x86/lib/cmpxchg16b_emu.S | 12 | ||||
-rw-r--r-- | arch/x86/lib/cmpxchg8b_emu.S | 30 | ||||
-rw-r--r-- | arch/x86/lib/copy_mc.c | 21 | ||||
-rw-r--r-- | arch/x86/lib/crc-t10dif-glue.c | 51 | ||||
-rw-r--r-- | arch/x86/lib/crc32-glue.c | 124 | ||||
-rw-r--r-- | arch/x86/lib/crc32-pclmul.S | 217 | ||||
-rw-r--r-- | arch/x86/lib/crc32c-3way.S | 360 | ||||
-rw-r--r-- | arch/x86/lib/crct10dif-pcl-asm_64.S | 332 | ||||
-rw-r--r-- | arch/x86/lib/delay.c | 2 | ||||
-rw-r--r-- | arch/x86/lib/getuser.S | 77 | ||||
-rw-r--r-- | arch/x86/lib/insn-eval.c | 6 | ||||
-rw-r--r-- | arch/x86/lib/insn.c | 89 | ||||
-rw-r--r-- | arch/x86/lib/iomap_copy_64.S | 15 | ||||
-rw-r--r-- | arch/x86/lib/iomem.c | 5 | ||||
-rw-r--r-- | arch/x86/lib/misc.c | 2 | ||||
-rw-r--r-- | arch/x86/lib/msr-smp.c | 12 | ||||
-rw-r--r-- | arch/x86/lib/msr.c | 6 | ||||
-rw-r--r-- | arch/x86/lib/putuser.S | 20 | ||||
-rw-r--r-- | arch/x86/lib/retpoline.S | 59 | ||||
-rw-r--r-- | arch/x86/lib/x86-opcode-map.txt | 317 |
24 files changed, 1533 insertions, 282 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index ea3a28e7b613..8a59c61624c2 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -14,19 +14,6 @@ ifdef CONFIG_KCSAN CFLAGS_REMOVE_delay.o = $(CC_FLAGS_FTRACE) endif -# Early boot use of cmdline; don't instrument it -ifdef CONFIG_AMD_MEM_ENCRYPT -KCOV_INSTRUMENT_cmdline.o := n -KASAN_SANITIZE_cmdline.o := n -KCSAN_SANITIZE_cmdline.o := n - -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_cmdline.o = -pg -endif - -CFLAGS_cmdline.o := -fno-stack-protector -fno-jump-tables -endif - inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ @@ -49,7 +36,14 @@ lib-$(CONFIG_ARCH_HAS_COPY_MC) += copy_mc.o copy_mc_64.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o -lib-$(CONFIG_RETPOLINE) += retpoline.o +lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o + +obj-$(CONFIG_CRC32_ARCH) += crc32-x86.o +crc32-x86-y := crc32-glue.o crc32-pclmul.o +crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o + +obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o +crc-t10dif-x86-y := crc-t10dif-glue.o crct10dif-pcl-asm_64.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o obj-y += iomem.o @@ -66,7 +60,6 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y) lib-y += atomic64_386_32.o endif else - obj-y += iomap_copy_64.o ifneq ($(CONFIG_GENERIC_CSUM),y) lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o endif diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 90afb488b396..b2eff07d65e4 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -16,6 +16,11 @@ cmpxchg8b (\reg) .endm +.macro read64_nonatomic reg + movl (\reg), %eax + movl 4(\reg), %edx +.endm + SYM_FUNC_START(atomic64_read_cx8) read64 %ecx RET @@ -51,7 +56,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8) movl %edx, %edi movl %ecx, %ebp - read64 %ecx + read64_nonatomic %ecx 1: movl %eax, %ebx movl %edx, %ecx @@ -79,7 +84,7 @@ addsub_return sub sub sbb SYM_FUNC_START(atomic64_\func\()_return_cx8) pushl %ebx - read64 %esi + read64_nonatomic %esi 1: movl %eax, %ebx movl %edx, %ecx diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c index 7c48ff4ae8d1..7af743bd3b13 100644 --- a/arch/x86/lib/cache-smp.c +++ b/arch/x86/lib/cache-smp.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <asm/paravirt.h> #include <linux/smp.h> #include <linux/export.h> diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 80570eb3c89b..c65cd5550454 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -6,8 +6,10 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/ctype.h> + #include <asm/setup.h> #include <asm/cmdline.h> +#include <asm/bug.h> static inline int myisspace(u8 c) { @@ -205,12 +207,29 @@ __cmdline_find_option(const char *cmdline, int max_cmdline_size, int cmdline_find_option_bool(const char *cmdline, const char *option) { - return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); + int ret; + + ret = __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); + if (ret > 0) + return ret; + + if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added) + return __cmdline_find_option_bool(builtin_cmdline, COMMAND_LINE_SIZE, option); + + return ret; } int cmdline_find_option(const char *cmdline, const char *option, char *buffer, int bufsize) { - return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, - buffer, bufsize); + int ret; + + ret = __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize); + if (ret > 0) + return ret; + + if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added) + return __cmdline_find_option(builtin_cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize); + + return ret; } diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S index 6962df315793..4fb44894ad87 100644 --- a/arch/x86/lib/cmpxchg16b_emu.S +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -23,14 +23,14 @@ SYM_FUNC_START(this_cpu_cmpxchg16b_emu) cli /* if (*ptr == old) */ - cmpq PER_CPU_VAR(0(%rsi)), %rax + cmpq __percpu (%rsi), %rax jne .Lnot_same - cmpq PER_CPU_VAR(8(%rsi)), %rdx + cmpq __percpu 8(%rsi), %rdx jne .Lnot_same /* *ptr = new */ - movq %rbx, PER_CPU_VAR(0(%rsi)) - movq %rcx, PER_CPU_VAR(8(%rsi)) + movq %rbx, __percpu (%rsi) + movq %rcx, __percpu 8(%rsi) /* set ZF in EFLAGS to indicate success */ orl $X86_EFLAGS_ZF, (%rsp) @@ -42,8 +42,8 @@ SYM_FUNC_START(this_cpu_cmpxchg16b_emu) /* *ptr != old */ /* old = *ptr */ - movq PER_CPU_VAR(0(%rsi)), %rax - movq PER_CPU_VAR(8(%rsi)), %rdx + movq __percpu (%rsi), %rax + movq __percpu 8(%rsi), %rdx /* clear ZF in EFLAGS to indicate failure */ andl $(~X86_EFLAGS_ZF), (%rsp) diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index 873e4ef23e49..1c96be769adc 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -24,12 +24,12 @@ SYM_FUNC_START(cmpxchg8b_emu) pushfl cli - cmpl 0(%esi), %eax + cmpl (%esi), %eax jne .Lnot_same cmpl 4(%esi), %edx jne .Lnot_same - movl %ebx, 0(%esi) + movl %ebx, (%esi) movl %ecx, 4(%esi) orl $X86_EFLAGS_ZF, (%esp) @@ -38,7 +38,7 @@ SYM_FUNC_START(cmpxchg8b_emu) RET .Lnot_same: - movl 0(%esi), %eax + movl (%esi), %eax movl 4(%esi), %edx andl $(~X86_EFLAGS_ZF), (%esp) @@ -53,18 +53,30 @@ EXPORT_SYMBOL(cmpxchg8b_emu) #ifndef CONFIG_UML +/* + * Emulate 'cmpxchg8b %fs:(%rsi)' + * + * Inputs: + * %esi : memory location to compare + * %eax : low 32 bits of old value + * %edx : high 32 bits of old value + * %ebx : low 32 bits of new value + * %ecx : high 32 bits of new value + * + * Notably this is not LOCK prefixed and is not safe against NMIs + */ SYM_FUNC_START(this_cpu_cmpxchg8b_emu) pushfl cli - cmpl PER_CPU_VAR(0(%esi)), %eax + cmpl __percpu (%esi), %eax jne .Lnot_same2 - cmpl PER_CPU_VAR(4(%esi)), %edx + cmpl __percpu 4(%esi), %edx jne .Lnot_same2 - movl %ebx, PER_CPU_VAR(0(%esi)) - movl %ecx, PER_CPU_VAR(4(%esi)) + movl %ebx, __percpu (%esi) + movl %ecx, __percpu 4(%esi) orl $X86_EFLAGS_ZF, (%esp) @@ -72,8 +84,8 @@ SYM_FUNC_START(this_cpu_cmpxchg8b_emu) RET .Lnot_same2: - movl PER_CPU_VAR(0(%esi)), %eax - movl PER_CPU_VAR(4(%esi)), %edx + movl __percpu (%esi), %eax + movl __percpu 4(%esi), %edx andl $(~X86_EFLAGS_ZF), (%esp) diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c index 6e8b7e600def..97e88e58567b 100644 --- a/arch/x86/lib/copy_mc.c +++ b/arch/x86/lib/copy_mc.c @@ -4,6 +4,7 @@ #include <linux/jump_label.h> #include <linux/uaccess.h> #include <linux/export.h> +#include <linux/instrumented.h> #include <linux/string.h> #include <linux/types.h> @@ -61,10 +62,20 @@ unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned */ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len) { - if (copy_mc_fragile_enabled) - return copy_mc_fragile(dst, src, len); - if (static_cpu_has(X86_FEATURE_ERMS)) - return copy_mc_enhanced_fast_string(dst, src, len); + unsigned long ret; + + if (copy_mc_fragile_enabled) { + instrument_memcpy_before(dst, src, len); + ret = copy_mc_fragile(dst, src, len); + instrument_memcpy_after(dst, src, len, ret); + return ret; + } + if (static_cpu_has(X86_FEATURE_ERMS)) { + instrument_memcpy_before(dst, src, len); + ret = copy_mc_enhanced_fast_string(dst, src, len); + instrument_memcpy_after(dst, src, len, ret); + return ret; + } memcpy(dst, src, len); return 0; } @@ -75,6 +86,7 @@ unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, un unsigned long ret; if (copy_mc_fragile_enabled) { + instrument_copy_to_user(dst, src, len); __uaccess_begin(); ret = copy_mc_fragile((__force void *)dst, src, len); __uaccess_end(); @@ -82,6 +94,7 @@ unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, un } if (static_cpu_has(X86_FEATURE_ERMS)) { + instrument_copy_to_user(dst, src, len); __uaccess_begin(); ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len); __uaccess_end(); diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif-glue.c new file mode 100644 index 000000000000..13f07ddc9122 --- /dev/null +++ b/arch/x86/lib/crc-t10dif-glue.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * CRC-T10DIF using PCLMULQDQ instructions + * + * Copyright 2024 Google LLC + */ + +#include <asm/cpufeatures.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <linux/crc-t10dif.h> +#include <linux/module.h> + +static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len); + +u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) +{ + if (len >= 16 && + static_key_enabled(&have_pclmulqdq) && crypto_simd_usable()) { + kernel_fpu_begin(); + crc = crc_t10dif_pcl(crc, p, len); + kernel_fpu_end(); + return crc; + } + return crc_t10dif_generic(crc, p, len); +} +EXPORT_SYMBOL(crc_t10dif_arch); + +static int __init crc_t10dif_x86_init(void) +{ + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) + static_branch_enable(&have_pclmulqdq); + return 0; +} +arch_initcall(crc_t10dif_x86_init); + +static void __exit crc_t10dif_x86_exit(void) +{ +} +module_exit(crc_t10dif_x86_exit); + +bool crc_t10dif_is_optimized(void) +{ + return static_key_enabled(&have_pclmulqdq); +} +EXPORT_SYMBOL(crc_t10dif_is_optimized); + +MODULE_DESCRIPTION("CRC-T10DIF using PCLMULQDQ instructions"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c new file mode 100644 index 000000000000..2dd18a886ded --- /dev/null +++ b/arch/x86/lib/crc32-glue.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * x86-optimized CRC32 functions + * + * Copyright (C) 2008 Intel Corporation + * Copyright 2012 Xyratex Technology Limited + * Copyright 2024 Google LLC + */ + +#include <asm/cpufeatures.h> +#include <asm/simd.h> +#include <crypto/internal/simd.h> +#include <linux/crc32.h> +#include <linux/linkage.h> +#include <linux/module.h> + +/* minimum size of buffer for crc32_pclmul_le_16 */ +#define CRC32_PCLMUL_MIN_LEN 64 + +static DEFINE_STATIC_KEY_FALSE(have_crc32); +static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len); + +u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (len >= CRC32_PCLMUL_MIN_LEN + 15 && + static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { + size_t n = -(uintptr_t)p & 15; + + /* align p to 16-byte boundary */ + if (n) { + crc = crc32_le_base(crc, p, n); + p += n; + len -= n; + } + n = round_down(len, 16); + kernel_fpu_begin(); + crc = crc32_pclmul_le_16(crc, p, n); + kernel_fpu_end(); + p += n; + len -= n; + } + if (len) + crc = crc32_le_base(crc, p, len); + return crc; +} +EXPORT_SYMBOL(crc32_le_arch); + +#ifdef CONFIG_X86_64 +#define CRC32_INST "crc32q %1, %q0" +#else +#define CRC32_INST "crc32l %1, %0" +#endif + +/* + * Use carryless multiply version of crc32c when buffer size is >= 512 to + * account for FPU state save/restore overhead. + */ +#define CRC32C_PCLMUL_BREAKEVEN 512 + +asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); + +u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len) +{ + size_t num_longs; + + if (!static_branch_likely(&have_crc32)) + return crc32c_le_base(crc, p, len); + + if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && + static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { + kernel_fpu_begin(); + crc = crc32c_x86_3way(crc, p, len); + kernel_fpu_end(); + return crc; + } + + for (num_longs = len / sizeof(unsigned long); + num_longs != 0; num_longs--, p += sizeof(unsigned long)) + asm(CRC32_INST : "+r" (crc) : "rm" (*(unsigned long *)p)); + + for (len %= sizeof(unsigned long); len; len--, p++) + asm("crc32b %1, %0" : "+r" (crc) : "rm" (*p)); + + return crc; +} +EXPORT_SYMBOL(crc32c_le_arch); + +u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + return crc32_be_base(crc, p, len); +} +EXPORT_SYMBOL(crc32_be_arch); + +static int __init crc32_x86_init(void) +{ + if (boot_cpu_has(X86_FEATURE_XMM4_2)) + static_branch_enable(&have_crc32); + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) + static_branch_enable(&have_pclmulqdq); + return 0; +} +arch_initcall(crc32_x86_init); + +static void __exit crc32_x86_exit(void) +{ +} +module_exit(crc32_x86_exit); + +u32 crc32_optimizations(void) +{ + u32 optimizations = 0; + + if (static_key_enabled(&have_crc32)) + optimizations |= CRC32C_OPTIMIZATION; + if (static_key_enabled(&have_pclmulqdq)) + optimizations |= CRC32_LE_OPTIMIZATION; + return optimizations; +} +EXPORT_SYMBOL(crc32_optimizations); + +MODULE_DESCRIPTION("x86-optimized CRC32 functions"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S new file mode 100644 index 000000000000..f9637789cac1 --- /dev/null +++ b/arch/x86/lib/crc32-pclmul.S @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2012 Xyratex Technology Limited + * + * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 + * calculation. + * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) + * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found + * at: + * http://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2B: Instruction Set Reference, N-Z + * + * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> + * Alexander Boyko <Alexander_Boyko@xyratex.com> + */ + +#include <linux/linkage.h> + + +.section .rodata +.align 16 +/* + * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 + * #define CONSTANT_R1 0x154442bd4LL + * + * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 + * #define CONSTANT_R2 0x1c6e41596LL + */ +.Lconstant_R2R1: + .octa 0x00000001c6e415960000000154442bd4 +/* + * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 + * #define CONSTANT_R3 0x1751997d0LL + * + * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e + * #define CONSTANT_R4 0x0ccaa009eLL + */ +.Lconstant_R4R3: + .octa 0x00000000ccaa009e00000001751997d0 +/* + * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 + * #define CONSTANT_R5 0x163cd6124LL + */ +.Lconstant_R5: + .octa 0x00000000000000000000000163cd6124 +.Lconstant_mask32: + .octa 0x000000000000000000000000FFFFFFFF +/* + * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL + * + * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL + * #define CONSTANT_RU 0x1F7011641LL + */ +.Lconstant_RUpoly: + .octa 0x00000001F701164100000001DB710641 + +#define CONSTANT %xmm0 + +#ifdef __x86_64__ +#define CRC %edi +#define BUF %rsi +#define LEN %rdx +#else +#define CRC %eax +#define BUF %edx +#define LEN %ecx +#endif + + + +.text +/** + * Calculate crc32 + * CRC - initial crc32 + * BUF - buffer (16 bytes aligned) + * LEN - sizeof buffer (16 bytes aligned), LEN should be greater than 63 + * return %eax crc32 + * u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len); + */ + +SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ + movdqa (BUF), %xmm1 + movdqa 0x10(BUF), %xmm2 + movdqa 0x20(BUF), %xmm3 + movdqa 0x30(BUF), %xmm4 + movd CRC, CONSTANT + pxor CONSTANT, %xmm1 + sub $0x40, LEN + add $0x40, BUF + cmp $0x40, LEN + jb .Lless_64 + +#ifdef __x86_64__ + movdqa .Lconstant_R2R1(%rip), CONSTANT +#else + movdqa .Lconstant_R2R1, CONSTANT +#endif + +.Lloop_64:/* 64 bytes Full cache line folding */ + prefetchnta 0x40(BUF) + movdqa %xmm1, %xmm5 + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 +#ifdef __x86_64__ + movdqa %xmm4, %xmm8 +#endif + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm2 + pclmulqdq $0x00, CONSTANT, %xmm3 +#ifdef __x86_64__ + pclmulqdq $0x00, CONSTANT, %xmm4 +#endif + pclmulqdq $0x11, CONSTANT, %xmm5 + pclmulqdq $0x11, CONSTANT, %xmm6 + pclmulqdq $0x11, CONSTANT, %xmm7 +#ifdef __x86_64__ + pclmulqdq $0x11, CONSTANT, %xmm8 +#endif + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 +#ifdef __x86_64__ + pxor %xmm8, %xmm4 +#else + /* xmm8 unsupported for x32 */ + movdqa %xmm4, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm4 + pclmulqdq $0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm4 +#endif + + pxor (BUF), %xmm1 + pxor 0x10(BUF), %xmm2 + pxor 0x20(BUF), %xmm3 + pxor 0x30(BUF), %xmm4 + + sub $0x40, LEN + add $0x40, BUF + cmp $0x40, LEN + jge .Lloop_64 +.Lless_64:/* Folding cache line into 128bit */ +#ifdef __x86_64__ + movdqa .Lconstant_R4R3(%rip), CONSTANT +#else + movdqa .Lconstant_R4R3, CONSTANT +#endif + prefetchnta (BUF) + + movdqa %xmm1, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm2, %xmm1 + + movdqa %xmm1, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm3, %xmm1 + + movdqa %xmm1, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm4, %xmm1 + + cmp $0x10, LEN + jb .Lfold_64 +.Lloop_16:/* Folding rest buffer into 128bit */ + movdqa %xmm1, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor (BUF), %xmm1 + sub $0x10, LEN + add $0x10, BUF + cmp $0x10, LEN + jge .Lloop_16 + +.Lfold_64: + /* perform the last 64 bit fold, also adds 32 zeroes + * to the input stream */ + pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ + psrldq $0x08, %xmm1 + pxor CONSTANT, %xmm1 + + /* final 32-bit fold */ + movdqa %xmm1, %xmm2 +#ifdef __x86_64__ + movdqa .Lconstant_R5(%rip), CONSTANT + movdqa .Lconstant_mask32(%rip), %xmm3 +#else + movdqa .Lconstant_R5, CONSTANT + movdqa .Lconstant_mask32, %xmm3 +#endif + psrldq $0x04, %xmm2 + pand %xmm3, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm1 + pxor %xmm2, %xmm1 + + /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ +#ifdef __x86_64__ + movdqa .Lconstant_RUpoly(%rip), CONSTANT +#else + movdqa .Lconstant_RUpoly, CONSTANT +#endif + movdqa %xmm1, %xmm2 + pand %xmm3, %xmm1 + pclmulqdq $0x10, CONSTANT, %xmm1 + pand %xmm3, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm1 + pxor %xmm2, %xmm1 + pextrd $0x01, %xmm1, %eax + + RET +SYM_FUNC_END(crc32_pclmul_le_16) diff --git a/arch/x86/lib/crc32c-3way.S b/arch/x86/lib/crc32c-3way.S new file mode 100644 index 000000000000..9b8770503bbc --- /dev/null +++ b/arch/x86/lib/crc32c-3way.S @@ -0,0 +1,360 @@ +/* + * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) + * + * The white papers on CRC32C calculations with PCLMULQDQ instruction can be + * downloaded from: + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf + * + * Copyright (C) 2012 Intel Corporation. + * Copyright 2024 Google LLC + * + * Authors: + * Wajdi Feghali <wajdi.k.feghali@intel.com> + * James Guilford <james.guilford@intel.com> + * David Cote <david.m.cote@intel.com> + * Tim Chen <tim.c.chen@linux.intel.com> + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/linkage.h> + +## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +# Define threshold below which buffers are considered "small" and routed to +# regular CRC code that does not interleave the CRC instructions. +#define SMALL_SIZE 200 + +# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); + +.text +SYM_FUNC_START(crc32c_x86_3way) +#define crc0 %edi +#define crc0_q %rdi +#define bufp %rsi +#define bufp_d %esi +#define len %rdx +#define len_dw %edx +#define n_misaligned %ecx /* overlaps chunk_bytes! */ +#define n_misaligned_q %rcx +#define chunk_bytes %ecx /* overlaps n_misaligned! */ +#define chunk_bytes_q %rcx +#define crc1 %r8 +#define crc2 %r9 + + cmp $SMALL_SIZE, len + jb .Lsmall + + ################################################################ + ## 1) ALIGN: + ################################################################ + mov bufp_d, n_misaligned + neg n_misaligned + and $7, n_misaligned # calculate the misalignment amount of + # the address + je .Laligned # Skip if aligned + + # Process 1 <= n_misaligned <= 7 bytes individually in order to align + # the remaining data to an 8-byte boundary. +.Ldo_align: + movq (bufp), %rax + add n_misaligned_q, bufp + sub n_misaligned_q, len +.Lalign_loop: + crc32b %al, crc0 # compute crc32 of 1-byte + shr $8, %rax # get next byte + dec n_misaligned + jne .Lalign_loop +.Laligned: + + ################################################################ + ## 2) PROCESS BLOCK: + ################################################################ + + cmp $128*24, len + jae .Lfull_block + +.Lpartial_block: + # Compute floor(len / 24) to get num qwords to process from each lane. + imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24) + shr $16, %eax + jmp .Lcrc_3lanes + +.Lfull_block: + # Processing 128 qwords from each lane. + mov $128, %eax + + ################################################################ + ## 3) CRC each of three lanes: + ################################################################ + +.Lcrc_3lanes: + xor crc1,crc1 + xor crc2,crc2 + mov %eax, chunk_bytes + shl $3, chunk_bytes # num bytes to process from each lane + sub $5, %eax # 4 for 4x_loop, 1 for special last iter + jl .Lcrc_3lanes_4x_done + + # Unroll the loop by a factor of 4 to reduce the overhead of the loop + # bookkeeping instructions, which can compete with crc32q for the ALUs. +.Lcrc_3lanes_4x_loop: + crc32q (bufp), crc0_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + crc32q 8(bufp), crc0_q + crc32q 8(bufp,chunk_bytes_q), crc1 + crc32q 8(bufp,chunk_bytes_q,2), crc2 + crc32q 16(bufp), crc0_q + crc32q 16(bufp,chunk_bytes_q), crc1 + crc32q 16(bufp,chunk_bytes_q,2), crc2 + crc32q 24(bufp), crc0_q + crc32q 24(bufp,chunk_bytes_q), crc1 + crc32q 24(bufp,chunk_bytes_q,2), crc2 + add $32, bufp + sub $4, %eax + jge .Lcrc_3lanes_4x_loop + +.Lcrc_3lanes_4x_done: + add $4, %eax + jz .Lcrc_3lanes_last_qword + +.Lcrc_3lanes_1x_loop: + crc32q (bufp), crc0_q + crc32q (bufp,chunk_bytes_q), crc1 + crc32q (bufp,chunk_bytes_q,2), crc2 + add $8, bufp + dec %eax + jnz .Lcrc_3lanes_1x_loop + +.Lcrc_3lanes_last_qword: + crc32q (bufp), crc0_q + crc32q (bufp,chunk_bytes_q), crc1 +# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet + + ################################################################ + ## 4) Combine three results: + ################################################################ + + lea (K_table-8)(%rip), %rax # first entry is for idx 1 + pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 + lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 + sub %rax, len # len -= chunk_bytes * 3 + + movq crc0_q, %xmm1 # CRC for block 1 + pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 + + movq crc1, %xmm2 # CRC for block 2 + pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 + + pxor %xmm2,%xmm1 + movq %xmm1, %rax + xor (bufp,chunk_bytes_q,2), %rax + mov crc2, crc0_q + crc32 %rax, crc0_q + lea 8(bufp,chunk_bytes_q,2), bufp + + ################################################################ + ## 5) If more blocks remain, goto (2): + ################################################################ + + cmp $128*24, len + jae .Lfull_block + cmp $SMALL_SIZE, len + jae .Lpartial_block + + ####################################################################### + ## 6) Process any remainder without interleaving: + ####################################################################### +.Lsmall: + test len_dw, len_dw + jz .Ldone + mov len_dw, %eax + shr $3, %eax + jz .Ldo_dword +.Ldo_qwords: + crc32q (bufp), crc0_q + add $8, bufp + dec %eax + jnz .Ldo_qwords +.Ldo_dword: + test $4, len_dw + jz .Ldo_word + crc32l (bufp), crc0 + add $4, bufp +.Ldo_word: + test $2, len_dw + jz .Ldo_byte + crc32w (bufp), crc0 + add $2, bufp +.Ldo_byte: + test $1, len_dw + jz .Ldone + crc32b (bufp), crc0 +.Ldone: + mov crc0, %eax + RET +SYM_FUNC_END(crc32c_x86_3way) + +.section .rodata, "a", @progbits + ################################################################ + ## PCLMULQDQ tables + ## Table is 128 entries x 2 words (8 bytes) each + ################################################################ +.align 8 +K_table: + .long 0x493c7d27, 0x00000001 + .long 0xba4fc28e, 0x493c7d27 + .long 0xddc0152b, 0xf20c0dfe + .long 0x9e4addf8, 0xba4fc28e + .long 0x39d3b296, 0x3da6d0cb + .long 0x0715ce53, 0xddc0152b + .long 0x47db8317, 0x1c291d04 + .long 0x0d3b6092, 0x9e4addf8 + .long 0xc96cfdc0, 0x740eef02 + .long 0x878a92a7, 0x39d3b296 + .long 0xdaece73e, 0x083a6eec + .long 0xab7aff2a, 0x0715ce53 + .long 0x2162d385, 0xc49f4f67 + .long 0x83348832, 0x47db8317 + .long 0x299847d5, 0x2ad91c30 + .long 0xb9e02b86, 0x0d3b6092 + .long 0x18b33a4e, 0x6992cea2 + .long 0xb6dd949b, 0xc96cfdc0 + .long 0x78d9ccb7, 0x7e908048 + .long 0xbac2fd7b, 0x878a92a7 + .long 0xa60ce07b, 0x1b3d8f29 + .long 0xce7f39f4, 0xdaece73e + .long 0x61d82e56, 0xf1d0f55e + .long 0xd270f1a2, 0xab7aff2a + .long 0xc619809d, 0xa87ab8a8 + .long 0x2b3cac5d, 0x2162d385 + .long 0x65863b64, 0x8462d800 + .long 0x1b03397f, 0x83348832 + .long 0xebb883bd, 0x71d111a8 + .long 0xb3e32c28, 0x299847d5 + .long 0x064f7f26, 0xffd852c6 + .long 0xdd7e3b0c, 0xb9e02b86 + .long 0xf285651c, 0xdcb17aa4 + .long 0x10746f3c, 0x18b33a4e + .long 0xc7a68855, 0xf37c5aee + .long 0x271d9844, 0xb6dd949b + .long 0x8e766a0c, 0x6051d5a2 + .long 0x93a5f730, 0x78d9ccb7 + .long 0x6cb08e5c, 0x18b0d4ff + .long 0x6b749fb2, 0xbac2fd7b + .long 0x1393e203, 0x21f3d99c + .long 0xcec3662e, 0xa60ce07b + .long 0x96c515bb, 0x8f158014 + .long 0xe6fc4e6a, 0xce7f39f4 + .long 0x8227bb8a, 0xa00457f7 + .long 0xb0cd4768, 0x61d82e56 + .long 0x39c7ff35, 0x8d6d2c43 + .long 0xd7a4825c, 0xd270f1a2 + .long 0x0ab3844b, 0x00ac29cf + .long 0x0167d312, 0xc619809d + .long 0xf6076544, 0xe9adf796 + .long 0x26f6a60a, 0x2b3cac5d + .long 0xa741c1bf, 0x96638b34 + .long 0x98d8d9cb, 0x65863b64 + .long 0x49c3cc9c, 0xe0e9f351 + .long 0x68bce87a, 0x1b03397f + .long 0x57a3d037, 0x9af01f2d + .long 0x6956fc3b, 0xebb883bd + .long 0x42d98888, 0x2cff42cf + .long 0x3771e98f, 0xb3e32c28 + .long 0xb42ae3d9, 0x88f25a3a + .long 0x2178513a, 0x064f7f26 + .long 0xe0ac139e, 0x4e36f0b0 + .long 0x170076fa, 0xdd7e3b0c + .long 0x444dd413, 0xbd6f81f8 + .long 0x6f345e45, 0xf285651c + .long 0x41d17b64, 0x91c9bd4b + .long 0xff0dba97, 0x10746f3c + .long 0xa2b73df1, 0x885f087b + .long 0xf872e54c, 0xc7a68855 + .long 0x1e41e9fc, 0x4c144932 + .long 0x86d8e4d2, 0x271d9844 + .long 0x651bd98b, 0x52148f02 + .long 0x5bb8f1bc, 0x8e766a0c + .long 0xa90fd27a, 0xa3c6f37a + .long 0xb3af077a, 0x93a5f730 + .long 0x4984d782, 0xd7c0557f + .long 0xca6ef3ac, 0x6cb08e5c + .long 0x234e0b26, 0x63ded06a + .long 0xdd66cbbb, 0x6b749fb2 + .long 0x4597456a, 0x4d56973c + .long 0xe9e28eb4, 0x1393e203 + .long 0x7b3ff57a, 0x9669c9df + .long 0xc9c8b782, 0xcec3662e + .long 0x3f70cc6f, 0xe417f38a + .long 0x93e106a4, 0x96c515bb + .long 0x62ec6c6d, 0x4b9e0f71 + .long 0xd813b325, 0xe6fc4e6a + .long 0x0df04680, 0xd104b8fc + .long 0x2342001e, 0x8227bb8a + .long 0x0a2a8d7e, 0x5b397730 + .long 0x6d9a4957, 0xb0cd4768 + .long 0xe8b6368b, 0xe78eb416 + .long 0xd2c3ed1a, 0x39c7ff35 + .long 0x995a5724, 0x61ff0e01 + .long 0x9ef68d35, 0xd7a4825c + .long 0x0c139b31, 0x8d96551c + .long 0xf2271e60, 0x0ab3844b + .long 0x0b0bf8ca, 0x0bf80dd2 + .long 0x2664fd8b, 0x0167d312 + .long 0xed64812d, 0x8821abed + .long 0x02ee03b2, 0xf6076544 + .long 0x8604ae0f, 0x6a45d2b2 + .long 0x363bd6b3, 0x26f6a60a + .long 0x135c83fd, 0xd8d26619 + .long 0x5fabe670, 0xa741c1bf + .long 0x35ec3279, 0xde87806c + .long 0x00bcf5f6, 0x98d8d9cb + .long 0x8ae00689, 0x14338754 + .long 0x17f27698, 0x49c3cc9c + .long 0x58ca5f00, 0x5bd2011f + .long 0xaa7c7ad5, 0x68bce87a + .long 0xb5cfca28, 0xdd07448e + .long 0xded288f8, 0x57a3d037 + .long 0x59f229bc, 0xdde8f5b9 + .long 0x6d390dec, 0x6956fc3b + .long 0x37170390, 0xa3e3e02c + .long 0x6353c1cc, 0x42d98888 + .long 0xc4584f5c, 0xd73c7bea + .long 0xf48642e9, 0x3771e98f + .long 0x531377e2, 0x80ff0093 + .long 0xdd35bc8d, 0xb42ae3d9 + .long 0xb25b29f2, 0x8fe4c34d + .long 0x9a5ede41, 0x2178513a + .long 0xa563905d, 0xdf99fc11 + .long 0x45cddf4e, 0xe0ac139e + .long 0xacfa3103, 0x6c23e841 + .long 0xa51b6135, 0x170076fa diff --git a/arch/x86/lib/crct10dif-pcl-asm_64.S b/arch/x86/lib/crct10dif-pcl-asm_64.S new file mode 100644 index 000000000000..5286db5b8165 --- /dev/null +++ b/arch/x86/lib/crct10dif-pcl-asm_64.S @@ -0,0 +1,332 @@ +######################################################################## +# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +# +# Copyright (c) 2013, Intel Corporation +# +# Authors: +# Erdinc Ozturk <erdinc.ozturk@intel.com> +# Vinodh Gopal <vinodh.gopal@intel.com> +# James Guilford <james.guilford@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Reference paper titled "Fast CRC Computation for Generic +# Polynomials Using PCLMULQDQ Instruction" +# URL: http://www.intel.com/content/dam/www/public/us/en/documents +# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +# + +#include <linux/linkage.h> + +.text + +#define init_crc %edi +#define buf %rsi +#define len %rdx + +#define FOLD_CONSTS %xmm10 +#define BSWAP_MASK %xmm11 + +# Fold reg1, reg2 into the next 32 data bytes, storing the result back into +# reg1, reg2. +.macro fold_32_bytes offset, reg1, reg2 + movdqu \offset(buf), %xmm9 + movdqu \offset+16(buf), %xmm12 + pshufb BSWAP_MASK, %xmm9 + pshufb BSWAP_MASK, %xmm12 + movdqa \reg1, %xmm8 + movdqa \reg2, %xmm13 + pclmulqdq $0x00, FOLD_CONSTS, \reg1 + pclmulqdq $0x11, FOLD_CONSTS, %xmm8 + pclmulqdq $0x00, FOLD_CONSTS, \reg2 + pclmulqdq $0x11, FOLD_CONSTS, %xmm13 + pxor %xmm9 , \reg1 + xorps %xmm8 , \reg1 + pxor %xmm12, \reg2 + xorps %xmm13, \reg2 +.endm + +# Fold src_reg into dst_reg. +.macro fold_16_bytes src_reg, dst_reg + movdqa \src_reg, %xmm8 + pclmulqdq $0x11, FOLD_CONSTS, \src_reg + pclmulqdq $0x00, FOLD_CONSTS, %xmm8 + pxor %xmm8, \dst_reg + xorps \src_reg, \dst_reg +.endm + +# +# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len); +# +# Assumes len >= 16. +# +SYM_FUNC_START(crc_t10dif_pcl) + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + + # For sizes less than 256 bytes, we can't fold 128 bytes at a time. + cmp $256, len + jl .Lless_than_256_bytes + + # Load the first 128 data bytes. Byte swapping is necessary to make the + # bit order match the polynomial coefficient order. + movdqu 16*0(buf), %xmm0 + movdqu 16*1(buf), %xmm1 + movdqu 16*2(buf), %xmm2 + movdqu 16*3(buf), %xmm3 + movdqu 16*4(buf), %xmm4 + movdqu 16*5(buf), %xmm5 + movdqu 16*6(buf), %xmm6 + movdqu 16*7(buf), %xmm7 + add $128, buf + pshufb BSWAP_MASK, %xmm0 + pshufb BSWAP_MASK, %xmm1 + pshufb BSWAP_MASK, %xmm2 + pshufb BSWAP_MASK, %xmm3 + pshufb BSWAP_MASK, %xmm4 + pshufb BSWAP_MASK, %xmm5 + pshufb BSWAP_MASK, %xmm6 + pshufb BSWAP_MASK, %xmm7 + + # XOR the first 16 data *bits* with the initial CRC value. + pxor %xmm8, %xmm8 + pinsrw $7, init_crc, %xmm8 + pxor %xmm8, %xmm0 + + movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS + + # Subtract 128 for the 128 data bytes just consumed. Subtract another + # 128 to simplify the termination condition of the following loop. + sub $256, len + + # While >= 128 data bytes remain (not counting xmm0-7), fold the 128 + # bytes xmm0-7 into them, storing the result back into xmm0-7. +.Lfold_128_bytes_loop: + fold_32_bytes 0, %xmm0, %xmm1 + fold_32_bytes 32, %xmm2, %xmm3 + fold_32_bytes 64, %xmm4, %xmm5 + fold_32_bytes 96, %xmm6, %xmm7 + add $128, buf + sub $128, len + jge .Lfold_128_bytes_loop + + # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7. + + # Fold across 64 bytes. + movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS + fold_16_bytes %xmm0, %xmm4 + fold_16_bytes %xmm1, %xmm5 + fold_16_bytes %xmm2, %xmm6 + fold_16_bytes %xmm3, %xmm7 + # Fold across 32 bytes. + movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS + fold_16_bytes %xmm4, %xmm6 + fold_16_bytes %xmm5, %xmm7 + # Fold across 16 bytes. + movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS + fold_16_bytes %xmm6, %xmm7 + + # Add 128 to get the correct number of data bytes remaining in 0...127 + # (not counting xmm7), following the previous extra subtraction by 128. + # Then subtract 16 to simplify the termination condition of the + # following loop. + add $128-16, len + + # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes + # xmm7 into them, storing the result back into xmm7. + jl .Lfold_16_bytes_loop_done +.Lfold_16_bytes_loop: + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, FOLD_CONSTS, %xmm7 + pclmulqdq $0x00, FOLD_CONSTS, %xmm8 + pxor %xmm8, %xmm7 + movdqu (buf), %xmm0 + pshufb BSWAP_MASK, %xmm0 + pxor %xmm0 , %xmm7 + add $16, buf + sub $16, len + jge .Lfold_16_bytes_loop + +.Lfold_16_bytes_loop_done: + # Add 16 to get the correct number of data bytes remaining in 0...15 + # (not counting xmm7), following the previous extra subtraction by 16. + add $16, len + je .Lreduce_final_16_bytes + +.Lhandle_partial_segment: + # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16 + # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do + # this without needing a fold constant for each possible 'len', redivide + # the bytes into a first chunk of 'len' bytes and a second chunk of 16 + # bytes, then fold the first chunk into the second. + + movdqa %xmm7, %xmm2 + + # xmm1 = last 16 original data bytes + movdqu -16(buf, len), %xmm1 + pshufb BSWAP_MASK, %xmm1 + + # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes. + lea .Lbyteshift_table+16(%rip), %rax + sub len, %rax + movdqu (%rax), %xmm0 + pshufb %xmm0, %xmm2 + + # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes. + pxor .Lmask1(%rip), %xmm0 + pshufb %xmm0, %xmm7 + + # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes), + # then '16-len' bytes from xmm2 (high-order bytes). + pblendvb %xmm2, %xmm1 #xmm0 is implicit + + # Fold the first chunk into the second chunk, storing the result in xmm7. + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, FOLD_CONSTS, %xmm7 + pclmulqdq $0x00, FOLD_CONSTS, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm1, %xmm7 + +.Lreduce_final_16_bytes: + # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC + + # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. + movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS + + # Fold the high 64 bits into the low 64 bits, while also multiplying by + # x^64. This produces a 128-bit value congruent to x^64 * M(x) and + # whose low 48 bits are 0. + movdqa %xmm7, %xmm0 + pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x)) + pslldq $8, %xmm0 + pxor %xmm0, %xmm7 # + low bits * x^64 + + # Fold the high 32 bits into the low 96 bits. This produces a 96-bit + # value congruent to x^64 * M(x) and whose low 48 bits are 0. + movdqa %xmm7, %xmm0 + pand .Lmask2(%rip), %xmm0 # zero high 32 bits + psrldq $12, %xmm7 # extract high 32 bits + pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x)) + pxor %xmm0, %xmm7 # + low bits + + # Load G(x) and floor(x^48 / G(x)). + movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS + + # Use Barrett reduction to compute the final CRC value. + movdqa %xmm7, %xmm0 + pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x)) + psrlq $32, %xmm7 # /= x^32 + pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x) + psrlq $48, %xmm0 + pxor %xmm7, %xmm0 # + low 16 nonzero bits + # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0. + + pextrw $0, %xmm0, %eax + RET + +.align 16 +.Lless_than_256_bytes: + # Checksumming a buffer of length 16...255 bytes + + # Load the first 16 data bytes. + movdqu (buf), %xmm7 + pshufb BSWAP_MASK, %xmm7 + add $16, buf + + # XOR the first 16 data *bits* with the initial CRC value. + pxor %xmm0, %xmm0 + pinsrw $7, init_crc, %xmm0 + pxor %xmm0, %xmm7 + + movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS + cmp $16, len + je .Lreduce_final_16_bytes # len == 16 + sub $32, len + jge .Lfold_16_bytes_loop # 32 <= len <= 255 + add $16, len + jmp .Lhandle_partial_segment # 17 <= len <= 31 +SYM_FUNC_END(crc_t10dif_pcl) + +.section .rodata, "a", @progbits +.align 16 + +# Fold constants precomputed from the polynomial 0x18bb7 +# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 +.Lfold_across_128_bytes_consts: + .quad 0x0000000000006123 # x^(8*128) mod G(x) + .quad 0x0000000000002295 # x^(8*128+64) mod G(x) +.Lfold_across_64_bytes_consts: + .quad 0x0000000000001069 # x^(4*128) mod G(x) + .quad 0x000000000000dd31 # x^(4*128+64) mod G(x) +.Lfold_across_32_bytes_consts: + .quad 0x000000000000857d # x^(2*128) mod G(x) + .quad 0x0000000000007acc # x^(2*128+64) mod G(x) +.Lfold_across_16_bytes_consts: + .quad 0x000000000000a010 # x^(1*128) mod G(x) + .quad 0x0000000000001faa # x^(1*128+64) mod G(x) +.Lfinal_fold_consts: + .quad 0x1368000000000000 # x^48 * (x^48 mod G(x)) + .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x)) +.Lbarrett_reduction_consts: + .quad 0x0000000000018bb7 # G(x) + .quad 0x00000001f65a57f8 # floor(x^48 / G(x)) + +.section .rodata.cst16.mask1, "aM", @progbits, 16 +.align 16 +.Lmask1: + .octa 0x80808080808080808080808080808080 + +.section .rodata.cst16.mask2, "aM", @progbits, 16 +.align 16 +.Lmask2: + .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF + +.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 +.align 16 +.Lbswap_mask: + .octa 0x000102030405060708090A0B0C0D0E0F + +.section .rodata.cst32.byteshift_table, "aM", @progbits, 32 +.align 16 +# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len] +# is the index vector to shift left by 'len' bytes, and is also {0x80, ..., +# 0x80} XOR the index vector to shift right by '16 - len' bytes. +.Lbyteshift_table: + .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 + .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 0e65d00e2339..23f81ca3f06b 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -128,7 +128,7 @@ static void delay_halt_mwaitx(u64 unused, u64 cycles) delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles); /* - * Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu + * Use cpu_tss_rw as a cacheline-aligned, seldom accessed per-cpu * variable as the monitor target. */ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 20ef350a60fb..89ecd57c9d42 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -39,9 +39,13 @@ .macro check_range size:req .if IS_ENABLED(CONFIG_X86_64) - mov %rax, %rdx - sar $63, %rdx - or %rdx, %rax + movq $0x0123456789abcdef,%rdx + 1: + .pushsection runtime_ptr_USER_PTR_MAX,"a" + .long 1b - 8 - . + .popsection + cmp %rdx, %rax + cmova %rdx, %rax .else cmp $TASK_SIZE_MAX-\size+1, %eax jae .Lbad_get_user @@ -50,11 +54,17 @@ .endif .endm +.macro UACCESS op src dst +1: \op \src,\dst + _ASM_EXTABLE_UA(1b, __get_user_handle_exception) +.endm + + .text SYM_FUNC_START(__get_user_1) check_range size=1 ASM_STAC -1: movzbl (%_ASM_AX),%edx + UACCESS movzbl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC RET @@ -64,7 +74,7 @@ EXPORT_SYMBOL(__get_user_1) SYM_FUNC_START(__get_user_2) check_range size=2 ASM_STAC -2: movzwl (%_ASM_AX),%edx + UACCESS movzwl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC RET @@ -74,7 +84,7 @@ EXPORT_SYMBOL(__get_user_2) SYM_FUNC_START(__get_user_4) check_range size=4 ASM_STAC -3: movl (%_ASM_AX),%edx + UACCESS movl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC RET @@ -82,13 +92,16 @@ SYM_FUNC_END(__get_user_4) EXPORT_SYMBOL(__get_user_4) SYM_FUNC_START(__get_user_8) +#ifndef CONFIG_X86_64 + xor %ecx,%ecx +#endif check_range size=8 ASM_STAC #ifdef CONFIG_X86_64 -4: movq (%_ASM_AX),%rdx + UACCESS movq (%_ASM_AX),%rdx #else -4: movl (%_ASM_AX),%edx -5: movl 4(%_ASM_AX),%ecx + UACCESS movl (%_ASM_AX),%edx + UACCESS movl 4(%_ASM_AX),%ecx #endif xor %eax,%eax ASM_CLAC @@ -100,7 +113,7 @@ EXPORT_SYMBOL(__get_user_8) SYM_FUNC_START(__get_user_nocheck_1) ASM_STAC ASM_BARRIER_NOSPEC -6: movzbl (%_ASM_AX),%edx + UACCESS movzbl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC RET @@ -110,7 +123,7 @@ EXPORT_SYMBOL(__get_user_nocheck_1) SYM_FUNC_START(__get_user_nocheck_2) ASM_STAC ASM_BARRIER_NOSPEC -7: movzwl (%_ASM_AX),%edx + UACCESS movzwl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC RET @@ -120,7 +133,7 @@ EXPORT_SYMBOL(__get_user_nocheck_2) SYM_FUNC_START(__get_user_nocheck_4) ASM_STAC ASM_BARRIER_NOSPEC -8: movl (%_ASM_AX),%edx + UACCESS movl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC RET @@ -131,10 +144,11 @@ SYM_FUNC_START(__get_user_nocheck_8) ASM_STAC ASM_BARRIER_NOSPEC #ifdef CONFIG_X86_64 -9: movq (%_ASM_AX),%rdx + UACCESS movq (%_ASM_AX),%rdx #else -9: movl (%_ASM_AX),%edx -10: movl 4(%_ASM_AX),%ecx + xor %ecx,%ecx + UACCESS movl (%_ASM_AX),%edx + UACCESS movl 4(%_ASM_AX),%ecx #endif xor %eax,%eax ASM_CLAC @@ -150,36 +164,3 @@ SYM_CODE_START_LOCAL(__get_user_handle_exception) mov $(-EFAULT),%_ASM_AX RET SYM_CODE_END(__get_user_handle_exception) - -#ifdef CONFIG_X86_32 -SYM_CODE_START_LOCAL(__get_user_8_handle_exception) - ASM_CLAC -bad_get_user_8: - xor %edx,%edx - xor %ecx,%ecx - mov $(-EFAULT),%_ASM_AX - RET -SYM_CODE_END(__get_user_8_handle_exception) -#endif - -/* get_user */ - _ASM_EXTABLE(1b, __get_user_handle_exception) - _ASM_EXTABLE(2b, __get_user_handle_exception) - _ASM_EXTABLE(3b, __get_user_handle_exception) -#ifdef CONFIG_X86_64 - _ASM_EXTABLE(4b, __get_user_handle_exception) -#else - _ASM_EXTABLE(4b, __get_user_8_handle_exception) - _ASM_EXTABLE(5b, __get_user_8_handle_exception) -#endif - -/* __get_user */ - _ASM_EXTABLE(6b, __get_user_handle_exception) - _ASM_EXTABLE(7b, __get_user_handle_exception) - _ASM_EXTABLE(8b, __get_user_handle_exception) -#ifdef CONFIG_X86_64 - _ASM_EXTABLE(9b, __get_user_handle_exception) -#else - _ASM_EXTABLE(9b, __get_user_8_handle_exception) - _ASM_EXTABLE(10b, __get_user_8_handle_exception) -#endif diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 558a605929db..98631c0e7a11 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1129,15 +1129,15 @@ static int get_eff_addr_modrm_16(struct insn *insn, struct pt_regs *regs, * get_eff_addr_sib() - Obtain referenced effective address via SIB * @insn: Instruction. Must be valid. * @regs: Register values as seen when entering kernel mode - * @regoff: Obtained operand offset, in pt_regs, associated with segment + * @base_offset: Obtained operand offset, in pt_regs, associated with segment * @eff_addr: Obtained effective address * * Obtain the effective address referenced by the SIB byte of @insn. After * identifying the registers involved in the indexed, register-indirect memory * reference, its value is obtained from the operands in @regs. The computed * address is stored @eff_addr. Also, the register operand that indicates the - * associated segment is stored in @regoff, this parameter can later be used to - * determine such segment. + * associated segment is stored in @base_offset; this parameter can later be + * used to determine such segment. * * Returns: * diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 55e371cc69fd..6ffb931b9fb1 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -13,7 +13,7 @@ #endif #include <asm/inat.h> /*__ignore_sync_check__ */ #include <asm/insn.h> /* __ignore_sync_check__ */ -#include <asm/unaligned.h> /* __ignore_sync_check__ */ +#include <linux/unaligned.h> /* __ignore_sync_check__ */ #include <linux/errno.h> #include <linux/kconfig.h> @@ -71,7 +71,7 @@ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) insn->kaddr = kaddr; insn->end_kaddr = kaddr + buf_len; insn->next_byte = kaddr; - insn->x86_64 = x86_64 ? 1 : 0; + insn->x86_64 = x86_64; insn->opnd_bytes = 4; if (x86_64) insn->addr_bytes = 8; @@ -185,6 +185,17 @@ found: if (X86_REX_W(b)) /* REX.W overrides opnd_size */ insn->opnd_bytes = 8; + } else if (inat_is_rex2_prefix(attr)) { + insn_set_byte(&insn->rex_prefix, 0, b); + b = peek_nbyte_next(insn_byte_t, insn, 1); + insn_set_byte(&insn->rex_prefix, 1, b); + insn->rex_prefix.nbytes = 2; + insn->next_byte += 2; + if (X86_REX_W(b)) + /* REX.W overrides opnd_size */ + insn->opnd_bytes = 8; + insn->rex_prefix.got = 1; + goto vex_end; } } insn->rex_prefix.got = 1; @@ -268,11 +279,9 @@ int insn_get_opcode(struct insn *insn) if (opcode->got) return 0; - if (!insn->prefixes.got) { - ret = insn_get_prefixes(insn); - if (ret) - return ret; - } + ret = insn_get_prefixes(insn); + if (ret) + return ret; /* Get first opcode */ op = get_next(insn_byte_t, insn); @@ -285,6 +294,10 @@ int insn_get_opcode(struct insn *insn) m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); + /* SCALABLE EVEX uses p bits to encode operand size */ + if (inat_evex_scalable(insn->attr) && !insn_vex_w_bit(insn) && + p == INAT_PFX_OPNDSZ) + insn->opnd_bytes = 2; if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))) { @@ -296,6 +309,20 @@ int insn_get_opcode(struct insn *insn) goto end; } + /* Check if there is REX2 prefix or not */ + if (insn_is_rex2(insn)) { + if (insn_rex2_m_bit(insn)) { + /* map 1 is escape 0x0f */ + insn_attr_t esc_attr = inat_get_opcode_attribute(0x0f); + + pfx_id = insn_last_prefix_id(insn); + insn->attr = inat_get_escape_attribute(op, pfx_id, esc_attr); + } else { + insn->attr = inat_get_opcode_attribute(op); + } + goto end; + } + insn->attr = inat_get_opcode_attribute(op); while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ @@ -339,11 +366,9 @@ int insn_get_modrm(struct insn *insn) if (modrm->got) return 0; - if (!insn->opcode.got) { - ret = insn_get_opcode(insn); - if (ret) - return ret; - } + ret = insn_get_opcode(insn); + if (ret) + return ret; if (inat_has_modrm(insn->attr)) { mod = get_next(insn_byte_t, insn); @@ -386,11 +411,9 @@ int insn_rip_relative(struct insn *insn) if (!insn->x86_64) return 0; - if (!modrm->got) { - ret = insn_get_modrm(insn); - if (ret) - return 0; - } + ret = insn_get_modrm(insn); + if (ret) + return 0; /* * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. @@ -417,11 +440,9 @@ int insn_get_sib(struct insn *insn) if (insn->sib.got) return 0; - if (!insn->modrm.got) { - ret = insn_get_modrm(insn); - if (ret) - return ret; - } + ret = insn_get_modrm(insn); + if (ret) + return ret; if (insn->modrm.nbytes) { modrm = insn->modrm.bytes[0]; @@ -460,11 +481,9 @@ int insn_get_displacement(struct insn *insn) if (insn->displacement.got) return 0; - if (!insn->sib.got) { - ret = insn_get_sib(insn); - if (ret) - return ret; - } + ret = insn_get_sib(insn); + if (ret) + return ret; if (insn->modrm.nbytes) { /* @@ -628,11 +647,9 @@ int insn_get_immediate(struct insn *insn) if (insn->immediate.got) return 0; - if (!insn->displacement.got) { - ret = insn_get_displacement(insn); - if (ret) - return ret; - } + ret = insn_get_displacement(insn); + if (ret) + return ret; if (inat_has_moffset(insn->attr)) { if (!__get_moffset(insn)) @@ -703,11 +720,9 @@ int insn_get_length(struct insn *insn) if (insn->length) return 0; - if (!insn->immediate.got) { - ret = insn_get_immediate(insn); - if (ret) - return ret; - } + ret = insn_get_immediate(insn); + if (ret) + return ret; insn->length = (unsigned char)((unsigned long)insn->next_byte - (unsigned long)insn->kaddr); diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S deleted file mode 100644 index 6ff2f56cb0f7..000000000000 --- a/arch/x86/lib/iomap_copy_64.S +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2006 PathScale, Inc. All Rights Reserved. - */ - -#include <linux/linkage.h> - -/* - * override generic version in lib/iomap_copy.c - */ -SYM_FUNC_START(__iowrite32_copy) - movl %edx,%ecx - rep movsl - RET -SYM_FUNC_END(__iowrite32_copy) diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c index e0411a3774d4..5eecb45d05d5 100644 --- a/arch/x86/lib/iomem.c +++ b/arch/x86/lib/iomem.c @@ -25,6 +25,9 @@ static __always_inline void rep_movs(void *to, const void *from, size_t n) static void string_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n) { + const void *orig_to = to; + const size_t orig_n = n; + if (unlikely(!n)) return; @@ -39,7 +42,7 @@ static void string_memcpy_fromio(void *to, const volatile void __iomem *from, si } rep_movs(to, (const void *)from, n); /* KMSAN must treat values read from devices as initialized. */ - kmsan_unpoison_memory(to, n); + kmsan_unpoison_memory(orig_to, orig_n); } static void string_memcpy_toio(volatile void __iomem *to, const void *from, size_t n) diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c index 92cd8ecc3a2c..40b81c338ae5 100644 --- a/arch/x86/lib/misc.c +++ b/arch/x86/lib/misc.c @@ -8,7 +8,7 @@ */ int num_digits(int val) { - int m = 10; + long long m = 10; int d = 1; if (val < 0) { diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index 40bbe56bde32..acd463d887e1 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -9,10 +9,9 @@ static void __rdmsr_on_cpu(void *info) { struct msr_info *rv = info; struct msr *reg; - int this_cpu = raw_smp_processor_id(); if (rv->msrs) - reg = per_cpu_ptr(rv->msrs, this_cpu); + reg = this_cpu_ptr(rv->msrs); else reg = &rv->reg; @@ -23,10 +22,9 @@ static void __wrmsr_on_cpu(void *info) { struct msr_info *rv = info; struct msr *reg; - int this_cpu = raw_smp_processor_id(); if (rv->msrs) - reg = per_cpu_ptr(rv->msrs, this_cpu); + reg = this_cpu_ptr(rv->msrs); else reg = &rv->reg; @@ -97,7 +95,7 @@ int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) EXPORT_SYMBOL(wrmsrl_on_cpu); static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, - struct msr *msrs, + struct msr __percpu *msrs, void (*msr_func) (void *info)) { struct msr_info rv; @@ -124,7 +122,7 @@ static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, * @msrs: array of MSR values * */ -void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs) { __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu); } @@ -138,7 +136,7 @@ EXPORT_SYMBOL(rdmsr_on_cpus); * @msrs: array of MSR values * */ -void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs) { __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu); } diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 47fd9bd6b91d..4bf4fad5b148 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -6,9 +6,9 @@ #define CREATE_TRACE_POINTS #include <asm/msr-trace.h> -struct msr *msrs_alloc(void) +struct msr __percpu *msrs_alloc(void) { - struct msr *msrs = NULL; + struct msr __percpu *msrs = NULL; msrs = alloc_percpu(struct msr); if (!msrs) { @@ -20,7 +20,7 @@ struct msr *msrs_alloc(void) } EXPORT_SYMBOL(msrs_alloc); -void msrs_free(struct msr *msrs) +void msrs_free(struct msr __percpu *msrs) { free_percpu(msrs); } diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 2877f5934177..975c9c18263d 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -133,15 +133,15 @@ SYM_CODE_START_LOCAL(__put_user_handle_exception) RET SYM_CODE_END(__put_user_handle_exception) - _ASM_EXTABLE(1b, __put_user_handle_exception) - _ASM_EXTABLE(2b, __put_user_handle_exception) - _ASM_EXTABLE(3b, __put_user_handle_exception) - _ASM_EXTABLE(4b, __put_user_handle_exception) - _ASM_EXTABLE(5b, __put_user_handle_exception) - _ASM_EXTABLE(6b, __put_user_handle_exception) - _ASM_EXTABLE(7b, __put_user_handle_exception) - _ASM_EXTABLE(9b, __put_user_handle_exception) + _ASM_EXTABLE_UA(1b, __put_user_handle_exception) + _ASM_EXTABLE_UA(2b, __put_user_handle_exception) + _ASM_EXTABLE_UA(3b, __put_user_handle_exception) + _ASM_EXTABLE_UA(4b, __put_user_handle_exception) + _ASM_EXTABLE_UA(5b, __put_user_handle_exception) + _ASM_EXTABLE_UA(6b, __put_user_handle_exception) + _ASM_EXTABLE_UA(7b, __put_user_handle_exception) + _ASM_EXTABLE_UA(9b, __put_user_handle_exception) #ifdef CONFIG_X86_32 - _ASM_EXTABLE(8b, __put_user_handle_exception) - _ASM_EXTABLE(10b, __put_user_handle_exception) + _ASM_EXTABLE_UA(8b, __put_user_handle_exception) + _ASM_EXTABLE_UA(10b, __put_user_handle_exception) #endif diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 7b2589877d06..391059b2c6fb 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -71,7 +71,7 @@ SYM_CODE_END(__x86_indirect_thunk_array) #include <asm/GEN-for-each-reg.h> #undef GEN -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING .macro CALL_THUNK reg .align RETPOLINE_THUNK_SIZE @@ -127,7 +127,7 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) #undef GEN #endif -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK /* * Be careful here: that label cannot really be removed because in @@ -138,7 +138,7 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) */ .section .text..__x86.return_thunk -#ifdef CONFIG_CPU_SRSO +#ifdef CONFIG_MITIGATION_SRSO /* * srso_alias_untrain_ret() and srso_alias_safe_ret() are placed at @@ -163,6 +163,7 @@ SYM_CODE_START_NOALIGN(srso_alias_untrain_ret) lfence jmp srso_alias_return_thunk SYM_FUNC_END(srso_alias_untrain_ret) +__EXPORT_THUNK(srso_alias_untrain_ret) .popsection .pushsection .text..__x86.rethunk_safe @@ -224,13 +225,19 @@ SYM_CODE_START(srso_return_thunk) SYM_CODE_END(srso_return_thunk) #define JMP_SRSO_UNTRAIN_RET "jmp srso_untrain_ret" -#define JMP_SRSO_ALIAS_UNTRAIN_RET "jmp srso_alias_untrain_ret" -#else /* !CONFIG_CPU_SRSO */ +#else /* !CONFIG_MITIGATION_SRSO */ +/* Dummy for the alternative in CALL_UNTRAIN_RET. */ +SYM_CODE_START(srso_alias_untrain_ret) + ANNOTATE_UNRET_SAFE + ANNOTATE_NOENDBR + ret + int3 +SYM_FUNC_END(srso_alias_untrain_ret) +__EXPORT_THUNK(srso_alias_untrain_ret) #define JMP_SRSO_UNTRAIN_RET "ud2" -#define JMP_SRSO_ALIAS_UNTRAIN_RET "ud2" -#endif /* CONFIG_CPU_SRSO */ +#endif /* CONFIG_MITIGATION_SRSO */ -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY /* * Some generic notes on the untraining sequences: @@ -312,22 +319,20 @@ SYM_CODE_END(retbleed_return_thunk) SYM_FUNC_END(retbleed_untrain_ret) #define JMP_RETBLEED_UNTRAIN_RET "jmp retbleed_untrain_ret" -#else /* !CONFIG_CPU_UNRET_ENTRY */ +#else /* !CONFIG_MITIGATION_UNRET_ENTRY */ #define JMP_RETBLEED_UNTRAIN_RET "ud2" -#endif /* CONFIG_CPU_UNRET_ENTRY */ +#endif /* CONFIG_MITIGATION_UNRET_ENTRY */ -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO) +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) SYM_FUNC_START(entry_untrain_ret) - ALTERNATIVE_2 JMP_RETBLEED_UNTRAIN_RET, \ - JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO, \ - JMP_SRSO_ALIAS_UNTRAIN_RET, X86_FEATURE_SRSO_ALIAS + ALTERNATIVE JMP_RETBLEED_UNTRAIN_RET, JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO SYM_FUNC_END(entry_untrain_ret) __EXPORT_THUNK(entry_untrain_ret) -#endif /* CONFIG_CPU_UNRET_ENTRY || CONFIG_CPU_SRSO */ +#endif /* CONFIG_MITIGATION_UNRET_ENTRY || CONFIG_MITIGATION_SRSO */ -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING .align 64 SYM_FUNC_START(call_depth_return_thunk) @@ -359,7 +364,7 @@ SYM_FUNC_START(call_depth_return_thunk) int3 SYM_FUNC_END(call_depth_return_thunk) -#endif /* CONFIG_CALL_DEPTH_TRACKING */ +#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ /* * This function name is magical and is used by -mfunction-return=thunk-extern @@ -369,21 +374,25 @@ SYM_FUNC_END(call_depth_return_thunk) * 'JMP __x86_return_thunk' sites are changed to something else by * apply_returns(). * - * This should be converted eventually to call a warning function which - * should scream loudly when the default return thunk is called after - * alternatives have been applied. - * - * That warning function cannot BUG() because the bug splat cannot be - * displayed in all possible configurations, leading to users not really - * knowing why the machine froze. + * The ALTERNATIVE below adds a really loud warning to catch the case + * where the insufficient default return thunk ends up getting used for + * whatever reason like miscompilation or failure of + * objtool/alternatives/etc to patch all the return sites. */ SYM_CODE_START(__x86_return_thunk) UNWIND_HINT_FUNC ANNOTATE_NOENDBR +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || \ + defined(CONFIG_MITIGATION_SRSO) || \ + defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) + ALTERNATIVE __stringify(ANNOTATE_UNRET_SAFE; ret), \ + "jmp warn_thunk_thunk", X86_FEATURE_ALWAYS +#else ANNOTATE_UNRET_SAFE ret +#endif int3 SYM_CODE_END(__x86_return_thunk) EXPORT_SYMBOL(__x86_return_thunk) -#endif /* CONFIG_RETHUNK */ +#endif /* CONFIG_MITIGATION_RETHUNK */ diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 5168ee0360b2..caedb3ef6688 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -23,6 +23,7 @@ # # AVX Superscripts # (ev): this opcode requires EVEX prefix. +# (es): this opcode requires EVEX prefix and is SCALABALE. # (evo): this opcode is changed by EVEX prefix (EVEX opcode) # (v): this opcode requires VEX prefix. # (v1): this opcode only supports 128bit VEX. @@ -33,6 +34,10 @@ # - (F2): the last prefix is 0xF2 # - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) # - (66&F2): Both 0x66 and 0xF2 prefixes are specified. +# +# REX2 Prefix +# - (!REX2): REX2 is not allowed +# - (REX2): REX2 variant e.g. JMPABS Table: one byte opcode Referrer: @@ -148,7 +153,7 @@ AVXcode: 65: SEG=GS (Prefix) 66: Operand-Size (Prefix) 67: Address-Size (Prefix) -68: PUSH Iz (d64) +68: PUSH Iz 69: IMUL Gv,Ev,Iz 6a: PUSH Ib (d64) 6b: IMUL Gv,Ev,Ib @@ -157,22 +162,22 @@ AVXcode: 6e: OUTS/OUTSB DX,Xb 6f: OUTS/OUTSW/OUTSD DX,Xz # 0x70 - 0x7f -70: JO Jb -71: JNO Jb -72: JB/JNAE/JC Jb -73: JNB/JAE/JNC Jb -74: JZ/JE Jb -75: JNZ/JNE Jb -76: JBE/JNA Jb -77: JNBE/JA Jb -78: JS Jb -79: JNS Jb -7a: JP/JPE Jb -7b: JNP/JPO Jb -7c: JL/JNGE Jb -7d: JNL/JGE Jb -7e: JLE/JNG Jb -7f: JNLE/JG Jb +70: JO Jb (!REX2) +71: JNO Jb (!REX2) +72: JB/JNAE/JC Jb (!REX2) +73: JNB/JAE/JNC Jb (!REX2) +74: JZ/JE Jb (!REX2) +75: JNZ/JNE Jb (!REX2) +76: JBE/JNA Jb (!REX2) +77: JNBE/JA Jb (!REX2) +78: JS Jb (!REX2) +79: JNS Jb (!REX2) +7a: JP/JPE Jb (!REX2) +7b: JNP/JPO Jb (!REX2) +7c: JL/JNGE Jb (!REX2) +7d: JNL/JGE Jb (!REX2) +7e: JLE/JNG Jb (!REX2) +7f: JNLE/JG Jb (!REX2) # 0x80 - 0x8f 80: Grp1 Eb,Ib (1A) 81: Grp1 Ev,Iz (1A) @@ -208,24 +213,24 @@ AVXcode: 9e: SAHF 9f: LAHF # 0xa0 - 0xaf -a0: MOV AL,Ob -a1: MOV rAX,Ov -a2: MOV Ob,AL -a3: MOV Ov,rAX -a4: MOVS/B Yb,Xb -a5: MOVS/W/D/Q Yv,Xv -a6: CMPS/B Xb,Yb -a7: CMPS/W/D Xv,Yv -a8: TEST AL,Ib -a9: TEST rAX,Iz -aa: STOS/B Yb,AL -ab: STOS/W/D/Q Yv,rAX -ac: LODS/B AL,Xb -ad: LODS/W/D/Q rAX,Xv -ae: SCAS/B AL,Yb +a0: MOV AL,Ob (!REX2) +a1: MOV rAX,Ov (!REX2) | JMPABS O (REX2),(o64) +a2: MOV Ob,AL (!REX2) +a3: MOV Ov,rAX (!REX2) +a4: MOVS/B Yb,Xb (!REX2) +a5: MOVS/W/D/Q Yv,Xv (!REX2) +a6: CMPS/B Xb,Yb (!REX2) +a7: CMPS/W/D Xv,Yv (!REX2) +a8: TEST AL,Ib (!REX2) +a9: TEST rAX,Iz (!REX2) +aa: STOS/B Yb,AL (!REX2) +ab: STOS/W/D/Q Yv,rAX (!REX2) +ac: LODS/B AL,Xb (!REX2) +ad: LODS/W/D/Q rAX,Xv (!REX2) +ae: SCAS/B AL,Yb (!REX2) # Note: The May 2011 Intel manual shows Xv for the second parameter of the # next instruction but Yv is correct -af: SCAS/W/D/Q rAX,Yv +af: SCAS/W/D/Q rAX,Yv (!REX2) # 0xb0 - 0xbf b0: MOV AL/R8L,Ib b1: MOV CL/R9L,Ib @@ -266,7 +271,7 @@ d1: Grp2 Ev,1 (1A) d2: Grp2 Eb,CL (1A) d3: Grp2 Ev,CL (1A) d4: AAM Ib (i64) -d5: AAD Ib (i64) +d5: AAD Ib (i64) | REX2 (Prefix),(o64) d6: d7: XLAT/XLATB d8: ESC @@ -281,26 +286,26 @@ df: ESC # Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix # in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation # to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. -e0: LOOPNE/LOOPNZ Jb (f64) -e1: LOOPE/LOOPZ Jb (f64) -e2: LOOP Jb (f64) -e3: JrCXZ Jb (f64) -e4: IN AL,Ib -e5: IN eAX,Ib -e6: OUT Ib,AL -e7: OUT Ib,eAX +e0: LOOPNE/LOOPNZ Jb (f64) (!REX2) +e1: LOOPE/LOOPZ Jb (f64) (!REX2) +e2: LOOP Jb (f64) (!REX2) +e3: JrCXZ Jb (f64) (!REX2) +e4: IN AL,Ib (!REX2) +e5: IN eAX,Ib (!REX2) +e6: OUT Ib,AL (!REX2) +e7: OUT Ib,eAX (!REX2) # With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset # in "near" jumps and calls is 16-bit. For CALL, # push of return address is 16-bit wide, RSP is decremented by 2 # but is not truncated to 16 bits, unlike RIP. -e8: CALL Jz (f64) -e9: JMP-near Jz (f64) -ea: JMP-far Ap (i64) -eb: JMP-short Jb (f64) -ec: IN AL,DX -ed: IN eAX,DX -ee: OUT DX,AL -ef: OUT DX,eAX +e8: CALL Jz (f64) (!REX2) +e9: JMP-near Jz (f64) (!REX2) +ea: JMP-far Ap (i64) (!REX2) +eb: JMP-short Jb (f64) (!REX2) +ec: IN AL,DX (!REX2) +ed: IN eAX,DX (!REX2) +ee: OUT DX,AL (!REX2) +ef: OUT DX,eAX (!REX2) # 0xf0 - 0xff f0: LOCK (Prefix) f1: @@ -386,14 +391,14 @@ AVXcode: 1 2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1) 2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1) # 0x0f 0x30-0x3f -30: WRMSR -31: RDTSC -32: RDMSR -33: RDPMC -34: SYSENTER -35: SYSEXIT +30: WRMSR (!REX2) +31: RDTSC (!REX2) +32: RDMSR (!REX2) +33: RDPMC (!REX2) +34: SYSENTER (!REX2) +35: SYSEXIT (!REX2) 36: -37: GETSEC +37: GETSEC (!REX2) 38: escape # 3-byte escape 1 39: 3a: escape # 3-byte escape 2 @@ -473,22 +478,22 @@ AVXcode: 1 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev) # 0x0f 0x80-0x8f # Note: "forced64" is Intel CPU behavior (see comment about CALL insn). -80: JO Jz (f64) -81: JNO Jz (f64) -82: JB/JC/JNAE Jz (f64) -83: JAE/JNB/JNC Jz (f64) -84: JE/JZ Jz (f64) -85: JNE/JNZ Jz (f64) -86: JBE/JNA Jz (f64) -87: JA/JNBE Jz (f64) -88: JS Jz (f64) -89: JNS Jz (f64) -8a: JP/JPE Jz (f64) -8b: JNP/JPO Jz (f64) -8c: JL/JNGE Jz (f64) -8d: JNL/JGE Jz (f64) -8e: JLE/JNG Jz (f64) -8f: JNLE/JG Jz (f64) +80: JO Jz (f64) (!REX2) +81: JNO Jz (f64) (!REX2) +82: JB/JC/JNAE Jz (f64) (!REX2) +83: JAE/JNB/JNC Jz (f64) (!REX2) +84: JE/JZ Jz (f64) (!REX2) +85: JNE/JNZ Jz (f64) (!REX2) +86: JBE/JNA Jz (f64) (!REX2) +87: JA/JNBE Jz (f64) (!REX2) +88: JS Jz (f64) (!REX2) +89: JNS Jz (f64) (!REX2) +8a: JP/JPE Jz (f64) (!REX2) +8b: JNP/JPO Jz (f64) (!REX2) +8c: JL/JNGE Jz (f64) (!REX2) +8d: JNL/JGE Jz (f64) (!REX2) +8e: JLE/JNG Jz (f64) (!REX2) +8f: JNLE/JG Jz (f64) (!REX2) # 0x0f 0x90-0x9f 90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66) 91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66) @@ -698,17 +703,17 @@ AVXcode: 2 4d: vrcp14ss/d Vsd,Hpd,Wsd (66),(ev) 4e: vrsqrt14ps/d Vpd,Wpd (66),(ev) 4f: vrsqrt14ss/d Vsd,Hsd,Wsd (66),(ev) -50: vpdpbusd Vx,Hx,Wx (66),(ev) -51: vpdpbusds Vx,Hx,Wx (66),(ev) -52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66),(ev) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev) -53: vpdpwssds Vx,Hx,Wx (66),(ev) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev) +50: vpdpbusd Vx,Hx,Wx (66) | vpdpbssd Vx,Hx,Wx (F2),(v) | vpdpbsud Vx,Hx,Wx (F3),(v) | vpdpbuud Vx,Hx,Wx (v) +51: vpdpbusds Vx,Hx,Wx (66) | vpdpbssds Vx,Hx,Wx (F2),(v) | vpdpbsuds Vx,Hx,Wx (F3),(v) | vpdpbuuds Vx,Hx,Wx (v) +52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev) +53: vpdpwssds Vx,Hx,Wx (66) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev) 54: vpopcntb/w Vx,Wx (66),(ev) 55: vpopcntd/q Vx,Wx (66),(ev) 58: vpbroadcastd Vx,Wx (66),(v) 59: vpbroadcastq Vx,Wx (66),(v) | vbroadcasti32x2 Vx,Wx (66),(evo) 5a: vbroadcasti128 Vqq,Mdq (66),(v) | vbroadcasti32x4/64x2 Vx,Wx (66),(evo) 5b: vbroadcasti32x8/64x4 Vqq,Mdq (66),(ev) -5c: TDPBF16PS Vt,Wt,Ht (F3),(v1) +5c: TDPBF16PS Vt,Wt,Ht (F3),(v1) | TDPFP16PS Vt,Wt,Ht (F2),(v1),(o64) # Skip 0x5d 5e: TDPBSSD Vt,Wt,Ht (F2),(v1) | TDPBSUD Vt,Wt,Ht (F3),(v1) | TDPBUSD Vt,Wt,Ht (66),(v1) | TDPBUUD Vt,Wt,Ht (v1) # Skip 0x5f-0x61 @@ -718,10 +723,12 @@ AVXcode: 2 65: vblendmps/d Vx,Hx,Wx (66),(ev) 66: vpblendmb/w Vx,Hx,Wx (66),(ev) 68: vp2intersectd/q Kx,Hx,Wx (F2),(ev) -# Skip 0x69-0x6f +# Skip 0x69-0x6b +6c: TCMMIMFP16PS Vt,Wt,Ht (66),(v1),(o64) | TCMMRLFP16PS Vt,Wt,Ht (v1),(o64) +# Skip 0x6d-0x6f 70: vpshldvw Vx,Hx,Wx (66),(ev) 71: vpshldvd/q Vx,Hx,Wx (66),(ev) -72: vcvtne2ps2bf16 Vx,Hx,Wx (F2),(ev) | vcvtneps2bf16 Vx,Wx (F3),(ev) | vpshrdvw Vx,Hx,Wx (66),(ev) +72: vcvtne2ps2bf16 Vx,Hx,Wx (F2),(ev) | vcvtneps2bf16 Vx,Wx (F3) | vpshrdvw Vx,Hx,Wx (66),(ev) 73: vpshrdvd/q Vx,Hx,Wx (66),(ev) 75: vpermi2b/w Vx,Hx,Wx (66),(ev) 76: vpermi2d/q Vx,Hx,Wx (66),(ev) @@ -777,8 +784,10 @@ ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v) ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1) ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v) af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1) -b4: vpmadd52luq Vx,Hx,Wx (66),(ev) -b5: vpmadd52huq Vx,Hx,Wx (66),(ev) +b0: vcvtneebf162ps Vx,Mx (F3),(!11B),(v) | vcvtneeph2ps Vx,Mx (66),(!11B),(v) | vcvtneobf162ps Vx,Mx (F2),(!11B),(v) | vcvtneoph2ps Vx,Mx (!11B),(v) +b1: vbcstnebf162ps Vx,Mw (F3),(!11B),(v) | vbcstnesh2ps Vx,Mw (66),(!11B),(v) +b4: vpmadd52luq Vx,Hx,Wx (66) +b5: vpmadd52huq Vx,Hx,Wx (66) b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v) b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v) b8: vfmadd231ps/d Vx,Hx,Wx (66),(v) @@ -796,15 +805,35 @@ c7: Grp19 (1A) c8: sha1nexte Vdq,Wdq | vexp2ps/d Vx,Wx (66),(ev) c9: sha1msg1 Vdq,Wdq ca: sha1msg2 Vdq,Wdq | vrcp28ps/d Vx,Wx (66),(ev) -cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev) -cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev) -cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev) +cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev) | vsha512rnds2 Vqq,Hqq,Udq (F2),(11B),(v) +cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev) | vsha512msg1 Vqq,Udq (F2),(11B),(v) +cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev) | vsha512msg2 Vqq,Uqq (F2),(11B),(v) cf: vgf2p8mulb Vx,Wx (66) +d2: vpdpwsud Vx,Hx,Wx (F3),(v) | vpdpwusd Vx,Hx,Wx (66),(v) | vpdpwuud Vx,Hx,Wx (v) +d3: vpdpwsuds Vx,Hx,Wx (F3),(v) | vpdpwusds Vx,Hx,Wx (66),(v) | vpdpwuuds Vx,Hx,Wx (v) +d8: AESENCWIDE128KL Qpi (F3),(000),(00B) | AESENCWIDE256KL Qpi (F3),(000),(10B) | AESDECWIDE128KL Qpi (F3),(000),(01B) | AESDECWIDE256KL Qpi (F3),(000),(11B) +da: vsm3msg1 Vdq,Hdq,Udq (v1) | vsm3msg2 Vdq,Hdq,Udq (66),(v1) | vsm4key4 Vx,Hx,Wx (F3),(v) | vsm4rnds4 Vx,Hx,Wx (F2),(v) db: VAESIMC Vdq,Wdq (66),(v1) -dc: vaesenc Vx,Hx,Wx (66) -dd: vaesenclast Vx,Hx,Wx (66) -de: vaesdec Vx,Hx,Wx (66) -df: vaesdeclast Vx,Hx,Wx (66) +dc: vaesenc Vx,Hx,Wx (66) | LOADIWKEY Vx,Hx (F3) | AESENC128KL Vpd,Qpi (F3) +dd: vaesenclast Vx,Hx,Wx (66) | AESDEC128KL Vpd,Qpi (F3) +de: vaesdec Vx,Hx,Wx (66) | AESENC256KL Vpd,Qpi (F3) +df: vaesdeclast Vx,Hx,Wx (66) | AESDEC256KL Vpd,Qpi (F3) +e0: CMPOXADD My,Gy,By (66),(v1),(o64) +e1: CMPNOXADD My,Gy,By (66),(v1),(o64) +e2: CMPBXADD My,Gy,By (66),(v1),(o64) +e3: CMPNBXADD My,Gy,By (66),(v1),(o64) +e4: CMPZXADD My,Gy,By (66),(v1),(o64) +e5: CMPNZXADD My,Gy,By (66),(v1),(o64) +e6: CMPBEXADD My,Gy,By (66),(v1),(o64) +e7: CMPNBEXADD My,Gy,By (66),(v1),(o64) +e8: CMPSXADD My,Gy,By (66),(v1),(o64) +e9: CMPNSXADD My,Gy,By (66),(v1),(o64) +ea: CMPPXADD My,Gy,By (66),(v1),(o64) +eb: CMPNPXADD My,Gy,By (66),(v1),(o64) +ec: CMPLXADD My,Gy,By (66),(v1),(o64) +ed: CMPNLXADD My,Gy,By (66),(v1),(o64) +ee: CMPLEXADD My,Gy,By (66),(v1),(o64) +ef: CMPNLEXADD My,Gy,By (66),(v1),(o64) f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2) f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2) f2: ANDN Gy,By,Ey (v) @@ -812,8 +841,11 @@ f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSSD/Q My,Gy (66) f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) | WRSSD/Q My,Gy f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) -f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) +f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) | URDMSR Rq,Gq (F2),(11B) | UWRMSR Gq,Rq (F3),(11B) f9: MOVDIRI My,Gy +fa: ENCODEKEY128 Ew,Ew (F3) +fb: ENCODEKEY256 Ew,Ew (F3) +fc: AADD My,Gy | AAND My,Gy (66) | AOR My,Gy (F2) | AXOR My,Gy (F3) EndTable Table: 3-byte opcode 2 (0x0f 0x3a) @@ -893,10 +925,103 @@ c2: vcmpph Vx,Hx,Wx,Ib (ev) | vcmpsh Vx,Hx,Wx,Ib (F3),(ev) cc: sha1rnds4 Vdq,Wdq,Ib ce: vgf2p8affineqb Vx,Wx,Ib (66) cf: vgf2p8affineinvqb Vx,Wx,Ib (66) +de: vsm3rnds2 Vdq,Hdq,Wdq,Ib (66),(v1) df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) f0: RORX Gy,Ey,Ib (F2),(v) | HRESET Gv,Ib (F3),(000),(11B) EndTable +Table: EVEX map 4 +Referrer: +AVXcode: 4 +00: ADD Eb,Gb (ev) +01: ADD Ev,Gv (es) | ADD Ev,Gv (66),(es) +02: ADD Gb,Eb (ev) +03: ADD Gv,Ev (es) | ADD Gv,Ev (66),(es) +08: OR Eb,Gb (ev) +09: OR Ev,Gv (es) | OR Ev,Gv (66),(es) +0a: OR Gb,Eb (ev) +0b: OR Gv,Ev (es) | OR Gv,Ev (66),(es) +10: ADC Eb,Gb (ev) +11: ADC Ev,Gv (es) | ADC Ev,Gv (66),(es) +12: ADC Gb,Eb (ev) +13: ADC Gv,Ev (es) | ADC Gv,Ev (66),(es) +18: SBB Eb,Gb (ev) +19: SBB Ev,Gv (es) | SBB Ev,Gv (66),(es) +1a: SBB Gb,Eb (ev) +1b: SBB Gv,Ev (es) | SBB Gv,Ev (66),(es) +20: AND Eb,Gb (ev) +21: AND Ev,Gv (es) | AND Ev,Gv (66),(es) +22: AND Gb,Eb (ev) +23: AND Gv,Ev (es) | AND Gv,Ev (66),(es) +24: SHLD Ev,Gv,Ib (es) | SHLD Ev,Gv,Ib (66),(es) +28: SUB Eb,Gb (ev) +29: SUB Ev,Gv (es) | SUB Ev,Gv (66),(es) +2a: SUB Gb,Eb (ev) +2b: SUB Gv,Ev (es) | SUB Gv,Ev (66),(es) +2c: SHRD Ev,Gv,Ib (es) | SHRD Ev,Gv,Ib (66),(es) +30: XOR Eb,Gb (ev) +31: XOR Ev,Gv (es) | XOR Ev,Gv (66),(es) +32: XOR Gb,Eb (ev) +33: XOR Gv,Ev (es) | XOR Gv,Ev (66),(es) +# CCMPSCC instructions are: CCOMB, CCOMBE, CCOMF, CCOML, CCOMLE, CCOMNB, CCOMNBE, CCOMNL, CCOMNLE, +# CCOMNO, CCOMNS, CCOMNZ, CCOMO, CCOMS, CCOMT, CCOMZ +38: CCMPSCC Eb,Gb (ev) +39: CCMPSCC Ev,Gv (es) | CCMPSCC Ev,Gv (66),(es) +3a: CCMPSCC Gv,Ev (ev) +3b: CCMPSCC Gv,Ev (es) | CCMPSCC Gv,Ev (66),(es) +40: CMOVO Gv,Ev (es) | CMOVO Gv,Ev (66),(es) | CFCMOVO Ev,Ev (es) | CFCMOVO Ev,Ev (66),(es) | SETO Eb (F2),(ev) +41: CMOVNO Gv,Ev (es) | CMOVNO Gv,Ev (66),(es) | CFCMOVNO Ev,Ev (es) | CFCMOVNO Ev,Ev (66),(es) | SETNO Eb (F2),(ev) +42: CMOVB Gv,Ev (es) | CMOVB Gv,Ev (66),(es) | CFCMOVB Ev,Ev (es) | CFCMOVB Ev,Ev (66),(es) | SETB Eb (F2),(ev) +43: CMOVNB Gv,Ev (es) | CMOVNB Gv,Ev (66),(es) | CFCMOVNB Ev,Ev (es) | CFCMOVNB Ev,Ev (66),(es) | SETNB Eb (F2),(ev) +44: CMOVZ Gv,Ev (es) | CMOVZ Gv,Ev (66),(es) | CFCMOVZ Ev,Ev (es) | CFCMOVZ Ev,Ev (66),(es) | SETZ Eb (F2),(ev) +45: CMOVNZ Gv,Ev (es) | CMOVNZ Gv,Ev (66),(es) | CFCMOVNZ Ev,Ev (es) | CFCMOVNZ Ev,Ev (66),(es) | SETNZ Eb (F2),(ev) +46: CMOVBE Gv,Ev (es) | CMOVBE Gv,Ev (66),(es) | CFCMOVBE Ev,Ev (es) | CFCMOVBE Ev,Ev (66),(es) | SETBE Eb (F2),(ev) +47: CMOVNBE Gv,Ev (es) | CMOVNBE Gv,Ev (66),(es) | CFCMOVNBE Ev,Ev (es) | CFCMOVNBE Ev,Ev (66),(es) | SETNBE Eb (F2),(ev) +48: CMOVS Gv,Ev (es) | CMOVS Gv,Ev (66),(es) | CFCMOVS Ev,Ev (es) | CFCMOVS Ev,Ev (66),(es) | SETS Eb (F2),(ev) +49: CMOVNS Gv,Ev (es) | CMOVNS Gv,Ev (66),(es) | CFCMOVNS Ev,Ev (es) | CFCMOVNS Ev,Ev (66),(es) | SETNS Eb (F2),(ev) +4a: CMOVP Gv,Ev (es) | CMOVP Gv,Ev (66),(es) | CFCMOVP Ev,Ev (es) | CFCMOVP Ev,Ev (66),(es) | SETP Eb (F2),(ev) +4b: CMOVNP Gv,Ev (es) | CMOVNP Gv,Ev (66),(es) | CFCMOVNP Ev,Ev (es) | CFCMOVNP Ev,Ev (66),(es) | SETNP Eb (F2),(ev) +4c: CMOVL Gv,Ev (es) | CMOVL Gv,Ev (66),(es) | CFCMOVL Ev,Ev (es) | CFCMOVL Ev,Ev (66),(es) | SETL Eb (F2),(ev) +4d: CMOVNL Gv,Ev (es) | CMOVNL Gv,Ev (66),(es) | CFCMOVNL Ev,Ev (es) | CFCMOVNL Ev,Ev (66),(es) | SETNL Eb (F2),(ev) +4e: CMOVLE Gv,Ev (es) | CMOVLE Gv,Ev (66),(es) | CFCMOVLE Ev,Ev (es) | CFCMOVLE Ev,Ev (66),(es) | SETLE Eb (F2),(ev) +4f: CMOVNLE Gv,Ev (es) | CMOVNLE Gv,Ev (66),(es) | CFCMOVNLE Ev,Ev (es) | CFCMOVNLE Ev,Ev (66),(es) | SETNLE Eb (F2),(ev) +60: MOVBE Gv,Ev (es) | MOVBE Gv,Ev (66),(es) +61: MOVBE Ev,Gv (es) | MOVBE Ev,Gv (66),(es) +65: WRUSSD Md,Gd (66),(ev) | WRUSSQ Mq,Gq (66),(ev) +66: ADCX Gy,Ey (66),(ev) | ADOX Gy,Ey (F3),(ev) | WRSSD Md,Gd (ev) | WRSSQ Mq,Gq (66),(ev) +69: IMUL Gv,Ev,Iz (es) | IMUL Gv,Ev,Iz (66),(es) +6b: IMUL Gv,Ev,Ib (es) | IMUL Gv,Ev,Ib (66),(es) +80: Grp1 Eb,Ib (1A),(ev) +81: Grp1 Ev,Iz (1A),(es) +83: Grp1 Ev,Ib (1A),(es) +# CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL, +# CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ +84: CTESTSCC (ev) +85: CTESTSCC (es) | CTESTSCC (66),(es) +88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es) +8f: POP2 Bq,Rq (000),(11B),(ev) +a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es) +ad: SHRD Ev,Gv,CL (es) | SHRD Ev,Gv,CL (66),(es) +af: IMUL Gv,Ev (es) | IMUL Gv,Ev (66),(es) +c0: Grp2 Eb,Ib (1A),(ev) +c1: Grp2 Ev,Ib (1A),(es) +d0: Grp2 Eb,1 (1A),(ev) +d1: Grp2 Ev,1 (1A),(es) +d2: Grp2 Eb,CL (1A),(ev) +d3: Grp2 Ev,CL (1A),(es) +f0: CRC32 Gy,Eb (es) | INVEPT Gq,Mdq (F3),(ev) +f1: CRC32 Gy,Ey (es) | CRC32 Gy,Ey (66),(es) | INVVPID Gy,Mdq (F3),(ev) +f2: INVPCID Gy,Mdq (F3),(ev) +f4: TZCNT Gv,Ev (es) | TZCNT Gv,Ev (66),(es) +f5: LZCNT Gv,Ev (es) | LZCNT Gv,Ev (66),(es) +f6: Grp3_1 Eb (1A),(ev) +f7: Grp3_2 Ev (1A),(es) +f8: MOVDIR64B Gv,Mdqq (66),(ev) | ENQCMD Gv,Mdqq (F2),(ev) | ENQCMDS Gv,Mdqq (F3),(ev) | URDMSR Rq,Gq (F2),(11B),(ev) | UWRMSR Gq,Rq (F3),(11B),(ev) +f9: MOVDIRI My,Gy (ev) +fe: Grp4 (1A),(ev) +ff: Grp5 (1A),(es) | PUSH2 Bq,Rq (110),(11B),(ev) +EndTable + Table: EVEX map 5 Referrer: AVXcode: 5 @@ -975,6 +1100,12 @@ d6: vfcmulcph Vx,Hx,Wx (F2),(ev) | vfmulcph Vx,Hx,Wx (F3),(ev) d7: vfcmulcsh Vx,Hx,Wx (F2),(ev) | vfmulcsh Vx,Hx,Wx (F3),(ev) EndTable +Table: VEX map 7 +Referrer: +AVXcode: 7 +f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B) +EndTable + GrpTable: Grp1 0: ADD 1: OR @@ -1051,8 +1182,8 @@ GrpTable: Grp6 EndTable GrpTable: Grp7 -0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) -1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) | RDMSRLIST (F2),(110),(11B) | WRMSRLIST (F3),(110),(11B) | PBNDKB (111),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B) 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B) 3: LIDT Ms 4: SMSW Mw/Rv @@ -1137,6 +1268,8 @@ GrpTable: Grp16 1: prefetch T0 2: prefetch T1 3: prefetch T2 +6: prefetch IT1 +7: prefetch IT0 EndTable GrpTable: Grp17 |