diff options
Diffstat (limited to 'arch/x86')
145 files changed, 1912 insertions, 1064 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 222855cc0158..58eae28c3dd6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1503,7 +1503,7 @@ config X86_5LEVEL config X86_DIRECT_GBPAGES def_bool y - depends on X86_64 && !DEBUG_PAGEALLOC + depends on X86_64 ---help--- Certain kernel features effectively disable kernel linear 1 GB mappings (even if the CPU otherwise diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 71c92db47c41..bf9cd83de777 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -171,7 +171,7 @@ config HAVE_MMIOTRACE_SUPPORT config X86_DECODER_SELFTEST bool "x86 instruction decoder selftest" - depends on DEBUG_KERNEL && KPROBES + depends on DEBUG_KERNEL && INSTRUCTION_DECODER depends on !COMPILE_TEST ---help--- Perform x86 instruction decoder selftests at build time. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 56e748a7679f..94df0868804b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -38,6 +38,7 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -DDISABLE_BRANCH_PROFILING \ REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) export REALMODE_CFLAGS diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 19eca14b49a0..ca866f1cca2e 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -28,8 +28,6 @@ #include "cpuflags.h" /* Useful macros */ -#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) - #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) extern struct setup_header hdr; diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 37380c0d5999..5e30eaaf8576 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -140,7 +140,7 @@ ENTRY(startup_32) /* * Jump to the relocated address. */ - leal relocated(%ebx), %eax + leal .Lrelocated(%ebx), %eax jmp *%eax ENDPROC(startup_32) @@ -209,7 +209,7 @@ ENDPROC(efi32_stub_entry) #endif .text -relocated: +.Lrelocated: /* * Clear BSS (stack is currently empty) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 6233ae35d0d9..d98cd483377e 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -87,7 +87,7 @@ ENTRY(startup_32) call verify_cpu testl %eax, %eax - jnz no_longmode + jnz .Lno_longmode /* * Compute the delta between where we were compiled to run at @@ -322,7 +322,7 @@ ENTRY(startup_64) 1: popq %rdi subq $1b, %rdi - call adjust_got + call .Ladjust_got /* * At this point we are in long mode with 4-level paging enabled, @@ -421,7 +421,7 @@ trampoline_return: /* The new adjustment is the relocation address */ movq %rbx, %rdi - call adjust_got + call .Ladjust_got /* * Copy the compressed kernel to the end of our buffer @@ -440,7 +440,7 @@ trampoline_return: /* * Jump to the relocated address. */ - leaq relocated(%rbx), %rax + leaq .Lrelocated(%rbx), %rax jmp *%rax #ifdef CONFIG_EFI_STUB @@ -511,7 +511,7 @@ ENDPROC(efi64_stub_entry) #endif .text -relocated: +.Lrelocated: /* * Clear BSS (stack is currently empty) @@ -548,7 +548,7 @@ relocated: * first time we touch GOT). * RDI is the new adjustment to apply. */ -adjust_got: +.Ladjust_got: /* Walk through the GOT adding the address to the entries */ leaq _got(%rip), %rdx leaq _egot(%rip), %rcx @@ -622,7 +622,7 @@ ENTRY(trampoline_32bit_src) movl %eax, %cr4 /* Calculate address of paging_enabled() once we are executing in the trampoline */ - leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax + leal .Lpaging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax /* Prepare the stack for far return to Long Mode */ pushl $__KERNEL_CS @@ -635,7 +635,7 @@ ENTRY(trampoline_32bit_src) lret .code64 -paging_enabled: +.Lpaging_enabled: /* Return from the trampoline */ jmp *%rdi @@ -647,7 +647,7 @@ paging_enabled: .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE .code32 -no_longmode: +.Lno_longmode: /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: hlt diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 5f2d03067ae5..c8862696a47b 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -72,6 +72,8 @@ static unsigned long find_trampoline_placement(void) /* Find the first usable memory region under bios_start. */ for (i = boot_params->e820_entries - 1; i >= 0; i--) { + unsigned long new = bios_start; + entry = &boot_params->e820_table[i]; /* Skip all entries above bios_start. */ @@ -84,15 +86,20 @@ static unsigned long find_trampoline_placement(void) /* Adjust bios_start to the end of the entry if needed. */ if (bios_start > entry->addr + entry->size) - bios_start = entry->addr + entry->size; + new = entry->addr + entry->size; /* Keep bios_start page-aligned. */ - bios_start = round_down(bios_start, PAGE_SIZE); + new = round_down(new, PAGE_SIZE); /* Skip the entry if it's too small. */ - if (bios_start - TRAMPOLINE_32BIT_SIZE < entry->addr) + if (new - TRAMPOLINE_32BIT_SIZE < entry->addr) continue; + /* Protect against underflow. */ + if (new - TRAMPOLINE_32BIT_SIZE > bios_start) + break; + + bios_start = new; break; } diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 996df3d586f0..e3add857c2c9 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -10,6 +10,7 @@ /* * Main module for the real-mode kernel code */ +#include <linux/build_bug.h> #include "boot.h" #include "string.h" diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 401e30ca0a75..8272a4492844 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -37,6 +37,14 @@ int memcmp(const void *s1, const void *s2, size_t len) return diff; } +/* + * Clang may lower `memcmp == 0` to `bcmp == 0`. + */ +int bcmp(const void *s1, const void *s2, size_t len) +{ + return memcmp(s1, s2, len); +} + int strcmp(const char *str1, const char *str2) { const unsigned char *s1 = (const unsigned char *)str1; diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 830bd984182b..515c0ceeb4a3 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -314,6 +314,23 @@ For 32-bit we have the following conventions - kernel is built with #endif +/* + * Mitigate Spectre v1 for conditional swapgs code paths. + * + * FENCE_SWAPGS_USER_ENTRY is used in the user entry swapgs code path, to + * prevent a speculative swapgs when coming from kernel space. + * + * FENCE_SWAPGS_KERNEL_ENTRY is used in the kernel entry non-swapgs code path, + * to prevent the swapgs from getting speculatively skipped when coming from + * user space. + */ +.macro FENCE_SWAPGS_USER_ENTRY + ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_USER +.endm +.macro FENCE_SWAPGS_KERNEL_ENTRY + ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_KERNEL +.endm + .macro STACKLEAK_ERASE_NOCLOBBER #ifdef CONFIG_GCC_PLUGIN_STACKLEAK PUSH_AND_CLEAR_REGS diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 536b574b6161..3f8e22615812 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -285,15 +285,16 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) nr = syscall_trace_enter(regs); - /* - * NB: Native and x32 syscalls are dispatched from the same - * table. The only functional difference is the x32 bit in - * regs->orig_ax, which changes the behavior of some syscalls. - */ - nr &= __SYSCALL_MASK; if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); regs->ax = sys_call_table[nr](regs); +#ifdef CONFIG_X86_X32_ABI + } else if (likely((nr & __X32_SYSCALL_BIT) && + (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { + nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, + X32_NR_syscalls); + regs->ax = x32_sys_call_table[nr](regs); +#endif } syscall_return_slowpath(regs); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2bb986f305ac..f83ca5aa8b77 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -63,7 +63,7 @@ * enough to patch inline, increasing performance. */ -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION # define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else # define preempt_stop(clobbers) @@ -1084,7 +1084,7 @@ restore_all: INTERRUPT_RETURN restore_all_kernel: -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0, PER_CPU_VAR(__preempt_count) jnz .Lno_preempt @@ -1364,7 +1364,7 @@ ENTRY(xen_hypervisor_callback) ENTRY(xen_do_upcall) 1: mov %esp, %eax call xen_evtchn_do_upcall -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION call xen_maybe_preempt_hcall #endif jmp ret_from_intr @@ -1443,8 +1443,12 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, ENTRY(page_fault) ASM_CLAC - pushl $0; /* %gs's slot on the stack */ + pushl $do_page_fault + jmp common_exception_read_cr2 +END(page_fault) +common_exception_read_cr2: + /* the function address is in %gs's slot on the stack */ SAVE_ALL switch_stacks=1 skip_gs=1 ENCODE_FRAME_POINTER @@ -1452,6 +1456,7 @@ ENTRY(page_fault) /* fixup %gs */ GS_TO_REG %ecx + movl PT_GS(%esp), %edi REG_TO_PTGS %ecx SET_KERNEL_GS %ecx @@ -1463,9 +1468,9 @@ ENTRY(page_fault) TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - call do_page_fault + CALL_NOSPEC %edi jmp ret_from_exception -END(page_fault) +END(common_exception_read_cr2) common_exception: /* the function address is in %gs's slot on the stack */ @@ -1595,7 +1600,7 @@ END(general_protection) ENTRY(async_page_fault) ASM_CLAC pushl $do_async_page_fault - jmp common_exception + jmp common_exception_read_cr2 END(async_page_fault) #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3f5a978a02a7..b7c3ea4cb19d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -519,7 +519,7 @@ ENTRY(interrupt_entry) testb $3, CS-ORIG_RAX+8(%rsp) jz 1f SWAPGS - + FENCE_SWAPGS_USER_ENTRY /* * Switch to the thread stack. The IRET frame and orig_ax are * on the stack, as well as the return address. RDI..R12 are @@ -549,8 +549,10 @@ ENTRY(interrupt_entry) UNWIND_HINT_FUNC movq (%rdi), %rdi + jmp 2f 1: - + FENCE_SWAPGS_KERNEL_ENTRY +2: PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 @@ -662,7 +664,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) /* Returning to kernel space */ retint_kernel: -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* Interrupts are off */ /* Check if we need preemption */ btl $9, EFLAGS(%rsp) /* were interrupts off? */ @@ -1056,10 +1058,10 @@ ENTRY(native_load_gs_index) ENDPROC(native_load_gs_index) EXPORT_SYMBOL(native_load_gs_index) - _ASM_EXTABLE(.Lgs_change, bad_gs) + _ASM_EXTABLE(.Lgs_change, .Lbad_gs) .section .fixup, "ax" /* running with kernelgs */ -bad_gs: +.Lbad_gs: SWAPGS /* switch back to user gs */ .macro ZAP_GS /* This can't be a string because the preprocessor needs to see it. */ @@ -1113,7 +1115,7 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ call xen_evtchn_do_upcall LEAVE_IRQ_STACK -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION call xen_maybe_preempt_hcall #endif jmp error_exit @@ -1238,6 +1240,13 @@ ENTRY(paranoid_entry) */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + /* + * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an + * unconditional CR3 write, even in the PTI case. So do an lfence + * to prevent GS speculation, regardless of whether PTI is enabled. + */ + FENCE_SWAPGS_KERNEL_ENTRY + ret END(paranoid_entry) @@ -1288,6 +1297,7 @@ ENTRY(error_entry) * from user mode due to an IRET fault. */ SWAPGS + FENCE_SWAPGS_USER_ENTRY /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax @@ -1301,6 +1311,8 @@ ENTRY(error_entry) pushq %r12 ret +.Lerror_entry_done_lfence: + FENCE_SWAPGS_KERNEL_ENTRY .Lerror_entry_done: ret @@ -1318,7 +1330,7 @@ ENTRY(error_entry) cmpq %rax, RIP+8(%rsp) je .Lbstep_iret cmpq $.Lgs_change, RIP+8(%rsp) - jne .Lerror_entry_done + jne .Lerror_entry_done_lfence /* * hack: .Lgs_change can fail with user gsbase. If this happens, fix up @@ -1326,6 +1338,7 @@ ENTRY(error_entry) * .Lgs_change's error handler with kernel gsbase. */ SWAPGS + FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax jmp .Lerror_entry_done @@ -1340,6 +1353,7 @@ ENTRY(error_entry) * gsbase and CR3. Switch to kernel gsbase and CR3: */ SWAPGS + FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax /* @@ -1431,6 +1445,7 @@ ENTRY(nmi) swapgs cld + FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index d5252bc1e380..b1bf31713374 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -10,10 +10,13 @@ /* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ extern asmlinkage long sys_ni_syscall(const struct pt_regs *); #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); +#define __SYSCALL_X32(nr, sym, qual) __SYSCALL_64(nr, sym, qual) #include <asm/syscalls_64.h> #undef __SYSCALL_64 +#undef __SYSCALL_X32 #define __SYSCALL_64(nr, sym, qual) [nr] = sym, +#define __SYSCALL_X32(nr, sym, qual) asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* @@ -23,3 +26,25 @@ asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { [0 ... __NR_syscall_max] = &sys_ni_syscall, #include <asm/syscalls_64.h> }; + +#undef __SYSCALL_64 +#undef __SYSCALL_X32 + +#ifdef CONFIG_X86_X32_ABI + +#define __SYSCALL_64(nr, sym, qual) +#define __SYSCALL_X32(nr, sym, qual) [nr] = sym, + +asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_syscall_x32_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_x32_max] = &sys_ni_syscall, +#include <asm/syscalls_64.h> +}; + +#undef __SYSCALL_64 +#undef __SYSCALL_X32 + +#endif diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c00019abd076..3fe02546aed3 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -186,11 +186,11 @@ 172 i386 prctl sys_prctl __ia32_sys_prctl 173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn 174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction -175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_sys_rt_sigprocmask +175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_compat_sys_rt_sigprocmask 176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending 177 i386 rt_sigtimedwait sys_rt_sigtimedwait_time32 __ia32_compat_sys_rt_sigtimedwait_time32 178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo -179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_sys_rt_sigsuspend +179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_compat_sys_rt_sigsuspend 180 i386 pread64 sys_pread64 __ia32_compat_sys_x86_pread 181 i386 pwrite64 sys_pwrite64 __ia32_compat_sys_x86_pwrite 182 i386 chown sys_chown16 __ia32_sys_chown16 diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 94fcd1951aca..1af2be39e7d9 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -1,13 +1,13 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 in="$1" out="$2" syscall_macro() { - abi="$1" - nr="$2" - entry="$3" + local abi="$1" + local nr="$2" + local entry="$3" # Entry can be either just a function name or "function/qualifier" real_entry="${entry%%/*}" @@ -21,14 +21,14 @@ syscall_macro() { } emit() { - abi="$1" - nr="$2" - entry="$3" - compat="$4" - umlentry="" + local abi="$1" + local nr="$2" + local entry="$3" + local compat="$4" + local umlentry="" - if [ "$abi" = "64" -a -n "$compat" ]; then - echo "a compat entry for a 64-bit syscall makes no sense" >&2 + if [ "$abi" != "I386" -a -n "$compat" ]; then + echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2 exit 1 fi @@ -62,14 +62,17 @@ grep '^[0-9]' "$in" | sort -n | ( while read nr abi name entry compat; do abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` if [ "$abi" = "COMMON" -o "$abi" = "64" ]; then - # COMMON is the same as 64, except that we don't expect X32 - # programs to use it. Our expectation has nothing to do with - # any generated code, so treat them the same. emit 64 "$nr" "$entry" "$compat" + if [ "$abi" = "COMMON" ]; then + # COMMON means that this syscall exists in the same form for + # 64-bit and X32. + echo "#ifdef CONFIG_X86_X32_ABI" + emit X32 "$nr" "$entry" "$compat" + echo "#endif" + fi elif [ "$abi" = "X32" ]; then - # X32 is equivalent to 64 on an X32-compatible kernel. echo "#ifdef CONFIG_X86_X32_ABI" - emit 64 "$nr" "$entry" "$compat" + emit X32 "$nr" "$entry" "$compat" echo "#endif" elif [ "$abi" = "I386" ]; then emit "$abi" "$nr" "$entry" "$compat" diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index cb3464525b37..2713490611a3 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -34,7 +34,7 @@ THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION THUNK ___preempt_schedule, preempt_schedule THUNK ___preempt_schedule_notrace, preempt_schedule_notrace EXPORT_SYMBOL(___preempt_schedule) diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index cc20465b2867..ea5c4167086c 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -46,7 +46,7 @@ THUNK lockdep_sys_exit_thunk,lockdep_sys_exit #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION THUNK ___preempt_schedule, preempt_schedule THUNK ___preempt_schedule_notrace, preempt_schedule_notrace EXPORT_SYMBOL(___preempt_schedule) @@ -55,7 +55,7 @@ #if defined(CONFIG_TRACE_IRQFLAGS) \ || defined(CONFIG_DEBUG_LOCK_ALLOC) \ - || defined(CONFIG_PREEMPT) + || defined(CONFIG_PREEMPTION) .L_restore: popq %r11 popq %r10 diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 62f317c9113a..5b35b7ea5d72 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -661,10 +661,17 @@ fail: throttle = perf_event_overflow(event, &data, ®s); out: - if (throttle) + if (throttle) { perf_ibs_stop(event, 0); - else - perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + } else { + period >>= 4; + + if ((ibs_caps & IBS_CAPS_RDWROPCNT) && + (*config & IBS_OP_CNT_CTL)) + period |= *config & IBS_OP_CUR_CNT_RAND; + + perf_ibs_enable_event(perf_ibs, hwc, period); + } perf_event_update_userpage(event); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 81b005e4c7d9..7b21455d7504 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1005,6 +1005,27 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, /* current number of events already accepted */ n = cpuc->n_events; + if (!cpuc->n_events) + cpuc->pebs_output = 0; + + if (!cpuc->is_fake && leader->attr.precise_ip) { + /* + * For PEBS->PT, if !aux_event, the group leader (PT) went + * away, the group was broken down and this singleton event + * can't schedule any more. + */ + if (is_pebs_pt(leader) && !leader->aux_event) + return -EINVAL; + + /* + * pebs_output: 0: no PEBS so far, 1: PT, 2: DS + */ + if (cpuc->pebs_output && + cpuc->pebs_output != is_pebs_pt(leader) + 1) + return -EINVAL; + + cpuc->pebs_output = is_pebs_pt(leader) + 1; + } if (is_x86_event(leader)) { if (n >= max_count) @@ -1236,7 +1257,7 @@ void x86_pmu_enable_event(struct perf_event *event) * Add a single event to the PMU. * * The event is added to the group of enabled events - * but only if it can be scehduled with existing events. + * but only if it can be scheduled with existing events. */ static int x86_pmu_add(struct perf_event *event, int flags) { @@ -2087,7 +2108,7 @@ static int x86_pmu_event_init(struct perf_event *event) static void refresh_pce(void *ignored) { - load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm)); + load_mm_cr4_irqsoff(this_cpu_read(cpu_tlbstate.loaded_mm)); } static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) @@ -2241,6 +2262,17 @@ static int x86_pmu_check_period(struct perf_event *event, u64 value) return 0; } +static int x86_pmu_aux_output_match(struct perf_event *event) +{ + if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT)) + return 0; + + if (x86_pmu.aux_output_match) + return x86_pmu.aux_output_match(event); + + return 0; +} + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, @@ -2266,6 +2298,8 @@ static struct pmu pmu = { .sched_task = x86_pmu_sched_task, .task_ctx_size = sizeof(struct x86_perf_task_context), .check_period = x86_pmu_check_period, + + .aux_output_match = x86_pmu_aux_output_match, }; void arch_perf_update_userpage(struct perf_event *event, diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 9e911a96972b..27ee47a7be66 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -18,9 +18,9 @@ #include <asm/cpufeature.h> #include <asm/hardirq.h> #include <asm/intel-family.h> +#include <asm/intel_pt.h> #include <asm/apic.h> #include <asm/cpu_device_id.h> -#include <asm/hypervisor.h> #include "../perf_event.h" @@ -263,8 +263,8 @@ static struct event_constraint intel_icl_event_constraints[] = { }; static struct extra_reg intel_icl_extra_regs[] __read_mostly = { - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff9fffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff9fffull, RSP_1), + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffffbfffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffffbfffull, RSP_1), INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE), EVENT_EXTRA_END @@ -3299,6 +3299,13 @@ static int intel_pmu_hw_config(struct perf_event *event) } } + if (event->attr.aux_output) { + if (!event->attr.precise_ip) + return -EINVAL; + + event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT; + } + if (event->attr.type != PERF_TYPE_RAW) return 0; @@ -3573,6 +3580,11 @@ static u64 bdw_limit_period(struct perf_event *event, u64 left) return left; } +static u64 nhm_limit_period(struct perf_event *event, u64 left) +{ + return max(left, 32ULL); +} + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -3812,6 +3824,14 @@ static int intel_pmu_check_period(struct perf_event *event, u64 value) return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0; } +static int intel_pmu_aux_output_match(struct perf_event *event) +{ + if (!x86_pmu.intel_cap.pebs_output_pt_available) + return 0; + + return is_intel_pt_event(event); +} + PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); @@ -3936,6 +3956,8 @@ static __initconst const struct x86_pmu intel_pmu = { .sched_task = intel_pmu_sched_task, .check_period = intel_pmu_check_period, + + .aux_output_match = intel_pmu_aux_output_match, }; static __init void intel_clovertown_quirk(void) @@ -3965,31 +3987,31 @@ static __init void intel_clovertown_quirk(void) } static const struct x86_cpu_desc isolation_ucodes[] = { - INTEL_CPU_DESC(INTEL_FAM6_HASWELL_CORE, 3, 0x0000001f), - INTEL_CPU_DESC(INTEL_FAM6_HASWELL_ULT, 1, 0x0000001e), - INTEL_CPU_DESC(INTEL_FAM6_HASWELL_GT3E, 1, 0x00000015), + INTEL_CPU_DESC(INTEL_FAM6_HASWELL, 3, 0x0000001f), + INTEL_CPU_DESC(INTEL_FAM6_HASWELL_L, 1, 0x0000001e), + INTEL_CPU_DESC(INTEL_FAM6_HASWELL_G, 1, 0x00000015), INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 2, 0x00000037), INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 4, 0x0000000a), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_CORE, 4, 0x00000023), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_GT3E, 1, 0x00000014), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 2, 0x00000010), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 3, 0x07000009), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 4, 0x0f000009), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 5, 0x0e000002), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL, 4, 0x00000023), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_G, 1, 0x00000014), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 2, 0x00000010), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002), INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000), - INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_MOBILE, 3, 0x0000007c), - INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_DESKTOP, 3, 0x0000007c), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 9, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 9, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 10, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 11, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 12, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 10, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 11, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 12, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 13, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 9, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 10, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 11, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 12, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 10, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 11, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 12, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 13, 0x0000004e), {} }; @@ -4053,7 +4075,7 @@ static bool check_msr(unsigned long msr, u64 mask) * Disable the check for real HW, so we don't * mess with potentionaly enabled registers: */ - if (hypervisor_is_type(X86_HYPER_NATIVE)) + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) return true; /* @@ -4147,7 +4169,7 @@ static const struct x86_cpu_desc counter_freezing_ucodes[] = { INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 2, 0x0000000e), INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 9, 0x0000002e), INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 10, 0x00000008), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_X, 1, 0x00000028), + INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_D, 1, 0x00000028), INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028), INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006), {} @@ -4607,6 +4629,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; + x86_pmu.limit_period = nhm_limit_period; mem_attr = nhm_mem_events_attrs; @@ -4644,7 +4667,7 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_SILVERMONT: - case INTEL_FAM6_ATOM_SILVERMONT_X: + case INTEL_FAM6_ATOM_SILVERMONT_D: case INTEL_FAM6_ATOM_SILVERMONT_MID: case INTEL_FAM6_ATOM_AIRMONT: case INTEL_FAM6_ATOM_AIRMONT_MID: @@ -4666,7 +4689,7 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GOLDMONT_X: + case INTEL_FAM6_ATOM_GOLDMONT_D: x86_add_quirk(intel_counter_freezing_quirk); memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -4722,7 +4745,7 @@ __init int intel_pmu_init(void) name = "goldmont_plus"; break; - case INTEL_FAM6_ATOM_TREMONT_X: + case INTEL_FAM6_ATOM_TREMONT_D: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -4858,10 +4881,10 @@ __init int intel_pmu_init(void) break; - case INTEL_FAM6_HASWELL_CORE: + case INTEL_FAM6_HASWELL: case INTEL_FAM6_HASWELL_X: - case INTEL_FAM6_HASWELL_ULT: - case INTEL_FAM6_HASWELL_GT3E: + case INTEL_FAM6_HASWELL_L: + case INTEL_FAM6_HASWELL_G: x86_add_quirk(intel_ht_bug); x86_add_quirk(intel_pebs_isolation_quirk); x86_pmu.late_ack = true; @@ -4891,9 +4914,9 @@ __init int intel_pmu_init(void) name = "haswell"; break; - case INTEL_FAM6_BROADWELL_CORE: - case INTEL_FAM6_BROADWELL_XEON_D: - case INTEL_FAM6_BROADWELL_GT3E: + case INTEL_FAM6_BROADWELL: + case INTEL_FAM6_BROADWELL_D: + case INTEL_FAM6_BROADWELL_G: case INTEL_FAM6_BROADWELL_X: x86_add_quirk(intel_pebs_isolation_quirk); x86_pmu.late_ack = true; @@ -4955,10 +4978,11 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SKYLAKE_X: pmem = true; - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: + /* fall through */ + case INTEL_FAM6_SKYLAKE_L: + case INTEL_FAM6_SKYLAKE: + case INTEL_FAM6_KABYLAKE_L: + case INTEL_FAM6_KABYLAKE: x86_add_quirk(intel_pebs_isolation_quirk); x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -5002,10 +5026,11 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ICELAKE_X: - case INTEL_FAM6_ICELAKE_XEON_D: + case INTEL_FAM6_ICELAKE_D: pmem = true; - case INTEL_FAM6_ICELAKE_MOBILE: - case INTEL_FAM6_ICELAKE_DESKTOP: + /* fall through */ + case INTEL_FAM6_ICELAKE_L: + case INTEL_FAM6_ICELAKE: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 688592b34564..9f2f39003d96 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -446,7 +446,7 @@ static int cstate_cpu_init(unsigned int cpu) return 0; } -const struct attribute_group *core_attr_update[] = { +static const struct attribute_group *core_attr_update[] = { &group_cstate_core_c1, &group_cstate_core_c3, &group_cstate_core_c6, @@ -454,7 +454,7 @@ const struct attribute_group *core_attr_update[] = { NULL, }; -const struct attribute_group *pkg_attr_update[] = { +static const struct attribute_group *pkg_attr_update[] = { &group_cstate_pkg_c2, &group_cstate_pkg_c3, &group_cstate_pkg_c6, @@ -593,40 +593,40 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_CORE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_GT3E, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_X, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_G, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_L, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_D, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_XEON_D, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_GT3E, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_X, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_D, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_G, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_L, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_L, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_MOBILE, cnl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_L, cnl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_D, glm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_MOBILE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_L, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE, snb_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 2c8db2c19328..ce83950036c5 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -851,7 +851,7 @@ struct event_constraint intel_skl_pebs_event_constraints[] = { struct event_constraint intel_icl_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ - INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x400000000ULL), /* SLOTS */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), /* SLOTS */ INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf), /* MEM_INST_RETIRED.LOAD */ @@ -902,6 +902,9 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) */ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc) { + if (cpuc->n_pebs == cpuc->n_pebs_via_pt) + return false; + return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs); } @@ -919,6 +922,9 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) u64 threshold; int reserved; + if (cpuc->n_pebs_via_pt) + return; + if (x86_pmu.flags & PMU_FL_PEBS_ALL) reserved = x86_pmu.max_pebs_events + x86_pmu.num_counters_fixed; else @@ -1059,10 +1065,40 @@ void intel_pmu_pebs_add(struct perf_event *event) cpuc->n_pebs++; if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs++; + if (hwc->flags & PERF_X86_EVENT_PEBS_VIA_PT) + cpuc->n_pebs_via_pt++; pebs_update_state(needed_cb, cpuc, event, true); } +static void intel_pmu_pebs_via_pt_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!is_pebs_pt(event)) + return; + + if (!(cpuc->pebs_enabled & ~PEBS_VIA_PT_MASK)) + cpuc->pebs_enabled &= ~PEBS_VIA_PT_MASK; +} + +static void intel_pmu_pebs_via_pt_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + struct debug_store *ds = cpuc->ds; + + if (!is_pebs_pt(event)) + return; + + if (!(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) + cpuc->pebs_enabled |= PEBS_PMI_AFTER_EACH_RECORD; + + cpuc->pebs_enabled |= PEBS_OUTPUT_PT; + + wrmsrl(MSR_RELOAD_PMC0 + hwc->idx, ds->pebs_event_reset[hwc->idx]); +} + void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1100,6 +1136,8 @@ void intel_pmu_pebs_enable(struct perf_event *event) } else { ds->pebs_event_reset[hwc->idx] = 0; } + + intel_pmu_pebs_via_pt_enable(event); } void intel_pmu_pebs_del(struct perf_event *event) @@ -1111,6 +1149,8 @@ void intel_pmu_pebs_del(struct perf_event *event) cpuc->n_pebs--; if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs--; + if (hwc->flags & PERF_X86_EVENT_PEBS_VIA_PT) + cpuc->n_pebs_via_pt--; pebs_update_state(needed_cb, cpuc, event, false); } @@ -1120,7 +1160,8 @@ void intel_pmu_pebs_disable(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - if (cpuc->n_pebs == cpuc->n_large_pebs) + if (cpuc->n_pebs == cpuc->n_large_pebs && + cpuc->n_pebs != cpuc->n_pebs_via_pt) intel_pmu_drain_pebs_buffer(); cpuc->pebs_enabled &= ~(1ULL << hwc->idx); @@ -1131,6 +1172,8 @@ void intel_pmu_pebs_disable(struct perf_event *event) else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); + intel_pmu_pebs_via_pt_disable(event); + if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); @@ -2031,6 +2074,12 @@ void __init intel_ds_init(void) PERF_SAMPLE_REGS_INTR); } pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); + + if (x86_pmu.intel_cap.pebs_output_pt_available) { + pr_cont("PEBS-via-PT, "); + x86_get_pmu()->capabilities |= PERF_PMU_CAP_AUX_OUTPUT; + } + break; default: diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 6f814a27416b..ea54634eabf3 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -273,7 +273,7 @@ static inline bool lbr_from_signext_quirk_needed(void) return !tsx_support && (lbr_desc[lbr_format] & LBR_TSX); } -DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key); +static DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key); /* If quirk is enabled, ensure sign extension is 63 bits: */ inline u64 lbr_from_signext_quirk_wr(u64 val) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index d3dc2274ddd4..74e80ed9c6c4 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -204,9 +204,9 @@ static int __init pt_pmu_hw_init(void) /* model-specific quirks */ switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_BROADWELL_CORE: - case INTEL_FAM6_BROADWELL_XEON_D: - case INTEL_FAM6_BROADWELL_GT3E: + case INTEL_FAM6_BROADWELL: + case INTEL_FAM6_BROADWELL_D: + case INTEL_FAM6_BROADWELL_G: case INTEL_FAM6_BROADWELL_X: /* not setting BRANCH_EN will #GP, erratum BDM106 */ pt_pmu.branch_en_always_on = true; @@ -545,33 +545,62 @@ static void pt_config_buffer(void *buf, unsigned int topa_idx, wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); } -/* - * Keep ToPA table-related metadata on the same page as the actual table, - * taking up a few words from the top - */ - -#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1) - /** - * struct topa - page-sized ToPA table with metadata at the top - * @table: actual ToPA table entries, as understood by PT hardware + * struct topa - ToPA metadata * @list: linkage to struct pt_buffer's list of tables - * @phys: physical address of this page * @offset: offset of the first entry in this table in the buffer * @size: total size of all entries in this table * @last: index of the last initialized entry in this table + * @z_count: how many times the first entry repeats */ struct topa { - struct topa_entry table[TENTS_PER_PAGE]; struct list_head list; - u64 phys; u64 offset; size_t size; int last; + unsigned int z_count; }; +/* + * Keep ToPA table-related metadata on the same page as the actual table, + * taking up a few words from the top + */ + +#define TENTS_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct topa)) / sizeof(struct topa_entry)) + +/** + * struct topa_page - page-sized ToPA table with metadata at the top + * @table: actual ToPA table entries, as understood by PT hardware + * @topa: metadata + */ +struct topa_page { + struct topa_entry table[TENTS_PER_PAGE]; + struct topa topa; +}; + +static inline struct topa_page *topa_to_page(struct topa *topa) +{ + return container_of(topa, struct topa_page, topa); +} + +static inline struct topa_page *topa_entry_to_page(struct topa_entry *te) +{ + return (struct topa_page *)((unsigned long)te & PAGE_MASK); +} + +static inline phys_addr_t topa_pfn(struct topa *topa) +{ + return PFN_DOWN(virt_to_phys(topa_to_page(topa))); +} + /* make -1 stand for the last table entry */ -#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)]) +#define TOPA_ENTRY(t, i) \ + ((i) == -1 \ + ? &topa_to_page(t)->table[(t)->last] \ + : &topa_to_page(t)->table[(i)]) +#define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size)) +#define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size) /** * topa_alloc() - allocate page-sized ToPA table @@ -583,27 +612,26 @@ struct topa { static struct topa *topa_alloc(int cpu, gfp_t gfp) { int node = cpu_to_node(cpu); - struct topa *topa; + struct topa_page *tp; struct page *p; p = alloc_pages_node(node, gfp | __GFP_ZERO, 0); if (!p) return NULL; - topa = page_address(p); - topa->last = 0; - topa->phys = page_to_phys(p); + tp = page_address(p); + tp->topa.last = 0; /* * In case of singe-entry ToPA, always put the self-referencing END * link as the 2nd entry in the table */ if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { - TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; - TOPA_ENTRY(topa, 1)->end = 1; + TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p); + TOPA_ENTRY(&tp->topa, 1)->end = 1; } - return topa; + return &tp->topa; } /** @@ -643,7 +671,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) BUG_ON(last->last != TENTS_PER_PAGE - 1); - TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT; + TOPA_ENTRY(last, -1)->base = topa_pfn(topa); TOPA_ENTRY(last, -1)->end = 1; } @@ -670,7 +698,7 @@ static bool topa_table_full(struct topa *topa) * * Return: 0 on success or error code. */ -static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp) +static int topa_insert_pages(struct pt_buffer *buf, int cpu, gfp_t gfp) { struct topa *topa = buf->last; int order = 0; @@ -681,13 +709,18 @@ static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp) order = page_private(p); if (topa_table_full(topa)) { - topa = topa_alloc(buf->cpu, gfp); + topa = topa_alloc(cpu, gfp); if (!topa) return -ENOMEM; topa_insert_table(buf, topa); } + if (topa->z_count == topa->last - 1) { + if (order == TOPA_ENTRY(topa, topa->last - 1)->size) + topa->z_count++; + } + TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; TOPA_ENTRY(topa, -1)->size = order; if (!buf->snapshot && @@ -713,23 +746,26 @@ static void pt_topa_dump(struct pt_buffer *buf) struct topa *topa; list_for_each_entry(topa, &buf->tables, list) { + struct topa_page *tp = topa_to_page(topa); int i; - pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table, - topa->phys, topa->offset, topa->size); + pr_debug("# table @%p, off %llx size %zx\n", tp->table, + topa->offset, topa->size); for (i = 0; i < TENTS_PER_PAGE; i++) { pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n", - &topa->table[i], - (unsigned long)topa->table[i].base << TOPA_SHIFT, - sizes(topa->table[i].size), - topa->table[i].end ? 'E' : ' ', - topa->table[i].intr ? 'I' : ' ', - topa->table[i].stop ? 'S' : ' ', - *(u64 *)&topa->table[i]); + &tp->table[i], + (unsigned long)tp->table[i].base << TOPA_SHIFT, + sizes(tp->table[i].size), + tp->table[i].end ? 'E' : ' ', + tp->table[i].intr ? 'I' : ' ', + tp->table[i].stop ? 'S' : ' ', + *(u64 *)&tp->table[i]); if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) && - topa->table[i].stop) || - topa->table[i].end) + tp->table[i].stop) || + tp->table[i].end) break; + if (!i && topa->z_count) + i += topa->z_count; } } } @@ -771,7 +807,7 @@ static void pt_update_head(struct pt *pt) /* offset of the current output region within this table */ for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++) - base += sizes(buf->cur->table[topa_idx].size); + base += TOPA_ENTRY_SIZE(buf->cur, topa_idx); if (buf->snapshot) { local_set(&buf->data_size, base); @@ -791,7 +827,7 @@ static void pt_update_head(struct pt *pt) */ static void *pt_buffer_region(struct pt_buffer *buf) { - return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT); + return phys_to_virt(TOPA_ENTRY(buf->cur, buf->cur_idx)->base << TOPA_SHIFT); } /** @@ -800,7 +836,7 @@ static void *pt_buffer_region(struct pt_buffer *buf) */ static size_t pt_buffer_region_size(struct pt_buffer *buf) { - return sizes(buf->cur->table[buf->cur_idx].size); + return TOPA_ENTRY_SIZE(buf->cur, buf->cur_idx); } /** @@ -830,7 +866,7 @@ static void pt_handle_status(struct pt *pt) * know. */ if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || - buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { + buf->output_off == pt_buffer_region_size(buf)) { perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_TRUNCATED); advance++; @@ -868,9 +904,11 @@ static void pt_handle_status(struct pt *pt) static void pt_read_offset(struct pt_buffer *buf) { u64 offset, base_topa; + struct topa_page *tp; rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa); - buf->cur = phys_to_virt(base_topa); + tp = phys_to_virt(base_topa); + buf->cur = &tp->topa; rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset); /* offset within current output region */ @@ -879,29 +917,97 @@ static void pt_read_offset(struct pt_buffer *buf) buf->cur_idx = (offset & 0xffffff80) >> 7; } -/** - * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry - * @buf: PT buffer. - * @pg: Page offset in the buffer. - * - * When advancing to the next output region (ToPA entry), given a page offset - * into the buffer, we need to find the offset of the first page in the next - * region. - */ -static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg) +static struct topa_entry * +pt_topa_entry_for_page(struct pt_buffer *buf, unsigned int pg) { - struct topa_entry *te = buf->topa_index[pg]; + struct topa_page *tp; + struct topa *topa; + unsigned int idx, cur_pg = 0, z_pg = 0, start_idx = 0; - /* one region */ - if (buf->first == buf->last && buf->first->last == 1) - return pg; + /* + * Indicates a bug in the caller. + */ + if (WARN_ON_ONCE(pg >= buf->nr_pages)) + return NULL; + + /* + * First, find the ToPA table where @pg fits. With high + * order allocations, there shouldn't be many of these. + */ + list_for_each_entry(topa, &buf->tables, list) { + if (topa->offset + topa->size > pg << PAGE_SHIFT) + goto found; + } + + /* + * Hitting this means we have a problem in the ToPA + * allocation code. + */ + WARN_ON_ONCE(1); - do { - pg++; - pg &= buf->nr_pages - 1; - } while (buf->topa_index[pg] == te); + return NULL; - return pg; +found: + /* + * Indicates a problem in the ToPA allocation code. + */ + if (WARN_ON_ONCE(topa->last == -1)) + return NULL; + + tp = topa_to_page(topa); + cur_pg = PFN_DOWN(topa->offset); + if (topa->z_count) { + z_pg = TOPA_ENTRY_PAGES(topa, 0) * (topa->z_count + 1); + start_idx = topa->z_count + 1; + } + + /* + * Multiple entries at the beginning of the table have the same size, + * ideally all of them; if @pg falls there, the search is done. + */ + if (pg >= cur_pg && pg < cur_pg + z_pg) { + idx = (pg - cur_pg) / TOPA_ENTRY_PAGES(topa, 0); + return &tp->table[idx]; + } + + /* + * Otherwise, slow path: iterate through the remaining entries. + */ + for (idx = start_idx, cur_pg += z_pg; idx < topa->last; idx++) { + if (cur_pg + TOPA_ENTRY_PAGES(topa, idx) > pg) + return &tp->table[idx]; + + cur_pg += TOPA_ENTRY_PAGES(topa, idx); + } + + /* + * Means we couldn't find a ToPA entry in the table that does match. + */ + WARN_ON_ONCE(1); + + return NULL; +} + +static struct topa_entry * +pt_topa_prev_entry(struct pt_buffer *buf, struct topa_entry *te) +{ + unsigned long table = (unsigned long)te & ~(PAGE_SIZE - 1); + struct topa_page *tp; + struct topa *topa; + + tp = (struct topa_page *)table; + if (tp->table != te) + return --te; + + topa = &tp->topa; + if (topa == buf->first) + topa = buf->last; + else + topa = list_prev_entry(topa, list); + + tp = topa_to_page(topa); + + return &tp->table[topa->last - 1]; } /** @@ -925,8 +1031,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, unsigned long idx, npages, wakeup; /* can't stop in the middle of an output region */ - if (buf->output_off + handle->size + 1 < - sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { + if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) { perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); return -EINVAL; } @@ -937,9 +1042,13 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, return 0; /* clear STOP and INT from current entry */ - buf->topa_index[buf->stop_pos]->stop = 0; - buf->topa_index[buf->stop_pos]->intr = 0; - buf->topa_index[buf->intr_pos]->intr = 0; + if (buf->stop_te) { + buf->stop_te->stop = 0; + buf->stop_te->intr = 0; + } + + if (buf->intr_te) + buf->intr_te->intr = 0; /* how many pages till the STOP marker */ npages = handle->size >> PAGE_SHIFT; @@ -950,7 +1059,12 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, idx = (head >> PAGE_SHIFT) + npages; idx &= buf->nr_pages - 1; - buf->stop_pos = idx; + + if (idx != buf->stop_pos) { + buf->stop_pos = idx; + buf->stop_te = pt_topa_entry_for_page(buf, idx); + buf->stop_te = pt_topa_prev_entry(buf, buf->stop_te); + } wakeup = handle->wakeup >> PAGE_SHIFT; @@ -960,51 +1074,20 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, idx = wakeup; idx &= buf->nr_pages - 1; - buf->intr_pos = idx; + if (idx != buf->intr_pos) { + buf->intr_pos = idx; + buf->intr_te = pt_topa_entry_for_page(buf, idx); + buf->intr_te = pt_topa_prev_entry(buf, buf->intr_te); + } - buf->topa_index[buf->stop_pos]->stop = 1; - buf->topa_index[buf->stop_pos]->intr = 1; - buf->topa_index[buf->intr_pos]->intr = 1; + buf->stop_te->stop = 1; + buf->stop_te->intr = 1; + buf->intr_te->intr = 1; return 0; } /** - * pt_buffer_setup_topa_index() - build topa_index[] table of regions - * @buf: PT buffer. - * - * topa_index[] references output regions indexed by offset into the - * buffer for purposes of quick reverse lookup. - */ -static void pt_buffer_setup_topa_index(struct pt_buffer *buf) -{ - struct topa *cur = buf->first, *prev = buf->last; - struct topa_entry *te_cur = TOPA_ENTRY(cur, 0), - *te_prev = TOPA_ENTRY(prev, prev->last - 1); - int pg = 0, idx = 0; - - while (pg < buf->nr_pages) { - int tidx; - - /* pages within one topa entry */ - for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++) - buf->topa_index[pg] = te_prev; - - te_prev = te_cur; - - if (idx == cur->last - 1) { - /* advance to next topa table */ - idx = 0; - cur = list_entry(cur->list.next, struct topa, list); - } else { - idx++; - } - te_cur = TOPA_ENTRY(cur, idx); - } - -} - -/** * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head * @buf: PT buffer. * @head: Write pointer (aux_head) from AUX buffer. @@ -1021,18 +1104,20 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf) */ static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) { + struct topa_page *cur_tp; + struct topa_entry *te; int pg; if (buf->snapshot) head &= (buf->nr_pages << PAGE_SHIFT) - 1; pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); - pg = pt_topa_next_entry(buf, pg); + te = pt_topa_entry_for_page(buf, pg); - buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK); - buf->cur_idx = ((unsigned long)buf->topa_index[pg] - - (unsigned long)buf->cur) / sizeof(struct topa_entry); - buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1); + cur_tp = topa_entry_to_page(te); + buf->cur = &cur_tp->topa; + buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0); + buf->output_off = head & (pt_buffer_region_size(buf) - 1); local64_set(&buf->head, head); local_set(&buf->data_size, 0); @@ -1061,31 +1146,29 @@ static void pt_buffer_fini_topa(struct pt_buffer *buf) * @size: Total size of all regions within this ToPA. * @gfp: Allocation flags. */ -static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages, - gfp_t gfp) +static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu, + unsigned long nr_pages, gfp_t gfp) { struct topa *topa; int err; - topa = topa_alloc(buf->cpu, gfp); + topa = topa_alloc(cpu, gfp); if (!topa) return -ENOMEM; topa_insert_table(buf, topa); while (buf->nr_pages < nr_pages) { - err = topa_insert_pages(buf, gfp); + err = topa_insert_pages(buf, cpu, gfp); if (err) { pt_buffer_fini_topa(buf); return -ENOMEM; } } - pt_buffer_setup_topa_index(buf); - /* link last table to the first one, unless we're double buffering */ if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { - TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT; + TOPA_ENTRY(buf->last, -1)->base = topa_pfn(buf->first); TOPA_ENTRY(buf->last, -1)->end = 1; } @@ -1119,18 +1202,18 @@ pt_buffer_setup_aux(struct perf_event *event, void **pages, cpu = raw_smp_processor_id(); node = cpu_to_node(cpu); - buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]), - GFP_KERNEL, node); + buf = kzalloc_node(sizeof(struct pt_buffer), GFP_KERNEL, node); if (!buf) return NULL; - buf->cpu = cpu; buf->snapshot = snapshot; buf->data_pages = pages; + buf->stop_pos = -1; + buf->intr_pos = -1; INIT_LIST_HEAD(&buf->tables); - ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL); + ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL); if (ret) { kfree(buf); return NULL; @@ -1296,7 +1379,7 @@ void intel_pt_interrupt(void) return; } - pt_config_buffer(buf->cur->table, buf->cur_idx, + pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx, buf->output_off); pt_config(event); } @@ -1361,7 +1444,7 @@ static void pt_event_start(struct perf_event *event, int mode) WRITE_ONCE(pt->handle_nmi, 1); hwc->state = 0; - pt_config_buffer(buf->cur->table, buf->cur_idx, + pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx, buf->output_off); pt_config(event); @@ -1481,6 +1564,11 @@ void cpu_emergency_stop_pt(void) pt_event_stop(pt->handle.event, PERF_EF_UPDATE); } +int is_intel_pt_event(struct perf_event *event) +{ + return event->pmu == &pt_pmu.pmu; +} + static __init int pt_init(void) { int ret, cpu, prior_warn = 0; diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 63fe4063fbd6..1d2bb7572374 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -53,7 +53,6 @@ struct pt_pmu { /** * struct pt_buffer - buffer configuration; one buffer per task_struct or * cpu, depending on perf event configuration - * @cpu: cpu for per-cpu allocation * @tables: list of ToPA tables in this buffer * @first: shorthand for first topa table * @last: shorthand for last topa table @@ -65,13 +64,14 @@ struct pt_pmu { * @lost: if data was lost/truncated * @head: logical write offset inside the buffer * @snapshot: if this is for a snapshot/overwrite counter - * @stop_pos: STOP topa entry in the buffer - * @intr_pos: INT topa entry in the buffer + * @stop_pos: STOP topa entry index + * @intr_pos: INT topa entry index + * @stop_te: STOP topa entry pointer + * @intr_te: INT topa entry pointer * @data_pages: array of pages from perf * @topa_index: table of topa entries indexed by page offset */ struct pt_buffer { - int cpu; struct list_head tables; struct topa *first, *last, *cur; unsigned int cur_idx; @@ -80,9 +80,9 @@ struct pt_buffer { local_t data_size; local64_t head; bool snapshot; - unsigned long stop_pos, intr_pos; + long stop_pos, intr_pos; + struct topa_entry *stop_te, *intr_te; void **data_pages; - struct topa_entry *topa_index[0]; }; #define PT_FILTERS_NUM 4 diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 64ab51ffdf06..5053a403e4ae 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -634,7 +634,7 @@ static void cleanup_rapl_pmus(void) kfree(rapl_pmus); } -const struct attribute_group *rapl_attr_update[] = { +static const struct attribute_group *rapl_attr_update[] = { &rapl_events_cores_group, &rapl_events_pkg_group, &rapl_events_ram_group, @@ -720,27 +720,27 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, model_snbep), X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, model_snb), X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, model_snbep), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL, model_hsw), X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_L, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_G, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_G, model_hsw), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, model_hsx), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_D, model_hsx), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, model_knl), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, model_knl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_L, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE, model_skl), X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_L, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_L, model_skl), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_D, model_hsw), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_DESKTOP, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE, model_skl), {}, }; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 3694a5d0703d..6fc2e06ab4c6 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1451,29 +1451,29 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EP, nhm_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, ivb_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, bdw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, bdw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL, hsw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_L, hsw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_G, hsw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL, bdw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_G, bdw_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EX, nhmex_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EX, nhmex_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, ivbep_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hswep_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, bdx_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, bdx_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_D, bdx_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_L, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, icl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_NNPI, icl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_DESKTOP, icl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ATOM_TREMONT_X, snr_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_L, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_L, icl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_NNPI, icl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE, icl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ATOM_TREMONT_D, snr_uncore_init), {}, }; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 9431447541e9..b1afc77f0704 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -59,22 +59,22 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_IVYBRIDGE: case INTEL_FAM6_IVYBRIDGE_X: - case INTEL_FAM6_HASWELL_CORE: + case INTEL_FAM6_HASWELL: case INTEL_FAM6_HASWELL_X: - case INTEL_FAM6_HASWELL_ULT: - case INTEL_FAM6_HASWELL_GT3E: + case INTEL_FAM6_HASWELL_L: + case INTEL_FAM6_HASWELL_G: - case INTEL_FAM6_BROADWELL_CORE: - case INTEL_FAM6_BROADWELL_XEON_D: - case INTEL_FAM6_BROADWELL_GT3E: + case INTEL_FAM6_BROADWELL: + case INTEL_FAM6_BROADWELL_D: + case INTEL_FAM6_BROADWELL_G: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_ATOM_SILVERMONT: - case INTEL_FAM6_ATOM_SILVERMONT_X: + case INTEL_FAM6_ATOM_SILVERMONT_D: case INTEL_FAM6_ATOM_AIRMONT: case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GOLDMONT_X: + case INTEL_FAM6_ATOM_GOLDMONT_D: case INTEL_FAM6_ATOM_GOLDMONT_PLUS: @@ -84,12 +84,12 @@ static bool test_intel(int idx, void *data) return true; break; - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_SKYLAKE_L: + case INTEL_FAM6_SKYLAKE: case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: - case INTEL_FAM6_ICELAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_L: + case INTEL_FAM6_KABYLAKE: + case INTEL_FAM6_ICELAKE_L: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; @@ -167,7 +167,7 @@ static const struct attribute_group *attr_groups[] = { NULL, }; -const struct attribute_group *attr_update[] = { +static const struct attribute_group *attr_update[] = { &group_aperf, &group_mperf, &group_pperf, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 8751008fc170..ecacfbf4ebc1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -76,6 +76,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ #define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ #define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ +#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ struct amd_nb { int nb_id; /* NorthBridge id */ @@ -85,6 +86,11 @@ struct amd_nb { }; #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) +#define PEBS_PMI_AFTER_EACH_RECORD BIT_ULL(60) +#define PEBS_OUTPUT_OFFSET 61 +#define PEBS_OUTPUT_MASK (3ull << PEBS_OUTPUT_OFFSET) +#define PEBS_OUTPUT_PT (1ull << PEBS_OUTPUT_OFFSET) +#define PEBS_VIA_PT_MASK (PEBS_OUTPUT_PT | PEBS_PMI_AFTER_EACH_RECORD) /* * Flags PEBS can handle without an PMI. @@ -211,6 +217,8 @@ struct cpu_hw_events { u64 pebs_enabled; int n_pebs; int n_large_pebs; + int n_pebs_via_pt; + int pebs_output; /* Current super set of events hardware configuration */ u64 pebs_data_cfg; @@ -510,6 +518,8 @@ union perf_capabilities { */ u64 full_width_write:1; u64 pebs_baseline:1; + u64 pebs_metrics_available:1; + u64 pebs_output_pt_available:1; }; u64 capabilities; }; @@ -692,6 +702,8 @@ struct x86_pmu { * Check period value for PERF_EVENT_IOC_PERIOD ioctl. */ int (*check_period) (struct perf_event *event, u64 period); + + int (*aux_output_match) (struct perf_event *event); }; struct x86_perf_task_context { @@ -901,6 +913,11 @@ static inline int amd_pmu_init(void) #endif /* CONFIG_CPU_SUP_AMD */ +static inline int is_pebs_pt(struct perf_event *event) +{ + return !!(event->hw.flags & PERF_X86_EVENT_PEBS_VIA_PT); +} + #ifdef CONFIG_CPU_SUP_INTEL static inline bool intel_pmu_has_bts_period(struct perf_event *event, u64 period) diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index e65d7fe6489f..5208ba49c89a 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -37,12 +37,14 @@ static inline int fill_gva_list(u64 gva_list[], int offset, * Lower 12 bits encode the number of additional * pages to flush (in addition to the 'cur' page). */ - if (diff >= HV_TLB_FLUSH_UNIT) + if (diff >= HV_TLB_FLUSH_UNIT) { gva_list[gva_n] |= ~PAGE_MASK; - else if (diff) + cur += HV_TLB_FLUSH_UNIT; + } else if (diff) { gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT; + cur = end; + } - cur += HV_TLB_FLUSH_UNIT; gva_n++; } while (cur < end); diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 84f848c2541a..7f828fe49797 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -49,8 +49,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, #define array_index_mask_nospec array_index_mask_nospec /* Prevent speculative execution past this barrier. */ -#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \ - "lfence", X86_FEATURE_LFENCE_RDTSC) +#define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC) #define dma_rmb() barrier() #define dma_wmb() barrier() diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index ba15d53c1ca7..7d1f6a49bfae 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -45,14 +45,13 @@ * We do the locked ops that don't return the old value as * a mask operation on a byte. */ -#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr)) #define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3)) #define CONST_MASK(nr) (1 << ((nr) & 7)) static __always_inline void arch_set_bit(long nr, volatile unsigned long *addr) { - if (IS_IMMEDIATE(nr)) { + if (__builtin_constant_p(nr)) { asm volatile(LOCK_PREFIX "orb %1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" ((u8)CONST_MASK(nr)) @@ -72,7 +71,7 @@ arch___set_bit(long nr, volatile unsigned long *addr) static __always_inline void arch_clear_bit(long nr, volatile unsigned long *addr) { - if (IS_IMMEDIATE(nr)) { + if (__builtin_constant_p(nr)) { asm volatile(LOCK_PREFIX "andb %1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" ((u8)~CONST_MASK(nr))); @@ -123,7 +122,7 @@ arch___change_bit(long nr, volatile unsigned long *addr) static __always_inline void arch_change_bit(long nr, volatile unsigned long *addr) { - if (IS_IMMEDIATE(nr)) { + if (__builtin_constant_p(nr)) { asm volatile(LOCK_PREFIX "xorb %1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" ((u8)CONST_MASK(nr))); diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 101eb944f13c..981fe923a59f 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -18,6 +18,20 @@ * Note: efi_info is commonly left uninitialized, but that field has a * private magic, so it is better to leave it unchanged. */ + +#define sizeof_mbr(type, member) ({ sizeof(((type *)0)->member); }) + +#define BOOT_PARAM_PRESERVE(struct_member) \ + { \ + .start = offsetof(struct boot_params, struct_member), \ + .len = sizeof_mbr(struct boot_params, struct_member), \ + } + +struct boot_params_to_save { + unsigned int start; + unsigned int len; +}; + static void sanitize_boot_params(struct boot_params *boot_params) { /* @@ -35,21 +49,41 @@ static void sanitize_boot_params(struct boot_params *boot_params) * problems again. */ if (boot_params->sentinel) { - /* fields in boot_params are left uninitialized, clear them */ - boot_params->acpi_rsdp_addr = 0; - memset(&boot_params->ext_ramdisk_image, 0, - (char *)&boot_params->efi_info - - (char *)&boot_params->ext_ramdisk_image); - memset(&boot_params->kbd_status, 0, - (char *)&boot_params->hdr - - (char *)&boot_params->kbd_status); - memset(&boot_params->_pad7[0], 0, - (char *)&boot_params->edd_mbr_sig_buffer[0] - - (char *)&boot_params->_pad7[0]); - memset(&boot_params->_pad8[0], 0, - (char *)&boot_params->eddbuf[0] - - (char *)&boot_params->_pad8[0]); - memset(&boot_params->_pad9[0], 0, sizeof(boot_params->_pad9)); + static struct boot_params scratch; + char *bp_base = (char *)boot_params; + char *save_base = (char *)&scratch; + int i; + + const struct boot_params_to_save to_save[] = { + BOOT_PARAM_PRESERVE(screen_info), + BOOT_PARAM_PRESERVE(apm_bios_info), + BOOT_PARAM_PRESERVE(tboot_addr), + BOOT_PARAM_PRESERVE(ist_info), + BOOT_PARAM_PRESERVE(hd0_info), + BOOT_PARAM_PRESERVE(hd1_info), + BOOT_PARAM_PRESERVE(sys_desc_table), + BOOT_PARAM_PRESERVE(olpc_ofw_header), + BOOT_PARAM_PRESERVE(efi_info), + BOOT_PARAM_PRESERVE(alt_mem_k), + BOOT_PARAM_PRESERVE(scratch), + BOOT_PARAM_PRESERVE(e820_entries), + BOOT_PARAM_PRESERVE(eddbuf_entries), + BOOT_PARAM_PRESERVE(edd_mbr_sig_buf_entries), + BOOT_PARAM_PRESERVE(edd_mbr_sig_buffer), + BOOT_PARAM_PRESERVE(secure_boot), + BOOT_PARAM_PRESERVE(hdr), + BOOT_PARAM_PRESERVE(e820_table), + BOOT_PARAM_PRESERVE(eddbuf), + }; + + memset(&scratch, 0, sizeof(scratch)); + + for (i = 0; i < ARRAY_SIZE(to_save); i++) { + memcpy(save_base + to_save[i].start, + bp_base + to_save[i].start, to_save[i].len); + } + + memcpy(boot_params, save_base, sizeof(*boot_params)); } } diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 58acda503817..59bf91c57aa8 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -61,6 +61,13 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) +/* + * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the + * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all + * header macros which use NCAPINTS need to be changed. The duplicated macro + * use causes the compiler to issue errors for all headers so that all usage + * sites can be corrected. + */ #define REQUIRED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 998c2cc08363..3be4dcb1f80f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -96,7 +96,6 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" MFENCE synchronizes RDTSC */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ @@ -281,6 +280,8 @@ #define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */ #define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ #define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ +#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ +#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ @@ -353,6 +354,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ @@ -394,5 +396,6 @@ #define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ #define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ +#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index 20a46150e0a8..9b8cb50768c2 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -73,6 +73,19 @@ static inline u64 mul_u32_u32(u32 a, u32 b) #else # include <asm-generic/div64.h> + +static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) +{ + u64 q; + + asm ("mulq %2; divq %3" : "=a" (q) + : "a" (a), "rm" ((u64)mul), "rm" ((u64)div) + : "rdx"); + + return q; +} +#define mul_u64_u32_div mul_u64_u32_div + #endif /* CONFIG_X86_32 */ #endif /* _ASM_X86_DIV64_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 606a4b6a9812..43a82e59c59d 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -242,6 +242,7 @@ static inline bool efi_is_64bit(void) __efi_early()->runtime_services), __VA_ARGS__) extern bool efi_reboot_required(void); +extern bool efi_is_table_address(unsigned long phys_addr); #else static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} @@ -249,6 +250,10 @@ static inline bool efi_reboot_required(void) { return false; } +static inline bool efi_is_table_address(unsigned long phys_addr) +{ + return false; +} #endif /* CONFIG_EFI */ #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/include/asm/error-injection.h b/arch/x86/include/asm/error-injection.h deleted file mode 100644 index 47b7a1296245..000000000000 --- a/arch/x86/include/asm/error-injection.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_ERROR_INJECTION_H -#define _ASM_ERROR_INJECTION_H - -#include <linux/compiler.h> -#include <linux/linkage.h> -#include <asm/ptrace.h> -#include <asm-generic/error-injection.h> - -asmlinkage void just_return_func(void); -void override_function_with_return(struct pt_regs *regs); - -#endif /* _ASM_ERROR_INJECTION_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 9da8cccdf3fb..0c47aa82e2e2 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -42,8 +42,7 @@ * Because of this, FIXADDR_TOP x86 integration was left as later work. */ #ifdef CONFIG_X86_32 -/* used by vmalloc.c, vsyscall.lds.S. - * +/* * Leave one empty page between vmalloc'ed areas and * the start of the fixmap. */ @@ -120,7 +119,7 @@ enum fixed_addresses { * before ioremap() is functional. * * If necessary we round it up to the next 512 pages boundary so - * that we can have a single pgd entry and a single pte table: + * that we can have a single pmd entry and a single pte table: */ #define NR_FIX_BTMAPS 64 #define FIX_BTMAPS_SLOTS 8 diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 287f1f7b2e52..c38a66661576 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -16,7 +16,6 @@ #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR #ifndef __ASSEMBLY__ -extern void mcount(void); extern atomic_t modifying_ftrace_code; extern void __fentry__(void); diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 0278aa66ef62..f04622500da3 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -5,12 +5,34 @@ /* * "Big Core" Processors (Branded as Core, Xeon, etc...) * - * The "_X" parts are generally the EP and EX Xeons, or the - * "Extreme" ones, like Broadwell-E, or Atom microserver. - * * While adding a new CPUID for a new microarchitecture, add a new * group to keep logically sorted out in chronological order. Within * that group keep the CPUID for the variants sorted by model number. + * + * The defined symbol names have the following form: + * INTEL_FAM6{OPTFAMILY}_{MICROARCH}{OPTDIFF} + * where: + * OPTFAMILY Describes the family of CPUs that this belongs to. Default + * is assumed to be "_CORE" (and should be omitted). Other values + * currently in use are _ATOM and _XEON_PHI + * MICROARCH Is the code name for the micro-architecture for this core. + * N.B. Not the platform name. + * OPTDIFF If needed, a short string to differentiate by market segment. + * + * Common OPTDIFFs: + * + * - regular client parts + * _L - regular mobile parts + * _G - parts with extra graphics on + * _X - regular server parts + * _D - micro server parts + * + * Historical OPTDIFFs: + * + * _EP - 2 socket server parts + * _EX - 4+ socket server parts + * + * The #define line may optionally include a comment including platform names. */ #define INTEL_FAM6_CORE_YONAH 0x0E @@ -34,30 +56,33 @@ #define INTEL_FAM6_IVYBRIDGE 0x3A #define INTEL_FAM6_IVYBRIDGE_X 0x3E -#define INTEL_FAM6_HASWELL_CORE 0x3C +#define INTEL_FAM6_HASWELL 0x3C #define INTEL_FAM6_HASWELL_X 0x3F -#define INTEL_FAM6_HASWELL_ULT 0x45 -#define INTEL_FAM6_HASWELL_GT3E 0x46 +#define INTEL_FAM6_HASWELL_L 0x45 +#define INTEL_FAM6_HASWELL_G 0x46 -#define INTEL_FAM6_BROADWELL_CORE 0x3D -#define INTEL_FAM6_BROADWELL_GT3E 0x47 +#define INTEL_FAM6_BROADWELL 0x3D +#define INTEL_FAM6_BROADWELL_G 0x47 #define INTEL_FAM6_BROADWELL_X 0x4F -#define INTEL_FAM6_BROADWELL_XEON_D 0x56 +#define INTEL_FAM6_BROADWELL_D 0x56 -#define INTEL_FAM6_SKYLAKE_MOBILE 0x4E -#define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E +#define INTEL_FAM6_SKYLAKE_L 0x4E +#define INTEL_FAM6_SKYLAKE 0x5E #define INTEL_FAM6_SKYLAKE_X 0x55 -#define INTEL_FAM6_KABYLAKE_MOBILE 0x8E -#define INTEL_FAM6_KABYLAKE_DESKTOP 0x9E +#define INTEL_FAM6_KABYLAKE_L 0x8E +#define INTEL_FAM6_KABYLAKE 0x9E -#define INTEL_FAM6_CANNONLAKE_MOBILE 0x66 +#define INTEL_FAM6_CANNONLAKE_L 0x66 #define INTEL_FAM6_ICELAKE_X 0x6A -#define INTEL_FAM6_ICELAKE_XEON_D 0x6C -#define INTEL_FAM6_ICELAKE_DESKTOP 0x7D -#define INTEL_FAM6_ICELAKE_MOBILE 0x7E +#define INTEL_FAM6_ICELAKE_D 0x6C +#define INTEL_FAM6_ICELAKE 0x7D +#define INTEL_FAM6_ICELAKE_L 0x7E #define INTEL_FAM6_ICELAKE_NNPI 0x9D +#define INTEL_FAM6_TIGERLAKE_L 0x8C +#define INTEL_FAM6_TIGERLAKE 0x8D + /* "Small Core" Processors (Atom) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ @@ -68,17 +93,21 @@ #define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ #define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ -#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */ +#define INTEL_FAM6_ATOM_SILVERMONT_D 0x4D /* Avaton, Rangely */ #define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ #define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ +#define INTEL_FAM6_ATOM_AIRMONT_NP 0x75 /* Lightning Mountain */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ -#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ +#define INTEL_FAM6_ATOM_GOLDMONT_D 0x5F /* Denverton */ + +/* Note: the micro-architecture is "Goldmont Plus" */ #define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ -#define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */ +#define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */ +#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ /* Xeon Phi */ diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 634f99b1dc22..423b788f495e 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -28,10 +28,12 @@ enum pt_capabilities { void cpu_emergency_stop_pt(void); extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap); extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap); +extern int is_intel_pt_event(struct perf_event *event); #else static inline void cpu_emergency_stop_pt(void) {} static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 0; } static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { return 0; } +static inline int is_intel_pt_event(struct perf_event *event) { return 0; } #endif #endif /* _ASM_X86_INTEL_PT_H */ diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index baedab8ac538..b91623d521d9 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -4,7 +4,6 @@ extern int force_iommu, no_iommu; extern int iommu_detected; -extern int iommu_pass_through; /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8282b8d41209..bdc16b0aa7c6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,6 +35,8 @@ #include <asm/kvm_vcpu_regs.h> #include <asm/hyperv-tlfs.h> +#define __KVM_HAVE_ARCH_VCPU_DEBUGFS + #define KVM_MAX_VCPUS 288 #define KVM_SOFT_MAX_VCPUS 240 #define KVM_MAX_VCPU_ID 1023 @@ -333,6 +335,7 @@ struct kvm_mmu_page { int root_count; /* Currently serving as active root */ unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ + unsigned long mmu_valid_gen; DECLARE_BITMAP(unsync_child_bitmap, 512); #ifdef CONFIG_X86_32 @@ -607,15 +610,16 @@ struct kvm_vcpu_arch { /* * QEMU userspace and the guest each have their own FPU state. - * In vcpu_run, we switch between the user, maintained in the - * task_struct struct, and guest FPU contexts. While running a VCPU, - * the VCPU thread will have the guest FPU context. + * In vcpu_run, we switch between the user and guest FPU contexts. + * While running a VCPU, the VCPU thread will have the guest FPU + * context. * * Note that while the PKRU state lives inside the fpu registers, * it is switched out separately at VMENTER and VMEXIT time. The * "guest_fpu" state here contains the guest FPU context, with the * host PRKU bits. */ + struct fpu *user_fpu; struct fpu *guest_fpu; u64 xcr0; @@ -853,6 +857,7 @@ struct kvm_arch { unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; + unsigned long mmu_valid_gen; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. @@ -1174,6 +1179,7 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); + bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, bool *expired); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 9024236693d2..16ae821483c8 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -28,16 +28,16 @@ static inline void paravirt_activate_mm(struct mm_struct *prev, DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key); -static inline void load_mm_cr4(struct mm_struct *mm) +static inline void load_mm_cr4_irqsoff(struct mm_struct *mm) { if (static_branch_unlikely(&rdpmc_always_available_key) || atomic_read(&mm->context.perf_rdpmc_allowed)) - cr4_set_bits(X86_CR4_PCE); + cr4_set_bits_irqsoff(X86_CR4_PCE); else - cr4_clear_bits(X86_CR4_PCE); + cr4_clear_bits_irqsoff(X86_CR4_PCE); } #else -static inline void load_mm_cr4(struct mm_struct *mm) {} +static inline void load_mm_cr4_irqsoff(struct mm_struct *mm) {} #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6b4fc2788078..20ce682a2540 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -375,13 +375,22 @@ /* Alternative perfctr range with full access. */ #define MSR_IA32_PMC0 0x000004c1 -/* AMD64 MSRs. Not complete. See the architecture manual for a more - complete list. */ +/* Auto-reload via MSR instead of DS area */ +#define MSR_RELOAD_PMC0 0x000014c1 +#define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* + * AMD64 MSRs. Not complete. See the architecture manual for a more + * complete list. + */ #define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_TSC_RATIO 0xc0000104 #define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD_PERF_CTL 0xc0010062 +#define MSR_AMD_PERF_STATUS 0xc0010063 +#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD64_LS_CFG 0xc0011020 @@ -560,9 +569,6 @@ #define MSR_IA32_PERF_STATUS 0x00000198 #define MSR_IA32_PERF_CTL 0x00000199 #define INTEL_PERF_CTL_MASK 0xffff -#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 -#define MSR_AMD_PERF_STATUS 0xc0010063 -#define MSR_AMD_PERF_CTL 0xc0010062 #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 5cc3930cb465..86f20d520a07 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -233,8 +233,7 @@ static __always_inline unsigned long long rdtsc_ordered(void) * Thus, use the preferred barrier on the respective CPU, aiming for * RDTSCP as the default. */ - asm volatile(ALTERNATIVE_3("rdtsc", - "mfence; rdtsc", X86_FEATURE_MFENCE_RDTSC, + asm volatile(ALTERNATIVE_2("rdtsc", "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC, "rdtscp", X86_FEATURE_RDTSCP) : EAX_EDX_RET(val, low, high) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 109f974f9835..80bc209c0708 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -192,7 +192,7 @@ " lfence;\n" \ " jmp 902b;\n" \ " .align 16\n" \ - "903: addl $4, %%esp;\n" \ + "903: lea 4(%%esp), %%esp;\n" \ " pushl %[thunk_target];\n" \ " ret;\n" \ " .align 16\n" \ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 1392d5e6e8d6..ee26e9215f18 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -252,16 +252,20 @@ struct pebs_lbr { #define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) #define IBSCTL_LVT_OFFSET_MASK 0x0F -/* ibs fetch bits/masks */ +/* IBS fetch bits/masks */ #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) #define IBS_FETCH_CNT 0xFFFF0000ULL #define IBS_FETCH_MAX_CNT 0x0000FFFFULL -/* ibs op bits/masks */ -/* lower 4 bits of the current count are ignored: */ -#define IBS_OP_CUR_CNT (0xFFFF0ULL<<32) +/* + * IBS op bits/masks + * The lower 7 bits of the current count are random bits + * preloaded by hardware and ignored in software + */ +#define IBS_OP_CUR_CNT (0xFFF80ULL<<32) +#define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 99a7fa9ab0a3..3d4cb83a8828 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -102,7 +102,7 @@ static __always_inline bool should_resched(int preempt_offset) return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION extern asmlinkage void ___preempt_schedule(void); # define __preempt_schedule() \ asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT) diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index bd5ac6cc37db..444d6fd9a6d8 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -63,10 +63,25 @@ static inline bool vcpu_is_preempted(long cpu) #endif #ifdef CONFIG_PARAVIRT +/* + * virt_spin_lock_key - enables (by default) the virt_spin_lock() hijack. + * + * Native (and PV wanting native due to vCPU pinning) should disable this key. + * It is done in this backwards fashion to only have a single direction change, + * which removes ordering between native_pv_spin_init() and HV setup. + */ DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); void native_pv_lock_init(void) __init; +/* + * Shortcut for the queued_spin_lock_slowpath() function that allows + * virt to hijack it. + * + * Returns: + * true - lock has been negotiated, all done; + * false - queued_spin_lock_slowpath() will do its thing. + */ #define virt_spin_lock virt_spin_lock static inline bool virt_spin_lock(struct qspinlock *lock) { diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index c53682303c9c..09ecc32f6524 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -20,7 +20,6 @@ struct real_mode_header { u32 ro_end; /* SMP trampoline */ u32 trampoline_start; - u32 trampoline_status; u32 trampoline_header; #ifdef CONFIG_X86_64 u32 trampoline_pgd; diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index ae7b909dc242..2ee8e469dcf5 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -40,7 +40,6 @@ int _set_memory_wt(unsigned long addr, int numpages); int _set_memory_wb(unsigned long addr, int numpages); int set_memory_uc(unsigned long addr, int numpages); int set_memory_wc(unsigned long addr, int numpages); -int set_memory_wt(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_np(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); @@ -48,11 +47,6 @@ int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); int set_memory_np_noalias(unsigned long addr, int numpages); -int set_memory_array_uc(unsigned long *addr, int addrinarray); -int set_memory_array_wc(unsigned long *addr, int addrinarray); -int set_memory_array_wt(unsigned long *addr, int addrinarray); -int set_memory_array_wb(unsigned long *addr, int addrinarray); - int set_pages_array_uc(struct page **pages, int addrinarray); int set_pages_array_wc(struct page **pages, int addrinarray); int set_pages_array_wt(struct page **pages, int addrinarray); @@ -80,8 +74,6 @@ int set_pages_array_wb(struct page **pages, int addrinarray); int set_pages_uc(struct page *page, int numpages); int set_pages_wb(struct page *page, int numpages); -int set_pages_x(struct page *page, int numpages); -int set_pages_nx(struct page *page, int numpages); int set_pages_ro(struct page *page, int numpages); int set_pages_rw(struct page *page, int numpages); diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 2dc4a021beea..8db3fdb6102e 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -36,6 +36,10 @@ extern const sys_call_ptr_t sys_call_table[]; extern const sys_call_ptr_t ia32_sys_call_table[]; #endif +#ifdef CONFIG_X86_X32_ABI +extern const sys_call_ptr_t x32_sys_call_table[]; +#endif + /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 70c09967a999..5e8319bb207a 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -45,8 +45,8 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); * no thread can be preempted in the instructions being modified (no iret to an * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. - * On the local CPU you need to be protected again NMI or MCE handlers seeing an - * inconsistent instruction while you patch. + * On the local CPU you need to be protected against NMI or MCE handlers seeing + * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index dee375831962..6f66d841262d 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -290,26 +290,42 @@ static inline void __cr4_set(unsigned long cr4) } /* Set in this cpu's CR4. */ -static inline void cr4_set_bits(unsigned long mask) +static inline void cr4_set_bits_irqsoff(unsigned long mask) { - unsigned long cr4, flags; + unsigned long cr4; - local_irq_save(flags); cr4 = this_cpu_read(cpu_tlbstate.cr4); if ((cr4 | mask) != cr4) __cr4_set(cr4 | mask); - local_irq_restore(flags); } /* Clear in this cpu's CR4. */ -static inline void cr4_clear_bits(unsigned long mask) +static inline void cr4_clear_bits_irqsoff(unsigned long mask) { - unsigned long cr4, flags; + unsigned long cr4; - local_irq_save(flags); cr4 = this_cpu_read(cpu_tlbstate.cr4); if ((cr4 & ~mask) != cr4) __cr4_set(cr4 & ~mask); +} + +/* Set in this cpu's CR4. */ +static inline void cr4_set_bits(unsigned long mask) +{ + unsigned long flags; + + local_irq_save(flags); + cr4_set_bits_irqsoff(mask); + local_irq_restore(flags); +} + +/* Clear in this cpu's CR4. */ +static inline void cr4_clear_bits(unsigned long mask) +{ + unsigned long flags; + + local_irq_save(flags); + cr4_clear_bits_irqsoff(mask); local_irq_restore(flags); } diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 9c4435307ff8..35c225ede0e4 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -444,8 +444,10 @@ __pu_label: \ ({ \ int __gu_err; \ __inttype(*(ptr)) __gu_val; \ + __typeof__(ptr) __gu_ptr = (ptr); \ + __typeof__(size) __gu_size = (size); \ __uaccess_begin_nospec(); \ - __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + __get_user_size(__gu_val, __gu_ptr, __gu_size, __gu_err, -EFAULT); \ __uaccess_end(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ __builtin_expect(__gu_err, 0); \ diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 097589753fec..a7dd080749ce 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -5,12 +5,6 @@ #include <uapi/asm/unistd.h> -# ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_MASK (~(__X32_SYSCALL_BIT)) -# else -# define __SYSCALL_MASK (~0) -# endif - # ifdef CONFIG_X86_32 # include <asm/unistd_32.h> diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index e60c45fd3679..6bc6d89d8e2a 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -12,10 +12,12 @@ struct mm_struct; #ifdef CONFIG_X86_UV #include <linux/efi.h> +extern unsigned long uv_systab_phys; + extern enum uv_system_type get_uv_system_type(void); static inline bool is_early_uv_system(void) { - return !((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab); + return uv_systab_phys && uv_systab_phys != EFI_INVALID_TABLE_ADDR; } extern int is_uv_system(void); extern int is_uv_hubless(void); diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index ae91429129a6..ba71a63cdac4 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -96,6 +96,8 @@ long clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) #else +#define VDSO_HAS_32BIT_FALLBACK 1 + static __always_inline long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) { @@ -114,6 +116,23 @@ long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) } static __always_inline +long clock_gettime32_fallback(clockid_t _clkid, struct old_timespec32 *_ts) +{ + long ret; + + asm ( + "mov %%ebx, %%edx \n" + "mov %[clock], %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret), "=m" (*_ts) + : "0" (__NR_clock_gettime), [clock] "g" (_clkid), "c" (_ts) + : "edx"); + + return ret; +} + +static __always_inline long gettimeofday_fallback(struct __kernel_old_timeval *_tv, struct timezone *_tz) { @@ -148,6 +167,23 @@ clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) return ret; } +static __always_inline +long clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts) +{ + long ret; + + asm ( + "mov %%ebx, %%edx \n" + "mov %[clock], %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret), "=m" (*_ts) + : "0" (__NR_clock_getres), [clock] "g" (_clkid), "c" (_ts) + : "edx"); + + return ret; +} + #endif #ifdef CONFIG_PARAVIRT_CLOCK diff --git a/arch/x86/include/uapi/asm/byteorder.h b/arch/x86/include/uapi/asm/byteorder.h index 484e3cfd7ef2..149143cab9ff 100644 --- a/arch/x86/include/uapi/asm/byteorder.h +++ b/arch/x86/include/uapi/asm/byteorder.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_BYTEORDER_H #define _ASM_X86_BYTEORDER_H diff --git a/arch/x86/include/uapi/asm/errno.h b/arch/x86/include/uapi/asm/errno.h deleted file mode 100644 index 4c82b503d92f..000000000000 --- a/arch/x86/include/uapi/asm/errno.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/errno.h> diff --git a/arch/x86/include/uapi/asm/fcntl.h b/arch/x86/include/uapi/asm/fcntl.h deleted file mode 100644 index 46ab12db5739..000000000000 --- a/arch/x86/include/uapi/asm/fcntl.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/fcntl.h> diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index 6ebaae90e207..8b2effe6efb8 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_HWCAP2_H #define _ASM_X86_HWCAP2_H diff --git a/arch/x86/include/uapi/asm/ioctl.h b/arch/x86/include/uapi/asm/ioctl.h deleted file mode 100644 index b279fe06dfe5..000000000000 --- a/arch/x86/include/uapi/asm/ioctl.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ioctl.h> diff --git a/arch/x86/include/uapi/asm/ioctls.h b/arch/x86/include/uapi/asm/ioctls.h deleted file mode 100644 index ec34c760665e..000000000000 --- a/arch/x86/include/uapi/asm/ioctls.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ioctls.h> diff --git a/arch/x86/include/uapi/asm/ipcbuf.h b/arch/x86/include/uapi/asm/ipcbuf.h deleted file mode 100644 index 84c7e51cb6d0..000000000000 --- a/arch/x86/include/uapi/asm/ipcbuf.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ipcbuf.h> diff --git a/arch/x86/include/uapi/asm/param.h b/arch/x86/include/uapi/asm/param.h deleted file mode 100644 index 965d45427975..000000000000 --- a/arch/x86/include/uapi/asm/param.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/param.h> diff --git a/arch/x86/include/uapi/asm/resource.h b/arch/x86/include/uapi/asm/resource.h deleted file mode 100644 index 04bc4db8921b..000000000000 --- a/arch/x86/include/uapi/asm/resource.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/resource.h> diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index 6b18e88de8a6..7114801d0499 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_SIGCONTEXT32_H #define _ASM_X86_SIGCONTEXT32_H diff --git a/arch/x86/include/uapi/asm/termbits.h b/arch/x86/include/uapi/asm/termbits.h deleted file mode 100644 index 3935b106de79..000000000000 --- a/arch/x86/include/uapi/asm/termbits.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/termbits.h> diff --git a/arch/x86/include/uapi/asm/termios.h b/arch/x86/include/uapi/asm/termios.h deleted file mode 100644 index 280d78a9d966..000000000000 --- a/arch/x86/include/uapi/asm/termios.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/termios.h> diff --git a/arch/x86/include/uapi/asm/types.h b/arch/x86/include/uapi/asm/types.h deleted file mode 100644 index df55e1ddb0c9..000000000000 --- a/arch/x86/include/uapi/asm/types.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_TYPES_H -#define _ASM_X86_TYPES_H - -#include <asm-generic/types.h> - -#endif /* _ASM_X86_TYPES_H */ diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h index 30d7d04d72d6..196fdd02b8b1 100644 --- a/arch/x86/include/uapi/asm/unistd.h +++ b/arch/x86/include/uapi/asm/unistd.h @@ -3,7 +3,7 @@ #define _UAPI_ASM_X86_UNISTD_H /* x32 syscall flag bit */ -#define __X32_SYSCALL_BIT 0x40000000 +#define __X32_SYSCALL_BIT 0x40000000UL #ifndef __KERNEL__ # ifdef __i386__ diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index b0715c3ac18d..7f9ade13bbcf 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -18,8 +18,13 @@ ENTRY(wakeup_long64) movq saved_magic, %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax - jne bogus_64_magic + je 2f + /* stop here on a saved_magic mismatch */ + movq $0xbad6d61676963, %rcx +1: + jmp 1b +2: movw $__KERNEL_DS, %ax movw %ax, %ss movw %ax, %ds @@ -37,9 +42,6 @@ ENTRY(wakeup_long64) jmp *%rax ENDPROC(wakeup_long64) -bogus_64_magic: - jmp bogus_64_magic - ENTRY(do_suspend_lowlevel) FRAME_BEGIN subq $8, %rsp diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ccd32013c47a..9d3a971ea364 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -713,7 +713,7 @@ void __init alternative_instructions(void) * Don't stop machine check exceptions while patching. * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. - * Ignoring it is worse than a unlikely patching race. + * Ignoring it is worse than an unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code @@ -753,8 +753,8 @@ void __init alternative_instructions(void) * When you use this code to patch more than one byte of an instruction * you need to make sure that other CPUs cannot execute this code in parallel. * Also no thread must be currently preempted in the middle of these - * instructions. And on the local CPU you need to be protected again NMI or MCE - * handlers seeing an inconsistent instruction while you patch. + * instructions. And on the local CPU you need to be protected against NMI or + * MCE handlers seeing an inconsistent instruction while you patch. */ void __init_or_module text_poke_early(void *addr, const void *opcode, size_t len) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index d63e63b7d1d9..251c795b4eb3 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -21,6 +21,7 @@ #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 +#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 /* Protect the PCI config register pairs used for SMN and DF indirect access. */ static DEFINE_MUTEX(smn_mutex); @@ -50,6 +51,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, {} }; EXPORT_SYMBOL_GPL(amd_nb_misc_ids); @@ -63,6 +65,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, {} }; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f5291362da1a..909abb2d59eb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -590,21 +590,21 @@ static u32 skx_deadline_rev(void) static const struct x86_cpu_id deadline_match[] = { DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), - DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_D, bdx_deadline_rev), DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_GT3E, 0x17), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL, 0x22), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_L, 0x20), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_G, 0x17), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_CORE, 0x25), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_GT3E, 0x17), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL, 0x25), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_G, 0x17), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_MOBILE, 0xb2), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_DESKTOP, 0xb2), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_L, 0xb2), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE, 0xb2), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_MOBILE, 0x52), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_DESKTOP, 0x52), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_L, 0x52), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE, 0x52), {}, }; @@ -722,7 +722,7 @@ static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; /* - * Temporary interrupt handler. + * Temporary interrupt handler and polled calibration function. */ static void __init lapic_cal_handler(struct clock_event_device *dev) { @@ -834,6 +834,10 @@ bool __init apic_needs_pit(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return true; + /* Virt guests may lack ARAT, but still have DEADLINE */ + if (!boot_cpu_has(X86_FEATURE_ARAT)) + return true; + /* Deadline timer is based on TSC so no further PIT action required */ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) return false; @@ -851,7 +855,8 @@ bool __init apic_needs_pit(void) static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = this_cpu_ptr(&lapic_events); - void (*real_handler)(struct clock_event_device *dev); + u64 tsc_perj = 0, tsc_start = 0; + unsigned long jif_start; unsigned long deltaj; long delta, deltatsc; int pm_referenced = 0; @@ -878,28 +883,64 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" "calibrating APIC timer ...\n"); + /* + * There are platforms w/o global clockevent devices. Instead of + * making the calibration conditional on that, use a polling based + * approach everywhere. + */ local_irq_disable(); - /* Replace the global interrupt handler */ - real_handler = global_clock_event->event_handler; - global_clock_event->event_handler = lapic_cal_handler; - /* * Setup the APIC counter to maximum. There is no way the lapic * can underflow in the 100ms detection time frame */ __setup_APIC_LVTT(0xffffffff, 0, 0); - /* Let the interrupts run */ + /* + * Methods to terminate the calibration loop: + * 1) Global clockevent if available (jiffies) + * 2) TSC if available and frequency is known + */ + jif_start = READ_ONCE(jiffies); + + if (tsc_khz) { + tsc_start = rdtsc(); + tsc_perj = div_u64((u64)tsc_khz * 1000, HZ); + } + + /* + * Enable interrupts so the tick can fire, if a global + * clockevent device is available + */ local_irq_enable(); - while (lapic_cal_loops <= LAPIC_CAL_LOOPS) - cpu_relax(); + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) { + /* Wait for a tick to elapse */ + while (1) { + if (tsc_khz) { + u64 tsc_now = rdtsc(); + if ((tsc_now - tsc_start) >= tsc_perj) { + tsc_start += tsc_perj; + break; + } + } else { + unsigned long jif_now = READ_ONCE(jiffies); - local_irq_disable(); + if (time_after(jif_now, jif_start)) { + jif_start = jif_now; + break; + } + } + cpu_relax(); + } - /* Restore the real event handler */ - global_clock_event->event_handler = real_handler; + /* Invoke the calibration routine */ + local_irq_disable(); + lapic_cal_handler(NULL); + local_irq_enable(); + } + + local_irq_disable(); /* Build delta t1-t2 as apic timer counts down */ delta = lapic_cal_t1 - lapic_cal_t2; @@ -943,10 +984,11 @@ static int __init calibrate_APIC_clock(void) levt->features &= ~CLOCK_EVT_FEAT_DUMMY; /* - * PM timer calibration failed or not turned on - * so lets try APIC timer based calibration + * PM timer calibration failed or not turned on so lets try APIC + * timer based calibration, if a global clockevent device is + * available. */ - if (!pm_referenced) { + if (!pm_referenced && global_clock_event) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); /* diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index afee386ff711..caedd8d60d36 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -38,32 +38,12 @@ static int bigsmp_early_logical_apicid(int cpu) return early_per_cpu(x86_cpu_to_apicid, cpu); } -static inline unsigned long calculate_ldr(int cpu) -{ - unsigned long val, id; - - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - id = per_cpu(x86_bios_cpu_apicid, cpu); - val |= SET_APIC_LOGICAL_ID(id); - - return val; -} - /* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... + * bigsmp enables physical destination mode + * and doesn't use LDR and DFR */ static void bigsmp_init_apic_ldr(void) { - unsigned long val; - int cpu = smp_processor_id(); - - apic_write(APIC_DFR, APIC_DFR_FLAT); - val = calculate_ldr(cpu); - apic_write(APIC_LDR, val); } static void bigsmp_setup_apic_routing(void) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c7bb6c69f21c..d6af97fd170a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2438,7 +2438,13 @@ unsigned int arch_dynirq_lower_bound(unsigned int from) * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use * gsi_top if ioapic_dynirq_base hasn't been initialized yet. */ - return ioapic_initialized ? ioapic_dynirq_base : gsi_top; + if (!ioapic_initialized) + return gsi_top; + /* + * For DT enabled machines ioapic_dynirq_base is irrelevant and not + * updated. So simply return @from if ioapic_dynirq_base == 0. + */ + return ioapic_dynirq_base ? : from; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1492799b8f43..ee2d91e382f1 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -184,7 +184,8 @@ void __init default_setup_apic_routing(void) def_to_bigsmp = 0; break; } - /* If P4 and above fall through */ + /* P4 and above */ + /* fall through */ case X86_VENDOR_HYGON: case X86_VENDOR_AMD: def_to_bigsmp = 1; diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d3d075226c0a..70e97727a26a 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -6,13 +6,28 @@ #include <asm/ia32.h> #define __SYSCALL_64(nr, sym, qual) [nr] = 1, +#define __SYSCALL_X32(nr, sym, qual) static char syscalls_64[] = { #include <asm/syscalls_64.h> }; +#undef __SYSCALL_64 +#undef __SYSCALL_X32 + +#ifdef CONFIG_X86_X32_ABI +#define __SYSCALL_64(nr, sym, qual) +#define __SYSCALL_X32(nr, sym, qual) [nr] = 1, +static char syscalls_x32[] = { +#include <asm/syscalls_64.h> +}; +#undef __SYSCALL_64 +#undef __SYSCALL_X32 +#endif + #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls_ia32[] = { #include <asm/syscalls_32.h> }; +#undef __SYSCALL_I386 #if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) #include <asm/kvm_para.h> @@ -80,6 +95,11 @@ int main(void) DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); +#ifdef CONFIG_X86_X32_ABI + DEFINE(__NR_syscall_x32_max, sizeof(syscalls_x32) - 1); + DEFINE(X32_NR_syscalls, sizeof(syscalls_x32)); +#endif + DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1); DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8d4e50428b68..90f75e515876 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/random.h> +#include <linux/topology.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cacheinfo.h> @@ -804,6 +805,64 @@ static void init_amd_ln(struct cpuinfo_x86 *c) msr_set_bit(MSR_AMD64_DE_CFG, 31); } +static bool rdrand_force; + +static int __init rdrand_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "force")) + rdrand_force = true; + else + return -EINVAL; + + return 0; +} +early_param("rdrand", rdrand_cmdline); + +static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c) +{ + /* + * Saving of the MSR used to hide the RDRAND support during + * suspend/resume is done by arch/x86/power/cpu.c, which is + * dependent on CONFIG_PM_SLEEP. + */ + if (!IS_ENABLED(CONFIG_PM_SLEEP)) + return; + + /* + * The nordrand option can clear X86_FEATURE_RDRAND, so check for + * RDRAND support using the CPUID function directly. + */ + if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force) + return; + + msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62); + + /* + * Verify that the CPUID change has occurred in case the kernel is + * running virtualized and the hypervisor doesn't support the MSR. + */ + if (cpuid_ecx(1) & BIT(30)) { + pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n"); + return; + } + + clear_cpu_cap(c, X86_FEATURE_RDRAND); + pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n"); +} + +static void init_amd_jg(struct cpuinfo_x86 *c) +{ + /* + * Some BIOS implementations do not restore proper RDRAND support + * across suspend and resume. Check on whether to hide the RDRAND + * instruction support via CPUID. + */ + clear_rdrand_cpuid_bit(c); +} + static void init_amd_bd(struct cpuinfo_x86 *c) { u64 value; @@ -818,12 +877,23 @@ static void init_amd_bd(struct cpuinfo_x86 *c) wrmsrl_safe(MSR_F15H_IC_CFG, value); } } + + /* + * Some BIOS implementations do not restore proper RDRAND support + * across suspend and resume. Check on whether to hide the RDRAND + * instruction support via CPUID. + */ + clear_rdrand_cpuid_bit(c); } static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); +#ifdef CONFIG_NUMA + node_reclaim_distance = 32; +#endif + /* * Fix erratum 1076: CPB feature bit not being set in CPUID. * Always set it, except when running under a hypervisor. @@ -860,6 +930,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x10: init_amd_gh(c); break; case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; + case 0x16: init_amd_jg(c); break; case 0x17: init_amd_zn(c); break; } @@ -879,12 +950,8 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); if (cpu_has(c, X86_FEATURE_XMM2)) { - unsigned long long val; - int ret; - /* - * A serializing LFENCE has less overhead than MFENCE, so - * use it for execution serialization. On families which + * Use LFENCE for execution serialization. On families which * don't have that MSR, LFENCE is already serializing. * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. @@ -892,19 +959,8 @@ static void init_amd(struct cpuinfo_x86 *c) msr_set_bit(MSR_F10H_DECFG, MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); - /* - * Verify that the MSR write was successful (could be running - * under a hypervisor) and only then assume that LFENCE is - * serializing. - */ - ret = rdmsrl_safe(MSR_F10H_DECFG, &val); - if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { - /* A serializing LFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - } else { - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - } + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); } /* diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 66ca906aa790..0b569f10d4a0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -34,6 +34,7 @@ #include "cpu.h" +static void __init spectre_v1_select_mitigation(void); static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); @@ -98,17 +99,11 @@ void __init check_bugs(void) if (boot_cpu_has(X86_FEATURE_STIBP)) x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; - /* Select the proper spectre mitigation before patching alternatives */ + /* Select the proper CPU mitigations before patching alternatives: */ + spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); - - /* - * Select proper mitigation for any exposure to the Speculative Store - * Bypass vulnerability. - */ ssb_select_mitigation(); - l1tf_select_mitigation(); - mds_select_mitigation(); arch_smt_update(); @@ -274,6 +269,98 @@ static int __init mds_cmdline(char *str) early_param("mds", mds_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "Spectre V1 : " fmt + +enum spectre_v1_mitigation { + SPECTRE_V1_MITIGATION_NONE, + SPECTRE_V1_MITIGATION_AUTO, +}; + +static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init = + SPECTRE_V1_MITIGATION_AUTO; + +static const char * const spectre_v1_strings[] = { + [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers", + [SPECTRE_V1_MITIGATION_AUTO] = "Mitigation: usercopy/swapgs barriers and __user pointer sanitization", +}; + +/* + * Does SMAP provide full mitigation against speculative kernel access to + * userspace? + */ +static bool smap_works_speculatively(void) +{ + if (!boot_cpu_has(X86_FEATURE_SMAP)) + return false; + + /* + * On CPUs which are vulnerable to Meltdown, SMAP does not + * prevent speculative access to user data in the L1 cache. + * Consider SMAP to be non-functional as a mitigation on these + * CPUs. + */ + if (boot_cpu_has(X86_BUG_CPU_MELTDOWN)) + return false; + + return true; +} + +static void __init spectre_v1_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) { + spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; + return; + } + + if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { + /* + * With Spectre v1, a user can speculatively control either + * path of a conditional swapgs with a user-controlled GS + * value. The mitigation is to add lfences to both code paths. + * + * If FSGSBASE is enabled, the user can put a kernel address in + * GS, in which case SMAP provides no protection. + * + * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the + * FSGSBASE enablement patches have been merged. ] + * + * If FSGSBASE is disabled, the user can only put a user space + * address in GS. That makes an attack harder, but still + * possible if there's no SMAP protection. + */ + if (!smap_works_speculatively()) { + /* + * Mitigation can be provided from SWAPGS itself or + * PTI as the CR3 write in the Meltdown mitigation + * is serializing. + * + * If neither is there, mitigate with an LFENCE to + * stop speculation through swapgs. + */ + if (boot_cpu_has_bug(X86_BUG_SWAPGS) && + !boot_cpu_has(X86_FEATURE_PTI)) + setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_USER); + + /* + * Enable lfences in the kernel entry (non-swapgs) + * paths, to prevent user entry from speculatively + * skipping swapgs. + */ + setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_KERNEL); + } + } + + pr_info("%s\n", spectre_v1_strings[spectre_v1_mitigation]); +} + +static int __init nospectre_v1_cmdline(char *str) +{ + spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; + return 0; +} +early_param("nospectre_v1", nospectre_v1_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = @@ -1097,15 +1184,15 @@ static void override_cache_bits(struct cpuinfo_x86 *c) case INTEL_FAM6_WESTMERE: case INTEL_FAM6_SANDYBRIDGE: case INTEL_FAM6_IVYBRIDGE: - case INTEL_FAM6_HASWELL_CORE: - case INTEL_FAM6_HASWELL_ULT: - case INTEL_FAM6_HASWELL_GT3E: - case INTEL_FAM6_BROADWELL_CORE: - case INTEL_FAM6_BROADWELL_GT3E: - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: + case INTEL_FAM6_HASWELL: + case INTEL_FAM6_HASWELL_L: + case INTEL_FAM6_HASWELL_G: + case INTEL_FAM6_BROADWELL: + case INTEL_FAM6_BROADWELL_G: + case INTEL_FAM6_SKYLAKE_L: + case INTEL_FAM6_SKYLAKE: + case INTEL_FAM6_KABYLAKE_L: + case INTEL_FAM6_KABYLAKE: if (c->x86_cache_bits < 44) c->x86_cache_bits = 44; break; @@ -1226,7 +1313,7 @@ static ssize_t l1tf_show_state(char *buf) static ssize_t mds_show_state(char *buf) { - if (!hypervisor_is_type(X86_HYPER_NATIVE)) { + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { return sprintf(buf, "%s; SMT Host state unknown\n", mds_strings[mds_mitigation]); } @@ -1290,7 +1377,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr break; case X86_BUG_SPECTRE_V1: - return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 11472178e17f..030e52749a74 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1022,6 +1022,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_L1TF BIT(3) #define NO_MDS BIT(4) #define MSBDS_ONLY BIT(5) +#define NO_SWAPGS BIT(6) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -1048,30 +1049,39 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY), - VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY), + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), VULNWL_INTEL(CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY), + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF), - VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), + + /* + * Technically, swapgs isn't serializing on AMD (despite it previously + * being documented as such in the APM). But according to AMD, %gs is + * updated non-speculatively, and the issuing of %gs-relative memory + * operands will be blocked until the %gs update completes, which is + * good enough for our purposes. + */ /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), {} }; @@ -1108,6 +1118,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); } + if (!cpu_matches(NO_SWAPGS)) + setup_force_cpu_bug(X86_BUG_SWAPGS); + if (cpu_matches(NO_MELTDOWN)) return; diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index b5353244749b..3cbe24ca80ab 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -20,54 +20,55 @@ struct cpuid_dep { * but it's difficult to tell that to the init reference checker. */ static const struct cpuid_dep cpuid_deps[] = { - { X86_FEATURE_FXSR, X86_FEATURE_FPU }, - { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, - { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, - { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, - { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, - { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, - { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, - { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, - { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, - { X86_FEATURE_MMX, X86_FEATURE_FXSR }, - { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, - { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, - { X86_FEATURE_XSAVE, X86_FEATURE_FXSR }, - { X86_FEATURE_XMM, X86_FEATURE_FXSR }, - { X86_FEATURE_XMM2, X86_FEATURE_XMM }, - { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, - { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, - { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, - { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, - { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, - { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, - { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, - { X86_FEATURE_AES, X86_FEATURE_XMM2 }, - { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, - { X86_FEATURE_FMA, X86_FEATURE_AVX }, - { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, - { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, - { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, - { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, - { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, - { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, - { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, - { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, - { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, - { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, - { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, + { X86_FEATURE_FXSR, X86_FEATURE_FPU }, + { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, + { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, + { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, + { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, + { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, + { X86_FEATURE_MMX, X86_FEATURE_FXSR }, + { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, + { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XSAVE, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM2, X86_FEATURE_XMM }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, + { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, + { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, + { X86_FEATURE_AES, X86_FEATURE_XMM2 }, + { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, + { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VP2INTERSECT, X86_FEATURE_AVX512VL }, + { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, {} }; diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 415621ddb8a2..4e28c1fc8749 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -330,12 +330,8 @@ static void init_hygon(struct cpuinfo_x86 *c) init_hygon_cacheinfo(c); if (cpu_has(c, X86_FEATURE_XMM2)) { - unsigned long long val; - int ret; - /* - * A serializing LFENCE has less overhead than MFENCE, so - * use it for execution serialization. On families which + * Use LFENCE for execution serialization. On families which * don't have that MSR, LFENCE is already serializing. * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. @@ -343,19 +339,8 @@ static void init_hygon(struct cpuinfo_x86 *c) msr_set_bit(MSR_F10H_DECFG, MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); - /* - * Verify that the MSR write was successful (could be running - * under a hypervisor) and only then assume that LFENCE is - * serializing. - */ - ret = rdmsrl_safe(MSR_F10H_DECFG, &val); - if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { - /* A serializing LFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - } else { - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - } + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); } /* diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8d6d92ebeb54..c2fdc00df163 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -142,21 +142,21 @@ struct sku_microcode { u32 microcode; }; static const struct sku_microcode spectre_bad_microcodes[] = { - { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x80 }, - { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x80 }, - { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x80 }, - { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x80 }, - { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x09, 0x80 }, + { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 }, { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, - { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 }, - { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b }, - { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 }, - { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 }, + { INTEL_FAM6_BROADWELL, 0x04, 0x28 }, + { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b }, + { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 }, + { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 }, { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, - { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 }, - { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 }, - { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 }, + { INTEL_FAM6_HASWELL_L, 0x01, 0x21 }, + { INTEL_FAM6_HASWELL_G, 0x01, 0x18 }, + { INTEL_FAM6_HASWELL, 0x03, 0x23 }, { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, @@ -265,9 +265,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ if (c->x86 == 6) { switch (c->x86_model) { - case 0x27: /* Penwell */ - case 0x35: /* Cloverview */ - case 0x4a: /* Merrifield */ + case INTEL_FAM6_ATOM_SALTWELL_MID: + case INTEL_FAM6_ATOM_SALTWELL_TABLET: + case INTEL_FAM6_ATOM_SILVERMONT_MID: + case INTEL_FAM6_ATOM_AIRMONT_NP: set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); break; default: diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index e43eb6732630..88cd9598fa57 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -479,7 +479,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) switch (c->x86_model) { case INTEL_FAM6_IVYBRIDGE_X: case INTEL_FAM6_HASWELL_X: - case INTEL_FAM6_BROADWELL_XEON_D: + case INTEL_FAM6_BROADWELL_D: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_XEON_PHI_KNL: diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 210f1f5db5f7..87bcdc6dc2f0 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -107,11 +107,11 @@ static struct severity { */ MCESEV( AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + SER, MASK(MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) ), MCESEV( AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) ), /* ignore OVER for UCNA */ diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 4296c702a3f7..72182809b333 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -98,6 +98,7 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) case 7: if (size < 0x40) break; + /* Else, fall through */ case 6: case 5: case 4: diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 6a204e7336c1..32b4dc9030aa 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -18,6 +18,12 @@ static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); /* + * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by + * hardware or BIOS before kernel boot. + */ +static u32 orig_umwait_control_cached __ro_after_init; + +/* * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in * the sysfs write functions. */ @@ -53,6 +59,23 @@ static int umwait_cpu_online(unsigned int cpu) } /* + * The CPU hotplug callback sets the control MSR to the original control + * value. + */ +static int umwait_cpu_offline(unsigned int cpu) +{ + /* + * This code is protected by the CPU hotplug already and + * orig_umwait_control_cached is never changed after it caches + * the original control MSR value in umwait_init(). So there + * is no race condition here. + */ + wrmsr(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached, 0); + + return 0; +} + +/* * On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which * is the only active CPU at this time. The MSR is set up on the APs via the * CPU hotplug callback. @@ -185,8 +208,22 @@ static int __init umwait_init(void) if (!boot_cpu_has(X86_FEATURE_WAITPKG)) return -ENODEV; + /* + * Cache the original control MSR value before the control MSR is + * changed. This is the only place where orig_umwait_control_cached + * is modified. + */ + rdmsrl(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", - umwait_cpu_online, NULL); + umwait_cpu_online, umwait_cpu_offline); + if (ret < 0) { + /* + * On failure, the control MSR on all CPUs has the + * original control value. + */ + return ret; + } register_syscore_ops(&umwait_syscore_ops); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 2bf70a2fed90..eb651fbde92a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -225,8 +225,6 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem) if (crashk_low_res.end) { ret = crash_exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); - if (ret) - return ret; } return ret; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2b5886401e5f..e07424e19274 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -367,13 +367,18 @@ NOKPROBE_SYMBOL(oops_end); int __die(const char *str, struct pt_regs *regs, long err) { + const char *pr = ""; + /* Save the regs of the first oops for the executive summary later. */ if (!die_counter) exec_summary_regs = *regs; + if (IS_ENABLED(CONFIG_PREEMPTION)) + pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; + printk(KERN_DEFAULT "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + pr, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index a6342c899be5..f3d3e9646a99 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -193,10 +193,10 @@ ENTRY(secondary_startup_64) /* Set up %gs. * - * The base of %gs always points to the bottom of the irqstack - * union. If the stack protector canary is enabled, it is - * located at %gs:40. Note that, on SMP, the boot cpu uses - * init data section till per cpu areas are set up. + * The base of %gs always points to fixed_percpu_data. If the + * stack protector canary is enabled, it is located at %gs:40. + * Note that, on SMP, the boot cpu uses init data section until + * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx movl initial_gs(%rip),%eax diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c43e96a938d0..c6f791bc481e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -827,10 +827,6 @@ int __init hpet_enable(void) if (!hpet_cfg_working()) goto out_nohpet; - /* Validate that the counter is counting */ - if (!hpet_counting()) - goto out_nohpet; - /* * Read the period and check for a sane value: */ @@ -896,6 +892,14 @@ int __init hpet_enable(void) } hpet_print_config(); + /* + * Validate that the counter is counting. This needs to be done + * after sanitizing the config registers to properly deal with + * force enabled HPETs. + */ + if (!hpet_counting()) + goto out_nohpet; + clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); if (id & HPET_ID_LEGSUP) { diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 0e0b08008b5a..43fc13c831af 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -580,7 +580,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, if (setup_detour_execution(p, regs, reenter)) return; -#if !defined(CONFIG_PREEMPT) +#if !defined(CONFIG_PREEMPTION) if (p->ainsn.boostable && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ if (!reenter) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 9d4aedece363..b348dd506d58 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -403,7 +403,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, (u8 *)op->kp.addr + op->optinsn.size); len += RELATIVEJUMP_SIZE; - /* We have to use text_poke for instuction buffer because it is RO */ + /* We have to use text_poke() for instruction buffer because it is RO */ text_poke(slot, buf, len); ret = 0; out: diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b7f34fe2171e..4cc967178bf9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -308,13 +308,10 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { - if (!kvm_para_available()) - return; - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION pa |= KVM_ASYNC_PF_SEND_ALWAYS; #endif pa |= KVM_ASYNC_PF_ENABLED; @@ -625,9 +622,6 @@ static void __init kvm_guest_init(void) { int i; - if (!kvm_para_available()) - return; - paravirt_ops_setup(); register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) @@ -848,8 +842,6 @@ asm( */ void __init kvm_spinlock_init(void) { - if (!kvm_para_available()) - return; /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 77854b192fef..7b45e8daad22 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -148,7 +148,7 @@ int machine_kexec_prepare(struct kimage *image) { int error; - set_pages_x(image->control_code_page, 1); + set_memory_x((unsigned long)page_address(image->control_code_page), 1); error = machine_kexec_alloc_page_tables(image); if (error) return error; @@ -162,7 +162,7 @@ int machine_kexec_prepare(struct kimage *image) */ void machine_kexec_cleanup(struct kimage *image) { - set_pages_nx(image->control_code_page, 1); + set_memory_nx((unsigned long)page_address(image->control_code_page), 1); machine_kexec_free_page_tables(image); } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f62b498b18fb..fa4352dce491 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/dma-direct.h> #include <linux/dma-debug.h> +#include <linux/iommu.h> #include <linux/dmar.h> #include <linux/export.h> #include <linux/memblock.h> @@ -34,21 +35,6 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -/* - * This variable becomes 1 if iommu=pt is passed on the kernel command line. - * If this variable is 1, IOMMU implementations do no DMA translation for - * devices and allow every device to access to whole physical memory. This is - * useful if a user wants to use an IOMMU only for KVM device assignment to - * guests and not for driver dma translation. - * It is also possible to disable by default in kernel config, and enable with - * iommu=nopt at boot time. - */ -#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH -int iommu_pass_through __read_mostly = 1; -#else -int iommu_pass_through __read_mostly; -#endif - extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; void __init pci_iommu_alloc(void) @@ -120,9 +106,9 @@ static __init int iommu_setup(char *p) swiotlb = 1; #endif if (!strncmp(p, "pt", 2)) - iommu_pass_through = 1; + iommu_set_default_passthrough(true); if (!strncmp(p, "nopt", 4)) - iommu_pass_through = 0; + iommu_set_default_translated(true); gart_parse_options(p); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0fdbe89d0754..3c5bbe8e4120 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -201,6 +201,7 @@ static int set_segment_reg(struct task_struct *task, case offsetof(struct user_regs_struct, ss): if (unlikely(value == 0)) return -EIO; + /* Else, fall through */ default: *pt_regs_access(task_pt_regs(task), offset) = value; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 8451f38ad399..1daf8f2aa21f 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -90,8 +90,6 @@ static void ich_force_hpet_resume(void) BUG(); else printk(KERN_DEBUG "Force enabled HPET at resume\n"); - - return; } static void ich_force_enable_hpet(struct pci_dev *dev) @@ -448,7 +446,6 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev) dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", force_hpet_address); cached_dev = dev; - return; } /* ISA Bridges */ @@ -513,7 +510,6 @@ static void e6xx_force_enable_hpet(struct pci_dev *dev) force_hpet_resume_type = NONE_FORCE_HPET_RESUME; dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " "0x%lx\n", force_hpet_address); - return; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU, e6xx_force_enable_hpet); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fdbd47ceb84d..497e9b7077c1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1023,8 +1023,6 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, int *cpu0_nmi_registered) { - volatile u32 *trampoline_status = - (volatile u32 *) __va(real_mode_header->trampoline_status); /* start_ip had better be page-aligned! */ unsigned long start_ip = real_mode_header->trampoline_start; @@ -1116,9 +1114,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, } } - /* mark "stuck" area as not stuck */ - *trampoline_status = 0; - if (x86_platform.legacy.warm_reset) { /* * Cleanup possible dangling ends... diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 4f36d3241faf..2d6898c2cb64 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -100,7 +100,7 @@ copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; - if (!access_ok(fp, sizeof(*frame))) + if (__range_not_ok(fp, sizeof(*frame), TASK_SIZE)) return 0; ret = 1; diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index 8eb67a670b10..653b7f617b61 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -230,9 +230,55 @@ static const struct dmi_system_id efifb_dmi_system_table[] __initconst = { {}, }; +/* + * Some devices have a portrait LCD but advertise a landscape resolution (and + * pitch). We simply swap width and height for these devices so that we can + * correctly deal with some of them coming with multiple resolutions. + */ +static const struct dmi_system_id efifb_dmi_swap_width_height[] __initconst = { + { + /* + * Lenovo MIIX310-10ICR, only some batches have the troublesome + * 800x1280 portrait screen. Luckily the portrait version has + * its own BIOS version, so we match on that. + */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "MIIX 310-10ICR"), + DMI_EXACT_MATCH(DMI_BIOS_VERSION, "1HCN44WW"), + }, + }, + { + /* Lenovo MIIX 320-10ICR with 800x1280 portrait screen */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, + "Lenovo MIIX 320-10ICR"), + }, + }, + { + /* Lenovo D330 with 800x1280 or 1200x1920 portrait screen */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, + "Lenovo ideapad D330-10IGM"), + }, + }, + {}, +}; + __init void sysfb_apply_efi_quirks(void) { if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS)) dmi_check_system(efifb_dmi_system_table); + + if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI && + dmi_check_system(efifb_dmi_swap_width_height)) { + u16 temp = screen_info.lfb_width; + + screen_info.lfb_width = screen_info.lfb_height; + screen_info.lfb_height = temp; + screen_info.lfb_linelength = 4 * screen_info.lfb_width; + } } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 57d87f79558f..c59454c382fd 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -638,7 +638,7 @@ unsigned long native_calibrate_tsc(void) * clock. */ if (crystal_khz == 0 && - boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_X) + boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D) crystal_khz = 25000; /* diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 067858fe4db8..e0cbe4f2af49 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -58,6 +58,10 @@ static const struct freq_desc freq_desc_ann = { 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }; +static const struct freq_desc freq_desc_lgm = { + 1, { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 } +}; + static const struct x86_cpu_id tsc_msr_cpu_ids[] = { INTEL_CPU_FAM6(ATOM_SALTWELL_MID, freq_desc_pnw), INTEL_CPU_FAM6(ATOM_SALTWELL_TABLET, freq_desc_clv), @@ -65,6 +69,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = { INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, freq_desc_tng), INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), INTEL_CPU_FAM6(ATOM_AIRMONT_MID, freq_desc_ann), + INTEL_CPU_FAM6(ATOM_AIRMONT_NP, freq_desc_lgm), {} }; diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 5b345add550f..548fefed71ee 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -19,7 +19,7 @@ /** DOC: Emulation for User-Mode Instruction Prevention (UMIP) * * The feature User-Mode Instruction Prevention present in recent Intel - * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str) + * processor prevents a group of instructions (SGDT, SIDT, SLDT, SMSW and STR) * from being executed with CPL > 0. Otherwise, a general protection fault is * issued. * @@ -36,8 +36,8 @@ * DOSEMU2) rely on this subset of instructions to function. * * The instructions protected by UMIP can be split in two groups. Those which - * return a kernel memory address (sgdt and sidt) and those which return a - * value (sldt, str and smsw). + * return a kernel memory address (SGDT and SIDT) and those which return a + * value (SLDT, STR and SMSW). * * For the instructions that return a kernel memory address, applications * such as WineHQ rely on the result being located in the kernel memory space, @@ -45,15 +45,13 @@ * value that, lies close to the top of the kernel memory. The limit for the GDT * and the IDT are set to zero. * - * Given that sldt and str are not commonly used in programs that run on WineHQ + * Given that SLDT and STR are not commonly used in programs that run on WineHQ * or DOSEMU2, they are not emulated. * * The instruction smsw is emulated to return the value that the register CR0 * has at boot time as set in the head_32. * - * Also, emulation is provided only for 32-bit processes; 64-bit processes - * that attempt to use the instructions that UMIP protects will receive the - * SIGSEGV signal issued as a consequence of the general protection fault. + * Emulation is provided for both 32-bit and 64-bit processes. * * Care is taken to appropriately emulate the results when segmentation is * used. That is, rather than relying on USER_DS and USER_CS, the function @@ -63,17 +61,18 @@ * application uses a local descriptor table. */ -#define UMIP_DUMMY_GDT_BASE 0xfffe0000 -#define UMIP_DUMMY_IDT_BASE 0xffff0000 +#define UMIP_DUMMY_GDT_BASE 0xfffffffffffe0000ULL +#define UMIP_DUMMY_IDT_BASE 0xffffffffffff0000ULL /* * The SGDT and SIDT instructions store the contents of the global descriptor * table and interrupt table registers, respectively. The destination is a * memory operand of X+2 bytes. X bytes are used to store the base address of - * the table and 2 bytes are used to store the limit. In 32-bit processes, the - * only processes for which emulation is provided, X has a value of 4. + * the table and 2 bytes are used to store the limit. In 32-bit processes X + * has a value of 4, in 64-bit processes X has a value of 8. */ -#define UMIP_GDT_IDT_BASE_SIZE 4 +#define UMIP_GDT_IDT_BASE_SIZE_64BIT 8 +#define UMIP_GDT_IDT_BASE_SIZE_32BIT 4 #define UMIP_GDT_IDT_LIMIT_SIZE 2 #define UMIP_INST_SGDT 0 /* 0F 01 /0 */ @@ -189,6 +188,7 @@ static int identify_insn(struct insn *insn) * @umip_inst: A constant indicating the instruction to emulate * @data: Buffer into which the dummy result is stored * @data_size: Size of the emulated result + * @x86_64: true if process is 64-bit, false otherwise * * Emulate an instruction protected by UMIP and provide a dummy result. The * result of the emulation is saved in @data. The size of the results depends @@ -202,11 +202,8 @@ static int identify_insn(struct insn *insn) * 0 on success, -EINVAL on error while emulating. */ static int emulate_umip_insn(struct insn *insn, int umip_inst, - unsigned char *data, int *data_size) + unsigned char *data, int *data_size, bool x86_64) { - unsigned long dummy_base_addr, dummy_value; - unsigned short dummy_limit = 0; - if (!data || !data_size || !insn) return -EINVAL; /* @@ -219,6 +216,9 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, * is always returned irrespective of the operand size. */ if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) { + u64 dummy_base_addr; + u16 dummy_limit = 0; + /* SGDT and SIDT do not use registers operands. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) return -EINVAL; @@ -228,13 +228,24 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, else dummy_base_addr = UMIP_DUMMY_IDT_BASE; - *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE; + /* + * 64-bit processes use the entire dummy base address. + * 32-bit processes use the lower 32 bits of the base address. + * dummy_base_addr is always 64 bits, but we memcpy the correct + * number of bytes from it to the destination. + */ + if (x86_64) + *data_size = UMIP_GDT_IDT_BASE_SIZE_64BIT; + else + *data_size = UMIP_GDT_IDT_BASE_SIZE_32BIT; + + memcpy(data + 2, &dummy_base_addr, *data_size); - memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE); + *data_size += UMIP_GDT_IDT_LIMIT_SIZE; memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); } else if (umip_inst == UMIP_INST_SMSW) { - dummy_value = CR0_STATE; + unsigned long dummy_value = CR0_STATE; /* * Even though the CR0 register has 4 bytes, the number @@ -290,11 +301,10 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) * fixup_umip_exception() - Fixup a general protection fault caused by UMIP * @regs: Registers as saved when entering the #GP handler * - * The instructions sgdt, sidt, str, smsw, sldt cause a general protection - * fault if executed with CPL > 0 (i.e., from user space). If the offending - * user-space process is not in long mode, this function fixes the exception - * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not - * fixed up. Also long mode user-space processes are not fixed up. + * The instructions SGDT, SIDT, STR, SMSW and SLDT cause a general protection + * fault if executed with CPL > 0 (i.e., from user space). This function fixes + * the exception up and provides dummy results for SGDT, SIDT and SMSW; STR + * and SLDT are not fixed up. * * If operands are memory addresses, results are copied to user-space memory as * indicated by the instruction pointed by eIP using the registers indicated in @@ -373,13 +383,14 @@ bool fixup_umip_exception(struct pt_regs *regs) umip_pr_warning(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); - /* Do not emulate SLDT, STR or user long mode processes. */ - if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT || user_64bit_mode(regs)) + /* Do not emulate (spoof) SLDT or STR. */ + if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT) return false; umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n"); - if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size)) + if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, + user_64bit_mode(regs))) return false; /* diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index d8359ebeea70..8cd745ef8c7b 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -508,9 +508,12 @@ struct uprobe_xol_ops { void (*abort)(struct arch_uprobe *, struct pt_regs *); }; -static inline int sizeof_long(void) +static inline int sizeof_long(struct pt_regs *regs) { - return in_ia32_syscall() ? 4 : 8; + /* + * Check registers for mode as in_xxx_syscall() does not apply here. + */ + return user_64bit_mode(regs) ? 8 : 4; } static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) @@ -521,9 +524,9 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) static int emulate_push_stack(struct pt_regs *regs, unsigned long val) { - unsigned long new_sp = regs->sp - sizeof_long(); + unsigned long new_sp = regs->sp - sizeof_long(regs); - if (copy_to_user((void __user *)new_sp, &val, sizeof_long())) + if (copy_to_user((void __user *)new_sp, &val, sizeof_long(regs))) return -EFAULT; regs->sp = new_sp; @@ -556,7 +559,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs long correction = utask->vaddr - utask->xol_vaddr; regs->ip += correction; } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { - regs->sp += sizeof_long(); /* Pop incorrect return address */ + regs->sp += sizeof_long(regs); /* Pop incorrect return address */ if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen)) return -ERESTART; } @@ -675,7 +678,7 @@ static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) * "call" insn was executed out-of-line. Just restore ->sp and restart. * We could also restore ->ip and try to call branch_emulate_op() again. */ - regs->sp += sizeof_long(); + regs->sp += sizeof_long(regs); return -ERESTART; } @@ -1056,7 +1059,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { - int rasize = sizeof_long(), nleft; + int rasize = sizeof_long(regs), nleft; unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 329361b69d5e..018aebce33ff 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -8,11 +8,6 @@ #include <linux/debugfs.h> #include "lapic.h" -bool kvm_arch_has_vcpu_debugfs(void) -{ - return true; -} - static int vcpu_get_timer_advance_ns(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; @@ -48,37 +43,22 @@ static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); -int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { - struct dentry *ret; - - ret = debugfs_create_file("tsc-offset", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_offset_fops); - if (!ret) - return -ENOMEM; + debugfs_create_file("tsc-offset", 0444, vcpu->debugfs_dentry, vcpu, + &vcpu_tsc_offset_fops); - if (lapic_in_kernel(vcpu)) { - ret = debugfs_create_file("lapic_timer_advance_ns", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_timer_advance_ns_fops); - if (!ret) - return -ENOMEM; - } + if (lapic_in_kernel(vcpu)) + debugfs_create_file("lapic_timer_advance_ns", 0444, + vcpu->debugfs_dentry, vcpu, + &vcpu_timer_advance_ns_fops); if (kvm_has_tsc_control) { - ret = debugfs_create_file("tsc-scaling-ratio", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_scaling_fops); - if (!ret) - return -ENOMEM; - ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, - vcpu->debugfs_dentry, - vcpu, &vcpu_tsc_scaling_frac_fops); - if (!ret) - return -ENOMEM; - + debugfs_create_file("tsc-scaling-ratio", 0444, + vcpu->debugfs_dentry, vcpu, + &vcpu_tsc_scaling_fops); + debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, + vcpu->debugfs_dentry, vcpu, + &vcpu_tsc_scaling_frac_fops); } - - return 0; } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index c10a8b10b203..fff790a3f4ee 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1781,7 +1781,7 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args) int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { - uint16_t evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu); + uint16_t evmcs_ver = 0; struct kvm_cpuid_entry2 cpuid_entries[] = { { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS }, { .function = HYPERV_CPUID_INTERFACE }, @@ -1793,6 +1793,9 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, }; int i, nent = ARRAY_SIZE(cpuid_entries); + if (kvm_x86_ops->nested_get_evmcs_version) + evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu); + /* Skip NESTED_FEATURES if eVMCS is not supported */ if (!evmcs_ver) --nent; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0aa158657f20..e904ff06a83d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -216,6 +216,9 @@ static void recalculate_apic_map(struct kvm *kvm) if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) new->phys_map[xapic_id] = apic; + if (!kvm_apic_sw_enabled(apic)) + continue; + ldr = kvm_lapic_get_reg(apic, APIC_LDR); if (apic_x2apic_mode(apic)) { @@ -258,6 +261,8 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) static_key_slow_dec_deferred(&apic_sw_disabled); else static_key_slow_inc(&apic_sw_disabled.key); + + recalculate_apic_map(apic->vcpu->kvm); } } @@ -1548,7 +1553,6 @@ static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) static void apic_timer_expired(struct kvm_lapic *apic) { struct kvm_vcpu *vcpu = apic->vcpu; - struct swait_queue_head *q = &vcpu->wq; struct kvm_timer *ktimer = &apic->lapic_timer; if (atomic_read(&apic->lapic_timer.pending)) @@ -1566,13 +1570,6 @@ static void apic_timer_expired(struct kvm_lapic *apic) atomic_inc(&apic->lapic_timer.pending); kvm_set_pending_timer(vcpu); - - /* - * For x86, the atomic_inc() is serialized, thus - * using swait_active() is safe. - */ - if (swait_active(q)) - swake_up_one(q); } static void start_sw_tscdeadline(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8f72526e2f68..a63964e7cec7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2095,6 +2095,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct if (!direct) sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + /* + * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() + * depends on valid pages being added to the head of the list. See + * comments in kvm_zap_obsolete_pages(). + */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; @@ -2244,7 +2250,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, #define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if ((_sp)->role.invalid) { \ + if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ @@ -2301,6 +2307,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } static void mmu_audit_disable(void) { } #endif +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); +} + static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { @@ -2525,6 +2536,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PT_PAGE_TABLE_LEVEL && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); @@ -3466,7 +3478,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, /* * Currently, fast page fault only works for direct mapping * since the gfn is not stable for indirect shadow page. See - * Documentation/virtual/kvm/locking.txt to get more detail. + * Documentation/virt/kvm/locking.txt to get more detail. */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, @@ -4233,6 +4245,13 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, return false; if (cached_root_available(vcpu, new_cr3, new_role)) { + /* + * It is possible that the cached previous root page is + * obsolete because of a change in the MMU generation + * number. However, changing the generation number is + * accompanied by KVM_REQ_MMU_RELOAD, which will free + * the root set here and allocate a new one. + */ kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); if (!skip_tlb_flush) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); @@ -5649,44 +5668,91 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) return alloc_mmu_pages(vcpu); } -static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - struct kvm_page_track_notifier_node *node) + +static void kvm_zap_obsolete_pages(struct kvm *kvm) { - struct kvm_mmu_page *sp; + struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); - unsigned long i; - bool flush; - gfn_t gfn; - - spin_lock(&kvm->mmu_lock); - - if (list_empty(&kvm->arch.active_mmu_pages)) - goto out_unlock; - - flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false); + int ign; - for (i = 0; i < slot->npages; i++) { - gfn = slot->base_gfn + i; +restart: + list_for_each_entry_safe_reverse(sp, node, + &kvm->arch.active_mmu_pages, link) { + /* + * No obsolete valid page exists before a newly created page + * since active_mmu_pages is a FIFO list. + */ + if (!is_obsolete_sp(kvm, sp)) + break; - for_each_valid_sp(kvm, sp, gfn) { - if (sp->gfn != gfn) - continue; + /* + * Do not repeatedly zap a root page to avoid unnecessary + * KVM_REQ_MMU_RELOAD, otherwise we may not be able to + * progress: + * vcpu 0 vcpu 1 + * call vcpu_enter_guest(): + * 1): handle KVM_REQ_MMU_RELOAD + * and require mmu-lock to + * load mmu + * repeat: + * 1): zap root page and + * send KVM_REQ_MMU_RELOAD + * + * 2): if (cond_resched_lock(mmu-lock)) + * + * 2): hold mmu-lock and load mmu + * + * 3): see KVM_REQ_MMU_RELOAD bit + * on vcpu->requests is set + * then return 1 to call + * vcpu_enter_guest() again. + * goto repeat; + * + * Since we are reversely walking the list and the invalid + * list will be moved to the head, skip the invalid page + * can help us to avoid the infinity list walking. + */ + if (sp->role.invalid) + continue; - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - flush = false; + kvm_mmu_commit_zap_page(kvm, &invalid_list); cond_resched_lock(&kvm->mmu_lock); + goto restart; } + + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) + goto restart; } - kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); -out_unlock: + kvm_mmu_commit_zap_page(kvm, &invalid_list); +} + +/* + * Fast invalidate all shadow pages and use lock-break technique + * to zap obsolete pages. + * + * It's required when memslot is being deleted or VM is being + * destroyed, in these cases, we should ensure that KVM MMU does + * not use any resource of the being-deleted slot or all slots + * after calling the function. + */ +static void kvm_mmu_zap_all_fast(struct kvm *kvm) +{ + spin_lock(&kvm->mmu_lock); + kvm->arch.mmu_valid_gen++; + + kvm_zap_obsolete_pages(kvm); spin_unlock(&kvm->mmu_lock); } +static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node) +{ + kvm_mmu_zap_all_fast(kvm); +} + void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 19f69df96758..e0368076a1ef 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1714,7 +1714,6 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (!entry) return -EINVAL; - new_entry = READ_ONCE(*entry); new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); @@ -2143,12 +2142,20 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto out; } + svm->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!svm->vcpu.arch.user_fpu) { + printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); + err = -ENOMEM; + goto free_partial_svm; + } + svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!svm->vcpu.arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; - goto free_partial_svm; + goto free_user_fpu; } err = kvm_vcpu_init(&svm->vcpu, kvm, id); @@ -2211,6 +2218,8 @@ uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); +free_user_fpu: + kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); free_partial_svm: kmem_cache_free(kvm_vcpu_cache, svm); out: @@ -2241,6 +2250,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); + kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, svm); } @@ -5179,6 +5189,11 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_vcpu_wake_up(vcpu); } +static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return false; +} + static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) { unsigned long flags; @@ -7113,12 +7128,6 @@ failed: return ret; } -static uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) -{ - /* Not supported */ - return 0; -} - static int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { @@ -7303,6 +7312,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, + .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, @@ -7316,7 +7326,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .mem_enc_unreg_region = svm_unregister_enc_region, .nested_enable_evmcs = nested_enable_evmcs, - .nested_get_evmcs_version = nested_get_evmcs_version, + .nested_get_evmcs_version = NULL, .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, }; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0f1378789bd0..a3cba321b5c5 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -220,6 +220,8 @@ static void free_nested(struct kvm_vcpu *vcpu) if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; + kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; free_vpid(vmx->nested.vpid02); @@ -232,7 +234,9 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); + vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); + vmx->nested.cached_shadow_vmcs12 = NULL; /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); @@ -4536,6 +4540,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) int len; gva_t gva = 0; struct vmcs12 *vmcs12; + struct x86_exception e; short offset; if (!nested_vmx_check_permission(vcpu)) @@ -4584,7 +4589,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu) vmx_instruction_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL); + if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) + kvm_inject_page_fault(vcpu, &e); } return nested_vmx_succeed(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a279447eb75b..c030c96fc81a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6117,6 +6117,11 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } +static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return pi_test_on(vcpu_to_pi_desc(vcpu)); +} + static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -6598,6 +6603,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); + kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, vmx); } @@ -6613,12 +6619,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); + vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vmx->vcpu.arch.user_fpu) { + printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); + err = -ENOMEM; + goto free_partial_vcpu; + } + vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!vmx->vcpu.arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; - goto free_partial_vcpu; + goto free_user_fpu; } vmx->vpid = allocate_vpid(); @@ -6721,6 +6735,8 @@ uninit_vcpu: free_vcpu: free_vpid(vmx->vpid); kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); +free_user_fpu: + kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); free_partial_vcpu: kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); @@ -7715,6 +7731,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, @@ -7780,6 +7797,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .set_nested_state = NULL, .get_vmcs12_pages = NULL, .nested_enable_evmcs = NULL, + .nested_get_evmcs_version = NULL, .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 58305cf81182..91602d310a3f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3306,6 +3306,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); + /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); @@ -5308,6 +5312,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, /* kvm_write_guest_virt_system can pull in tons of pages. */ vcpu->arch.l1tf_flush_l1d = true; + /* + * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED + * is returned, but our callers are not ready for that and they blindly + * call kvm_inject_page_fault. Ensure that they at least do not leak + * uninitialized kernel stack memory into cr2 and error code. + */ + memset(exception, 0, sizeof(*exception)); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } @@ -6590,12 +6601,13 @@ restart: unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE && ctxt->tf) - kvm_vcpu_do_singlestep(vcpu, &r); if (!ctxt->have_exception || - exception_type(ctxt->exception.vector) == EXCPT_TRAP) + exception_type(ctxt->exception.vector) == EXCPT_TRAP) { + kvm_rip_write(vcpu, ctxt->eip); + if (r == EMULATE_DONE && ctxt->tf) + kvm_vcpu_do_singlestep(vcpu, &r); __kvm_set_rflags(vcpu, ctxt->eflags); + } /* * For STI, interrupts are shadowed; so KVM_REQ_EVENT will @@ -7202,7 +7214,7 @@ static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) rcu_read_unlock(); - if (target) + if (target && READ_ONCE(target->ready)) kvm_vcpu_yield_to(target); } @@ -7242,6 +7254,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; case KVM_HC_KICK_CPU: kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + kvm_sched_yield(vcpu->kvm, a1); ret = 0; break; #ifdef CONFIG_X86_64 @@ -7990,9 +8003,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) trace_kvm_entry(vcpu->vcpu_id); guest_enter_irqoff(); - fpregs_assert_state_consistent(); - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_return(); + /* The preempt notifier should have taken care of the FPU already. */ + WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -8270,7 +8282,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - copy_fpregs_to_fpstate(¤t->thread.fpu); + copy_fpregs_to_fpstate(vcpu->arch.user_fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); @@ -8287,7 +8299,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) fpregs_lock(); copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(¤t->thread.fpu.state); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); fpregs_mark_activate(); fpregs_unlock(); @@ -9694,6 +9706,22 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + kvm_test_request(KVM_REQ_SMI, vcpu) || + kvm_test_request(KVM_REQ_EVENT, vcpu)) + return true; + + if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu)) + return true; + + return false; +} + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { return vcpu->arch.preempted_in_kernel; diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 4fe1601dbc5d..86976b55ae74 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -33,7 +33,7 @@ 102: .section .fixup,"ax" 103: addl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail + jmp .Lcopy_user_handle_tail .previous _ASM_EXTABLE_UA(100b, 103b) @@ -113,7 +113,7 @@ ENTRY(copy_user_generic_unrolled) 40: leal (%rdx,%rcx,8),%edx jmp 60f 50: movl %ecx,%edx -60: jmp copy_user_handle_tail /* ecx is zerorest also */ +60: jmp .Lcopy_user_handle_tail /* ecx is zerorest also */ .previous _ASM_EXTABLE_UA(1b, 30b) @@ -177,7 +177,7 @@ ENTRY(copy_user_generic_string) .section .fixup,"ax" 11: leal (%rdx,%rcx,8),%ecx 12: movl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail + jmp .Lcopy_user_handle_tail .previous _ASM_EXTABLE_UA(1b, 11b) @@ -210,7 +210,7 @@ ENTRY(copy_user_enhanced_fast_string) .section .fixup,"ax" 12: movl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail + jmp .Lcopy_user_handle_tail .previous _ASM_EXTABLE_UA(1b, 12b) @@ -231,7 +231,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * eax uncopied bytes or 0 if successful. */ ALIGN; -copy_user_handle_tail: +.Lcopy_user_handle_tail: movl %edx,%ecx 1: rep movsb 2: mov %ecx,%eax @@ -239,7 +239,7 @@ copy_user_handle_tail: ret _ASM_EXTABLE_UA(1b, 2b) -END(copy_user_handle_tail) +END(.Lcopy_user_handle_tail) /* * copy_user_nocache - Uncached memory copy with exception handling @@ -364,7 +364,7 @@ ENTRY(__copy_user_nocache) movl %ecx,%edx .L_fixup_handle_tail: sfence - jmp copy_user_handle_tail + jmp .Lcopy_user_handle_tail .previous _ASM_EXTABLE_UA(1b, .L_fixup_4x8b_copy) diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c index 04967cdce5d1..7ad68917a51e 100644 --- a/arch/x86/lib/cpu.c +++ b/arch/x86/lib/cpu.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/export.h> +#include <asm/cpu.h> unsigned int x86_family(unsigned int sig) { diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 304f958c27b2..9578eb88fc87 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -115,7 +115,7 @@ ENDPROC(__get_user_8) EXPORT_SYMBOL(__get_user_8) -bad_get_user_clac: +.Lbad_get_user_clac: ASM_CLAC bad_get_user: xor %edx,%edx @@ -123,7 +123,7 @@ bad_get_user: ret #ifdef CONFIG_X86_32 -bad_get_user_8_clac: +.Lbad_get_user_8_clac: ASM_CLAC bad_get_user_8: xor %edx,%edx @@ -132,12 +132,12 @@ bad_get_user_8: ret #endif - _ASM_EXTABLE_UA(1b, bad_get_user_clac) - _ASM_EXTABLE_UA(2b, bad_get_user_clac) - _ASM_EXTABLE_UA(3b, bad_get_user_clac) + _ASM_EXTABLE_UA(1b, .Lbad_get_user_clac) + _ASM_EXTABLE_UA(2b, .Lbad_get_user_clac) + _ASM_EXTABLE_UA(3b, .Lbad_get_user_clac) #ifdef CONFIG_X86_64 - _ASM_EXTABLE_UA(4b, bad_get_user_clac) + _ASM_EXTABLE_UA(4b, .Lbad_get_user_clac) #else - _ASM_EXTABLE_UA(4b, bad_get_user_8_clac) - _ASM_EXTABLE_UA(5b, bad_get_user_8_clac) + _ASM_EXTABLE_UA(4b, .Lbad_get_user_8_clac) + _ASM_EXTABLE_UA(5b, .Lbad_get_user_8_clac) #endif diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 14bf78341d3c..126dd6a9ec9b 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -37,7 +37,7 @@ ENTRY(__put_user_1) ENTER cmp TASK_addr_limit(%_ASM_BX),%_ASM_CX - jae bad_put_user + jae .Lbad_put_user ASM_STAC 1: movb %al,(%_ASM_CX) xor %eax,%eax @@ -51,7 +51,7 @@ ENTRY(__put_user_2) mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $1,%_ASM_BX cmp %_ASM_BX,%_ASM_CX - jae bad_put_user + jae .Lbad_put_user ASM_STAC 2: movw %ax,(%_ASM_CX) xor %eax,%eax @@ -65,7 +65,7 @@ ENTRY(__put_user_4) mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $3,%_ASM_BX cmp %_ASM_BX,%_ASM_CX - jae bad_put_user + jae .Lbad_put_user ASM_STAC 3: movl %eax,(%_ASM_CX) xor %eax,%eax @@ -79,7 +79,7 @@ ENTRY(__put_user_8) mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $7,%_ASM_BX cmp %_ASM_BX,%_ASM_CX - jae bad_put_user + jae .Lbad_put_user ASM_STAC 4: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 @@ -91,16 +91,16 @@ ENTRY(__put_user_8) ENDPROC(__put_user_8) EXPORT_SYMBOL(__put_user_8) -bad_put_user_clac: +.Lbad_put_user_clac: ASM_CLAC -bad_put_user: +.Lbad_put_user: movl $-EFAULT,%eax RET - _ASM_EXTABLE_UA(1b, bad_put_user_clac) - _ASM_EXTABLE_UA(2b, bad_put_user_clac) - _ASM_EXTABLE_UA(3b, bad_put_user_clac) - _ASM_EXTABLE_UA(4b, bad_put_user_clac) + _ASM_EXTABLE_UA(1b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(2b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(3b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(4b, .Lbad_put_user_clac) #ifdef CONFIG_X86_32 - _ASM_EXTABLE_UA(5b, bad_put_user_clac) + _ASM_EXTABLE_UA(5b, .Lbad_put_user_clac) #endif diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index 6b468517ab71..73dc66d887f3 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -178,13 +178,15 @@ void FPU_printall(void) for (i = 0; i < 8; i++) { FPU_REG *r = &st(i); u_char tagi = FPU_gettagi(i); + switch (tagi) { case TAG_Empty: continue; - break; case TAG_Zero: case TAG_Special: + /* Update tagi for the printk below */ tagi = FPU_Special(r); + /* fall through */ case TAG_Valid: printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i, getsign(r) ? '-' : '+', @@ -198,7 +200,6 @@ void FPU_printall(void) printk("Whoops! Error in errors.c: tag%d is %d ", i, tagi); continue; - break; } printk("%s\n", tag_desc[(int)(unsigned)tagi]); } diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c index 783c509f957a..127ea54122d7 100644 --- a/arch/x86/math-emu/fpu_trig.c +++ b/arch/x86/math-emu/fpu_trig.c @@ -1352,7 +1352,7 @@ static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag) case TW_Denormal: if (denormal_operand() < 0) return; - + /* fall through */ case TAG_Zero: case TAG_Valid: setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr)); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6c46095cd0d9..9ceacd1156db 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -177,13 +177,14 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - if (!pmd_present(*pmd)) + if (pmd_present(*pmd) != pmd_present(*pmd_k)) set_pmd(pmd, *pmd_k); + + if (!pmd_present(*pmd_k)) + return NULL; else - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); return pmd_k; } @@ -203,17 +204,13 @@ void vmalloc_sync_all(void) spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { spinlock_t *pgt_lock; - pmd_t *ret; /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - ret = vmalloc_sync_one(page_address(page), address); + vmalloc_sync_one(page_address(page), address); spin_unlock(pgt_lock); - - if (!ret) - break; } spin_unlock(&pgd_lock); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 4068abb9427f..930edeb41ec3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -916,7 +916,7 @@ static void mark_nxdata_nx(void) if (__supported_pte_mask & _PAGE_NX) printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10); - set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT); + set_memory_nx(start, size >> PAGE_SHIFT); } void mark_rodata_ro(void) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 63e99f15d7cf..a39dcdb5ae34 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -19,6 +19,7 @@ #include <asm/set_memory.h> #include <asm/e820/api.h> +#include <asm/efi.h> #include <asm/fixmap.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index e6dad600614c..4123100e0eaf 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -861,9 +861,9 @@ void numa_remove_cpu(int cpu) */ const struct cpumask *cpumask_of_node(int node) { - if (node >= nr_node_ids) { + if ((unsigned)node >= nr_node_ids) { printk(KERN_WARNING - "cpumask_of_node(%d): node > nr_node_ids(%u)\n", + "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n", node, nr_node_ids); dump_stack(); return cpu_none_mask; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 6a9a77a403c9..0d09cc5aad61 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -516,7 +516,7 @@ static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, */ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, unsigned long pfn, unsigned long npg, - int warnlvl) + unsigned long lpsize, int warnlvl) { pgprotval_t forbidden, res; unsigned long end; @@ -535,9 +535,17 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX"); forbidden = res; - res = protect_kernel_text_ro(start, end); - check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); - forbidden |= res; + /* + * Special case to preserve a large page. If the change spawns the + * full large page mapping then there is no point to split it + * up. Happens with ftrace and is going to be removed once ftrace + * switched to text_poke(). + */ + if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) { + res = protect_kernel_text_ro(start, end); + check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); + forbidden |= res; + } /* Check the PFN directly */ res = protect_pci_bios(pfn, pfn + npg - 1); @@ -819,7 +827,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, * extra conditional required here. */ chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages, - CPA_CONFLICT); + psize, CPA_CONFLICT); if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { /* @@ -855,7 +863,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, * protection requirement in the large page. */ new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, - CPA_DETECT); + psize, CPA_DETECT); /* * If there is a conflict, split the large page. @@ -906,7 +914,8 @@ static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn, if (!cpa->force_static_prot) goto set; - prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT); + /* Hand in lpsize = 0 to enforce the protection mechanism */ + prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT); if (pgprot_val(prot) == pgprot_val(ref_prot)) goto set; @@ -1503,7 +1512,8 @@ repeat: pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); cpa_inc_4k_install(); - new_prot = static_protections(new_prot, address, pfn, 1, + /* Hand in lpsize = 0 to enforce the protection mechanism */ + new_prot = static_protections(new_prot, address, pfn, 1, 0, CPA_PROTECT); new_prot = pgprot_clear_protnone_bits(new_prot); @@ -1809,63 +1819,6 @@ out_err: } EXPORT_SYMBOL(set_memory_uc); -static int _set_memory_array(unsigned long *addr, int numpages, - enum page_cache_mode new_type) -{ - enum page_cache_mode set_type; - int i, j; - int ret; - - for (i = 0; i < numpages; i++) { - ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, - new_type, NULL); - if (ret) - goto out_free; - } - - /* If WC, set to UC- first and then WC */ - set_type = (new_type == _PAGE_CACHE_MODE_WC) ? - _PAGE_CACHE_MODE_UC_MINUS : new_type; - - ret = change_page_attr_set(addr, numpages, - cachemode2pgprot(set_type), 1); - - if (!ret && new_type == _PAGE_CACHE_MODE_WC) - ret = change_page_attr_set_clr(addr, numpages, - cachemode2pgprot( - _PAGE_CACHE_MODE_WC), - __pgprot(_PAGE_CACHE_MASK), - 0, CPA_ARRAY, NULL); - if (ret) - goto out_free; - - return 0; - -out_free: - for (j = 0; j < i; j++) - free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); - - return ret; -} - -int set_memory_array_uc(unsigned long *addr, int numpages) -{ - return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_UC_MINUS); -} -EXPORT_SYMBOL(set_memory_array_uc); - -int set_memory_array_wc(unsigned long *addr, int numpages) -{ - return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WC); -} -EXPORT_SYMBOL(set_memory_array_wc); - -int set_memory_array_wt(unsigned long *addr, int numpages) -{ - return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WT); -} -EXPORT_SYMBOL_GPL(set_memory_array_wt); - int _set_memory_wc(unsigned long addr, int numpages) { int ret; @@ -1905,23 +1858,6 @@ int _set_memory_wt(unsigned long addr, int numpages) cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); } -int set_memory_wt(unsigned long addr, int numpages) -{ - int ret; - - ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, - _PAGE_CACHE_MODE_WT, NULL); - if (ret) - return ret; - - ret = _set_memory_wt(addr, numpages); - if (ret) - free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); - - return ret; -} -EXPORT_SYMBOL_GPL(set_memory_wt); - int _set_memory_wb(unsigned long addr, int numpages) { /* WB cache mode is hard wired to all cache attribute bits being 0 */ @@ -1942,24 +1878,6 @@ int set_memory_wb(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_wb); -int set_memory_array_wb(unsigned long *addr, int numpages) -{ - int i; - int ret; - - /* WB cache mode is hard wired to all cache attribute bits being 0 */ - ret = change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_CACHE_MASK), 1); - if (ret) - return ret; - - for (i = 0; i < numpages; i++) - free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); - - return 0; -} -EXPORT_SYMBOL(set_memory_array_wb); - int set_memory_x(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) @@ -1967,7 +1885,6 @@ int set_memory_x(unsigned long addr, int numpages) return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } -EXPORT_SYMBOL(set_memory_x); int set_memory_nx(unsigned long addr, int numpages) { @@ -1976,7 +1893,6 @@ int set_memory_nx(unsigned long addr, int numpages) return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } -EXPORT_SYMBOL(set_memory_nx); int set_memory_ro(unsigned long addr, int numpages) { @@ -2180,22 +2096,6 @@ int set_pages_array_wb(struct page **pages, int numpages) } EXPORT_SYMBOL(set_pages_array_wb); -int set_pages_x(struct page *page, int numpages) -{ - unsigned long addr = (unsigned long)page_address(page); - - return set_memory_x(addr, numpages); -} -EXPORT_SYMBOL(set_pages_x); - -int set_pages_nx(struct page *page, int numpages) -{ - unsigned long addr = (unsigned long)page_address(page); - - return set_memory_nx(addr, numpages); -} -EXPORT_SYMBOL(set_pages_nx); - int set_pages_ro(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4de9704c4aaf..e6a9edc5baaf 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -440,7 +440,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); if (next != real_prev) { - load_mm_cr4(next); + load_mm_cr4_irqsoff(next); switch_ldt(real_prev, next); } } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index eaaed5bfc4a4..991549a1c5f3 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -390,8 +390,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, emit_prologue(&prog, bpf_prog->aux->stack_depth, bpf_prog_was_classic(bpf_prog)); + addrs[0] = prog - temp; - for (i = 0; i < insn_cnt; i++, insn++) { + for (i = 1; i <= insn_cnt; i++, insn++) { const s32 imm32 = insn->imm; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; @@ -1105,7 +1106,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) extra_pass = true; goto skip_init_addrs; } - addrs = kmalloc_array(prog->len, sizeof(*addrs), GFP_KERNEL); + addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; goto out_addrs; @@ -1115,7 +1116,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) * Before first pass, make a rough estimation of addrs[] * each BPF instruction is translated to less than 64 bytes */ - for (proglen = 0, i = 0; i < prog->len; i++) { + for (proglen = 0, i = 0; i <= prog->len; i++) { proglen += 64; addrs[i] = proglen; } @@ -1180,7 +1181,7 @@ out_image: if (!image || !prog->is_func || extra_pass) { if (image) - bpf_prog_fill_jited_linfo(prog, addrs); + bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: kfree(addrs); kfree(jit_data); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 7389db538c30..6fa42e9c4e6f 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -29,6 +29,7 @@ static bool pci_mmcfg_running_state; static bool pci_mmcfg_arch_init_failed; static DEFINE_MUTEX(pci_mmcfg_lock); +#define pci_mmcfg_lock_held() lock_is_held(&(pci_mmcfg_lock).dep_map) LIST_HEAD(pci_mmcfg_list); @@ -54,7 +55,7 @@ static void list_add_sorted(struct pci_mmcfg_region *new) struct pci_mmcfg_region *cfg; /* keep list sorted by segment and starting bus number */ - list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) { + list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list, pci_mmcfg_lock_held()) { if (cfg->segment > new->segment || (cfg->segment == new->segment && cfg->start_bus >= new->start_bus)) { @@ -118,7 +119,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus) { struct pci_mmcfg_region *cfg; - list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) + list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list, pci_mmcfg_lock_held()) if (cfg->segment == segment && cfg->start_bus <= bus && bus <= cfg->end_bus) return cfg; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index a7189a3b4d70..c202e1b07e29 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -59,11 +59,34 @@ static efi_system_table_t efi_systab __initdata; static efi_config_table_type_t arch_tables[] __initdata = { #ifdef CONFIG_X86_UV - {UV_SYSTEM_TABLE_GUID, "UVsystab", &efi.uv_systab}, + {UV_SYSTEM_TABLE_GUID, "UVsystab", &uv_systab_phys}, #endif {NULL_GUID, NULL, NULL}, }; +static const unsigned long * const efi_tables[] = { + &efi.mps, + &efi.acpi, + &efi.acpi20, + &efi.smbios, + &efi.smbios3, + &efi.boot_info, + &efi.hcdp, + &efi.uga, +#ifdef CONFIG_X86_UV + &uv_systab_phys, +#endif + &efi.fw_vendor, + &efi.runtime, + &efi.config_table, + &efi.esrt, + &efi.properties_table, + &efi.mem_attr_table, +#ifdef CONFIG_EFI_RCI2_TABLE + &rci2_table_phys, +#endif +}; + u64 efi_setup; /* efi setup_data physical address */ static int add_efi_memmap __initdata; @@ -1049,3 +1072,17 @@ static int __init arch_parse_efi_cmdline(char *str) return 0; } early_param("efi", arch_parse_efi_cmdline); + +bool efi_is_table_address(unsigned long phys_addr) +{ + unsigned int i; + + if (phys_addr == EFI_INVALID_TABLE_ADDR) + return false; + + for (i = 0; i < ARRAY_SIZE(efi_tables); i++) + if (*(efi_tables[i]) == phys_addr) + return true; + + return false; +} diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c index 2e796b54cbde..9e2444500428 100644 --- a/arch/x86/platform/intel/iosf_mbi.c +++ b/arch/x86/platform/intel/iosf_mbi.c @@ -17,6 +17,7 @@ #include <linux/debugfs.h> #include <linux/capability.h> #include <linux/pm_qos.h> +#include <linux/wait.h> #include <asm/iosf_mbi.h> @@ -201,23 +202,45 @@ EXPORT_SYMBOL(iosf_mbi_available); #define PUNIT_SEMAPHORE_BIT BIT(0) #define PUNIT_SEMAPHORE_ACQUIRE BIT(1) -static DEFINE_MUTEX(iosf_mbi_punit_mutex); -static DEFINE_MUTEX(iosf_mbi_block_punit_i2c_access_count_mutex); +static DEFINE_MUTEX(iosf_mbi_pmic_access_mutex); static BLOCKING_NOTIFIER_HEAD(iosf_mbi_pmic_bus_access_notifier); -static u32 iosf_mbi_block_punit_i2c_access_count; +static DECLARE_WAIT_QUEUE_HEAD(iosf_mbi_pmic_access_waitq); +static u32 iosf_mbi_pmic_punit_access_count; +static u32 iosf_mbi_pmic_i2c_access_count; static u32 iosf_mbi_sem_address; static unsigned long iosf_mbi_sem_acquired; static struct pm_qos_request iosf_mbi_pm_qos; void iosf_mbi_punit_acquire(void) { - mutex_lock(&iosf_mbi_punit_mutex); + /* Wait for any I2C PMIC accesses from in kernel drivers to finish. */ + mutex_lock(&iosf_mbi_pmic_access_mutex); + while (iosf_mbi_pmic_i2c_access_count != 0) { + mutex_unlock(&iosf_mbi_pmic_access_mutex); + wait_event(iosf_mbi_pmic_access_waitq, + iosf_mbi_pmic_i2c_access_count == 0); + mutex_lock(&iosf_mbi_pmic_access_mutex); + } + /* + * We do not need to do anything to allow the PUNIT to safely access + * the PMIC, other then block in kernel accesses to the PMIC. + */ + iosf_mbi_pmic_punit_access_count++; + mutex_unlock(&iosf_mbi_pmic_access_mutex); } EXPORT_SYMBOL(iosf_mbi_punit_acquire); void iosf_mbi_punit_release(void) { - mutex_unlock(&iosf_mbi_punit_mutex); + bool do_wakeup; + + mutex_lock(&iosf_mbi_pmic_access_mutex); + iosf_mbi_pmic_punit_access_count--; + do_wakeup = iosf_mbi_pmic_punit_access_count == 0; + mutex_unlock(&iosf_mbi_pmic_access_mutex); + + if (do_wakeup) + wake_up(&iosf_mbi_pmic_access_waitq); } EXPORT_SYMBOL(iosf_mbi_punit_release); @@ -256,34 +279,32 @@ static void iosf_mbi_reset_semaphore(void) * already blocked P-Unit accesses because it wants them blocked over multiple * i2c-transfers, for e.g. read-modify-write of an I2C client register. * - * The P-Unit accesses already being blocked is tracked through the - * iosf_mbi_block_punit_i2c_access_count variable which is protected by the - * iosf_mbi_block_punit_i2c_access_count_mutex this mutex is hold for the - * entire duration of the function. - * - * If access is not blocked yet, this function takes the following steps: + * To allow safe PMIC i2c bus accesses this function takes the following steps: * * 1) Some code sends request to the P-Unit which make it access the PMIC * I2C bus. Testing has shown that the P-Unit does not check its internal * PMIC bus semaphore for these requests. Callers of these requests call * iosf_mbi_punit_acquire()/_release() around their P-Unit accesses, these - * functions lock/unlock the iosf_mbi_punit_mutex. - * As the first step we lock the iosf_mbi_punit_mutex, to wait for any in - * flight requests to finish and to block any new requests. + * functions increase/decrease iosf_mbi_pmic_punit_access_count, so first + * we wait for iosf_mbi_pmic_punit_access_count to become 0. + * + * 2) Check iosf_mbi_pmic_i2c_access_count, if access has already + * been blocked by another caller, we only need to increment + * iosf_mbi_pmic_i2c_access_count and we can skip the other steps. * - * 2) Some code makes such P-Unit requests from atomic contexts where it + * 3) Some code makes such P-Unit requests from atomic contexts where it * cannot call iosf_mbi_punit_acquire() as that may sleep. * As the second step we call a notifier chain which allows any code * needing P-Unit resources from atomic context to acquire them before * we take control over the PMIC I2C bus. * - * 3) When CPU cores enter C6 or C7 the P-Unit needs to talk to the PMIC + * 4) When CPU cores enter C6 or C7 the P-Unit needs to talk to the PMIC * if this happens while the kernel itself is accessing the PMIC I2C bus * the SoC hangs. * As the third step we call pm_qos_update_request() to disallow the CPU * to enter C6 or C7. * - * 4) The P-Unit has a PMIC bus semaphore which we can request to stop + * 5) The P-Unit has a PMIC bus semaphore which we can request to stop * autonomous P-Unit tasks from accessing the PMIC I2C bus while we hold it. * As the fourth and final step we request this semaphore and wait for our * request to be acknowledged. @@ -297,12 +318,18 @@ int iosf_mbi_block_punit_i2c_access(void) if (WARN_ON(!mbi_pdev || !iosf_mbi_sem_address)) return -ENXIO; - mutex_lock(&iosf_mbi_block_punit_i2c_access_count_mutex); + mutex_lock(&iosf_mbi_pmic_access_mutex); - if (iosf_mbi_block_punit_i2c_access_count > 0) + while (iosf_mbi_pmic_punit_access_count != 0) { + mutex_unlock(&iosf_mbi_pmic_access_mutex); + wait_event(iosf_mbi_pmic_access_waitq, + iosf_mbi_pmic_punit_access_count == 0); + mutex_lock(&iosf_mbi_pmic_access_mutex); + } + + if (iosf_mbi_pmic_i2c_access_count > 0) goto success; - mutex_lock(&iosf_mbi_punit_mutex); blocking_notifier_call_chain(&iosf_mbi_pmic_bus_access_notifier, MBI_PMIC_BUS_ACCESS_BEGIN, NULL); @@ -330,10 +357,6 @@ int iosf_mbi_block_punit_i2c_access(void) iosf_mbi_sem_acquired = jiffies; dev_dbg(&mbi_pdev->dev, "P-Unit semaphore acquired after %ums\n", jiffies_to_msecs(jiffies - start)); - /* - * Success, keep iosf_mbi_punit_mutex locked till - * iosf_mbi_unblock_punit_i2c_access() gets called. - */ goto success; } @@ -344,15 +367,13 @@ int iosf_mbi_block_punit_i2c_access(void) dev_err(&mbi_pdev->dev, "Error P-Unit semaphore timed out, resetting\n"); error: iosf_mbi_reset_semaphore(); - mutex_unlock(&iosf_mbi_punit_mutex); - if (!iosf_mbi_get_sem(&sem)) dev_err(&mbi_pdev->dev, "P-Unit semaphore: %d\n", sem); success: if (!WARN_ON(ret)) - iosf_mbi_block_punit_i2c_access_count++; + iosf_mbi_pmic_i2c_access_count++; - mutex_unlock(&iosf_mbi_block_punit_i2c_access_count_mutex); + mutex_unlock(&iosf_mbi_pmic_access_mutex); return ret; } @@ -360,17 +381,20 @@ EXPORT_SYMBOL(iosf_mbi_block_punit_i2c_access); void iosf_mbi_unblock_punit_i2c_access(void) { - mutex_lock(&iosf_mbi_block_punit_i2c_access_count_mutex); + bool do_wakeup = false; - iosf_mbi_block_punit_i2c_access_count--; - if (iosf_mbi_block_punit_i2c_access_count == 0) { + mutex_lock(&iosf_mbi_pmic_access_mutex); + iosf_mbi_pmic_i2c_access_count--; + if (iosf_mbi_pmic_i2c_access_count == 0) { iosf_mbi_reset_semaphore(); - mutex_unlock(&iosf_mbi_punit_mutex); dev_dbg(&mbi_pdev->dev, "punit semaphore held for %ums\n", jiffies_to_msecs(jiffies - iosf_mbi_sem_acquired)); + do_wakeup = true; } + mutex_unlock(&iosf_mbi_pmic_access_mutex); - mutex_unlock(&iosf_mbi_block_punit_i2c_access_count_mutex); + if (do_wakeup) + wake_up(&iosf_mbi_pmic_access_waitq); } EXPORT_SYMBOL(iosf_mbi_unblock_punit_i2c_access); @@ -379,10 +403,10 @@ int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb) int ret; /* Wait for the bus to go inactive before registering */ - mutex_lock(&iosf_mbi_punit_mutex); + iosf_mbi_punit_acquire(); ret = blocking_notifier_chain_register( &iosf_mbi_pmic_bus_access_notifier, nb); - mutex_unlock(&iosf_mbi_punit_mutex); + iosf_mbi_punit_release(); return ret; } @@ -403,9 +427,9 @@ int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb) int ret; /* Wait for the bus to go inactive before unregistering */ - mutex_lock(&iosf_mbi_punit_mutex); + iosf_mbi_punit_acquire(); ret = iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(nb); - mutex_unlock(&iosf_mbi_punit_mutex); + iosf_mbi_punit_release(); return ret; } @@ -413,7 +437,7 @@ EXPORT_SYMBOL(iosf_mbi_unregister_pmic_bus_access_notifier); void iosf_mbi_assert_punit_acquired(void) { - WARN_ON(!mutex_is_locked(&iosf_mbi_punit_mutex)); + WARN_ON(iosf_mbi_pmic_punit_access_count == 0); } EXPORT_SYMBOL(iosf_mbi_assert_punit_acquired); diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 7c69652ffeea..c2ee31953372 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -14,6 +14,8 @@ #include <asm/uv/bios.h> #include <asm/uv/uv_hub.h> +unsigned long uv_systab_phys __ro_after_init = EFI_INVALID_TABLE_ADDR; + struct uv_systab *uv_systab; static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, @@ -185,13 +187,13 @@ EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); void uv_bios_init(void) { uv_systab = NULL; - if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || - !efi.uv_systab || efi_runtime_disabled()) { + if ((uv_systab_phys == EFI_INVALID_TABLE_ADDR) || + !uv_systab_phys || efi_runtime_disabled()) { pr_crit("UV: UVsystab: missing\n"); return; } - uv_systab = ioremap(efi.uv_systab, sizeof(struct uv_systab)); + uv_systab = ioremap(uv_systab_phys, sizeof(struct uv_systab)); if (!uv_systab || strncmp(uv_systab->signature, UV_SYSTAB_SIG, 4)) { pr_err("UV: UVsystab: bad signature!\n"); iounmap(uv_systab); @@ -203,7 +205,7 @@ void uv_bios_init(void) int size = uv_systab->size; iounmap(uv_systab); - uv_systab = ioremap(efi.uv_systab, size); + uv_systab = ioremap(uv_systab_phys, size); if (!uv_systab) { pr_err("UV: UVsystab: ioremap(%d) failed!\n", size); return; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 20c389a91b80..5f0a96bf27a1 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1804,9 +1804,9 @@ static void pq_init(int node, int pnode) plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry); vp = kmalloc_node(plsize, GFP_KERNEL, node); - pqp = (struct bau_pq_entry *)vp; - BUG_ON(!pqp); + BUG_ON(!vp); + pqp = (struct bau_pq_entry *)vp; cp = (char *)pqp + 31; pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 24b079e94bc2..c9ef6a7a4a1a 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -12,6 +12,7 @@ #include <linux/smp.h> #include <linux/perf_event.h> #include <linux/tboot.h> +#include <linux/dmi.h> #include <asm/pgtable.h> #include <asm/proto.h> @@ -23,7 +24,7 @@ #include <asm/debugreg.h> #include <asm/cpu.h> #include <asm/mmu_context.h> -#include <linux/dmi.h> +#include <asm/cpu_device_id.h> #ifdef CONFIG_X86_32 __visible unsigned long saved_context_ebx; @@ -397,15 +398,14 @@ static int __init bsp_pm_check_init(void) core_initcall(bsp_pm_check_init); -static int msr_init_context(const u32 *msr_id, const int total_num) +static int msr_build_context(const u32 *msr_id, const int num) { - int i = 0; + struct saved_msrs *saved_msrs = &saved_context.saved_msrs; struct saved_msr *msr_array; + int total_num; + int i, j; - if (saved_context.saved_msrs.array || saved_context.saved_msrs.num > 0) { - pr_err("x86/pm: MSR quirk already applied, please check your DMI match table.\n"); - return -EINVAL; - } + total_num = saved_msrs->num + num; msr_array = kmalloc_array(total_num, sizeof(struct saved_msr), GFP_KERNEL); if (!msr_array) { @@ -413,19 +413,30 @@ static int msr_init_context(const u32 *msr_id, const int total_num) return -ENOMEM; } - for (i = 0; i < total_num; i++) { - msr_array[i].info.msr_no = msr_id[i]; + if (saved_msrs->array) { + /* + * Multiple callbacks can invoke this function, so copy any + * MSR save requests from previous invocations. + */ + memcpy(msr_array, saved_msrs->array, + sizeof(struct saved_msr) * saved_msrs->num); + + kfree(saved_msrs->array); + } + + for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) { + msr_array[i].info.msr_no = msr_id[j]; msr_array[i].valid = false; msr_array[i].info.reg.q = 0; } - saved_context.saved_msrs.num = total_num; - saved_context.saved_msrs.array = msr_array; + saved_msrs->num = total_num; + saved_msrs->array = msr_array; return 0; } /* - * The following section is a quirk framework for problematic BIOSen: + * The following sections are a quirk framework for problematic BIOSen: * Sometimes MSRs are modified by the BIOSen after suspended to * RAM, this might cause unexpected behavior after wakeup. * Thus we save/restore these specified MSRs across suspend/resume @@ -440,7 +451,7 @@ static int msr_initialize_bdw(const struct dmi_system_id *d) u32 bdw_msr_id[] = { MSR_IA32_THERM_CONTROL }; pr_info("x86/pm: %s detected, MSR saving is needed during suspending.\n", d->ident); - return msr_init_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id)); + return msr_build_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id)); } static const struct dmi_system_id msr_save_dmi_table[] = { @@ -455,9 +466,58 @@ static const struct dmi_system_id msr_save_dmi_table[] = { {} }; +static int msr_save_cpuid_features(const struct x86_cpu_id *c) +{ + u32 cpuid_msr_id[] = { + MSR_AMD64_CPUID_FN_1, + }; + + pr_info("x86/pm: family %#hx cpu detected, MSR saving is needed during suspending.\n", + c->family); + + return msr_build_context(cpuid_msr_id, ARRAY_SIZE(cpuid_msr_id)); +} + +static const struct x86_cpu_id msr_save_cpu_table[] = { + { + .vendor = X86_VENDOR_AMD, + .family = 0x15, + .model = X86_MODEL_ANY, + .feature = X86_FEATURE_ANY, + .driver_data = (kernel_ulong_t)msr_save_cpuid_features, + }, + { + .vendor = X86_VENDOR_AMD, + .family = 0x16, + .model = X86_MODEL_ANY, + .feature = X86_FEATURE_ANY, + .driver_data = (kernel_ulong_t)msr_save_cpuid_features, + }, + {} +}; + +typedef int (*pm_cpu_match_t)(const struct x86_cpu_id *); +static int pm_cpu_check(const struct x86_cpu_id *c) +{ + const struct x86_cpu_id *m; + int ret = 0; + + m = x86_match_cpu(msr_save_cpu_table); + if (m) { + pm_cpu_match_t fn; + + fn = (pm_cpu_match_t)m->driver_data; + ret = fn(m); + } + + return ret; +} + static int pm_check_save_msr(void) { dmi_check_system(msr_save_dmi_table); + pm_cpu_check(msr_save_cpu_table); + return 0; } diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 3cf302b26332..10fb42da0007 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -6,6 +6,9 @@ purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string targets += $(purgatory-y) PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) +$(obj)/string.o: $(srctree)/arch/x86/boot/compressed/string.c FORCE + $(call if_changed_rule,cc_o_c) + $(obj)/sha256.o: $(srctree)/lib/sha256.c FORCE $(call if_changed_rule,cc_o_c) @@ -15,13 +18,39 @@ targets += purgatory.ro KASAN_SANITIZE := n KCOV_INSTRUMENT := n +# These are adjustments to the compiler flags used for objects that +# make up the standalone purgatory.ro + +PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel +PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss + # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That # in turn leaves some undefined symbols like __fentry__ in purgatory and not -# sure how to relocate those. Like kexec-tools, use custom flags. +# sure how to relocate those. +ifdef CONFIG_FUNCTION_TRACER +PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_FTRACE) +endif + +ifdef CONFIG_STACKPROTECTOR +PURGATORY_CFLAGS_REMOVE += -fstack-protector +endif + +ifdef CONFIG_STACKPROTECTOR_STRONG +PURGATORY_CFLAGS_REMOVE += -fstack-protector-strong +endif + +ifdef CONFIG_RETPOLINE +PURGATORY_CFLAGS_REMOVE += $(RETPOLINE_CFLAGS) +endif + +CFLAGS_REMOVE_purgatory.o += $(PURGATORY_CFLAGS_REMOVE) +CFLAGS_purgatory.o += $(PURGATORY_CFLAGS) + +CFLAGS_REMOVE_sha256.o += $(PURGATORY_CFLAGS_REMOVE) +CFLAGS_sha256.o += $(PURGATORY_CFLAGS) -KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -Os -mcmodel=large -KBUILD_CFLAGS += -m$(BITS) -KBUILD_CFLAGS += $(call cc-option,-fno-PIE) +CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE) +CFLAGS_string.o += $(PURGATORY_CFLAGS) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 6d8d5a34c377..b607bda786f6 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -68,3 +68,9 @@ void purgatory(void) } copy_backup_region(); } + +/* + * Defined in order to reuse memcpy() and memset() from + * arch/x86/boot/compressed/string.c + */ +void warn(const char *msg) {} diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c deleted file mode 100644 index 01ad43873ad9..000000000000 --- a/arch/x86/purgatory/string.c +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Simple string functions. - * - * Copyright (C) 2014 Red Hat Inc. - * - * Author: - * Vivek Goyal <vgoyal@redhat.com> - */ - -#include <linux/types.h> - -#include "../boot/string.c" - -void *memcpy(void *dst, const void *src, size_t len) -{ - return __builtin_memcpy(dst, src, len); -} - -void *memset(void *dst, int c, size_t len) -{ - return __builtin_memset(dst, c, len); -} diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index 30b0d30d861a..6363761cc74c 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -19,7 +19,6 @@ GLOBAL(real_mode_header) .long pa_ro_end /* SMP trampoline */ .long pa_trampoline_start - .long pa_trampoline_status .long pa_trampoline_header #ifdef CONFIG_X86_64 .long pa_trampoline_pgd; diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 2dd866c9e21e..1868b158480d 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -41,9 +41,6 @@ ENTRY(trampoline_start) movl tr_start, %eax # where we need to go - movl $0xA5A5A5A5, trampoline_status - # write marker for master knows we're running - /* * GDT tables in non default location kernel can be beyond 16MB and * lgdt will not be able to load the address as in real mode default diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 24bb7598774e..aee2b45d83b8 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -49,9 +49,6 @@ ENTRY(trampoline_start) mov %ax, %es mov %ax, %ss - movl $0xA5A5A5A5, trampoline_status - # write marker for master knows we're running - # Setup stack movl $rm_stack_end, %esp diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S index 7c706772ab59..8d8208dcca24 100644 --- a/arch/x86/realmode/rm/trampoline_common.S +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -2,7 +2,3 @@ .section ".rodata","a" .balign 16 tr_idt: .fill 1, 6, 0 - - .bss - .balign 4 -GLOBAL(trampoline_status) .space 4 |