summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/boot/compressed/acpi.c2
-rw-r--r--arch/x86/coco/tdx/tdx.c1
-rw-r--r--arch/x86/entry/common.c93
-rw-r--r--arch/x86/entry/entry_64_compat.S77
-rw-r--r--arch/x86/events/intel/core.c2
-rw-r--r--arch/x86/hyperv/hv_init.c25
-rw-r--r--arch/x86/include/asm/acpi.h14
-rw-r--r--arch/x86/include/asm/ia32.h7
-rw-r--r--arch/x86/include/asm/idtentry.h4
-rw-r--r--arch/x86/include/asm/proto.h4
-rw-r--r--arch/x86/include/asm/syscall_wrapper.h34
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h9
-rw-r--r--arch/x86/kernel/acpi/boot.c34
-rw-r--r--arch/x86/kernel/alternative.c14
-rw-r--r--arch/x86/kernel/cpu/amd.c3
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c39
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c15
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c17
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h14
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c5
-rw-r--r--arch/x86/kernel/head_64.S16
-rw-r--r--arch/x86/kernel/idt.c2
-rw-r--r--arch/x86/kernel/sev.c11
-rw-r--r--arch/x86/kernel/signal_64.c6
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/cpuid.c4
-rw-r--r--arch/x86/kvm/debugfs.c1
-rw-r--r--arch/x86/kvm/svm/sev.c19
-rw-r--r--arch/x86/kvm/svm/svm.c9
-rw-r--r--arch/x86/kvm/svm/svm.h2
-rw-r--r--arch/x86/kvm/x86.c9
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c11
-rw-r--r--arch/x86/net/bpf_jit_comp.c46
-rw-r--r--arch/x86/xen/Kconfig1
-rw-r--r--arch/x86/xen/enlighten.c6
-rw-r--r--arch/x86/xen/enlighten_pv.c2
-rw-r--r--arch/x86/xen/xen-asm.S2
-rw-r--r--arch/x86/xen/xen-ops.h2
38 files changed, 355 insertions, 214 deletions
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 55c98fdd67d2..18d15d1ce87d 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -178,7 +178,7 @@ static unsigned long get_cmdline_acpi_rsdp(void)
{
unsigned long addr = 0;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
char val[MAX_ADDR_LEN] = { };
int ret;
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 1b5d17a9f70d..cf1f13c82175 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -10,6 +10,7 @@
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
+#include <asm/ia32.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/pgtable.h>
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index d813160b14d8..6356060caaf3 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -26,6 +26,7 @@
#include <xen/events.h>
#endif
+#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/vdso.h>
@@ -167,7 +168,96 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
}
}
-/* Handles int $0x80 */
+#ifdef CONFIG_IA32_EMULATION
+static __always_inline bool int80_is_external(void)
+{
+ const unsigned int offs = (0x80 / 32) * 0x10;
+ const u32 bit = BIT(0x80 % 32);
+
+ /* The local APIC on XENPV guests is fake */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /*
+ * If vector 0x80 is set in the APIC ISR then this is an external
+ * interrupt. Either from broken hardware or injected by a VMM.
+ *
+ * Note: In guest mode this is only valid for secure guests where
+ * the secure module fully controls the vAPIC exposed to the guest.
+ */
+ return apic_read(APIC_ISR + offs) & bit;
+}
+
+/**
+ * int80_emulation - 32-bit legacy syscall entry
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls. Instances of INT $0x80 can be found inline in
+ * various programs and libraries. It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method. Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * The arguments for the INT $0x80 based syscall are on stack in the
+ * pt_regs structure:
+ * eax: system call number
+ * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
+ */
+DEFINE_IDTENTRY_RAW(int80_emulation)
+{
+ int nr;
+
+ /* Kernel does not use INT $0x80! */
+ if (unlikely(!user_mode(regs))) {
+ irqentry_enter(regs);
+ instrumentation_begin();
+ panic("Unexpected external interrupt 0x80\n");
+ }
+
+ /*
+ * Establish kernel context for instrumentation, including for
+ * int80_is_external() below which calls into the APIC driver.
+ * Identical for soft and external interrupts.
+ */
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /* Validate that this is a soft interrupt to the extent possible */
+ if (unlikely(int80_is_external()))
+ panic("Unexpected external interrupt 0x80\n");
+
+ /*
+ * The low level idtentry code pushed -1 into regs::orig_ax
+ * and regs::ax contains the syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#else /* CONFIG_IA32_EMULATION */
+
+/* Handles int $0x80 on a 32bit kernel */
__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
{
int nr = syscall_32_enter(regs);
@@ -186,6 +276,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
instrumentation_end();
syscall_exit_to_user_mode(regs);
}
+#endif /* !CONFIG_IA32_EMULATION */
static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
{
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 27c05d08558a..de94e2e84ecc 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -275,80 +275,3 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
int3
SYM_CODE_END(entry_SYSCALL_compat)
-
-/*
- * 32-bit legacy system call entry.
- *
- * 32-bit x86 Linux system calls traditionally used the INT $0x80
- * instruction. INT $0x80 lands here.
- *
- * This entry point can be used by 32-bit and 64-bit programs to perform
- * 32-bit system calls. Instances of INT $0x80 can be found inline in
- * various programs and libraries. It is also used by the vDSO's
- * __kernel_vsyscall fallback for hardware that doesn't support a faster
- * entry method. Restarted 32-bit system calls also fall back to INT
- * $0x80 regardless of what instruction was originally used to do the
- * system call.
- *
- * This is considered a slow path. It is not used by most libc
- * implementations on modern hardware except during process startup.
- *
- * Arguments:
- * eax system call number
- * ebx arg1
- * ecx arg2
- * edx arg3
- * esi arg4
- * edi arg5
- * ebp arg6
- */
-SYM_CODE_START(entry_INT80_compat)
- UNWIND_HINT_ENTRY
- ENDBR
- /*
- * Interrupts are off on entry.
- */
- ASM_CLAC /* Do this early to minimize exposure */
- ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
-
- /*
- * User tracing code (ptrace or signal handlers) might assume that
- * the saved RAX contains a 32-bit number when we're invoking a 32-bit
- * syscall. Just in case the high bits are nonzero, zero-extend
- * the syscall number. (This could almost certainly be deleted
- * with no ill effects.)
- */
- movl %eax, %eax
-
- /* switch to thread stack expects orig_ax and rdi to be pushed */
- pushq %rax /* pt_regs->orig_ax */
-
- /* Need to switch before accessing the thread stack. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
-
- /* In the Xen PV case we already run on the thread stack. */
- ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
-
- movq %rsp, %rax
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
-
- pushq 5*8(%rax) /* regs->ss */
- pushq 4*8(%rax) /* regs->rsp */
- pushq 3*8(%rax) /* regs->eflags */
- pushq 2*8(%rax) /* regs->cs */
- pushq 1*8(%rax) /* regs->ip */
- pushq 0*8(%rax) /* regs->orig_ax */
-.Lint80_keep_stack:
-
- PUSH_AND_CLEAR_REGS rax=$-ENOSYS
- UNWIND_HINT_REGS
-
- cld
-
- IBRS_ENTER
- UNTRAIN_RET
-
- movq %rsp, %rdi
- call do_int80_syscall_32
- jmp swapgs_restore_regs_and_return_to_usermode
-SYM_CODE_END(entry_INT80_compat)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a08f794a0e79..ce1c777227b4 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4660,7 +4660,7 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
if (pmu->intel_cap.pebs_output_pt_available)
pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
else
- pmu->pmu.capabilities |= ~PERF_PMU_CAP_AUX_OUTPUT;
+ pmu->pmu.capabilities &= ~PERF_PMU_CAP_AUX_OUTPUT;
intel_pmu_check_event_constraints(pmu->event_constraints,
pmu->num_counters,
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 21556ad87f4b..8f3a4d16bb79 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -15,6 +15,7 @@
#include <linux/io.h>
#include <asm/apic.h>
#include <asm/desc.h>
+#include <asm/e820/api.h>
#include <asm/sev.h>
#include <asm/ibt.h>
#include <asm/hypervisor.h>
@@ -286,15 +287,31 @@ static int hv_cpu_die(unsigned int cpu)
static int __init hv_pci_init(void)
{
- int gen2vm = efi_enabled(EFI_BOOT);
+ bool gen2vm = efi_enabled(EFI_BOOT);
/*
- * For Generation-2 VM, we exit from pci_arch_init() by returning 0.
- * The purpose is to suppress the harmless warning:
+ * A Generation-2 VM doesn't support legacy PCI/PCIe, so both
+ * raw_pci_ops and raw_pci_ext_ops are NULL, and pci_subsys_init() ->
+ * pcibios_init() doesn't call pcibios_resource_survey() ->
+ * e820__reserve_resources_late(); as a result, any emulated persistent
+ * memory of E820_TYPE_PRAM (12) via the kernel parameter
+ * memmap=nn[KMG]!ss is not added into iomem_resource and hence can't be
+ * detected by register_e820_pmem(). Fix this by directly calling
+ * e820__reserve_resources_late() here: e820__reserve_resources_late()
+ * depends on e820__reserve_resources(), which has been called earlier
+ * from setup_arch(). Note: e820__reserve_resources_late() also adds
+ * any memory of E820_TYPE_PMEM (7) into iomem_resource, and
+ * acpi_nfit_register_region() -> acpi_nfit_insert_resource() ->
+ * region_intersects() returns REGION_INTERSECTS, so the memory of
+ * E820_TYPE_PMEM won't get added twice.
+ *
+ * We return 0 here so that pci_arch_init() won't print the warning:
* "PCI: Fatal: No config space access function found"
*/
- if (gen2vm)
+ if (gen2vm) {
+ e820__reserve_resources_late();
return 0;
+ }
/* For Generation-1 VM, we'll proceed in pci_arch_init(). */
return 1;
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index c8a7fc23f63c..f896eed4516c 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -16,6 +16,9 @@
#include <asm/x86_init.h>
#include <asm/cpufeature.h>
#include <asm/irq_vectors.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
#ifdef CONFIG_ACPI_APEI
# include <asm/pgtable_types.h>
@@ -127,6 +130,17 @@ static inline void arch_acpi_set_proc_cap_bits(u32 *cap)
if (!cpu_has(c, X86_FEATURE_MWAIT) ||
boot_option_idle_override == IDLE_NOMWAIT)
*cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH);
+
+ if (xen_initial_domain()) {
+ /*
+ * When Linux is running as Xen dom0, the hypervisor is the
+ * entity in charge of the processor power management, and so
+ * Xen needs to check the OS capabilities reported in the
+ * processor capabilities buffer matches what the hypervisor
+ * driver supports.
+ */
+ xen_sanitize_proc_cap_bits(cap);
+ }
}
static inline bool acpi_has_cpu_in_madt(void)
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 5a2ae24b1204..9805629479d9 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -75,6 +75,11 @@ static inline bool ia32_enabled(void)
return __ia32_enabled;
}
+static inline void ia32_disable(void)
+{
+ __ia32_enabled = false;
+}
+
#else /* !CONFIG_IA32_EMULATION */
static inline bool ia32_enabled(void)
@@ -82,6 +87,8 @@ static inline bool ia32_enabled(void)
return IS_ENABLED(CONFIG_X86_32);
}
+static inline void ia32_disable(void) {}
+
#endif
#endif /* _ASM_X86_IA32_H */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 05fd175cec7d..13639e57e1f8 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -569,6 +569,10 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_UD, exc_invalid_op);
DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3);
DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault);
+#if defined(CONFIG_IA32_EMULATION)
+DECLARE_IDTENTRY_RAW(IA32_SYSCALL_VECTOR, int80_emulation);
+#endif
+
#ifdef CONFIG_X86_MCE
#ifdef CONFIG_X86_64
DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check);
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 4d84122bd643..484f4f0131a5 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -32,10 +32,6 @@ void entry_SYSCALL_compat(void);
void entry_SYSCALL_compat_safe_stack(void);
void entry_SYSRETL_compat_unsafe_stack(void);
void entry_SYSRETL_compat_end(void);
-void entry_INT80_compat(void);
-#ifdef CONFIG_XEN_PV
-void xen_entry_INT80_compat(void);
-#endif
#else /* !CONFIG_IA32_EMULATION */
#define entry_SYSCALL_compat NULL
#define entry_SYSENTER_compat NULL
diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
index fd2669b1cb2d..21f9407be5d3 100644
--- a/arch/x86/include/asm/syscall_wrapper.h
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -86,9 +86,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
return sys_ni_syscall(); \
}
-#define __SYS_NI(abi, name) \
- SYSCALL_ALIAS(__##abi##_##name, sys_ni_posix_timers);
-
#ifdef CONFIG_X86_64
#define __X64_SYS_STUB0(name) \
__SYS_STUB0(x64, sys_##name)
@@ -100,13 +97,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
#define __X64_COND_SYSCALL(name) \
__COND_SYSCALL(x64, sys_##name)
-#define __X64_SYS_NI(name) \
- __SYS_NI(x64, sys_##name)
#else /* CONFIG_X86_64 */
#define __X64_SYS_STUB0(name)
#define __X64_SYS_STUBx(x, name, ...)
#define __X64_COND_SYSCALL(name)
-#define __X64_SYS_NI(name)
#endif /* CONFIG_X86_64 */
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -120,13 +114,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
#define __IA32_COND_SYSCALL(name) \
__COND_SYSCALL(ia32, sys_##name)
-#define __IA32_SYS_NI(name) \
- __SYS_NI(ia32, sys_##name)
#else /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
#define __IA32_SYS_STUB0(name)
#define __IA32_SYS_STUBx(x, name, ...)
#define __IA32_COND_SYSCALL(name)
-#define __IA32_SYS_NI(name)
#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
#ifdef CONFIG_IA32_EMULATION
@@ -135,8 +126,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
* additional wrappers (aptly named __ia32_sys_xyzzy) which decode the
* ia32 regs in the proper order for shared or "common" syscalls. As some
* syscalls may not be implemented, we need to expand COND_SYSCALL in
- * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this
- * case as well.
+ * kernel/sys_ni.c to cover this case as well.
*/
#define __IA32_COMPAT_SYS_STUB0(name) \
__SYS_STUB0(ia32, compat_sys_##name)
@@ -148,14 +138,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
#define __IA32_COMPAT_COND_SYSCALL(name) \
__COND_SYSCALL(ia32, compat_sys_##name)
-#define __IA32_COMPAT_SYS_NI(name) \
- __SYS_NI(ia32, compat_sys_##name)
-
#else /* CONFIG_IA32_EMULATION */
#define __IA32_COMPAT_SYS_STUB0(name)
#define __IA32_COMPAT_SYS_STUBx(x, name, ...)
#define __IA32_COMPAT_COND_SYSCALL(name)
-#define __IA32_COMPAT_SYS_NI(name)
#endif /* CONFIG_IA32_EMULATION */
@@ -175,13 +161,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
#define __X32_COMPAT_COND_SYSCALL(name) \
__COND_SYSCALL(x64, compat_sys_##name)
-#define __X32_COMPAT_SYS_NI(name) \
- __SYS_NI(x64, compat_sys_##name)
#else /* CONFIG_X86_X32_ABI */
#define __X32_COMPAT_SYS_STUB0(name)
#define __X32_COMPAT_SYS_STUBx(x, name, ...)
#define __X32_COMPAT_COND_SYSCALL(name)
-#define __X32_COMPAT_SYS_NI(name)
#endif /* CONFIG_X86_X32_ABI */
@@ -212,17 +195,12 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
/*
* As some compat syscalls may not be implemented, we need to expand
- * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in
- * kernel/time/posix-stubs.c to cover this case as well.
+ * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well.
*/
#define COND_SYSCALL_COMPAT(name) \
__IA32_COMPAT_COND_SYSCALL(name) \
__X32_COMPAT_COND_SYSCALL(name)
-#define COMPAT_SYS_NI(name) \
- __IA32_COMPAT_SYS_NI(name) \
- __X32_COMPAT_SYS_NI(name)
-
#endif /* CONFIG_COMPAT */
#define __SYSCALL_DEFINEx(x, name, ...) \
@@ -243,8 +221,8 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
* As the generic SYSCALL_DEFINE0() macro does not decode any parameters for
* obvious reasons, and passing struct pt_regs *regs to it in %rdi does not
* hurt, we only need to re-define it here to keep the naming congruent to
- * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI()
- * macros to work correctly.
+ * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() macro
+ * to work correctly.
*/
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
@@ -257,10 +235,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
__X64_COND_SYSCALL(name) \
__IA32_COND_SYSCALL(name)
-#define SYS_NI(name) \
- __X64_SYS_NI(name) \
- __IA32_SYS_NI(name)
-
/*
* For VSYSCALLS, we need to declare these three syscalls with the new
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 7048dfacc04b..a9088250770f 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -100,4 +100,13 @@ static inline void leave_lazy(enum xen_lazy_mode mode)
enum xen_lazy_mode xen_get_lazy_mode(void);
+#if defined(CONFIG_XEN_DOM0) && defined(CONFIG_ACPI)
+void xen_sanitize_proc_cap_bits(uint32_t *buf);
+#else
+static inline void xen_sanitize_proc_cap_bits(uint32_t *buf)
+{
+ BUG();
+}
+#endif
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index d0918a75cb00..85a3ce2a3666 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -63,6 +63,7 @@ int acpi_fix_pin2_polarity __initdata;
#ifdef CONFIG_X86_LOCAL_APIC
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+static bool has_lapic_cpus __initdata;
static bool acpi_support_online_capable;
#endif
@@ -233,6 +234,14 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
return 0;
/*
+ * According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure
+ * when MADT provides both valid LAPIC and x2APIC entries, the APIC ID
+ * in x2APIC must be equal or greater than 0xff.
+ */
+ if (has_lapic_cpus && apic_id < 0xff)
+ return 0;
+
+ /*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
* cpus_possible_map more accurately, to permit
@@ -284,6 +293,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
processor->processor_id, /* ACPI ID */
processor->lapic_flags & ACPI_MADT_ENABLED);
+ has_lapic_cpus = true;
return 0;
}
@@ -1114,10 +1124,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
static int __init acpi_parse_madt_lapic_entries(void)
{
- int count;
- int x2count = 0;
- int ret;
- struct acpi_subtable_proc madt_proc[2];
+ int count, x2count = 0;
if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
@@ -1126,21 +1133,10 @@ static int __init acpi_parse_madt_lapic_entries(void)
acpi_parse_sapic, MAX_LOCAL_APIC);
if (!count) {
- memset(madt_proc, 0, sizeof(madt_proc));
- madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
- madt_proc[0].handler = acpi_parse_lapic;
- madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
- madt_proc[1].handler = acpi_parse_x2apic;
- ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
- sizeof(struct acpi_table_madt),
- madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
- if (ret < 0) {
- pr_err("Error parsing LAPIC/X2APIC entries\n");
- return ret;
- }
-
- count = madt_proc[0].count;
- x2count = madt_proc[1].count;
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
+ acpi_parse_lapic, MAX_LOCAL_APIC);
+ x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
+ acpi_parse_x2apic, MAX_LOCAL_APIC);
}
if (!count && !x2count) {
pr_err("No LAPIC entries present\n");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 73be3931e4f0..aae7456ece07 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -255,6 +255,16 @@ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
}
}
+static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ optimize_nops(instr, len);
+ sync_core();
+ local_irq_restore(flags);
+}
+
/*
* In this context, "source" is where the instructions are placed in the
* section .altinstr_replacement, for example during kernel build by the
@@ -438,7 +448,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* patch if feature is *NOT* present.
*/
if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
- optimize_nops(instr, a->instrlen);
+ optimize_nops_inplace(instr, a->instrlen);
continue;
}
@@ -1685,8 +1695,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
} else {
local_irq_save(flags);
memcpy(addr, opcode, len);
- local_irq_restore(flags);
sync_core();
+ local_irq_restore(flags);
/*
* Could also do a CLFLUSH here to speed up CPU recovery; but
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a7eab05e5f29..f322ebd053a9 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1320,6 +1320,9 @@ static void zenbleed_check_cpu(void *unused)
void amd_check_microcode(void)
{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return;
+
on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 9373ec01c5ae..13b45b9c806d 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -104,8 +104,6 @@ struct cont_desc {
size_t size;
};
-static u32 ucode_new_rev;
-
/*
* Microcode patch container file is prepended to the initrd in cpio
* format. See Documentation/arch/x86/microcode.rst
@@ -442,12 +440,11 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
*
* Returns true if container found (sets @desc), false otherwise.
*/
-static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size)
+static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size)
{
struct cont_desc desc = { 0 };
struct microcode_amd *mc;
bool ret = false;
- u32 rev, dummy;
desc.cpuid_1_eax = cpuid_1_eax;
@@ -457,22 +454,15 @@ static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size)
if (!mc)
return ret;
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
/*
* Allow application of the same revision to pick up SMT-specific
* changes even if the revision of the other SMT thread is already
* up-to-date.
*/
- if (rev > mc->hdr.patch_id)
+ if (old_rev > mc->hdr.patch_id)
return ret;
- if (!__apply_microcode_amd(mc)) {
- ucode_new_rev = mc->hdr.patch_id;
- ret = true;
- }
-
- return ret;
+ return !__apply_microcode_amd(mc);
}
static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
@@ -506,9 +496,12 @@ static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpi
*ret = cp;
}
-void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
+void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax)
{
struct cpio_data cp = { };
+ u32 dummy;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy);
/* Needed in load_microcode_amd() */
ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax;
@@ -517,7 +510,8 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
if (!(cp.data && cp.size))
return;
- early_apply_microcode(cpuid_1_eax, cp.data, cp.size);
+ if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size))
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy);
}
static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
@@ -625,10 +619,8 @@ void reload_ucode_amd(unsigned int cpu)
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
if (rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc)) {
- ucode_new_rev = mc->hdr.patch_id;
- pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
- }
+ if (!__apply_microcode_amd(mc))
+ pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id);
}
}
@@ -649,8 +641,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
if (p && (p->patch_id == csig->rev))
uci->mc = p->data;
- pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
-
return 0;
}
@@ -691,8 +681,6 @@ static enum ucode_state apply_microcode_amd(int cpu)
rev = mc_amd->hdr.patch_id;
ret = UCODE_UPDATED;
- pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
-
out:
uci->cpu_sig.rev = rev;
c->microcode = rev;
@@ -935,11 +923,6 @@ struct microcode_ops * __init init_amd_microcode(void)
pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
return NULL;
}
-
- if (ucode_new_rev)
- pr_info_once("microcode updated early to new patch_level=0x%08x\n",
- ucode_new_rev);
-
return &microcode_amd_ops;
}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 666d25bbc5ad..232026a239a6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -41,8 +41,6 @@
#include "internal.h"
-#define DRIVER_VERSION "2.2"
-
static struct microcode_ops *microcode_ops;
bool dis_ucode_ldr = true;
@@ -77,6 +75,8 @@ static u32 final_levels[] = {
0, /* T-101 terminator */
};
+struct early_load_data early_data;
+
/*
* Check the current patch level on this CPU.
*
@@ -155,9 +155,9 @@ void __init load_ucode_bsp(void)
return;
if (intel)
- load_ucode_intel_bsp();
+ load_ucode_intel_bsp(&early_data);
else
- load_ucode_amd_bsp(cpuid_1_eax);
+ load_ucode_amd_bsp(&early_data, cpuid_1_eax);
}
void load_ucode_ap(void)
@@ -828,6 +828,11 @@ static int __init microcode_init(void)
if (!microcode_ops)
return -ENODEV;
+ pr_info_once("Current revision: 0x%08x\n", (early_data.new_rev ?: early_data.old_rev));
+
+ if (early_data.new_rev)
+ pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev);
+
microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0);
if (IS_ERR(microcode_pdev))
return PTR_ERR(microcode_pdev);
@@ -846,8 +851,6 @@ static int __init microcode_init(void)
cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
mc_cpu_online, mc_cpu_down_prep);
- pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);
-
return 0;
out_pdev:
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 6024feb98d29..070426b9895f 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -339,16 +339,9 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci,
static enum ucode_state apply_microcode_early(struct ucode_cpu_info *uci)
{
struct microcode_intel *mc = uci->mc;
- enum ucode_state ret;
- u32 cur_rev, date;
+ u32 cur_rev;
- ret = __apply_microcode(uci, mc, &cur_rev);
- if (ret == UCODE_UPDATED) {
- date = mc->hdr.date;
- pr_info_once("updated early: 0x%x -> 0x%x, date = %04x-%02x-%02x\n",
- cur_rev, mc->hdr.rev, date & 0xffff, date >> 24, (date >> 16) & 0xff);
- }
- return ret;
+ return __apply_microcode(uci, mc, &cur_rev);
}
static __init bool load_builtin_intel_microcode(struct cpio_data *cp)
@@ -413,13 +406,17 @@ static int __init save_builtin_microcode(void)
early_initcall(save_builtin_microcode);
/* Load microcode on BSP from initrd or builtin blobs */
-void __init load_ucode_intel_bsp(void)
+void __init load_ucode_intel_bsp(struct early_load_data *ed)
{
struct ucode_cpu_info uci;
+ ed->old_rev = intel_get_microcode_revision();
+
uci.mc = get_microcode_blob(&uci, false);
if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED)
ucode_patch_va = UCODE_BSP_LOADED;
+
+ ed->new_rev = uci.cpu_sig.rev;
}
void load_ucode_intel_ap(void)
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index f8047b12329a..21776c529fa9 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -37,6 +37,12 @@ struct microcode_ops {
use_nmi : 1;
};
+struct early_load_data {
+ u32 old_rev;
+ u32 new_rev;
+};
+
+extern struct early_load_data early_data;
extern struct ucode_cpu_info ucode_cpu_info[];
struct cpio_data find_microcode_in_initrd(const char *path);
@@ -92,14 +98,14 @@ extern bool dis_ucode_ldr;
extern bool force_minrev;
#ifdef CONFIG_CPU_SUP_AMD
-void load_ucode_amd_bsp(unsigned int family);
+void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family);
void load_ucode_amd_ap(unsigned int family);
int save_microcode_in_initrd_amd(unsigned int family);
void reload_ucode_amd(unsigned int cpu);
struct microcode_ops *init_amd_microcode(void);
void exit_amd_microcode(void);
#else /* CONFIG_CPU_SUP_AMD */
-static inline void load_ucode_amd_bsp(unsigned int family) { }
+static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { }
static inline void load_ucode_amd_ap(unsigned int family) { }
static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
static inline void reload_ucode_amd(unsigned int cpu) { }
@@ -108,12 +114,12 @@ static inline void exit_amd_microcode(void) { }
#endif /* !CONFIG_CPU_SUP_AMD */
#ifdef CONFIG_CPU_SUP_INTEL
-void load_ucode_intel_bsp(void);
+void load_ucode_intel_bsp(struct early_load_data *ed);
void load_ucode_intel_ap(void);
void reload_ucode_intel(void);
struct microcode_ops *init_intel_microcode(void);
#else /* CONFIG_CPU_SUP_INTEL */
-static inline void load_ucode_intel_bsp(void) { }
+static inline void load_ucode_intel_bsp(struct early_load_data *ed) { }
static inline void load_ucode_intel_ap(void) { }
static inline void reload_ucode_intel(void) { }
static inline struct microcode_ops *init_intel_microcode(void) { return NULL; }
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e6bba12c759c..01fa06dd06b6 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -262,11 +262,14 @@ static uint32_t __init ms_hyperv_platform(void)
static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
{
static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+ unsigned int old_cpu, this_cpu;
if (!unknown_nmi_panic)
return NMI_DONE;
- if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1)
+ old_cpu = -1;
+ this_cpu = raw_smp_processor_id();
+ if (!atomic_try_cmpxchg(&nmi_cpu, &old_cpu, this_cpu))
return NMI_HANDLED;
return NMI_DONE;
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 086a2c3aaaa0..0f8103240fda 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -255,6 +255,22 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
testl $X2APIC_ENABLE, %eax
jnz .Lread_apicid_msr
+#ifdef CONFIG_X86_X2APIC
+ /*
+ * If system is in X2APIC mode then MMIO base might not be
+ * mapped causing the MMIO read below to fault. Faults can't
+ * be handled at that point.
+ */
+ cmpl $0, x2apic_mode(%rip)
+ jz .Lread_apicid_mmio
+
+ /* Force the AP into X2APIC mode. */
+ orl $X2APIC_ENABLE, %eax
+ wrmsr
+ jmp .Lread_apicid_msr
+#endif
+
+.Lread_apicid_mmio:
/* Read the APIC ID from the fix-mapped MMIO space. */
movq apic_mmio_base(%rip), %rcx
addq $APIC_ID, %rcx
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 8857abc706e4..660b601f1d6c 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -121,7 +121,7 @@ static const __initconst struct idt_data def_idts[] = {
static const struct idt_data ia32_idt[] __initconst = {
#if defined(CONFIG_IA32_EMULATION)
- SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
+ SYSG(IA32_SYSCALL_VECTOR, asm_int80_emulation),
#elif defined(CONFIG_X86_32)
SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
#endif
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 70472eebe719..c67285824e82 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -1234,10 +1234,6 @@ void setup_ghcb(void)
if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
return;
- /* First make sure the hypervisor talks a supported protocol. */
- if (!sev_es_negotiate_protocol())
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-
/*
* Check whether the runtime #VC exception handler is active. It uses
* the per-CPU GHCB page which is set up by sev_es_init_vc_handling().
@@ -1255,6 +1251,13 @@ void setup_ghcb(void)
}
/*
+ * Make sure the hypervisor talks a supported protocol.
+ * This gets called only in the BSP boot phase.
+ */
+ if (!sev_es_negotiate_protocol())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ /*
* Clear the boot_ghcb. The first exception comes in before the bss
* section is cleared.
*/
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index cacf2ede6217..23d8aaf8d9fd 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -175,9 +175,6 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp);
uc_flags = frame_uc_flags(regs);
- if (setup_signal_shadow_stack(ksig))
- return -EFAULT;
-
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
@@ -198,6 +195,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
return -EFAULT;
}
+ if (setup_signal_shadow_stack(ksig))
+ return -EFAULT;
+
/* Set up registers for signal handler */
regs->di = ksig->sig;
/* In case the signal handler was declared without prototypes */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 34f2f47cadf2..b9d35b6b19f7 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -23,17 +23,15 @@ config KVM
depends on HAVE_KVM
depends on HIGH_RES_TIMERS
depends on X86_LOCAL_APIC
- select PREEMPT_NOTIFIERS
+ select KVM_COMMON
select KVM_GENERIC_MMU_NOTIFIER
select HAVE_KVM_IRQCHIP
select HAVE_KVM_PFNCACHE
- select HAVE_KVM_IRQFD
select HAVE_KVM_DIRTY_RING_TSO
select HAVE_KVM_DIRTY_RING_ACQ_REL
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
- select HAVE_KVM_EVENTFD
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
@@ -46,7 +44,6 @@ config KVM
select KVM_XFER_TO_GUEST_WORK
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_VFIO
- select INTERVAL_TREE
select HAVE_KVM_PM_NOTIFIER if PM
select KVM_GENERIC_HARDWARE_ENABLING
help
@@ -80,7 +77,7 @@ config KVM_WERROR
config KVM_SW_PROTECTED_VM
bool "Enable support for KVM software-protected VMs"
depends on EXPERT
- depends on X86_64
+ depends on KVM && X86_64
select KVM_GENERIC_PRIVATE_MEM
help
Enable support for KVM software-protected VMs. Currently "protected"
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 1b278a3f0689..cab20216921b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -475,7 +475,7 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
return -E2BIG;
if (cpuid->nent) {
- e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent));
+ e = vmemdup_array_user(entries, cpuid->nent, sizeof(*e));
if (IS_ERR(e))
return PTR_ERR(e);
@@ -519,7 +519,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
return -E2BIG;
if (cpuid->nent) {
- e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent));
+ e2 = vmemdup_array_user(entries, cpuid->nent, sizeof(*e2));
if (IS_ERR(e2))
return PTR_ERR(e2);
}
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 42026b3f3ff3..95ea1a1f7403 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -182,6 +182,7 @@ static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
}
static const struct file_operations mmu_rmaps_stat_fops = {
+ .owner = THIS_MODULE,
.open = kvm_mmu_rmaps_stat_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 4900c078045a..6ee925d66648 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2972,6 +2972,25 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
}
+
+ /*
+ * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
+ * the host/guest supports its use.
+ *
+ * guest_can_use() checks a number of requirements on the host/guest to
+ * ensure that MSR_IA32_XSS is available, but it might report true even
+ * if X86_FEATURE_XSAVES isn't configured in the guest to ensure host
+ * MSR_IA32_XSS is always properly restored. For SEV-ES, it is better
+ * to further check that the guest CPUID actually supports
+ * X86_FEATURE_XSAVES so that accesses to MSR_IA32_XSS by misbehaved
+ * guests will still get intercepted and caught in the normal
+ * kvm_emulate_rdmsr()/kvm_emulated_wrmsr() paths.
+ */
+ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1);
+ else
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0);
}
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 712146312358..a8bd4e909a1e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -103,6 +103,7 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
{ .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_IA32_XSS, .always = false },
{ .index = MSR_EFER, .always = false },
{ .index = MSR_IA32_CR_PAT, .always = false },
{ .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
@@ -1855,15 +1856,17 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
bool old_paging = is_paging(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
+ if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
vcpu->arch.efer |= EFER_LMA;
- svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
}
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
vcpu->arch.efer &= ~EFER_LMA;
- svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
}
}
#endif
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 59adff7bbf55..8ef95139cd24 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -30,7 +30,7 @@
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
-#define MAX_DIRECT_ACCESS_MSRS 46
+#define MAX_DIRECT_ACCESS_MSRS 47
#define MSRPM_OFFSETS 32
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 598b057611e0..b6fca43cb455 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5546,8 +5546,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
- return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
- sizeof(guest_xsave->region));
+ kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
+ sizeof(guest_xsave->region));
}
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
@@ -13081,7 +13081,10 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return true;
- return vcpu->arch.preempted_in_kernel;
+ if (vcpu != kvm_get_running_vcpu())
+ return vcpu->arch.preempted_in_kernel;
+
+ return static_call(kvm_x86_get_cpl)(vcpu) == 0;
}
unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index a68f2dda0948..70b91de2e053 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -32,6 +32,7 @@
#include <asm/msr.h>
#include <asm/cmdline.h>
#include <asm/sev.h>
+#include <asm/ia32.h>
#include "mm_internal.h"
@@ -481,6 +482,16 @@ void __init sme_early_init(void)
*/
if (sev_status & MSR_AMD64_SEV_ES_ENABLED)
x86_cpuinit.parallel_bringup = false;
+
+ /*
+ * The VMM is capable of injecting interrupt 0x80 and triggering the
+ * compatibility syscall path.
+ *
+ * By default, the 32-bit emulation is disabled in order to ensure
+ * the safety of the VM.
+ */
+ if (sev_status & MSR_AMD64_SEV_ENABLED)
+ ia32_disable();
}
void __init mem_encrypt_free_decrypted_mem(void)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8c10d9abc239..e89e415aa743 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3025,3 +3025,49 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
#endif
WARN(1, "verification of programs using bpf_throw should have failed\n");
}
+
+void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+ struct bpf_prog *new, struct bpf_prog *old)
+{
+ u8 *old_addr, *new_addr, *old_bypass_addr;
+ int ret;
+
+ old_bypass_addr = old ? NULL : poke->bypass_addr;
+ old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
+ new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
+
+ /*
+ * On program loading or teardown, the program's kallsym entry
+ * might not be in place, so we use __bpf_arch_text_poke to skip
+ * the kallsyms check.
+ */
+ if (new) {
+ ret = __bpf_arch_text_poke(poke->tailcall_target,
+ BPF_MOD_JUMP,
+ old_addr, new_addr);
+ BUG_ON(ret < 0);
+ if (!old) {
+ ret = __bpf_arch_text_poke(poke->tailcall_bypass,
+ BPF_MOD_JUMP,
+ poke->bypass_addr,
+ NULL);
+ BUG_ON(ret < 0);
+ }
+ } else {
+ ret = __bpf_arch_text_poke(poke->tailcall_bypass,
+ BPF_MOD_JUMP,
+ old_bypass_addr,
+ poke->bypass_addr);
+ BUG_ON(ret < 0);
+ /* let other CPUs finish the execution of program
+ * so that it will not possible to expose them
+ * to invalid nop, stack unwind, nop state
+ */
+ if (!ret)
+ synchronize_rcu();
+ ret = __bpf_arch_text_poke(poke->tailcall_target,
+ BPF_MOD_JUMP,
+ old_addr, NULL);
+ BUG_ON(ret < 0);
+ }
+}
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 9b1ec5d8c99c..a65fc2ae15b4 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -9,6 +9,7 @@ config XEN
select PARAVIRT_CLOCK
select X86_HV_CALLBACK_VECTOR
depends on X86_64 || (X86_32 && X86_PAE)
+ depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MCORE2 || MATOM || MK8)
depends on X86_LOCAL_APIC && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0337392a3121..3c61bb98c10e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,9 +33,12 @@ EXPORT_SYMBOL_GPL(hypercall_page);
* and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
* but during boot it is switched to point to xen_vcpu_info.
* The pointer is used in xen_evtchn_do_upcall to acknowledge pending events.
+ * Make sure that xen_vcpu_info doesn't cross a page boundary by making it
+ * cache-line aligned (the struct is guaranteed to have a size of 64 bytes,
+ * which matches the cache line size of 64-bit x86 processors).
*/
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
-DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info);
/* Linux <-> Xen vCPU id mapping */
DEFINE_PER_CPU(uint32_t, xen_vcpu_id);
@@ -160,6 +163,7 @@ void xen_vcpu_setup(int cpu)
int err;
struct vcpu_info *vcpup;
+ BUILD_BUG_ON(sizeof(*vcpup) > SMP_CACHE_BYTES);
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
/*
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index bbbfdd495ebd..aeb33e0a3f76 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -704,7 +704,7 @@ static struct trap_array_entry trap_array[] = {
TRAP_ENTRY(exc_int3, false ),
TRAP_ENTRY(exc_overflow, false ),
#ifdef CONFIG_IA32_EMULATION
- { entry_INT80_compat, xen_entry_INT80_compat, false },
+ TRAP_ENTRY(int80_emulation, false ),
#endif
TRAP_ENTRY(exc_page_fault, false ),
TRAP_ENTRY(exc_divide_error, false ),
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 9e5e68008785..1a9cd18dfbd3 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -156,7 +156,7 @@ xen_pv_trap asm_xenpv_exc_machine_check
#endif /* CONFIG_X86_MCE */
xen_pv_trap asm_exc_simd_coprocessor_error
#ifdef CONFIG_IA32_EMULATION
-xen_pv_trap entry_INT80_compat
+xen_pv_trap asm_int80_emulation
#endif
xen_pv_trap asm_exc_xen_unknown_trap
xen_pv_trap asm_exc_xen_hypervisor_callback
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 408a2aa66c69..a87ab36889e7 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -21,7 +21,7 @@ extern void *xen_initial_gdt;
struct trap_info;
void xen_copy_trap_info(struct trap_info *traps);
-DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DECLARE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info);
DECLARE_PER_CPU(unsigned long, xen_cr3);
DECLARE_PER_CPU(unsigned long, xen_current_cr3);