summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2021-12-08 05:29:16 -0500
committerPaolo Bonzini <pbonzini@redhat.com>2021-12-08 05:30:48 -0500
commit93b350f884c451d3cb75e6e3814266cf2d2afab1 (patch)
tree4fc17bfc4e31b6348df3c36117bd64a0b4520f88 /arch/x86
parentadbfb12d4c4517a8adde23a7fc46538953d56eea (diff)
parent250552b925ce400c17d166422fde9bb215958481 (diff)
Merge branch 'kvm-on-hv-msrbm-fix' into HEAD
Merge bugfix for enlightened MSR Bitmap, before adding support to KVM for exposing the feature to nested guests.
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/entry/entry_64.S35
-rw-r--r--arch/x86/include/asm/fpu/api.h6
-rw-r--r--arch/x86/include/asm/intel-family.h2
-rw-r--r--arch/x86/include/asm/xen/hypercall.h4
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h1
-rw-r--r--arch/x86/kernel/fpu/signal.c2
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/sev.c57
-rw-r--r--arch/x86/kernel/tsc.c28
-rw-r--r--arch/x86/kernel/tsc_sync.c41
-rw-r--r--arch/x86/kvm/vmx/vmx.c22
-rw-r--r--arch/x86/realmode/init.c12
-rw-r--r--arch/x86/xen/xen-asm.S20
13 files changed, 171 insertions, 61 deletions
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e38a4cf795d9..97b1f84bb53f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -574,6 +574,10 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
ud2
1:
#endif
+#ifdef CONFIG_XEN_PV
+ ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+#endif
+
POP_REGS pop_rdi=0
/*
@@ -890,6 +894,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
.Lparanoid_entry_checkgs:
/* EBX = 1 -> kernel GSBASE active, no restore required */
movl $1, %ebx
+
/*
* The kernel-enforced convention is a negative GSBASE indicates
* a kernel value. No SWAPGS needed on entry and exit.
@@ -897,21 +902,14 @@ SYM_CODE_START_LOCAL(paranoid_entry)
movl $MSR_GS_BASE, %ecx
rdmsr
testl %edx, %edx
- jns .Lparanoid_entry_swapgs
- ret
+ js .Lparanoid_kernel_gsbase
-.Lparanoid_entry_swapgs:
+ /* EBX = 0 -> SWAPGS required on exit */
+ xorl %ebx, %ebx
swapgs
+.Lparanoid_kernel_gsbase:
- /*
- * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
- * unconditional CR3 write, even in the PTI case. So do an lfence
- * to prevent GS speculation, regardless of whether PTI is enabled.
- */
FENCE_SWAPGS_KERNEL_ENTRY
-
- /* EBX = 0 -> SWAPGS required on exit */
- xorl %ebx, %ebx
ret
SYM_CODE_END(paranoid_entry)
@@ -993,11 +991,6 @@ SYM_CODE_START_LOCAL(error_entry)
pushq %r12
ret
-.Lerror_entry_done_lfence:
- FENCE_SWAPGS_KERNEL_ENTRY
-.Lerror_entry_done:
- ret
-
/*
* There are two places in the kernel that can potentially fault with
* usergs. Handle them here. B stepping K8s sometimes report a
@@ -1020,8 +1013,14 @@ SYM_CODE_START_LOCAL(error_entry)
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
- FENCE_SWAPGS_USER_ENTRY
- jmp .Lerror_entry_done
+
+ /*
+ * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
+ * kernel or user gsbase.
+ */
+.Lerror_entry_done_lfence:
+ FENCE_SWAPGS_KERNEL_ENTRY
+ ret
.Lbstep_iret:
/* Fix truncated RIP */
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 6053674f9132..c2767a6a387e 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -102,12 +102,6 @@ extern void switch_fpu_return(void);
*/
extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
-/*
- * Tasks that are not using SVA have mm->pasid set to zero to note that they
- * will not have the valid bit set in MSR_IA32_PASID while they are running.
- */
-#define PASID_DISABLED 0
-
/* Trap handling */
extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
extern void fpu_sync_fpstate(struct fpu *fpu);
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 5a0bcf8b78d7..048b6d5aff50 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -108,7 +108,7 @@
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
-#define INTEL_FAM6_RAPTOR_LAKE 0xB7
+#define INTEL_FAM6_RAPTORLAKE 0xB7
/* "Small Core" Processors (Atom) */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 0575f5863b7f..e5e0fe10c692 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -281,13 +281,13 @@ HYPERVISOR_callback_op(int cmd, void *arg)
return _hypercall2(int, callback_op, cmd, arg);
}
-static inline int
+static __always_inline int
HYPERVISOR_set_debugreg(int reg, unsigned long value)
{
return _hypercall2(int, set_debugreg, reg, value);
}
-static inline unsigned long
+static __always_inline unsigned long
HYPERVISOR_get_debugreg(int reg)
{
return _hypercall1(unsigned long, get_debugreg, reg);
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 4957f59deb40..5adab895127e 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -64,6 +64,7 @@ void xen_arch_unregister_cpu(int num);
#ifdef CONFIG_PVH
void __init xen_pvh_init(struct boot_params *boot_params);
+void __init mem_map_via_hcall(struct boot_params *boot_params_p);
#endif
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index d5958278eba6..91d4b6de58ab 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -118,7 +118,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
struct fpstate *fpstate)
{
struct xregs_state __user *x = buf;
- struct _fpx_sw_bytes sw_bytes;
+ struct _fpx_sw_bytes sw_bytes = {};
u32 xfeatures;
int err;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c410be738ae7..6a190c7f4d71 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -742,7 +742,7 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
return 0;
}
-static char *prepare_command_line(void)
+static char * __init prepare_command_line(void)
{
#ifdef CONFIG_CMDLINE_BOOL
#ifdef CONFIG_CMDLINE_OVERRIDE
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 74f0ec955384..a9fc2ac7a8bd 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -294,11 +294,6 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
char *dst, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
- char __user *target = (char __user *)dst;
- u64 d8;
- u32 d4;
- u16 d2;
- u8 d1;
/*
* This function uses __put_user() independent of whether kernel or user
@@ -320,26 +315,42 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
* instructions here would cause infinite nesting.
*/
switch (size) {
- case 1:
+ case 1: {
+ u8 d1;
+ u8 __user *target = (u8 __user *)dst;
+
memcpy(&d1, buf, 1);
if (__put_user(d1, target))
goto fault;
break;
- case 2:
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *target = (u16 __user *)dst;
+
memcpy(&d2, buf, 2);
if (__put_user(d2, target))
goto fault;
break;
- case 4:
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *target = (u32 __user *)dst;
+
memcpy(&d4, buf, 4);
if (__put_user(d4, target))
goto fault;
break;
- case 8:
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *target = (u64 __user *)dst;
+
memcpy(&d8, buf, 8);
if (__put_user(d8, target))
goto fault;
break;
+ }
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
@@ -362,11 +373,6 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
char *src, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT;
- char __user *s = (char __user *)src;
- u64 d8;
- u32 d4;
- u16 d2;
- u8 d1;
/*
* This function uses __get_user() independent of whether kernel or user
@@ -388,26 +394,41 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
* instructions here would cause infinite nesting.
*/
switch (size) {
- case 1:
+ case 1: {
+ u8 d1;
+ u8 __user *s = (u8 __user *)src;
+
if (__get_user(d1, s))
goto fault;
memcpy(buf, &d1, 1);
break;
- case 2:
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *s = (u16 __user *)src;
+
if (__get_user(d2, s))
goto fault;
memcpy(buf, &d2, 2);
break;
- case 4:
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *s = (u32 __user *)src;
+
if (__get_user(d4, s))
goto fault;
memcpy(buf, &d4, 4);
break;
- case 8:
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *s = (u64 __user *)src;
if (__get_user(d8, s))
goto fault;
memcpy(buf, &d8, 8);
break;
+ }
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 2e076a459a0c..a698196377be 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1180,6 +1180,12 @@ void mark_tsc_unstable(char *reason)
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+static void __init tsc_disable_clocksource_watchdog(void)
+{
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1196,6 +1202,23 @@ static void __init check_system_tsc_reliable(void)
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
+
+ /*
+ * Disable the clocksource watchdog when the system has:
+ * - TSC running at constant frequency
+ * - TSC which does not stop in C-States
+ * - the TSC_ADJUST register which allows to detect even minimal
+ * modifications
+ * - not more than two sockets. As the number of sockets cannot be
+ * evaluated at the early boot stage where this has to be
+ * invoked, check the number of online memory nodes as a
+ * fallback solution which is an reasonable estimate.
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ nr_online_nodes <= 2)
+ tsc_disable_clocksource_watchdog();
}
/*
@@ -1387,9 +1410,6 @@ static int __init init_tsc_clocksource(void)
if (tsc_unstable)
goto unreg;
- if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1527,7 +1547,7 @@ void __init tsc_init(void)
}
if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ tsc_disable_clocksource_watchdog();
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 50a4515fe0ad..9452dc9664b5 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -30,6 +30,7 @@ struct tsc_adjust {
};
static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+static struct timer_list tsc_sync_check_timer;
/*
* TSC's on different sockets may be reset asynchronously.
@@ -77,6 +78,46 @@ void tsc_verify_tsc_adjust(bool resume)
}
}
+/*
+ * Normally the tsc_sync will be checked every time system enters idle
+ * state, but there is still caveat that a system won't enter idle,
+ * either because it's too busy or configured purposely to not enter
+ * idle.
+ *
+ * So setup a periodic timer (every 10 minutes) to make sure the check
+ * is always on.
+ */
+
+#define SYNC_CHECK_INTERVAL (HZ * 600)
+
+static void tsc_sync_check_timer_fn(struct timer_list *unused)
+{
+ int next_cpu;
+
+ tsc_verify_tsc_adjust(false);
+
+ /* Run the check for all onlined CPUs in turn */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+
+ tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
+ add_timer_on(&tsc_sync_check_timer, next_cpu);
+}
+
+static int __init start_sync_check_timer(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
+ return 0;
+
+ timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
+ tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
+ add_timer(&tsc_sync_check_timer);
+
+ return 0;
+}
+late_initcall(start_sync_check_timer);
+
static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
unsigned int cpu, bool bootcpu)
{
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cee69d895370..9d7f26eaf05f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2652,15 +2652,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
-
- if (IS_ENABLED(CONFIG_HYPERV) &&
- static_branch_unlikely(&enable_evmcs) &&
- (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
- struct hv_enlightened_vmcs *evmcs =
- (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
-
- evmcs->hv_enlightenments_control.msr_bitmap = 1;
- }
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
@@ -6851,6 +6842,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
if (err < 0)
goto free_pml;
+ /*
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
+ * performance benefits from enabling it for vmcs02.
+ */
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
+
/* The MSR bitmap starts with all ones */
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 4a3da7592b99..38d24d2ab38b 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -72,6 +72,7 @@ static void __init setup_real_mode(void)
#ifdef CONFIG_X86_64
u64 *trampoline_pgd;
u64 efer;
+ int i;
#endif
base = (unsigned char *)real_mode_header;
@@ -128,8 +129,17 @@ static void __init setup_real_mode(void)
trampoline_header->flags = 0;
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+
+ /* Map the real mode stub as virtual == physical */
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
- trampoline_pgd[511] = init_top_pgt[511].pgd;
+
+ /*
+ * Include the entirety of the kernel mapping into the trampoline
+ * PGD. This way, all mappings present in the normal kernel page
+ * tables are usable while running on trampoline_pgd.
+ */
+ for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
+ trampoline_pgd[i] = init_top_pgt[i].pgd;
#endif
sme_sev_setup_real_mode(trampoline_header);
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 220dd9678494..444d824775f6 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/linkage.h>
+#include <../entry/calling.h>
.pushsection .noinstr.text, "ax"
/*
@@ -193,6 +194,25 @@ SYM_CODE_START(xen_iret)
SYM_CODE_END(xen_iret)
/*
+ * XEN pv doesn't use trampoline stack, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is
+ * also the kernel stack. Reusing swapgs_restore_regs_and_return_to_usermode()
+ * in XEN pv would cause %rsp to move up to the top of the kernel stack and
+ * leave the IRET frame below %rsp, which is dangerous to be corrupted if #NMI
+ * interrupts. And swapgs_restore_regs_and_return_to_usermode() pushing the IRET
+ * frame at the same address is useless.
+ */
+SYM_CODE_START(xenpv_restore_regs_and_return_to_usermode)
+ UNWIND_HINT_REGS
+ POP_REGS
+
+ /* stackleak_erase() can work safely on the kernel stack. */
+ STACKLEAK_ERASE_NOCLOBBER
+
+ addq $8, %rsp /* skip regs->orig_ax */
+ jmp xen_iret
+SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode)
+
+/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
* - kernel gs