summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r--arch/x86/kvm/vmx/vmx.c1610
1 files changed, 857 insertions, 753 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 893366e53732..4cbe8c84b636 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -28,7 +28,6 @@
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>
-#include <linux/entry-kvm.h>
#include <asm/apic.h>
#include <asm/asm.h>
@@ -46,6 +45,7 @@
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
+#include <asm/msr.h>
#include <asm/mwait.h>
#include <asm/spec-ctrl.h>
#include <asm/vmx.h>
@@ -53,6 +53,7 @@
#include <trace/events/ipi.h>
#include "capabilities.h"
+#include "common.h"
#include "cpuid.h"
#include "hyperv.h"
#include "kvm_onhyperv.h"
@@ -73,6 +74,8 @@
#include "vmx_onhyperv.h"
#include "posted_intr.h"
+#include "mmu/spte.h"
+
MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
@@ -111,10 +114,10 @@ static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, 0444);
module_param(enable_apicv, bool, 0444);
-
-bool __read_mostly enable_ipiv = true;
module_param(enable_ipiv, bool, 0444);
+module_param(enable_device_posted_irqs, bool, 0444);
+
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -164,31 +167,6 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
RTIT_STATUS_BYTECNT))
/*
- * List of MSRs that can be directly passed to the guest.
- * In addition to these x2apic, PT and LBR MSRs are handled specially.
- */
-static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
- MSR_IA32_SPEC_CTRL,
- MSR_IA32_PRED_CMD,
- MSR_IA32_FLUSH_CMD,
- MSR_IA32_TSC,
-#ifdef CONFIG_X86_64
- MSR_FS_BASE,
- MSR_GS_BASE,
- MSR_KERNEL_GS_BASE,
- MSR_IA32_XFD,
- MSR_IA32_XFD_ERR,
-#endif
- MSR_IA32_SYSENTER_CS,
- MSR_IA32_SYSENTER_ESP,
- MSR_IA32_SYSENTER_EIP,
- MSR_CORE_C1_RES,
- MSR_CORE_C3_RESIDENCY,
- MSR_CORE_C6_RESIDENCY,
- MSR_CORE_C7_RESIDENCY,
-};
-
-/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
* executions of PAUSE in a loop. Also indicate if ple enabled.
@@ -225,6 +203,7 @@ module_param(pt_mode, int, S_IRUGO);
struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
+#ifdef CONFIG_CPU_MITIGATIONS
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
@@ -247,7 +226,7 @@ static const struct {
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
-static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
unsigned int i;
@@ -273,6 +252,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
case L1TF_MITIGATION_OFF:
l1tf = VMENTER_L1D_FLUSH_NEVER;
break;
+ case L1TF_MITIGATION_AUTO:
case L1TF_MITIGATION_FLUSH_NOWARN:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
@@ -323,6 +303,26 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
+static int vmx_setup_l1d_flush(void)
+{
+ /*
+ * Hand the parameter mitigation value in which was stored in the pre
+ * module init parser. If no parameter was given, it will contain
+ * 'auto' which will be turned into the default 'cond' mitigation mode.
+ */
+ return __vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+}
+
+static void vmx_cleanup_l1d_flush(void)
+{
+ if (vmx_l1d_flush_pages) {
+ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+ vmx_l1d_flush_pages = NULL;
+ }
+ /* Restore state so sysfs ignores VMX */
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+
static int vmentry_l1d_flush_parse(const char *s)
{
unsigned int i;
@@ -360,7 +360,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
}
mutex_lock(&vmx_l1d_flush_mutex);
- ret = vmx_setup_l1d_flush(l1tf);
+ ret = __vmx_setup_l1d_flush(l1tf);
mutex_unlock(&vmx_l1d_flush_mutex);
return ret;
}
@@ -373,6 +373,101 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ if (!static_branch_unlikely(&vmx_l1d_should_flush))
+ return;
+
+ /*
+ * This code is only executed when the flush mode is 'cond' or
+ * 'always'
+ */
+ if (static_branch_likely(&vmx_l1d_flush_cond)) {
+ /*
+ * Clear the per-cpu flush bit, it gets set again if the vCPU
+ * is reloaded, i.e. if the vCPU is scheduled out or if KVM
+ * exits to userspace, or if KVM reaches one of the unsafe
+ * VMEXIT handlers, e.g. if KVM calls into the emulator,
+ * or from the interrupt handlers.
+ */
+ if (!kvm_get_cpu_l1tf_flush_l1d())
+ return;
+ kvm_clear_cpu_l1tf_flush_l1d();
+ }
+
+ vcpu->stat.l1d_flush++;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return;
+ }
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (vmx_l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
+#else /* CONFIG_CPU_MITIGATIONS*/
+static int vmx_setup_l1d_flush(void)
+{
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER;
+ return 0;
+}
+static void vmx_cleanup_l1d_flush(void)
+{
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+
+}
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n");
+ return 0;
+}
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ return sysfs_emit(s, "never\n");
+}
+#endif
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+ .set = vmentry_l1d_flush_set,
+ .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+
static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
{
u64 msr;
@@ -380,9 +475,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
if (!vmx->disable_fb_clear)
return;
- msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
+ msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL);
msr |= FB_CLEAR_DIS;
- native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr);
/* Cache the MSR value to avoid reading it later */
vmx->msr_ia32_mcu_opt_ctrl = msr;
}
@@ -393,7 +488,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
return;
vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
- native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+ native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
}
static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
@@ -425,12 +520,6 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
vmx->disable_fb_clear = false;
}
-static const struct kernel_param_ops vmentry_l1d_flush_ops = {
- .set = vmentry_l1d_flush_set,
- .get = vmentry_l1d_flush_get,
-};
-module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
-
static u32 vmx_segment_access_rights(struct kvm_segment *var);
void vmx_vmexit(void);
@@ -669,40 +758,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static int vmx_get_passthrough_msr_slot(u32 msr)
-{
- int i;
-
- switch (msr) {
- case 0x800 ... 0x8ff:
- /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
- return -ENOENT;
- case MSR_IA32_RTIT_STATUS:
- case MSR_IA32_RTIT_OUTPUT_BASE:
- case MSR_IA32_RTIT_OUTPUT_MASK:
- case MSR_IA32_RTIT_CR3_MATCH:
- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
- /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
- case MSR_LBR_SELECT:
- case MSR_LBR_TOS:
- case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
- case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
- case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
- case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
- case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
- /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
- return -ENOENT;
- }
-
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
- if (vmx_possible_passthrough_msrs[i] == msr)
- return i;
- }
-
- WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
- return -ENOENT;
-}
-
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -769,8 +824,11 @@ void vmx_emergency_disable_virtualization_cpu(void)
return;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
+ loaded_vmcss_on_cpu_link) {
vmcs_clear(v->vmcs);
+ if (v->shadow_vmcs)
+ vmcs_clear(v->shadow_vmcs);
+ }
kvm_cpu_vmxoff();
}
@@ -804,7 +862,7 @@ static void __loaded_vmcs_clear(void *arg)
loaded_vmcs->launched = 0;
}
-void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
int cpu = loaded_vmcs->cpu;
@@ -955,6 +1013,10 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
flags |= VMX_RUN_SAVE_SPEC_CTRL;
+ if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
+ kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
+ flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
+
return flags;
}
@@ -1063,7 +1125,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
* provide that period, so a CPU could write host's record into
* guest's memory.
*/
- wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+ wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
}
i = vmx_find_loadstore_msr_slot(&m->guest, msr);
@@ -1192,13 +1254,13 @@ static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
- wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
- wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
- wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
- wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+ wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
+ wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+ wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+ wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
- wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
- wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+ wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+ wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
@@ -1206,13 +1268,13 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
- rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
- rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
- rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
- rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+ rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
+ rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+ rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+ rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
- rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
- rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+ rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+ rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
@@ -1225,9 +1287,9 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
* GUEST_IA32_RTIT_CTL is already set in the VMCS.
* Save host state before VM entry.
*/
- rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
- wrmsrl(MSR_IA32_RTIT_CTL, 0);
+ wrmsrq(MSR_IA32_RTIT_CTL, 0);
pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
}
@@ -1248,7 +1310,7 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
* i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
*/
if (vmx->pt_desc.host.ctl)
- wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
@@ -1281,6 +1343,7 @@ void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
struct vmcs_host_state *host_state;
#ifdef CONFIG_X86_64
int cpu = raw_smp_processor_id();
@@ -1309,7 +1372,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
if (vmx->nested.need_vmcs12_to_shadow_sync)
nested_sync_vmcs12_to_shadow(vcpu);
- if (vmx->guest_state_loaded)
+ if (vt->guest_state_loaded)
return;
host_state = &vmx->loaded_vmcs->host_state;
@@ -1330,15 +1393,15 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
fs_sel = current->thread.fsindex;
gs_sel = current->thread.gsindex;
fs_base = current->thread.fsbase;
- vmx->msr_host_kernel_gs_base = current->thread.gsbase;
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
} else {
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
fs_base = read_msr(MSR_FS_BASE);
- vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
}
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
@@ -1347,14 +1410,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
#endif
vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
- vmx->guest_state_loaded = true;
+ vt->guest_state_loaded = true;
}
static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
{
struct vmcs_host_state *host_state;
- if (!vmx->guest_state_loaded)
+ if (!vmx->vt.guest_state_loaded)
return;
host_state = &vmx->loaded_vmcs->host_state;
@@ -1362,7 +1425,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
++vmx->vcpu.stat.host_state_reload;
#ifdef CONFIG_X86_64
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
kvm_load_ldt(host_state->ldt_sel);
@@ -1382,30 +1445,43 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
#endif
invalidate_tss_limit();
#ifdef CONFIG_X86_64
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base);
#endif
load_fixmap_gdt(raw_smp_processor_id());
- vmx->guest_state_loaded = false;
+ vmx->vt.guest_state_loaded = false;
vmx->guest_uret_msrs_loaded = false;
}
#ifdef CONFIG_X86_64
-static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
+static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache)
{
preempt_disable();
- if (vmx->guest_state_loaded)
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ if (vmx->vt.guest_state_loaded)
+ *cache = read_msr(msr);
preempt_enable();
- return vmx->msr_guest_kernel_gs_base;
+ return *cache;
}
-static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
+static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data,
+ u64 *cache)
{
preempt_disable();
- if (vmx->guest_state_loaded)
- wrmsrl(MSR_KERNEL_GS_BASE, data);
+ if (vmx->vt.guest_state_loaded)
+ wrmsrns(msr, data);
preempt_enable();
- vmx->msr_guest_kernel_gs_base = data;
+ *cache = data;
+}
+
+static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
+{
+ return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE,
+ &vmx->msr_guest_kernel_gs_base);
+}
+
+static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
+{
+ vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data,
+ &vmx->msr_guest_kernel_gs_base);
}
#endif
@@ -1441,8 +1517,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
- struct loaded_vmcs *buddy)
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
@@ -1469,16 +1544,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
if (prev != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
-
- /*
- * No indirect branch prediction barrier needed when switching
- * the active VMCS within a vCPU, unless IBRS is advertised to
- * the vCPU. To minimize the number of IBPBs executed, KVM
- * performs IBPB on nested VM-Exit (a single nested transition
- * may switch the active VMCS multiple times).
- */
- if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
- indirect_branch_prediction_barrier();
}
if (!already_loaded) {
@@ -1514,16 +1579,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
*/
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
- vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
vmx_vcpu_pi_load(vcpu, cpu);
-
- vmx->host_debugctlmsr = get_debugctlmsr();
}
void vmx_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1582,7 +1643,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
- vmx->emulation_required = vmx_emulation_required(vcpu);
+ vmx->vt.emulation_required = vmx_emulation_required(vcpu);
}
bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
@@ -1636,7 +1697,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
* result in a #GP unless the same write also clears TraceEn.
*/
if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
- ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+ (data & RTIT_CTL_TRACEEN) &&
+ data != vmx->pt_desc.guest.ctl)
return 1;
/*
@@ -1701,16 +1763,22 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* so that guest userspace can't DoS the guest simply by triggering
* emulation (enclaves are CPL3 only).
*/
- if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+ if (vmx_get_exit_reason(vcpu).enclave_mode) {
kvm_queue_exception(vcpu, UD_VECTOR);
return X86EMUL_PROPAGATE_FAULT;
}
+
+ /* Check that emulation is possible during event vectoring */
+ if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ !kvm_can_emulate_event_vectoring(emul_type))
+ return X86EMUL_UNHANDLEABLE_VECTORING;
+
return X86EMUL_CONTINUE;
}
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
unsigned long rip, orig_rip;
u32 instr_len;
@@ -1857,7 +1925,7 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu)
return;
}
- WARN_ON_ONCE(vmx->emulation_required);
+ WARN_ON_ONCE(vmx->vt.emulation_required);
if (kvm_exception_is_soft(ex->vector)) {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
@@ -1908,8 +1976,8 @@ static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
- guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID));
/*
* hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
@@ -2062,7 +2130,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
@@ -2078,13 +2146,13 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
return 1;
msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
break;
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
- if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return 1;
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data))
@@ -2097,7 +2165,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* sanity checking and refuse to boot. Filter all unsupported
* features out.
*/
- if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
+ if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
nested_evmcs_filter_control_msr(vcpu, msr_info->index,
&msr_info->data);
#endif
@@ -2147,8 +2215,17 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
+ case MSR_IA32_S_CET:
+ msr_info->data = vmcs_readl(GUEST_S_CET);
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ msr_info->data = vmcs_readl(GUEST_SSP);
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
+ break;
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ msr_info->data = vmx_guest_debugctl_read();
break;
default:
find_uret_msr:
@@ -2167,27 +2244,43 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
u64 data)
{
#ifdef CONFIG_X86_64
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
return (u32)data;
#endif
return (unsigned long)data;
}
-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
{
u64 debugctl = 0;
if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
- (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
- if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
+ if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) &&
(host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ if (boot_cpu_has(X86_FEATURE_RTM) &&
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
+ debugctl |= DEBUGCTLMSR_RTM_DEBUG;
+
return debugctl;
}
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+{
+ u64 invalid;
+
+ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
+ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
+ }
+ return !invalid;
+}
+
/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -2256,33 +2349,26 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_DEBUGCTLMSR: {
- u64 invalid;
-
- invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
- kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- }
-
- if (invalid)
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
return 1;
+ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ vmx_guest_debugctl_write(vcpu, data);
+
if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
(data & DEBUGCTLMSR_LBR))
intel_pmu_create_guest_lbr_event(vcpu);
return 0;
- }
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
return 1;
if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD))
@@ -2384,7 +2470,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* behavior, but it's close enough.
*/
if (!msr_info->host_initiated &&
- (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+ (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
!(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
return 1;
@@ -2394,7 +2480,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
- if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_RTIT_CTL:
@@ -2456,10 +2542,19 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else
vmx->pt_desc.guest.addr_a[index / 2] = data;
break;
+ case MSR_IA32_S_CET:
+ vmcs_writel(GUEST_S_CET, data);
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ vmcs_writel(GUEST_SSP, data);
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ vmcs_writel(GUEST_INTR_SSP_TABLE, data);
+ break;
case MSR_IA32_PERF_CAPABILITIES:
- if (data & PMU_CAP_LBR_FMT) {
- if ((data & PMU_CAP_LBR_FMT) !=
- (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
+ if (data & PERF_CAP_LBR_FMT) {
+ if ((data & PERF_CAP_LBR_FMT) !=
+ (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT))
return 1;
if (!cpuid_model_is_consistent(vcpu))
return 1;
@@ -2468,9 +2563,9 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((data & PERF_CAP_PEBS_MASK) !=
(kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
return 1;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS))
return 1;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64))
return 1;
if (!cpuid_model_is_consistent(vcpu))
return 1;
@@ -2570,11 +2665,39 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
{
u64 allowed;
- rdmsrl(msr, allowed);
+ rdmsrq(msr, allowed);
return ctl_opt & allowed;
}
+#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \
+({ \
+ int i, r = 0; \
+ \
+ BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \
+ BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \
+ \
+ for (i = 0; i < ARRAY_SIZE(pairs); i++) { \
+ typeof(entry_controls) n_ctrl = pairs[i].entry_control; \
+ typeof(exit_controls) x_ctrl = pairs[i].exit_control; \
+ \
+ if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \
+ continue; \
+ \
+ pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \
+ "entry = %llx (%llx), exit = %llx (%llx)\n", \
+ (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \
+ (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \
+ \
+ if (error_on_inconsistent_vmcs_config) \
+ r = -EIO; \
+ \
+ entry_controls &= ~n_ctrl; \
+ exit_controls &= ~x_ctrl; \
+ } \
+ r; \
+})
+
static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
struct vmx_capability *vmx_cap)
{
@@ -2586,7 +2709,6 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
u32 _vmentry_control = 0;
u64 basic_msr;
u64 misc_msr;
- int i;
/*
* LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
@@ -2602,6 +2724,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
{ VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
{ VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
{ VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
+ { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE },
};
memset(vmcs_conf, 0, sizeof(*vmcs_conf));
@@ -2690,22 +2813,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_vmentry_control))
return -EIO;
- for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
- u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
- u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
-
- if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
- continue;
-
- pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
- _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
-
- if (error_on_inconsistent_vmcs_config)
- return -EIO;
-
- _vmentry_control &= ~n_ctrl;
- _vmexit_control &= ~x_ctrl;
- }
+ if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs,
+ _vmentry_control, _vmexit_control))
+ return -EIO;
/*
* Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
@@ -2728,7 +2838,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
break;
}
- rdmsrl(MSR_IA32_VMX_BASIC, basic_msr);
+ rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
@@ -2748,7 +2858,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
return -EIO;
- rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
+ rdmsrq(MSR_IA32_VMX_MISC, misc_msr);
vmcs_conf->basic = basic_msr;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
@@ -2832,7 +2942,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
fault:
WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
- rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
cr4_clear_bits(X86_CR4_VMXE);
return -EFAULT;
@@ -3219,6 +3329,40 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
return to_vmx(vcpu)->vpid;
}
+static u64 construct_eptp(hpa_t root_hpa)
+{
+ u64 eptp = root_hpa | VMX_EPTP_MT_WB;
+ struct kvm_mmu_page *root;
+
+ if (kvm_mmu_is_dummy_root(root_hpa))
+ return eptp | VMX_EPTP_PWL_4;
+
+ /*
+ * EPT roots should always have an associated MMU page. Return a "bad"
+ * EPTP to induce VM-Fail instead of continuing on in a unknown state.
+ */
+ root = root_to_sp(root_hpa);
+ if (WARN_ON_ONCE(!root))
+ return INVALID_PAGE;
+
+ eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+
+ if (enable_ept_ad_bits && !root->role.ad_disabled)
+ eptp |= VMX_EPTP_AD_ENABLE_BIT;
+
+ return eptp;
+}
+
+static void vmx_flush_tlb_ept_root(hpa_t root_hpa)
+{
+ u64 eptp = construct_eptp(root_hpa);
+
+ if (VALID_PAGE(eptp))
+ ept_sync_context(eptp);
+ else
+ ept_sync_global();
+}
+
void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -3229,8 +3373,7 @@ void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
return;
if (enable_ept)
- ept_sync_context(construct_eptp(vcpu, root_hpa,
- mmu->root_role.level));
+ vmx_flush_tlb_ept_root(root_hpa);
else
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
@@ -3386,7 +3529,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
/* depends on vcpu->arch.cr0 to be set to a new value */
- vmx->emulation_required = vmx_emulation_required(vcpu);
+ vmx->vt.emulation_required = vmx_emulation_required(vcpu);
}
static int vmx_get_max_ept_level(void)
@@ -3396,30 +3539,16 @@ static int vmx_get_max_ept_level(void)
return 4;
}
-u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
-{
- u64 eptp = VMX_EPTP_MT_WB;
-
- eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
-
- if (enable_ept_ad_bits &&
- (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
- eptp |= VMX_EPTP_AD_ENABLE_BIT;
- eptp |= root_hpa;
-
- return eptp;
-}
-
void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
unsigned long guest_cr3;
- u64 eptp;
if (enable_ept) {
- eptp = construct_eptp(vcpu, root_hpa, root_level);
- vmcs_write64(EPT_POINTER, eptp);
+ KVM_MMU_WARN_ON(root_to_sp(root_hpa) &&
+ root_level != root_to_sp(root_hpa)->role.level);
+ vmcs_write64(EPT_POINTER, construct_eptp(root_hpa));
hv_track_root_tdp(vcpu, root_hpa);
@@ -3516,7 +3645,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcs_writel(GUEST_CR4, hw_cr4);
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
}
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
@@ -3649,7 +3778,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
__vmx_set_segment(vcpu, var, seg);
- to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
+ to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu);
}
void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3998,76 +4127,29 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
vmx->nested.force_msr_bitmap_recalc = true;
}
-void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
- int idx;
if (!cpu_has_vmx_msr_bitmap())
return;
vmx_msr_bitmap_l01_changed(vmx);
- /*
- * Mark the desired intercept state in shadow bitmap, this is needed
- * for resync when the MSR filters change.
- */
- idx = vmx_get_passthrough_msr_slot(msr);
- if (idx >= 0) {
- if (type & MSR_TYPE_R)
- clear_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- clear_bit(idx, vmx->shadow_msr_intercept.write);
- }
-
- if ((type & MSR_TYPE_R) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
- type &= ~MSR_TYPE_R;
- }
-
- if ((type & MSR_TYPE_W) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
- type &= ~MSR_TYPE_W;
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
}
- if (type & MSR_TYPE_R)
- vmx_clear_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_clear_msr_bitmap_write(msr_bitmap, msr);
-}
-
-void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
- int idx;
-
- if (!cpu_has_vmx_msr_bitmap())
- return;
-
- vmx_msr_bitmap_l01_changed(vmx);
-
- /*
- * Mark the desired intercept state in shadow bitmap, this is needed
- * for resync when the MSR filter changes.
- */
- idx = vmx_get_passthrough_msr_slot(msr);
- if (idx >= 0) {
- if (type & MSR_TYPE_R)
- set_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- set_bit(idx, vmx->shadow_msr_intercept.write);
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-
- if (type & MSR_TYPE_R)
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
@@ -4146,79 +4228,81 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 i;
+ bool intercept;
if (!cpu_has_vmx_msr_bitmap())
return;
- /*
- * Redo intercept permissions for MSRs that KVM is passing through to
- * the guest. Disabling interception will check the new MSR filter and
- * ensure that KVM enables interception if usersepace wants to filter
- * the MSR. MSRs that KVM is already intercepting don't need to be
- * refreshed since KVM is going to intercept them regardless of what
- * userspace wants.
- */
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
- u32 msr = vmx_possible_passthrough_msrs[i];
-
- if (!test_bit(i, vmx->shadow_msr_intercept.read))
- vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
-
- if (!test_bit(i, vmx->shadow_msr_intercept.write))
- vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
+ vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ if (kvm_cstate_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ }
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
}
/* PT MSRs can be passed through iff PT is exposed to the guest. */
if (vmx_pt_mode_is_host_guest())
pt_update_intercept_for_msr(vcpu);
-}
-static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
- int pi_vec)
-{
-#ifdef CONFIG_SMP
- if (vcpu->mode == IN_GUEST_MODE) {
- /*
- * The vector of the virtual has already been set in the PIR.
- * Send a notification event to deliver the virtual interrupt
- * unless the vCPU is the currently running vCPU, i.e. the
- * event is being sent from a fastpath VM-Exit handler, in
- * which case the PIR will be synced to the vIRR before
- * re-entering the guest.
- *
- * When the target is not the running vCPU, the following
- * possibilities emerge:
- *
- * Case 1: vCPU stays in non-root mode. Sending a notification
- * event posts the interrupt to the vCPU.
- *
- * Case 2: vCPU exits to root mode and is still runnable. The
- * PIR will be synced to the vIRR before re-entering the guest.
- * Sending a notification event is ok as the host IRQ handler
- * will ignore the spurious event.
- *
- * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
- * has already synced PIR to vIRR and never blocks the vCPU if
- * the vIRR is not empty. Therefore, a blocked vCPU here does
- * not wait for any requested interrupts in PIR, and sending a
- * notification event also results in a benign, spurious event.
- */
+ if (vcpu->arch.xfd_no_write_intercept)
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
- if (vcpu != kvm_get_running_vcpu())
- __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
- return;
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !to_vmx(vcpu)->spec_ctrl);
+
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
+
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
+
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept);
}
-#endif
+
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) {
+ intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept);
+ }
+
/*
- * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
- * otherwise do nothing as KVM will grab the highest priority pending
- * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+ * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
+ * filtered by userspace.
*/
- kvm_vcpu_wake_up(vcpu);
+}
+
+void vmx_recalc_intercepts(struct kvm_vcpu *vcpu)
+{
+ vmx_recalc_msr_intercepts(vcpu);
}
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -4269,7 +4353,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
*/
static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
int r;
r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
@@ -4280,20 +4364,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (!vcpu->arch.apic->apicv_active)
return -1;
- if (pi_test_and_set_pir(vector, &vmx->pi_desc))
- return 0;
-
- /* If a previous notification has sent the IPI, nothing to do. */
- if (pi_test_and_set_on(&vmx->pi_desc))
- return 0;
-
- /*
- * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
- * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
- * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
- * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
- */
- kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
+ __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector);
return 0;
}
@@ -4373,7 +4444,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
- rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
+ rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl);
vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
@@ -4383,6 +4454,21 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
if (cpu_has_load_ia32_efer())
vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
+
+ /*
+ * Supervisor shadow stack is not enabled on host side, i.e.,
+ * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM
+ * description(RDSSP instruction), SSP is not readable in CPL0,
+ * so resetting the two registers to 0s at VM-Exit does no harm
+ * to kernel execution. When execution flow exits to userspace,
+ * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter
+ * 3 and 4 for details.
+ */
+ if (cpu_has_load_cet_ctrl()) {
+ vmcs_writel(HOST_S_CET, kvm_host.s_cet);
+ vmcs_writel(HOST_SSP, 0);
+ vmcs_writel(HOST_INTR_SSP_TABLE, 0);
+ }
}
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -4417,7 +4503,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
return pin_based_exec_ctrl;
}
-static u32 vmx_vmentry_ctrl(void)
+static u32 vmx_get_initial_vmentry_ctrl(void)
{
u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
@@ -4434,7 +4520,7 @@ static u32 vmx_vmentry_ctrl(void)
return vmentry_ctrl;
}
-static u32 vmx_vmexit_ctrl(void)
+static u32 vmx_get_initial_vmexit_ctrl(void)
{
u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
@@ -4464,19 +4550,13 @@ void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
- if (kvm_vcpu_apicv_active(vcpu)) {
- secondary_exec_controls_setbit(vmx,
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
- if (enable_ipiv)
- tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
- } else {
- secondary_exec_controls_clearbit(vmx,
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
- if (enable_ipiv)
- tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
- }
+ secondary_exec_controls_changebit(vmx,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY,
+ kvm_vcpu_apicv_active(vcpu));
+ if (enable_ipiv)
+ tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT,
+ kvm_vcpu_apicv_active(vcpu));
vmx_update_msr_bitmap_x2apic(vcpu);
}
@@ -4590,10 +4670,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
bool __enabled; \
\
if (cpu_has_vmx_##name()) { \
- if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \
- __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \
- else \
- __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \
+ __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \
vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
__enabled, exiting); \
} \
@@ -4669,8 +4746,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
*/
if (cpu_has_vmx_rdtscp()) {
bool rdpid_or_rdtscp_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
vmx_adjust_secondary_exec_control(vmx, &exec_control,
SECONDARY_EXEC_ENABLE_RDTSCP,
@@ -4763,7 +4840,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write16(GUEST_INTR_STATUS, 0);
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
- vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
+ vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc)));
}
if (vmx_can_use_ipiv(&vmx->vcpu)) {
@@ -4802,10 +4879,10 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
- vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
+ vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl());
/* 22.2.1, 20.8.1 */
- vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
+ vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl());
vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
@@ -4820,7 +4897,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (enable_pml) {
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
vmx_write_encls_bitmap(&vmx->vcpu, NULL);
@@ -4835,7 +4912,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write32(GUEST_SYSENTER_CS, 0);
vmcs_writel(GUEST_SYSENTER_ESP, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ vmx_guest_debugctl_write(&vmx->vcpu, 0);
if (cpu_has_vmx_tpr_shadow()) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
@@ -4876,8 +4954,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
* Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
* or POSTED_INTR_WAKEUP_VECTOR.
*/
- vmx->pi_desc.nv = POSTED_INTR_VECTOR;
- __pi_set_sn(&vmx->pi_desc);
+ vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
+ __pi_set_sn(&vmx->vt.pi_desc);
}
void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -4932,6 +5010,14 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ vmcs_writel(GUEST_SSP, 0);
+ vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
+ }
+ if (kvm_cpu_cap_has(X86_FEATURE_IBT) ||
+ kvm_cpu_cap_has(X86_FEATURE_SHSTK))
+ vmcs_writel(GUEST_S_CET, 0);
+
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
vpid_sync_context(vmx->vpid);
@@ -5211,6 +5297,12 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
(kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
}
+static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.guest_fpu.fpstate->xfd &&
+ !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS);
+}
+
static int handle_exception_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5237,7 +5329,8 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
* point.
*/
if (is_nm_fault(intr_info)) {
- kvm_queue_exception(vcpu, NM_VECTOR);
+ kvm_queue_exception_p(vcpu, NM_VECTOR,
+ is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0);
return 1;
}
@@ -5781,11 +5874,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification;
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
gpa_t gpa;
- u64 error_code;
-
- exit_qualification = vmx_get_exit_qual(vcpu);
/*
* EPT violation happened while executing iret from NMI,
@@ -5801,23 +5891,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(vcpu, gpa, exit_qualification);
- /* Is it a read fault? */
- error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
- ? PFERR_USER_MASK : 0;
- /* Is it a write fault? */
- error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
- ? PFERR_WRITE_MASK : 0;
- /* Is it a fetch fault? */
- error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
- ? PFERR_FETCH_MASK : 0;
- /* ept page table entry is present? */
- error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
- ? PFERR_PRESENT_MASK : 0;
-
- if (error_code & EPT_VIOLATION_GVA_IS_VALID)
- error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
- PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
-
/*
* Check that the GPA doesn't exceed physical memory limits, as that is
* a guest page fault. We have to emulate the instruction here, because
@@ -5829,7 +5902,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
return kvm_emulate_instruction(vcpu, 0);
- return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+ return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
}
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
@@ -5865,11 +5938,35 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
return 1;
}
-static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
+/*
+ * Returns true if emulation is required (due to the vCPU having invalid state
+ * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the
+ * current vCPU state.
+ */
+static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- return vmx->emulation_required && !vmx->rmode.vm86_active &&
+ if (!vmx->vt.emulation_required)
+ return false;
+
+ /*
+ * It is architecturally impossible for emulation to be required when a
+ * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if
+ * guest state is invalid and unrestricted guest is disabled, i.e. KVM
+ * should synthesize VM-Fail instead emulation L2 code. This path is
+ * only reachable if userspace modifies L2 guest state after KVM has
+ * performed the nested VM-Enter consistency checks.
+ */
+ if (vmx->nested.nested_run_pending)
+ return true;
+
+ /*
+ * KVM only supports emulating exceptions if the vCPU is in Real Mode.
+ * If emulation is required, KVM can't perform a successful VM-Enter to
+ * inject the exception.
+ */
+ return !vmx->rmode.vm86_active &&
(kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
}
@@ -5882,17 +5979,24 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
intr_window_requested = exec_controls_get(vmx) &
CPU_BASED_INTR_WINDOW_EXITING;
- while (vmx->emulation_required && count-- != 0) {
+ while (vmx->vt.emulation_required && count-- != 0) {
if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
return handle_interrupt_window(&vmx->vcpu);
if (kvm_test_request(KVM_REQ_EVENT, vcpu))
return 1;
+ /*
+ * Ensure that any updates to kvm->buses[] observed by the
+ * previous instruction (emulated or otherwise) are also
+ * visible to the instruction KVM is about to emulate.
+ */
+ smp_rmb();
+
if (!kvm_emulate_instruction(vcpu, 0))
return 0;
- if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ if (vmx_unhandleable_emulation_required(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -5916,7 +6020,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
- if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ if (vmx_unhandleable_emulation_required(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -5959,7 +6063,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
} operand;
int gpr_index;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -6049,7 +6153,7 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
/*
* When nested=0, all VMX instruction VM Exits filter here. The handlers
- * are overwritten by nested_vmx_setup() when nested=1.
+ * are overwritten by nested_vmx_hardware_setup() when nested=1.
*/
static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
{
@@ -6057,6 +6161,12 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
#ifndef CONFIG_X86_SGX_KVM
static int handle_encls(struct kvm_vcpu *vcpu)
{
@@ -6077,7 +6187,7 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
* VM-Exits. Unconditionally set the flag here and leave the handling to
* vmx_handle_exit().
*/
- to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
+ to_vt(vcpu)->exit_reason.bus_lock_detected = true;
return 1;
}
@@ -6107,6 +6217,23 @@ static int handle_notify(struct kvm_vcpu *vcpu)
return 1;
}
+static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO));
+}
+
+static int handle_rdmsr_imm(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
+}
+
+static int handle_wrmsr_imm(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -6165,6 +6292,10 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_ENCLS] = handle_encls,
[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
[EXIT_REASON_NOTIFY] = handle_notify,
+ [EXIT_REASON_SEAMCALL] = handle_tdx_instruction,
+ [EXIT_REASON_TDCALL] = handle_tdx_instruction,
+ [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
+ [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
};
static const int kvm_vmx_max_exit_handlers =
@@ -6175,9 +6306,9 @@ void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- *reason = vmx->exit_reason.full;
+ *reason = vmx->vt.exit_reason.full;
*info1 = vmx_get_exit_qual(vcpu);
- if (!(vmx->exit_reason.failed_vmentry)) {
+ if (!(vmx->vt.exit_reason.failed_vmentry)) {
*info2 = vmx->idt_vectoring_info;
*intr_info = vmx_get_intr_info(vcpu);
if (is_exception_with_error_code(*intr_info))
@@ -6191,6 +6322,15 @@ void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
}
}
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
+{
+ *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+ else
+ *error_code = 0;
+}
+
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
{
if (vmx->pml_pg) {
@@ -6202,32 +6342,40 @@ static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u16 pml_idx, pml_tail_index;
u64 *pml_buf;
- u16 pml_idx;
+ int i;
pml_idx = vmcs_read16(GUEST_PML_INDEX);
/* Do nothing if PML buffer is empty */
- if (pml_idx == (PML_ENTITY_NUM - 1))
+ if (pml_idx == PML_HEAD_INDEX)
return;
+ /*
+ * PML index always points to the next available PML buffer entity
+ * unless PML log has just overflowed.
+ */
+ pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
- /* PML index always points to next available PML buffer entity */
- if (pml_idx >= PML_ENTITY_NUM)
- pml_idx = 0;
- else
- pml_idx++;
-
+ /*
+ * PML log is written backwards: the CPU first writes the entry 511
+ * then the entry 510, and so on.
+ *
+ * Read the entries in the same order they were written, to ensure that
+ * the dirty ring is filled in the same order the CPU wrote them.
+ */
pml_buf = page_address(vmx->pml_pg);
- for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+
+ for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
u64 gpa;
- gpa = pml_buf[pml_idx];
+ gpa = pml_buf[i];
WARN_ON(gpa & (PAGE_SIZE - 1));
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
}
/* reset PML index */
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
static void vmx_dump_sel(char *name, uint32_t sel)
@@ -6352,6 +6500,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
+ if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE)
+ pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
+ vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP),
+ vmcs_readl(GUEST_INTR_SSP_TABLE));
pr_err("*** Host State ***\n");
pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
@@ -6382,6 +6534,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
+ if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE)
+ pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
+ vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP),
+ vmcs_readl(HOST_INTR_SSP_TABLE));
pr_err("*** Control State ***\n");
pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
@@ -6456,7 +6612,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- union vmx_exit_reason exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
u32 vectoring_info = vmx->idt_vectoring_info;
u16 exit_handler_index;
@@ -6512,7 +6668,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* the least awful solution for the userspace case without
* risking false positives.
*/
- if (vmx->emulation_required) {
+ if (vmx->vt.emulation_required) {
nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
return 1;
}
@@ -6522,7 +6678,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
}
/* If guest state is invalid, start emulating. L2 is handled above. */
- if (vmx->emulation_required)
+ if (vmx->vt.emulation_required)
return handle_invalid_guest_state(vcpu);
if (exit_reason.failed_vmentry) {
@@ -6543,33 +6699,15 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return 0;
}
- /*
- * Note:
- * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
- * delivery event since it indicates guest is accessing MMIO.
- * The vm-exit can be triggered again after return to guest that
- * will cause infinite loop.
- */
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
(exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
exit_reason.basic != EXIT_REASON_PML_FULL &&
exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
- exit_reason.basic != EXIT_REASON_NOTIFY)) {
- int ndata = 3;
-
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
- vcpu->run->internal.data[0] = vectoring_info;
- vcpu->run->internal.data[1] = exit_reason.full;
- vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu);
- if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
- vcpu->run->internal.data[ndata++] =
- vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- }
- vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
- vcpu->run->internal.ndata = ndata;
+ exit_reason.basic != EXIT_REASON_NOTIFY &&
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) {
+ kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA);
return 0;
}
@@ -6600,6 +6738,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
#ifdef CONFIG_MITIGATION_RETPOLINE
if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
return kvm_emulate_wrmsr(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
+ return handle_wrmsr_imm(vcpu);
else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
return handle_preemption_timer(vcpu);
else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
@@ -6620,15 +6760,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
unexpected_vmexit:
- vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
- exit_reason.full);
dump_vmcs(vcpu);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_reason.full;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full);
return 0;
}
@@ -6640,7 +6773,7 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* Exit to user space when bus lock detected to inform that there is
* a bus lock in guest.
*/
- if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
+ if (vmx_get_exit_reason(vcpu).bus_lock_detected) {
if (ret > 0)
vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
@@ -6650,77 +6783,6 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return ret;
}
-/*
- * Software based L1D cache flush which is used when microcode providing
- * the cache control MSR is not loaded.
- *
- * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
- * flush it is required to read in 64 KiB because the replacement algorithm
- * is not exactly LRU. This could be sized at runtime via topology
- * information but as all relevant affected CPUs have 32KiB L1D cache size
- * there is no point in doing so.
- */
-static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
-{
- int size = PAGE_SIZE << L1D_CACHE_ORDER;
-
- /*
- * This code is only executed when the flush mode is 'cond' or
- * 'always'
- */
- if (static_branch_likely(&vmx_l1d_flush_cond)) {
- bool flush_l1d;
-
- /*
- * Clear the per-vcpu flush bit, it gets set again if the vCPU
- * is reloaded, i.e. if the vCPU is scheduled out or if KVM
- * exits to userspace, or if KVM reaches one of the unsafe
- * VMEXIT handlers, e.g. if KVM calls into the emulator.
- */
- flush_l1d = vcpu->arch.l1tf_flush_l1d;
- vcpu->arch.l1tf_flush_l1d = false;
-
- /*
- * Clear the per-cpu flush bit, it gets set again from
- * the interrupt handlers.
- */
- flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
- kvm_clear_cpu_l1tf_flush_l1d();
-
- if (!flush_l1d)
- return;
- }
-
- vcpu->stat.l1d_flush++;
-
- if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
- return;
- }
-
- asm volatile(
- /* First ensure the pages are in the TLB */
- "xorl %%eax, %%eax\n"
- ".Lpopulate_tlb:\n\t"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $4096, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lpopulate_tlb\n\t"
- "xorl %%eax, %%eax\n\t"
- "cpuid\n\t"
- /* Now fill the cache */
- "xorl %%eax, %%eax\n"
- ".Lfill_cache:\n"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $64, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lfill_cache\n\t"
- "lfence\n"
- :: [flush_pages] "r" (vmx_l1d_flush_pages),
- [size] "r" (size)
- : "eax", "ebx", "ecx", "edx");
-}
-
void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -6862,11 +6924,32 @@ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
read_unlock(&vcpu->kvm->mmu_lock);
}
-void vmx_hwapic_isr_update(int max_isr)
+void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
u16 status;
u8 old;
+ /*
+ * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
+ * is only relevant for if and only if Virtual Interrupt Delivery is
+ * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
+ * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested
+ * VM-Exit, otherwise L1 with run with a stale SVI.
+ */
+ if (is_guest_mode(vcpu)) {
+ /*
+ * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
+ * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
+ * Note, userspace can stuff state while L2 is active; assert
+ * that VID is disabled if and only if the vCPU is in KVM_RUN
+ * to avoid false positives if userspace is setting APIC state.
+ */
+ WARN_ON_ONCE(vcpu->wants_to_run &&
+ nested_cpu_has_vid(get_vmcs12(vcpu)));
+ to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
+ return;
+ }
+
if (max_isr == -1)
max_isr = 0;
@@ -6896,38 +6979,24 @@ static void vmx_set_rvi(int vector)
}
}
-void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
-{
- /*
- * When running L2, updating RVI is only relevant when
- * vmcs12 virtual-interrupt-delivery enabled.
- * However, it can be enabled only when L1 also
- * intercepts external-interrupts and in that case
- * we should not update vmcs02 RVI but instead intercept
- * interrupt. Therefore, do nothing when running L2.
- */
- if (!is_guest_mode(vcpu))
- vmx_set_rvi(max_irr);
-}
-
int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
int max_irr;
bool got_posted_interrupt;
if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
return -EIO;
- if (pi_test_on(&vmx->pi_desc)) {
- pi_clear_on(&vmx->pi_desc);
+ if (pi_test_on(&vt->pi_desc)) {
+ pi_clear_on(&vt->pi_desc);
/*
* IOMMU can write to PID.ON, so the barrier matters even on UP.
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
got_posted_interrupt =
- kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
+ kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
got_posted_interrupt = false;
@@ -6967,14 +7036,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}
-void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- pi_clear_on(&vmx->pi_desc);
- memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
-}
-
void vmx_do_interrupt_irqoff(unsigned long entry);
void vmx_do_nmi_irqoff(void);
@@ -6985,17 +7046,16 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
* MSR value is not clobbered by the host activity before the guest
* has chance to consume it.
*
- * Do not blindly read xfd_err here, since this exception might
- * be caused by L1 interception on a platform which doesn't
- * support xfd at all.
+ * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM
+ * interception may have been caused by L1 interception. Per the SDM,
+ * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1.
*
- * Do it conditionally upon guest_fpu::xfd. xfd_err matters
- * only when xfd contains a non-zero value.
- *
- * Queuing exception is done in vmx_handle_exit. See comment there.
+ * Note, XFD_ERR is updated _before_ the #NM interception check, i.e.
+ * unlike CR2 and DR6, the value is not a payload that is attached to
+ * the #NM exception.
*/
- if (vcpu->arch.guest_fpu.fpstate->xfd)
- rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+ if (is_xfd_nm_fault(vcpu))
+ rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
@@ -7020,8 +7080,14 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
"unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
+ /*
+ * Invoke the kernel's IRQ handler for the vector. Use the FRED path
+ * when it's available even if FRED isn't fully enabled, e.g. even if
+ * FRED isn't supported in hardware, in order to avoid the indirect
+ * CALL in the non-FRED path.
+ */
kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
- if (cpu_feature_enabled(X86_FEATURE_FRED))
+ if (IS_ENABLED(CONFIG_X86_FRED))
fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
else
vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
@@ -7032,15 +7098,22 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (vmx->emulation_required)
+ if (to_vt(vcpu)->emulation_required)
return;
- if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
+ switch (vmx_get_exit_reason(vcpu).basic) {
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
- else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
+ break;
+ case EXIT_REASON_EXCEPTION_NMI:
handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
+ break;
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ kvm_machine_check();
+ break;
+ default:
+ break;
+ }
}
/*
@@ -7145,13 +7218,17 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
case INTR_TYPE_SOFT_EXCEPTION:
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
fallthrough;
- case INTR_TYPE_HARD_EXCEPTION:
- if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
- u32 err = vmcs_read32(error_code_field);
- kvm_requeue_exception_e(vcpu, vector, err);
- } else
- kvm_requeue_exception(vcpu, vector);
+ case INTR_TYPE_HARD_EXCEPTION: {
+ u32 error_code = 0;
+
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(error_code_field);
+
+ kvm_requeue_exception(vcpu, vector,
+ idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
+ error_code);
break;
+ }
case INTR_TYPE_SOFT_INTR:
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
fallthrough;
@@ -7246,7 +7323,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
return;
if (flags & VMX_RUN_SAVE_SPEC_CTRL)
- vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
+ vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL);
/*
* If the guest/host SPEC_CTRL values differ, restore the host value.
@@ -7257,7 +7334,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
*/
if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
vmx->spec_ctrl != hostval)
- native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
+ native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval);
barrier_nospec();
}
@@ -7270,21 +7347,40 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
* the fastpath even, all other exits must use the slow path.
*/
if (is_guest_mode(vcpu) &&
- to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
+ vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER)
return EXIT_FASTPATH_NONE;
- switch (to_vmx(vcpu)->exit_reason.basic) {
+ switch (vmx_get_exit_reason(vcpu).basic) {
case EXIT_REASON_MSR_WRITE:
- return handle_fastpath_set_msr_irqoff(vcpu);
+ return handle_fastpath_wrmsr(vcpu);
+ case EXIT_REASON_MSR_WRITE_IMM:
+ return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
case EXIT_REASON_PREEMPTION_TIMER:
return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
case EXIT_REASON_HLT:
return handle_fastpath_hlt(vcpu);
+ case EXIT_REASON_INVD:
+ return handle_fastpath_invd(vcpu);
default:
return EXIT_FASTPATH_NONE;
}
}
+noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu)
+{
+ if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI ||
+ !is_nmi(vmx_get_intr_info(vcpu)))
+ return;
+
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
+ else
+ vmx_do_nmi_irqoff();
+ kvm_after_interrupt(vcpu);
+}
+
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
unsigned int flags)
{
@@ -7292,17 +7388,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
guest_state_enter_irqoff();
- /*
- * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW
- * mitigation for MDS is done late in VMentry and is still
- * executed in spite of L1D Flush. This is because an extra VERW
- * should not matter much after the big hammer L1D Flush.
- */
- if (static_branch_unlikely(&vmx_l1d_should_flush))
- vmx_l1d_flush(vcpu);
- else if (static_branch_unlikely(&mmio_stale_data_clear) &&
- kvm_arch_has_assigned_device(vcpu->kvm))
- mds_clear_cpu_buffers();
+ vmx_l1d_flush(vcpu);
vmx_disable_fb_clear(vmx);
@@ -7320,30 +7406,23 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vmx_enable_fb_clear(vmx);
if (unlikely(vmx->fail)) {
- vmx->exit_reason.full = 0xdead;
+ vmx->vt.exit_reason.full = 0xdead;
goto out;
}
- vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
- if (likely(!vmx->exit_reason.failed_vmentry))
+ vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry))
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
- is_nmi(vmx_get_intr_info(vcpu))) {
- kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
- if (cpu_feature_enabled(X86_FEATURE_FRED))
- fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
- else
- vmx_do_nmi_irqoff();
- kvm_after_interrupt(vcpu);
- }
+ vmx_handle_nmi(vcpu);
out:
guest_state_exit_irqoff();
}
-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7357,15 +7436,15 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
* start emulation until we arrive back to a valid state. Synthesize a
* consistency check VM-Exit due to invalid guest state and bail.
*/
- if (unlikely(vmx->emulation_required)) {
+ if (unlikely(vmx->vt.emulation_required)) {
vmx->fail = 0;
- vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
- vmx->exit_reason.failed_vmentry = 1;
+ vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE;
+ vmx->vt.exit_reason.failed_vmentry = 1;
kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
- vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
+ vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT;
kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
- vmx->exit_intr_info = 0;
+ vmx->vt.exit_intr_info = 0;
return EXIT_FASTPATH_NONE;
}
@@ -7388,6 +7467,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
vcpu->arch.regs_dirty = 0;
+ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+ set_debugreg(vcpu->arch.dr6, 6);
+
+ if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
+ vmx_reload_guest_debugctl(vcpu);
+
/*
* Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
* prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
@@ -7407,10 +7492,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- set_debugreg(vcpu->arch.dr6, 6);
-
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -7419,8 +7500,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- kvm_load_guest_xsave_state(vcpu);
-
pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx);
@@ -7446,8 +7525,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
}
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
- if (vmx->host_debugctlmsr)
- update_debugctlmsr(vmx->host_debugctlmsr);
+ if (vcpu->arch.host_debugctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
#ifndef CONFIG_X86_64
/*
@@ -7464,15 +7543,13 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
pt_guest_exit(vmx);
- kvm_load_host_xsave_state(vcpu);
-
if (is_guest_mode(vcpu)) {
/*
* Track VMLAUNCH/VMRESUME that have made past guest state
* checking.
*/
if (vmx->nested.nested_run_pending &&
- !vmx->exit_reason.failed_vmentry)
+ !vmx_get_exit_reason(vcpu).failed_vmentry)
++vcpu->stat.nested_run;
vmx->nested.nested_run_pending = 0;
@@ -7481,12 +7558,9 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
if (unlikely(vmx->fail))
return EXIT_FASTPATH_NONE;
- if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
- kvm_machine_check();
-
trace_kvm_exit(vcpu, KVM_ISA_VMX);
- if (unlikely(vmx->exit_reason.failed_vmentry))
+ if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry))
return EXIT_FASTPATH_NONE;
vmx->loaded_vmcs->launched = 1;
@@ -7518,7 +7592,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu);
- INIT_LIST_HEAD(&vmx->pi_wakeup_list);
+ INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list);
err = -ENOMEM;
@@ -7566,26 +7640,6 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
evmcs->hv_enlightenments_control.msr_bitmap = 1;
}
- /* The MSR bitmap starts with all ones */
- bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
-
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
-#ifdef CONFIG_X86_64
- vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-#endif
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
- if (kvm_cstate_in_guest(vcpu->kvm)) {
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
- }
-
vmx->loaded_vmcs = &vmx->vmcs01;
if (cpu_need_virtualize_apic_accesses(vcpu)) {
@@ -7616,7 +7670,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
if (vmx_can_use_ipiv(vcpu))
WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
- __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
+ __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID);
return 0;
@@ -7635,7 +7689,7 @@ free_vpid:
int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
- kvm->arch.pause_in_guest = true;
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
switch (l1tf_mitigation) {
@@ -7643,6 +7697,7 @@ int vmx_vm_init(struct kvm *kvm)
case L1TF_MITIGATION_FLUSH_NOWARN:
/* 'I explicitly don't care' is set */
break;
+ case L1TF_MITIGATION_AUTO:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
case L1TF_MITIGATION_FULL:
@@ -7660,9 +7715,23 @@ int vmx_vm_init(struct kvm *kvm)
break;
}
}
+
+ if (enable_pml)
+ kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
return 0;
}
+static inline bool vmx_ignore_guest_pat(struct kvm *kvm)
+{
+ /*
+ * Non-coherent DMA devices need the guest to flush CPU properly.
+ * In that case it is not possible to map all guest RAM as WB, so
+ * always trust guest PAT.
+ */
+ return !kvm_arch_has_noncoherent_dma(kvm) &&
+ kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT);
+}
+
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
/*
@@ -7672,13 +7741,8 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
if (is_mmio)
return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- /*
- * Force WB and ignore guest PAT if the VM does NOT have a non-coherent
- * device attached. Letting the guest control memory types on Intel
- * CPUs may result in unexpected behavior, and so KVM's ABI is to trust
- * the guest to behave only as a last resort.
- */
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ /* Force WB if ignoring guest PAT */
+ if (vmx_ignore_guest_pat(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
@@ -7743,6 +7807,8 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
+ cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK));
+ cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT));
entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1);
cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM));
@@ -7828,12 +7894,8 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
* to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
* set if and only if XSAVE is supported.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
-
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM);
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
+ guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES);
vmx_setup_uret_msrs(vmx);
@@ -7841,7 +7903,7 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmcs_set_secondary_exec_control(vmx,
vmx_secondary_exec_control(vmx));
- if (guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
@@ -7850,43 +7912,31 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
- if (guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
nested_vmx_cr_fixed1_bits_update(vcpu);
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
- guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+ guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT))
update_intel_pt_cfg(vcpu);
if (boot_cpu_has(X86_FEATURE_RTM)) {
struct vmx_uret_msr *msr;
msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (msr) {
- bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
+ bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM);
vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
}
}
- if (kvm_cpu_cap_has(X86_FEATURE_XFD))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
- !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
-
- if (boot_cpu_has(X86_FEATURE_IBPB))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
- !guest_has_pred_cmd_msr(vcpu));
-
- if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
- !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
-
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX))
vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
else
vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_SGX_LC_ENABLED;
else
@@ -7899,14 +7949,14 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
static __init u64 vmx_get_perf_capabilities(void)
{
- u64 perf_cap = PMU_CAP_FW_WRITES;
+ u64 perf_cap = PERF_CAP_FW_WRITES;
u64 host_perf_cap = 0;
if (!enable_pmu)
return 0;
if (boot_cpu_has(X86_FEATURE_PDCM))
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+ rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
x86_perf_get_lbr(&vmx_lbr_caps);
@@ -7919,7 +7969,7 @@ static __init u64 vmx_get_perf_capabilities(void)
if (!vmx_lbr_caps.has_callstack)
memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps));
else if (vmx_lbr_caps.nr)
- perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+ perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT;
}
if (vmx_pebs_supported()) {
@@ -7987,7 +8037,6 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_UMIP);
/* CPUID 0xD.1 */
- kvm_caps.supported_xss = 0;
if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
@@ -7999,40 +8048,64 @@ static __init void vmx_set_cpu_caps(void)
if (cpu_has_vmx_waitpkg())
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
+
+ /*
+ * Disable CET if unrestricted_guest is unsupported as KVM doesn't
+ * enforce CET HW behaviors in emulator. On platforms with
+ * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
+ * fails, so disable CET in this case too.
+ */
+ if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
+ !cpu_has_vmx_basic_no_hw_errcode_cc()) {
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+ kvm_cpu_cap_clear(X86_FEATURE_IBT);
+ }
}
-static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info)
+static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ unsigned long *exit_qualification)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
unsigned short port;
- bool intercept;
int size;
+ bool imm;
+
+ /*
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+ */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
if (info->intercept == x86_intercept_in ||
info->intercept == x86_intercept_ins) {
port = info->src_val;
size = info->dst_bytes;
+ imm = info->src_type == OP_IMM;
} else {
port = info->dst_val;
size = info->src_bytes;
+ imm = info->dst_type == OP_IMM;
}
- /*
- * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
- * VM-exits depend on the 'unconditional IO exiting' VM-execution
- * control.
- *
- * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
- */
- if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
- intercept = nested_cpu_has(vmcs12,
- CPU_BASED_UNCOND_IO_EXITING);
- else
- intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
- /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
- return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+ *exit_qualification = ((unsigned long)port << 16) | (size - 1);
+
+ if (info->intercept == x86_intercept_ins ||
+ info->intercept == x86_intercept_outs)
+ *exit_qualification |= BIT(4);
+
+ if (info->rep_prefix)
+ *exit_qualification |= BIT(5);
+
+ if (imm)
+ *exit_qualification |= BIT(6);
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
}
int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -8041,26 +8114,34 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu,
struct x86_exception *exception)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long exit_qualification = 0;
+ u32 vm_exit_reason;
+ u64 exit_insn_len;
switch (info->intercept) {
- /*
- * RDPID causes #UD if disabled through secondary execution controls.
- * Because it is marked as EmulateOnUD, we need to intercept it here.
- * Note, RDPID is hidden behind ENABLE_RDTSCP.
- */
case x86_intercept_rdpid:
+ /*
+ * RDPID causes #UD if not enabled through secondary execution
+ * controls (ENABLE_RDTSCP). Note, the implicit MSR access to
+ * TSC_AUX is NOT subject to interception, i.e. checking only
+ * the dedicated execution control is architecturally correct.
+ */
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
exception->vector = UD_VECTOR;
exception->error_code_valid = false;
return X86EMUL_PROPAGATE_FAULT;
}
- break;
+ return X86EMUL_CONTINUE;
case x86_intercept_in:
case x86_intercept_ins:
case x86_intercept_out:
case x86_intercept_outs:
- return vmx_check_intercept_io(vcpu, info);
+ if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_IO_INSTRUCTION;
+ break;
case x86_intercept_lgdt:
case x86_intercept_lidt:
@@ -8073,7 +8154,24 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu,
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
return X86EMUL_CONTINUE;
- /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ if (info->intercept == x86_intercept_lldt ||
+ info->intercept == x86_intercept_ltr ||
+ info->intercept == x86_intercept_sldt ||
+ info->intercept == x86_intercept_str)
+ vm_exit_reason = EXIT_REASON_LDTR_TR;
+ else
+ vm_exit_reason = EXIT_REASON_GDTR_IDTR;
+ /*
+ * FIXME: Decode the ModR/M to generate the correct exit
+ * qualification for memory operands.
+ */
+ break;
+
+ case x86_intercept_hlt:
+ if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_HLT;
break;
case x86_intercept_pause:
@@ -8086,17 +8184,24 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu,
* the PAUSE.
*/
if ((info->rep_prefix != REPE_PREFIX) ||
- !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
+ !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING))
return X86EMUL_CONTINUE;
+ vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION;
break;
/* TODO: check more intercepts... */
default:
- break;
+ return X86EMUL_UNHANDLEABLE;
}
- return X86EMUL_UNHANDLEABLE;
+ exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip);
+ if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH)
+ return X86EMUL_UNHANDLEABLE;
+
+ __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification,
+ exit_insn_len);
+ return X86EMUL_INTERCEPTED;
}
#ifdef CONFIG_X86_64
@@ -8404,14 +8509,12 @@ __init int vmx_hardware_setup(void)
vmx_setup_user_return_msrs();
- if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
- return -EIO;
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
if (boot_cpu_has(X86_FEATURE_MPX)) {
- rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
+ rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs);
WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
}
@@ -8435,6 +8538,14 @@ __init int vmx_hardware_setup(void)
return -EOPNOTSUPP;
}
+ /*
+ * Shadow paging doesn't have a (further) performance penalty
+ * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
+ * by default
+ */
+ if (!enable_ept)
+ allow_smaller_maxphyaddr = true;
+
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
@@ -8500,6 +8611,8 @@ __init int vmx_hardware_setup(void)
if (enable_ept)
kvm_mmu_set_ept_masks(enable_ept_ad_bits,
cpu_has_vmx_ept_execute_only());
+ else
+ vt_x86_ops.get_mt_mask = NULL;
/*
* Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
@@ -8517,9 +8630,6 @@ __init int vmx_hardware_setup(void)
if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
enable_pml = 0;
- if (!enable_pml)
- vt_x86_ops.cpu_dirty_log_size = 0;
-
if (!cpu_has_vmx_preemption_timer())
enable_preemption_timer = false;
@@ -8561,6 +8671,13 @@ __init int vmx_hardware_setup(void)
setup_default_sgx_lepubkeyhash();
+ vmx_set_cpu_caps();
+
+ /*
+ * Configure nested capabilities after core CPU capabilities so that
+ * nested support can be conditional on base support, e.g. so that KVM
+ * can hide/show features based on kvm_cpu_cap_has().
+ */
if (nested) {
nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
@@ -8569,68 +8686,75 @@ __init int vmx_hardware_setup(void)
return r;
}
- vmx_set_cpu_caps();
-
r = alloc_kvm_area();
if (r && nested)
nested_vmx_hardware_unsetup();
kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
- return r;
-}
+ /*
+ * On Intel CPUs that lack self-snoop feature, letting the guest control
+ * memory types may result in unexpected behavior. So always ignore guest
+ * PAT on those CPUs and map VM as writeback, not allowing userspace to
+ * disable the quirk.
+ *
+ * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is
+ * supported, UC is slow enough to cause issues with some older guests (e.g.
+ * an old version of bochs driver uses ioremap() instead of ioremap_wc() to
+ * map the video RAM, causing wayland desktop to fail to get started
+ * correctly). To avoid breaking those older guests that rely on KVM to force
+ * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the
+ * safer (for performance) default behavior.
+ *
+ * On top of this, non-coherent DMA devices need the guest to flush CPU
+ * caches properly. This also requires honoring guest PAT, and is forced
+ * independent of the quirk in vmx_ignore_guest_pat().
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
-static void vmx_cleanup_l1d_flush(void)
-{
- if (vmx_l1d_flush_pages) {
- free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
- vmx_l1d_flush_pages = NULL;
- }
- /* Restore state so sysfs ignores VMX */
- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+ kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+
+ return r;
}
-static void __vmx_exit(void)
+void vmx_exit(void)
{
allow_smaller_maxphyaddr = false;
vmx_cleanup_l1d_flush();
-}
-static void vmx_exit(void)
-{
- kvm_exit();
- __vmx_exit();
kvm_x86_vendor_exit();
-
}
-module_exit(vmx_exit);
-static int __init vmx_init(void)
+int __init vmx_init(void)
{
int r, cpu;
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
+
if (!kvm_is_vmx_supported())
return -EOPNOTSUPP;
/*
- * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
- * to unwind if a later step fails.
+ * Note, VMCS and eVMCS configuration only touch VMX knobs/variables,
+ * i.e. there's nothing to unwind if a later step fails.
*/
hv_init_evmcs();
+ /*
+ * Parse the VMCS config and VMX capabilities before anything else, so
+ * that the information is available to all setup flows.
+ */
+ if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
+ return -EIO;
+
r = kvm_x86_vendor_init(&vt_init_ops);
if (r)
return r;
- /*
- * Must be called after common x86 init so enable_ept is properly set
- * up. Hand the parameter mitigation value in which was stored in
- * the pre module init parser. If no parameter was given, it will
- * contain 'auto' which will be turned into the default 'cond'
- * mitigation mode.
- */
- r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+ /* Must be called after common x86 init so enable_ept is setup. */
+ r = vmx_setup_l1d_flush();
if (r)
goto err_l1d_flush;
@@ -8642,29 +8766,9 @@ static int __init vmx_init(void)
vmx_check_vmcs12_offsets();
- /*
- * Shadow paging doesn't have a (further) performance penalty
- * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
- * by default
- */
- if (!enable_ept)
- allow_smaller_maxphyaddr = true;
-
- /*
- * Common KVM initialization _must_ come last, after this, /dev/kvm is
- * exposed to userspace!
- */
- r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
- THIS_MODULE);
- if (r)
- goto err_kvm_init;
-
return 0;
-err_kvm_init:
- __vmx_exit();
err_l1d_flush:
kvm_x86_vendor_exit();
return r;
}
-module_init(vmx_init);