summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r--arch/x86/kvm/vmx/vmx.c4893
1 files changed, 2828 insertions, 2065 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 927a552393b9..4cbe8c84b636 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -12,6 +12,7 @@
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/highmem.h>
#include <linux/hrtimer.h>
@@ -27,7 +28,6 @@
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>
-#include <linux/entry-kvm.h>
#include <asm/apic.h>
#include <asm/asm.h>
@@ -35,22 +35,26 @@
#include <asm/cpu_device_id.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fred.h>
#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
-#include <asm/kexec.h>
+#include <asm/reboot.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
+#include <asm/msr.h>
#include <asm/mwait.h>
#include <asm/spec-ctrl.h>
-#include <asm/virtext.h>
#include <asm/vmx.h>
+#include <trace/events/ipi.h>
+
#include "capabilities.h"
+#include "common.h"
#include "cpuid.h"
-#include "evmcs.h"
#include "hyperv.h"
#include "kvm_onhyperv.h"
#include "irq.h"
@@ -65,8 +69,15 @@
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
+#include "x86_ops.h"
+#include "smm.h"
+#include "vmx_onhyperv.h"
+#include "posted_intr.h"
+
+#include "mmu/spte.h"
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
#ifdef MODULE
@@ -81,28 +92,31 @@ bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
static bool __read_mostly enable_vnmi = 1;
-module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
+module_param_named(vnmi, enable_vnmi, bool, 0444);
bool __read_mostly flexpriority_enabled = 1;
-module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
+module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
bool __read_mostly enable_ept = 1;
-module_param_named(ept, enable_ept, bool, S_IRUGO);
+module_param_named(ept, enable_ept, bool, 0444);
bool __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest,
- enable_unrestricted_guest, bool, S_IRUGO);
+ enable_unrestricted_guest, bool, 0444);
bool __read_mostly enable_ept_ad_bits = 1;
-module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
static bool __read_mostly emulate_invalid_guest_state = true;
-module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+module_param(emulate_invalid_guest_state, bool, 0444);
static bool __read_mostly fasteoi = 1;
-module_param(fasteoi, bool, S_IRUGO);
+module_param(fasteoi, bool, 0444);
-module_param(enable_apicv, bool, S_IRUGO);
+module_param(enable_apicv, bool, 0444);
+module_param(enable_ipiv, bool, 0444);
+
+module_param(enable_device_posted_irqs, bool, 0444);
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
@@ -110,10 +124,13 @@ module_param(enable_apicv, bool, S_IRUGO);
* use VMX instructions.
*/
static bool __read_mostly nested = 1;
-module_param(nested, bool, S_IRUGO);
+module_param(nested, bool, 0444);
bool __read_mostly enable_pml = 1;
-module_param_named(pml, enable_pml, bool, S_IRUGO);
+module_param_named(pml, enable_pml, bool, 0444);
+
+static bool __read_mostly error_on_inconsistent_vmcs_config = true;
+module_param(error_on_inconsistent_vmcs_config, bool, 0444);
static bool __read_mostly dump_invalid_vmcs = 0;
module_param(dump_invalid_vmcs, bool, 0644);
@@ -136,8 +153,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON \
- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
- X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -151,28 +167,6 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
RTIT_STATUS_BYTECNT))
/*
- * List of MSRs that can be directly passed to the guest.
- * In addition to these x2apic and PT MSRs are handled specially.
- */
-static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
- MSR_IA32_SPEC_CTRL,
- MSR_IA32_PRED_CMD,
- MSR_IA32_TSC,
-#ifdef CONFIG_X86_64
- MSR_FS_BASE,
- MSR_GS_BASE,
- MSR_KERNEL_GS_BASE,
-#endif
- MSR_IA32_SYSENTER_CS,
- MSR_IA32_SYSENTER_ESP,
- MSR_IA32_SYSENTER_EIP,
- MSR_CORE_C1_RES,
- MSR_CORE_C3_RESIDENCY,
- MSR_CORE_C6_RESIDENCY,
- MSR_CORE_C7_RESIDENCY,
-};
-
-/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
* executions of PAUSE in a loop. Also indicate if ple enabled.
@@ -201,10 +195,15 @@ module_param(ple_window_shrink, uint, 0444);
static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
-/* Default is SYSTEM mode, 1 for host-guest mode */
+/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
int __read_mostly pt_mode = PT_MODE_SYSTEM;
+#ifdef CONFIG_BROKEN
module_param(pt_mode, int, S_IRUGO);
+#endif
+struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
+
+#ifdef CONFIG_CPU_MITIGATIONS
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
@@ -227,7 +226,7 @@ static const struct {
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
-static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
unsigned int i;
@@ -242,14 +241,9 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
- u64 msr;
-
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
- if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
- return 0;
- }
+ if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
}
/* If set to auto use the default l1tf mitigation method */
@@ -258,6 +252,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
case L1TF_MITIGATION_OFF:
l1tf = VMENTER_L1D_FLUSH_NEVER;
break;
+ case L1TF_MITIGATION_AUTO:
case L1TF_MITIGATION_FLUSH_NOWARN:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
@@ -308,6 +303,26 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
+static int vmx_setup_l1d_flush(void)
+{
+ /*
+ * Hand the parameter mitigation value in which was stored in the pre
+ * module init parser. If no parameter was given, it will contain
+ * 'auto' which will be turned into the default 'cond' mitigation mode.
+ */
+ return __vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+}
+
+static void vmx_cleanup_l1d_flush(void)
+{
+ if (vmx_l1d_flush_pages) {
+ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+ vmx_l1d_flush_pages = NULL;
+ }
+ /* Restore state so sysfs ignores VMX */
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+
static int vmentry_l1d_flush_parse(const char *s)
{
unsigned int i;
@@ -345,7 +360,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
}
mutex_lock(&vmx_l1d_flush_mutex);
- ret = vmx_setup_l1d_flush(l1tf);
+ ret = __vmx_setup_l1d_flush(l1tf);
mutex_unlock(&vmx_l1d_flush_mutex);
return ret;
}
@@ -353,17 +368,158 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
{
if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
- return sprintf(s, "???\n");
+ return sysfs_emit(s, "???\n");
- return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
+ return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ if (!static_branch_unlikely(&vmx_l1d_should_flush))
+ return;
+
+ /*
+ * This code is only executed when the flush mode is 'cond' or
+ * 'always'
+ */
+ if (static_branch_likely(&vmx_l1d_flush_cond)) {
+ /*
+ * Clear the per-cpu flush bit, it gets set again if the vCPU
+ * is reloaded, i.e. if the vCPU is scheduled out or if KVM
+ * exits to userspace, or if KVM reaches one of the unsafe
+ * VMEXIT handlers, e.g. if KVM calls into the emulator,
+ * or from the interrupt handlers.
+ */
+ if (!kvm_get_cpu_l1tf_flush_l1d())
+ return;
+ kvm_clear_cpu_l1tf_flush_l1d();
+ }
+
+ vcpu->stat.l1d_flush++;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return;
+ }
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (vmx_l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
+#else /* CONFIG_CPU_MITIGATIONS*/
+static int vmx_setup_l1d_flush(void)
+{
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER;
+ return 0;
+}
+static void vmx_cleanup_l1d_flush(void)
+{
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+
+}
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n");
+ return 0;
+}
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ return sysfs_emit(s, "never\n");
+}
+#endif
+
static const struct kernel_param_ops vmentry_l1d_flush_ops = {
.set = vmentry_l1d_flush_set,
.get = vmentry_l1d_flush_get,
};
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
+{
+ u64 msr;
+
+ if (!vmx->disable_fb_clear)
+ return;
+
+ msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL);
+ msr |= FB_CLEAR_DIS;
+ native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr);
+ /* Cache the MSR value to avoid reading it later */
+ vmx->msr_ia32_mcu_opt_ctrl = msr;
+}
+
+static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
+{
+ if (!vmx->disable_fb_clear)
+ return;
+
+ vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
+ native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+}
+
+static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
+{
+ /*
+ * Disable VERW's behavior of clearing CPU buffers for the guest if the
+ * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
+ * the mitigation. Disabling the clearing behavior provides a
+ * performance boost for guests that aren't aware that manually clearing
+ * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
+ * and VM-Exit.
+ */
+ vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
+ (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA);
+
+ /*
+ * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
+ * at VMEntry. Skip the MSR read/write when a guest has no use case to
+ * execute VERW.
+ */
+ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
+ ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
+ vmx->disable_fb_clear = false;
+}
+
static u32 vmx_segment_access_rights(struct kvm_segment *var);
void vmx_vmexit(void);
@@ -374,40 +530,51 @@ do { \
pr_warn_ratelimited(fmt); \
} while (0)
-asmlinkage void vmread_error(unsigned long field, bool fault)
+noinline void vmread_error(unsigned long field)
{
- if (fault)
+ vmx_insn_failed("vmread failed: field=%lx\n", field);
+}
+
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
+{
+ if (fault) {
kvm_spurious_fault();
- else
- vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
+ } else {
+ instrumentation_begin();
+ vmread_error(field);
+ instrumentation_end();
+ }
}
+#endif
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
- vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
+ vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
+ vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
+ vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
+ vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
+ vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
{
- vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
+ vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
ext, vpid, gva);
}
-noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
+noinline void invept_error(unsigned long ext, u64 eptp)
{
- vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
- ext, eptp, gpa);
+ vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
}
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
@@ -421,8 +588,8 @@ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
-struct vmcs_config vmcs_config;
-struct vmx_capability vmx_capability;
+struct vmcs_config vmcs_config __ro_after_init;
+struct vmx_capability vmx_capability __ro_after_init;
#define VMX_SEGMENT_FIELD(seg) \
[VCPU_SREG_##seg] = { \
@@ -448,10 +615,6 @@ static const struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
-static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
-{
- vmx->segment_cache.bitmask = 0;
-}
static unsigned long host_idt_base;
@@ -459,31 +622,87 @@ static unsigned long host_idt_base;
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
-static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
- struct hv_partition_assist_pg **p_hv_pa_pg =
- &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
- /*
- * Synthetic VM-Exit is not enabled in current code and so All
- * evmcs in singe VM shares same assist page.
- */
- if (!*p_hv_pa_pg)
- *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
+ hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
- if (!*p_hv_pa_pg)
+ if (partition_assist_page == INVALID_PAGE)
return -ENOMEM;
evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
- evmcs->partition_assist_page =
- __pa(*p_hv_pa_pg);
+ evmcs->partition_assist_page = partition_assist_page;
evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
return 0;
}
+static __init void hv_init_evmcs(void)
+{
+ int cpu;
+
+ if (!enlightened_vmcs)
+ return;
+
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above.
+ */
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("Using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&__kvm_is_using_evmcs);
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+ vt_x86_ops.enable_l2_tlb_flush
+ = hv_enable_l2_tlb_flush;
+ } else {
+ enlightened_vmcs = false;
+ }
+}
+
+static void hv_reset_evmcs(void)
+{
+ struct hv_vp_assist_page *vp_ap;
+
+ if (!kvm_is_using_evmcs())
+ return;
+
+ /*
+ * KVM should enable eVMCS if and only if all CPUs have a VP assist
+ * page, and should reject CPU onlining if eVMCS is enabled the CPU
+ * doesn't have a VP assist page allocated.
+ */
+ vp_ap = hv_get_vp_assist_page(smp_processor_id());
+ if (WARN_ON_ONCE(!vp_ap))
+ return;
+
+ /*
+ * Reset everything to support using non-enlightened VMCS access later
+ * (e.g. when we reload the module with enlightened_vmcs=0)
+ */
+ vp_ap->nested_control.features.directhypercall = 0;
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+}
+
+#else /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_init_evmcs(void) {}
+static void hv_reset_evmcs(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
/*
@@ -539,54 +758,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static inline bool report_flexpriority(void)
-{
- return flexpriority_enabled;
-}
-
-static int possible_passthrough_msr_slot(u32 msr)
-{
- u32 i;
-
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
- if (vmx_possible_passthrough_msrs[i] == msr)
- return i;
-
- return -ENOENT;
-}
-
-static bool is_valid_passthrough_msr(u32 msr)
-{
- bool r;
-
- switch (msr) {
- case 0x800 ... 0x8ff:
- /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
- return true;
- case MSR_IA32_RTIT_STATUS:
- case MSR_IA32_RTIT_OUTPUT_BASE:
- case MSR_IA32_RTIT_OUTPUT_MASK:
- case MSR_IA32_RTIT_CR3_MATCH:
- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
- /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
- case MSR_LBR_SELECT:
- case MSR_LBR_TOS:
- case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
- case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
- case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
- case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
- case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
- /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
- return true;
- }
-
- r = possible_passthrough_msr_slot(msr) != -ENOENT;
-
- WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
-
- return r;
-}
-
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -603,29 +774,64 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
unsigned int slot = msr - vmx->guest_uret_msrs;
int ret = 0;
- u64 old_msr_data = msr->data;
- msr->data = data;
if (msr->load_into_hardware) {
preempt_disable();
- ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
+ ret = kvm_set_user_return_msr(slot, data, msr->mask);
preempt_enable();
- if (ret)
- msr->data = old_msr_data;
}
+ if (!ret)
+ msr->data = data;
return ret;
}
-#ifdef CONFIG_KEXEC_CORE
-static void crash_vmclear_local_loaded_vmcss(void)
+/*
+ * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+ *
+ * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+ * atomically track post-VMXON state, e.g. this may be called in NMI context.
+ * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+ * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+ * magically in RM, VM86, compat mode, or at CPL>0.
+ */
+static int kvm_cpu_vmxoff(void)
+{
+ asm goto("1: vmxoff\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "cc", "memory" : fault);
+
+ cr4_clear_bits(X86_CR4_VMXE);
+ return 0;
+
+fault:
+ cr4_clear_bits(X86_CR4_VMXE);
+ return -EIO;
+}
+
+void vmx_emergency_disable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
+ kvm_rebooting = true;
+
+ /*
+ * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
+ * set in task context. If this races with VMX is disabled by an NMI,
+ * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
+ * kvm_rebooting set.
+ */
+ if (!(__read_cr4() & X86_CR4_VMXE))
+ return;
+
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
+ loaded_vmcss_on_cpu_link) {
vmcs_clear(v->vmcs);
+ if (v->shadow_vmcs)
+ vmcs_clear(v->shadow_vmcs);
+ }
+
+ kvm_cpu_vmxoff();
}
-#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
{
@@ -645,10 +851,10 @@ static void __loaded_vmcs_clear(void *arg)
/*
* Ensure all writes to loaded_vmcs, including deleting it from its
- * current percpu list, complete before setting loaded_vmcs->vcpu to
- * -1, otherwise a different cpu can see vcpu == -1 first and add
- * loaded_vmcs to its percpu list before it's deleted from this cpu's
- * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
+ * current percpu list, complete before setting loaded_vmcs->cpu to
+ * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
+ * and add loaded_vmcs to its percpu list before it's deleted from this
+ * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
@@ -656,7 +862,7 @@ static void __loaded_vmcs_clear(void *arg)
loaded_vmcs->launched = 0;
}
-void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
int cpu = loaded_vmcs->cpu;
@@ -723,6 +929,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
/*
+ * #VE isn't used for VMX. To test against unexpected changes
+ * related to #VE for VMX, intercept unexpected #VE and warn on it.
+ */
+ if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ eb |= 1u << VE_VECTOR;
+ /*
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
@@ -746,7 +958,7 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
*/
if (is_guest_mode(vcpu))
eb |= get_vmcs12(vcpu)->exception_bitmap;
- else {
+ else {
int mask = 0, match = 0;
if (enable_ept && (eb & (1u << PF_VECTOR))) {
@@ -764,33 +976,51 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
}
+ /*
+ * Disabling xfd interception indicates that dynamic xfeatures
+ * might be used in the guest. Always trap #NM in this case
+ * to save guest xfd_err timely.
+ */
+ if (vcpu->arch.xfd_no_write_intercept)
+ eb |= (1u << NM_VECTOR);
+
vmcs_write32(EXCEPTION_BITMAP, eb);
}
/*
* Check if MSR is intercepted for currently loaded MSR bitmap.
*/
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
{
- unsigned long *msr_bitmap;
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
+ if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
return true;
- msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
+ return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
+}
- if (msr <= 0x1fff) {
- return !!test_bit(msr, msr_bitmap + 0x800 / f);
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- return !!test_bit(msr, msr_bitmap + 0xc00 / f);
- }
+unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
+{
+ unsigned int flags = 0;
- return true;
+ if (vmx->loaded_vmcs->launched)
+ flags |= VMX_RUN_VMRESUME;
+
+ /*
+ * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
+ * to change it directly without causing a vmexit. In that case read
+ * it after vmexit and store it in vmx->spec_ctrl.
+ */
+ if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
+ flags |= VMX_RUN_SAVE_SPEC_CTRL;
+
+ if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
+ kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
+ flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
+
+ return flags;
}
-static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit)
{
vm_entry_controls_clearbit(vmx, entry);
@@ -848,7 +1078,7 @@ skip_guest:
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
-static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit,
unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
u64 guest_val, u64 host_val)
@@ -895,7 +1125,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
* provide that period, so a CPU could write host's record into
* guest's memory.
*/
- wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+ wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
}
i = vmx_find_loadstore_msr_slot(&m->guest, msr);
@@ -953,12 +1183,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
* atomically, since it's faster than switching it manually.
*/
if (cpu_has_load_ia32_efer() ||
- (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
+ (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
add_atomic_switch_msr(vmx, MSR_EFER,
- guest_efer, host_efer, false);
+ guest_efer, kvm_host.efer, false);
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
@@ -971,7 +1201,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
clear_atomic_switch_msr(vmx, MSR_EFER);
guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
+ guest_efer |= kvm_host.efer & ignore_bits;
vmx->guest_uret_msrs[i].data = guest_efer;
vmx->guest_uret_msrs[i].mask = ~ignore_bits;
@@ -1024,13 +1254,13 @@ static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
- wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
- wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
- wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
- wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+ wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
+ wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+ wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+ wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
- wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
- wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+ wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+ wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
@@ -1038,13 +1268,13 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
- rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
- rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
- rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
- rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+ rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
+ rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+ rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+ rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
- rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
- rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+ rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+ rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
@@ -1057,11 +1287,11 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
* GUEST_IA32_RTIT_CTL is already set in the VMCS.
* Save host state before VM entry.
*/
- rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
- wrmsrl(MSR_IA32_RTIT_CTL, 0);
- pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
- pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
+ wrmsrq(MSR_IA32_RTIT_CTL, 0);
+ pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
}
}
@@ -1071,12 +1301,16 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
return;
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
- pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
- pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
+ pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
}
- /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
- wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ /*
+ * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
+ * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
+ */
+ if (vmx->pt_desc.host.ctl)
+ wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
@@ -1109,6 +1343,7 @@ void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
struct vmcs_host_state *host_state;
#ifdef CONFIG_X86_64
int cpu = raw_smp_processor_id();
@@ -1117,8 +1352,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
u16 fs_sel, gs_sel;
int i;
- vmx->req_immediate_exit = false;
-
/*
* Note that guest MSRs to be saved/restored can also be changed
* when guest state is loaded. This happens when guest transitions
@@ -1136,10 +1369,10 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
}
}
- if (vmx->nested.need_vmcs12_to_shadow_sync)
+ if (vmx->nested.need_vmcs12_to_shadow_sync)
nested_sync_vmcs12_to_shadow(vcpu);
- if (vmx->guest_state_loaded)
+ if (vt->guest_state_loaded)
return;
host_state = &vmx->loaded_vmcs->host_state;
@@ -1160,15 +1393,15 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
fs_sel = current->thread.fsindex;
gs_sel = current->thread.gsindex;
fs_base = current->thread.fsbase;
- vmx->msr_host_kernel_gs_base = current->thread.gsbase;
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
} else {
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
fs_base = read_msr(MSR_FS_BASE);
- vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
}
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
@@ -1177,14 +1410,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
#endif
vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
- vmx->guest_state_loaded = true;
+ vt->guest_state_loaded = true;
}
static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
{
struct vmcs_host_state *host_state;
- if (!vmx->guest_state_loaded)
+ if (!vmx->vt.guest_state_loaded)
return;
host_state = &vmx->loaded_vmcs->host_state;
@@ -1192,7 +1425,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
++vmx->vcpu.stat.host_state_reload;
#ifdef CONFIG_X86_64
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
kvm_load_ldt(host_state->ldt_sel);
@@ -1212,35 +1445,79 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
#endif
invalidate_tss_limit();
#ifdef CONFIG_X86_64
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base);
#endif
load_fixmap_gdt(raw_smp_processor_id());
- vmx->guest_state_loaded = false;
+ vmx->vt.guest_state_loaded = false;
vmx->guest_uret_msrs_loaded = false;
}
#ifdef CONFIG_X86_64
-static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
+static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache)
{
preempt_disable();
- if (vmx->guest_state_loaded)
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ if (vmx->vt.guest_state_loaded)
+ *cache = read_msr(msr);
preempt_enable();
- return vmx->msr_guest_kernel_gs_base;
+ return *cache;
}
-static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
+static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data,
+ u64 *cache)
{
preempt_disable();
- if (vmx->guest_state_loaded)
- wrmsrl(MSR_KERNEL_GS_BASE, data);
+ if (vmx->vt.guest_state_loaded)
+ wrmsrns(msr, data);
preempt_enable();
- vmx->msr_guest_kernel_gs_base = data;
+ *cache = data;
+}
+
+static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
+{
+ return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE,
+ &vmx->msr_guest_kernel_gs_base);
+}
+
+static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
+{
+ vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data,
+ &vmx->msr_guest_kernel_gs_base);
}
#endif
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
- struct loaded_vmcs *buddy)
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
@@ -1267,19 +1544,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
if (prev != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
-
- /*
- * No indirect branch prediction barrier needed when switching
- * the active VMCS within a guest, e.g. on nested VM-Enter.
- * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
- */
- if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
- indirect_branch_prediction_barrier();
}
if (!already_loaded) {
void *gdt = get_current_gdt_ro();
- unsigned long sysenter_esp;
/*
* Flush all EPTP/VPID contexts, the new pCPU may have stale
@@ -1295,8 +1563,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
+ /* 22.2.3 */
+ vmcs_writel(HOST_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(cpu) + 1));
+ }
vmx->loaded_vmcs->cpu = cpu;
}
@@ -1306,25 +1577,24 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
- vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
vmx_vcpu_pi_load(vcpu, cpu);
-
- vmx->host_debugctlmsr = get_debugctlmsr();
}
-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);
vmx_prepare_switch_to_host(to_vmx(vcpu));
}
-static bool emulation_required(struct kvm_vcpu *vcpu)
+bool vmx_emulation_required(struct kvm_vcpu *vcpu)
{
return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
}
@@ -1352,6 +1622,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long old_rflags;
+ /*
+ * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
+ * is an unrestricted guest in order to mark L2 as needing emulation
+ * if L1 runs L2 as a restricted guest.
+ */
if (is_unrestricted_guest(vcpu)) {
kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
vmx->rflags = rflags;
@@ -1368,7 +1643,12 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
- vmx->emulation_required = emulation_required(vcpu);
+ vmx->vt.emulation_required = vmx_emulation_required(vcpu);
+}
+
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
}
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
@@ -1417,7 +1697,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
* result in a #GP unless the same write also clears TraceEn.
*/
if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
- ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+ (data & RTIT_CTL_TRACEEN) &&
+ data != vmx->pt_desc.guest.ctl)
return 1;
/*
@@ -1457,40 +1738,47 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
* cause a #GP fault.
*/
value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
return 1;
return 0;
}
-static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
- * not point tthe failing instruction, and even if it did, the code
+ * not point at the failing instruction, and even if it did, the code
* stream is inaccessible. Inject #UD instead of exiting to userspace
* so that guest userspace can't DoS the guest simply by triggering
* emulation (enclaves are CPL3 only).
*/
- if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+ if (vmx_get_exit_reason(vcpu).enclave_mode) {
kvm_queue_exception(vcpu, UD_VECTOR);
- return false;
+ return X86EMUL_PROPAGATE_FAULT;
}
- return true;
+
+ /* Check that emulation is possible during event vectoring */
+ if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ !kvm_can_emulate_event_vectoring(emul_type))
+ return X86EMUL_UNHANDLEABLE_VECTORING;
+
+ return X86EMUL_CONTINUE;
}
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
unsigned long rip, orig_rip;
u32 instr_len;
@@ -1525,8 +1813,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (!instr_len)
goto rip_updated;
- WARN(exit_reason.enclave_mode,
- "KVM: skipping instruction after SGX enclave VM-Exit");
+ WARN_ONCE(exit_reason.enclave_mode,
+ "skipping instruction after SGX enclave VM-Exit");
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + instr_len;
@@ -1556,7 +1844,7 @@ rip_updated:
* Recognizes a pending MTF VM-exit and records the nested state for later
* delivery.
*/
-static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1566,20 +1854,28 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
/*
* Per the SDM, MTF takes priority over debug-trap exceptions besides
- * T-bit traps. As instruction emulation is completed (i.e. at the
- * instruction boundary), any #DB exception pending delivery must be a
- * debug-trap. Record the pending MTF state to be delivered in
+ * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps
+ * or ICEBP (in the emulator proper), and skipping of ICEBP after an
+ * intercepted #DB deliberately avoids single-step #DB and MTF updates
+ * as ICEBP is higher priority than both. As instruction emulation is
+ * completed at this point (i.e. KVM is at the instruction boundary),
+ * any #DB exception pending delivery must be a debug-trap of lower
+ * priority than MTF. Record the pending MTF state to be delivered in
* vmx_check_nested_events().
*/
if (nested_cpu_has_mtf(vmcs12) &&
(!vcpu->arch.exception.pending ||
- vcpu->arch.exception.nr == DB_VECTOR))
+ vcpu->arch.exception.vector == DB_VECTOR) &&
+ (!vcpu->arch.exception_vmexit.pending ||
+ vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
vmx->nested.mtf_pending = true;
- else
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ } else {
vmx->nested.mtf_pending = false;
+ }
}
-static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
vmx_update_emulated_instruction(vcpu);
return skip_emulated_instruction(vcpu);
@@ -1598,32 +1894,40 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
-static void vmx_queue_exception(struct kvm_vcpu *vcpu)
+void vmx_inject_exception(struct kvm_vcpu *vcpu)
{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
+ u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned nr = vcpu->arch.exception.nr;
- bool has_error_code = vcpu->arch.exception.has_error_code;
- u32 error_code = vcpu->arch.exception.error_code;
- u32 intr_info = nr | INTR_INFO_VALID_MASK;
- kvm_deliver_exception_payload(vcpu);
+ kvm_deliver_exception_payload(vcpu, ex);
- if (has_error_code) {
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+ if (ex->has_error_code) {
+ /*
+ * Despite the error code being architecturally defined as 32
+ * bits, and the VMCS field being 32 bits, Intel CPUs and thus
+ * VMX don't actually supporting setting bits 31:16. Hardware
+ * will (should) never provide a bogus error code, but AMD CPUs
+ * do generate error codes with bits 31:16 set, and so KVM's
+ * ABI lets userspace shove in arbitrary 32-bit values. Drop
+ * the upper bits to avoid VM-Fail, losing information that
+ * doesn't really exist is preferable to killing the VM.
+ */
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
if (vmx->rmode.vm86_active) {
int inc_eip = 0;
- if (kvm_exception_is_soft(nr))
+ if (kvm_exception_is_soft(ex->vector))
inc_eip = vcpu->arch.event_exit_inst_len;
- kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
+ kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
return;
}
- WARN_ON_ONCE(vmx->emulation_required);
+ WARN_ON_ONCE(vmx->vt.emulation_required);
- if (kvm_exception_is_soft(nr)) {
+ if (kvm_exception_is_soft(ex->vector)) {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
@@ -1648,11 +1952,12 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
}
/*
- * Set up the vmcs to automatically save and restore system
- * msrs. Don't touch the 64-bit msrs if the guest is in legacy
- * mode, as fiddling with msrs is very expensive.
+ * Configuring user return MSRs to automatically save, load, and restore MSRs
+ * that need to be shoved into hardware when running the guest. Note, omitting
+ * an MSR here does _NOT_ mean it's not emulated, only that it will not be
+ * loaded into hardware when running the guest.
*/
-static void setup_msrs(struct vcpu_vmx *vmx)
+static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
{
#ifdef CONFIG_X86_64
bool load_syscall_msrs;
@@ -1671,8 +1976,8 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
- guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID));
/*
* hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
@@ -1682,9 +1987,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
*/
vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(&vmx->vcpu);
-
/*
* The set of MSRs to load may have changed, reload MSRs before the
* next VM-Enter.
@@ -1710,59 +2012,75 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
return vmcs12->tsc_multiplier;
- return kvm_default_tsc_scaling_ratio;
+ return kvm_caps.default_tsc_scaling_ratio;
}
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
{
- vmcs_write64(TSC_OFFSET, offset);
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
}
-static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
- vmcs_write64(TSC_MULTIPLIER, multiplier);
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
}
/*
- * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
- * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
- * all guests if the "nested" module option is off, and can also be disabled
- * for a single guest by disabling its VMX cpuid bit.
+ * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
+ * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
+ * backwards compatibility even though KVM doesn't support emulating SMX. And
+ * because userspace set "VMX in SMX", the guest must also be allowed to set it,
+ * e.g. if the MSR is left unlocked and the guest does a RMW operation.
*/
-bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
-{
- return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
-}
+#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
+ FEAT_CTL_SGX_LC_ENABLED | \
+ FEAT_CTL_SGX_ENABLED | \
+ FEAT_CTL_LMCE_ENABLED)
-static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
- uint64_t val)
+static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
+ struct msr_data *msr)
{
- uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+ uint64_t valid_bits;
- return !(val & ~valid_bits);
+ /*
+ * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
+ * exposed to the guest.
+ */
+ WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
+ ~KVM_SUPPORTED_FEATURE_CONTROL);
+
+ if (!msr->host_initiated &&
+ (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
+ return false;
+
+ if (msr->host_initiated)
+ valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
+ else
+ valid_bits = vmx->msr_ia32_feature_control_valid_bits;
+
+ return !(msr->data & ~valid_bits);
}
-static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+int vmx_get_feature_msr(u32 msr, u64 *data)
{
- switch (msr->index) {
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ switch (msr) {
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested)
return 1;
- return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
- case MSR_IA32_PERF_CAPABILITIES:
- msr->data = vmx_get_perf_capabilities();
- return 0;
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
default:
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
}
/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
@@ -1812,7 +2130,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
@@ -1828,27 +2146,29 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
return 1;
msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
break;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
- if (!nested_vmx_allowed(vcpu))
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return 1;
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data))
return 1;
+#ifdef CONFIG_KVM_HYPERV
/*
- * Enlightened VMCS v1 doesn't have certain fields, but buggy
- * Hyper-V versions are still trying to use corresponding
- * features when they are exposed. Filter out the essential
- * minimum.
+ * Enlightened VMCS v1 doesn't have certain VMCS fields but
+ * instead of just ignoring the features, different Hyper-V
+ * versions are either trying to use them and fail or do some
+ * sanity checking and refuse to boot. Filter all unsupported
+ * features out.
*/
- if (!msr_info->host_initiated &&
- vmx->nested.enlightened_vmcs_enabled)
- nested_evmcs_filter_control_msr(msr_info->index,
+ if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
+ nested_evmcs_filter_control_msr(vcpu, msr_info->index,
&msr_info->data);
+#endif
break;
case MSR_IA32_RTIT_CTL:
if (!vmx_pt_mode_is_host_guest())
@@ -1888,16 +2208,24 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
if (!vmx_pt_mode_is_host_guest() ||
- (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges)))
+ (index >= 2 * vmx->pt_desc.num_address_ranges))
return 1;
if (index % 2)
msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
else
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
+ case MSR_IA32_S_CET:
+ msr_info->data = vmcs_readl(GUEST_S_CET);
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ msr_info->data = vmcs_readl(GUEST_SSP);
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
+ break;
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ msr_info->data = vmx_guest_debugctl_read();
break;
default:
find_uret_msr:
@@ -1916,31 +2244,49 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
u64 data)
{
#ifdef CONFIG_X86_64
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
return (u32)data;
#endif
return (unsigned long)data;
}
-static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
{
- u64 debugctl = vmx_supported_debugctl();
+ u64 debugctl = 0;
- if (!intel_pmu_lbr_is_enabled(vcpu))
- debugctl &= ~DEBUGCTLMSR_LBR_MASK;
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
+ debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
- debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+ if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) &&
+ (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+ debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
+ if (boot_cpu_has(X86_FEATURE_RTM) &&
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
+ debugctl |= DEBUGCTLMSR_RTM_DEBUG;
return debugctl;
}
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+{
+ u64 invalid;
+
+ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
+ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
+ }
+ return !invalid;
+}
+
/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
@@ -1965,6 +2311,24 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KERNEL_GS_BASE:
vmx_write_guest_kernel_gs_base(vmx, data);
break;
+ case MSR_IA32_XFD:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ /*
+ * Always intercepting WRMSR could incur non-negligible
+ * overhead given xfd might be changed frequently in
+ * guest context switch. Disable write interception
+ * upon the first write with a non-zero value (indicating
+ * potential usage on dynamic xfeatures). Also update
+ * exception bitmap to trap #NM for proper virtualization
+ * of guest xfd_err.
+ */
+ if (!ret && data) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
+ MSR_TYPE_RW);
+ vcpu->arch.xfd_no_write_intercept = true;
+ vmx_update_exception_bitmap(vcpu);
+ }
+ break;
#endif
case MSR_IA32_SYSENTER_CS:
if (is_guest_mode(vcpu))
@@ -1985,37 +2349,36 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_DEBUGCTLMSR: {
- u64 invalid = data & ~vcpu_supported_debugctl(vcpu);
- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- }
-
- if (invalid)
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
return 1;
+ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ vmx_guest_debugctl_write(vcpu, data);
+
if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
(data & DEBUGCTLMSR_LBR))
intel_pmu_create_guest_lbr_event(vcpu);
return 0;
- }
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
return 1;
- if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
+ if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD))
return 1;
+
+ if (is_guest_mode(vcpu) &&
+ ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
+ (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
+ get_vmcs12(vcpu)->guest_bndcfgs = data;
+
vmcs_write64(GUEST_BNDCFGS, data);
break;
case MSR_IA32_UMWAIT_CONTROL:
@@ -2063,50 +2426,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
return 1;
goto find_uret_msr;
- case MSR_IA32_PRED_CMD:
- if (!msr_info->host_initiated &&
- !guest_has_pred_cmd_msr(vcpu))
- return 1;
-
- if (data & ~PRED_CMD_IBPB)
- return 1;
- if (!boot_cpu_has(X86_FEATURE_IBPB))
- return 1;
- if (!data)
- break;
-
- wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
-
- /*
- * For non-nested:
- * When it's written (to non-zero) for the first time, pass
- * it through.
- *
- * For nested:
- * The handling of the MSR bitmap for L2 guests is done in
- * nested_vmx_prepare_msr_bitmap. We should not touch the
- * vmcs02.msr_bitmap here since it gets completely overwritten
- * in the merging.
- */
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
- break;
case MSR_IA32_CR_PAT:
- if (!kvm_pat_valid(data))
- return 1;
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ if (ret)
+ break;
if (is_guest_mode(vcpu) &&
get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
get_vmcs12(vcpu)->guest_ia32_pat = data;
- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, data);
- vcpu->arch.pat = data;
- break;
- }
- ret = kvm_set_msr_common(vcpu, msr_info);
- break;
- case MSR_IA32_TSC_ADJUST:
- ret = kvm_set_msr_common(vcpu, msr_info);
break;
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
@@ -2117,10 +2447,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.mcg_ext_ctl = data;
break;
case MSR_IA32_FEAT_CTL:
- if (!vmx_feature_control_msr_valid(vcpu, data) ||
- (to_vmx(vcpu)->msr_ia32_feature_control &
- FEAT_CTL_LOCKED && !msr_info->host_initiated))
+ if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
return 1;
+
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
@@ -2141,17 +2470,17 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* behavior, but it's close enough.
*/
if (!msr_info->host_initiated &&
- (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+ (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
!(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
return 1;
vmx->msr_ia32_sgxlepubkeyhash
[msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
break;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
- if (!nested_vmx_allowed(vcpu))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_RTIT_CTL:
@@ -2204,24 +2533,41 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!pt_can_write_msr(vmx))
return 1;
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
- if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges))
+ if (index >= 2 * vmx->pt_desc.num_address_ranges)
return 1;
- if (is_noncanonical_address(data, vcpu))
+ if (is_noncanonical_msr_address(data, vcpu))
return 1;
if (index % 2)
vmx->pt_desc.guest.addr_b[index / 2] = data;
else
vmx->pt_desc.guest.addr_a[index / 2] = data;
break;
+ case MSR_IA32_S_CET:
+ vmcs_writel(GUEST_S_CET, data);
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ vmcs_writel(GUEST_SSP, data);
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ vmcs_writel(GUEST_INTR_SSP_TABLE, data);
+ break;
case MSR_IA32_PERF_CAPABILITIES:
- if (data && !vcpu_to_pmu(vcpu)->version)
- return 1;
- if (data & PMU_CAP_LBR_FMT) {
- if ((data & PMU_CAP_LBR_FMT) !=
- (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
+ if (data & PERF_CAP_LBR_FMT) {
+ if ((data & PERF_CAP_LBR_FMT) !=
+ (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT))
return 1;
- if (!intel_pmu_lbr_is_compatible(vcpu))
+ if (!cpuid_model_is_consistent(vcpu))
+ return 1;
+ }
+ if (data & PERF_CAP_PEBS_FORMAT) {
+ if ((data & PERF_CAP_PEBS_MASK) !=
+ (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
+ return 1;
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS))
+ return 1;
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64))
+ return 1;
+ if (!cpuid_model_is_consistent(vcpu))
return 1;
}
ret = kvm_set_msr_common(vcpu, msr_info);
@@ -2236,10 +2582,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
ret = kvm_set_msr_common(vcpu, msr_info);
}
+ /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
+ if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
+ vmx_update_fb_clear_dis(vcpu, vmx);
+
return ret;
}
-static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
unsigned long guest_owned_bits;
@@ -2263,8 +2613,11 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
break;
case VCPU_EXREG_CR3:
- if (is_unrestricted_guest(vcpu) ||
- (enable_ept && is_paging(vcpu)))
+ /*
+ * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
+ * CR3 is loaded into hardware, not the guest's CR3.
+ */
+ if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
break;
case VCPU_EXREG_CR4:
@@ -2274,93 +2627,11 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
break;
default:
- WARN_ON_ONCE(1);
+ KVM_BUG_ON(1, vcpu->kvm);
break;
}
}
-static __init int cpu_has_kvm_support(void)
-{
- return cpu_has_vmx();
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
- return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !boot_cpu_has(X86_FEATURE_VMX);
-}
-
-static int kvm_cpu_vmxon(u64 vmxon_pointer)
-{
- u64 msr;
-
- cr4_set_bits(X86_CR4_VMXE);
-
- asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
- _ASM_EXTABLE(1b, %l[fault])
- : : [vmxon_pointer] "m"(vmxon_pointer)
- : : fault);
- return 0;
-
-fault:
- WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
- rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
- cr4_clear_bits(X86_CR4_VMXE);
-
- return -EFAULT;
-}
-
-static int hardware_enable(void)
-{
- int cpu = raw_smp_processor_id();
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- int r;
-
- if (cr4_read_shadow() & X86_CR4_VMXE)
- return -EBUSY;
-
- /*
- * This can happen if we hot-added a CPU but failed to allocate
- * VP assist page for it.
- */
- if (static_branch_unlikely(&enable_evmcs) &&
- !hv_get_vp_assist_page(cpu))
- return -EFAULT;
-
- intel_pt_handle_vmx(1);
-
- r = kvm_cpu_vmxon(phys_addr);
- if (r) {
- intel_pt_handle_vmx(0);
- return r;
- }
-
- if (enable_ept)
- ept_sync_global();
-
- return 0;
-}
-
-static void vmclear_local_loaded_vmcss(void)
-{
- int cpu = raw_smp_processor_id();
- struct loaded_vmcs *v, *n;
-
- list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
- __loaded_vmcs_clear(v);
-}
-
-static void hardware_disable(void)
-{
- vmclear_local_loaded_vmcss();
-
- if (cpu_vmxoff())
- kvm_spurious_fault();
-
- intel_pt_handle_vmx(0);
-}
-
/*
* There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
* directly instead of going through cpu_has(), to ensure KVM is trapping
@@ -2372,8 +2643,7 @@ static bool cpu_has_sgx(void)
return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
}
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
- u32 msr, u32 *result)
+static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
u32 ctl = ctl_min | ctl_opt;
@@ -2391,76 +2661,89 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
- struct vmx_capability *vmx_cap)
+static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+{
+ u64 allowed;
+
+ rdmsrq(msr, allowed);
+
+ return ctl_opt & allowed;
+}
+
+#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \
+({ \
+ int i, r = 0; \
+ \
+ BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \
+ BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \
+ \
+ for (i = 0; i < ARRAY_SIZE(pairs); i++) { \
+ typeof(entry_controls) n_ctrl = pairs[i].entry_control; \
+ typeof(exit_controls) x_ctrl = pairs[i].exit_control; \
+ \
+ if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \
+ continue; \
+ \
+ pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \
+ "entry = %llx (%llx), exit = %llx (%llx)\n", \
+ (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \
+ (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \
+ \
+ if (error_on_inconsistent_vmcs_config) \
+ r = -EIO; \
+ \
+ entry_controls &= ~n_ctrl; \
+ exit_controls &= ~x_ctrl; \
+ } \
+ r; \
+})
+
+static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
+ struct vmx_capability *vmx_cap)
{
- u32 vmx_msr_low, vmx_msr_high;
- u32 min, opt, min2, opt2;
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
+ u64 _cpu_based_3rd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
+ u64 basic_msr;
+ u64 misc_msr;
+
+ /*
+ * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
+ * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
+ * intercepts writes to PAT and EFER, i.e. never enables those controls.
+ */
+ struct {
+ u32 entry_control;
+ u32 exit_control;
+ } const vmcs_entry_exit_pairs[] = {
+ { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
+ { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
+ { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
+ { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
+ { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
+ { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE },
+ };
memset(vmcs_conf, 0, sizeof(*vmcs_conf));
- min = CPU_BASED_HLT_EXITING |
-#ifdef CONFIG_X86_64
- CPU_BASED_CR8_LOAD_EXITING |
- CPU_BASED_CR8_STORE_EXITING |
-#endif
- CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_UNCOND_IO_EXITING |
- CPU_BASED_MOV_DR_EXITING |
- CPU_BASED_USE_TSC_OFFSETTING |
- CPU_BASED_MWAIT_EXITING |
- CPU_BASED_MONITOR_EXITING |
- CPU_BASED_INVLPG_EXITING |
- CPU_BASED_RDPMC_EXITING;
-
- opt = CPU_BASED_TPR_SHADOW |
- CPU_BASED_USE_MSR_BITMAPS |
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
- &_cpu_based_exec_control) < 0)
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PROCBASED_CTLS,
+ &_cpu_based_exec_control))
return -EIO;
-#ifdef CONFIG_X86_64
- if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
- _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
- ~CPU_BASED_CR8_STORE_EXITING;
-#endif
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
- min2 = 0;
- opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
- SECONDARY_EXEC_WBINVD_EXITING |
- SECONDARY_EXEC_ENABLE_VPID |
- SECONDARY_EXEC_ENABLE_EPT |
- SECONDARY_EXEC_UNRESTRICTED_GUEST |
- SECONDARY_EXEC_PAUSE_LOOP_EXITING |
- SECONDARY_EXEC_DESC |
- SECONDARY_EXEC_ENABLE_RDTSCP |
- SECONDARY_EXEC_ENABLE_INVPCID |
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_SHADOW_VMCS |
- SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_RDSEED_EXITING |
- SECONDARY_EXEC_RDRAND_EXITING |
- SECONDARY_EXEC_ENABLE_PML |
- SECONDARY_EXEC_TSC_SCALING |
- SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
- SECONDARY_EXEC_PT_USE_GPA |
- SECONDARY_EXEC_PT_CONCEAL_VMX |
- SECONDARY_EXEC_ENABLE_VMFUNC |
- SECONDARY_EXEC_BUS_LOCK_DETECTION;
- if (cpu_has_sgx())
- opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
- if (adjust_vmx_controls(min2, opt2,
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
MSR_IA32_VMX_PROCBASED_CTLS2,
- &_cpu_based_2nd_exec_control) < 0)
+ &_cpu_based_2nd_exec_control))
return -EIO;
}
+ if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
+
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
@@ -2476,43 +2759,46 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
&vmx_cap->ept, &vmx_cap->vpid);
- if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
- /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
- enabled */
- _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_INVLPG_EXITING);
- } else if (vmx_cap->ept) {
- vmx_cap->ept = 0;
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+ vmx_cap->ept) {
pr_warn_once("EPT CAP should not exist if not support "
"1-setting enable EPT VM-execution control\n");
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ vmx_cap->ept = 0;
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
- vmx_cap->vpid) {
- vmx_cap->vpid = 0;
+ vmx_cap->vpid) {
pr_warn_once("VPID CAP should not exist if not support "
"1-setting enable VPID VM-execution control\n");
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ vmx_cap->vpid = 0;
}
- min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
-#ifdef CONFIG_X86_64
- min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#endif
- opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
- VM_EXIT_LOAD_IA32_PAT |
- VM_EXIT_LOAD_IA32_EFER |
- VM_EXIT_CLEAR_BNDCFGS |
- VM_EXIT_PT_CONCEAL_PIP |
- VM_EXIT_CLEAR_IA32_RTIT_CTL;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
- &_vmexit_control) < 0)
+ if (!cpu_has_sgx())
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
+
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
+ _cpu_based_3rd_exec_control =
+ adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PROCBASED_CTLS3);
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
+ KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
+ MSR_IA32_VMX_EXIT_CTLS,
+ &_vmexit_control))
return -EIO;
- min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
- opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
- PIN_BASED_VMX_PREEMPTION_TIMER;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
- &_pin_based_exec_control) < 0)
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PINBASED_CTLS,
+ &_pin_based_exec_control))
return -EIO;
if (cpu_has_broken_vmx_preemption_timer())
@@ -2521,15 +2807,14 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
- min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
- opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
- VM_ENTRY_LOAD_IA32_PAT |
- VM_ENTRY_LOAD_IA32_EFER |
- VM_ENTRY_LOAD_BNDCFGS |
- VM_ENTRY_PT_CONCEAL_PIP |
- VM_ENTRY_LOAD_IA32_RTIT_CTL;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
- &_vmentry_control) < 0)
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
+ KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
+ MSR_IA32_VMX_ENTRY_CTLS,
+ &_vmentry_control))
+ return -EIO;
+
+ if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs,
+ _vmentry_control, _vmexit_control))
return -EIO;
/*
@@ -2538,51 +2823,51 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
* IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
* MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
*/
- if (boot_cpu_data.x86 == 0x6) {
- switch (boot_cpu_data.x86_model) {
- case 26: /* AAK155 */
- case 30: /* AAP115 */
- case 37: /* AAT100 */
- case 44: /* BC86,AAY89,BD102 */
- case 46: /* BA97 */
- _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
- _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
- pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
- "does not work properly. Using workaround\n");
- break;
- default:
- break;
- }
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_NEHALEM_EP: /* AAK155 */
+ case INTEL_NEHALEM: /* AAP115 */
+ case INTEL_WESTMERE: /* AAT100 */
+ case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
+ case INTEL_NEHALEM_EX: /* BA97 */
+ _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ "does not work properly. Using workaround\n");
+ break;
+ default:
+ break;
}
-
- rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+ rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
- if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
return -EIO;
#ifdef CONFIG_X86_64
- /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
- if (vmx_msr_high & (1u<<16))
+ /*
+ * KVM expects to be able to shove all legal physical addresses into
+ * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
+ * 0 for processors that support Intel 64 architecture".
+ */
+ if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
return -EIO;
#endif
/* Require Write-Back (WB) memory type for VMCS accesses. */
- if (((vmx_msr_high >> 18) & 15) != 6)
+ if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
return -EIO;
- vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->order = get_order(vmcs_conf->size);
- vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
-
- vmcs_conf->revision_id = vmx_msr_low;
+ rdmsrq(MSR_IA32_VMX_MISC, misc_msr);
+ vmcs_conf->basic = basic_msr;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+ vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
+ vmcs_conf->misc = misc_msr;
#if IS_ENABLED(CONFIG_HYPERV)
if (enlightened_vmcs)
@@ -2592,23 +2877,143 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return 0;
}
+static bool __kvm_is_vmx_supported(void)
+{
+ int cpu = smp_processor_id();
+
+ if (!(cpuid_ecx(1) & feature_bit(VMX))) {
+ pr_err("VMX not supported by CPU %d\n", cpu);
+ return false;
+ }
+
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static bool kvm_is_vmx_supported(void)
+{
+ bool supported;
+
+ migrate_disable();
+ supported = __kvm_is_vmx_supported();
+ migrate_enable();
+
+ return supported;
+}
+
+int vmx_check_processor_compat(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct vmcs_config vmcs_conf;
+ struct vmx_capability vmx_cap;
+
+ if (!__kvm_is_vmx_supported())
+ return -EIO;
+
+ if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
+ pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ if (nested)
+ nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
+ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
+ pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
+{
+ u64 msr;
+
+ cr4_set_bits(X86_CR4_VMXE);
+
+ asm goto("1: vmxon %[vmxon_pointer]\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ : : [vmxon_pointer] "m"(vmxon_pointer)
+ : : fault);
+ return 0;
+
+fault:
+ WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ cr4_clear_bits(X86_CR4_VMXE);
+
+ return -EFAULT;
+}
+
+int vmx_enable_virtualization_cpu(void)
+{
+ int cpu = raw_smp_processor_id();
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ int r;
+
+ if (cr4_read_shadow() & X86_CR4_VMXE)
+ return -EBUSY;
+
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
+ intel_pt_handle_vmx(1);
+
+ r = kvm_cpu_vmxon(phys_addr);
+ if (r) {
+ intel_pt_handle_vmx(0);
+ return r;
+ }
+
+ return 0;
+}
+
+static void vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v, *n;
+
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
+}
+
+void vmx_disable_virtualization_cpu(void)
+{
+ vmclear_local_loaded_vmcss();
+
+ if (kvm_cpu_vmxoff())
+ kvm_spurious_fault();
+
+ hv_reset_evmcs();
+
+ intel_pt_handle_vmx(0);
+}
+
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
int node = cpu_to_node(cpu);
struct page *pages;
struct vmcs *vmcs;
- pages = __alloc_pages_node(node, flags, vmcs_config.order);
+ pages = __alloc_pages_node(node, flags, 0);
if (!pages)
return NULL;
vmcs = page_address(pages);
- memset(vmcs, 0, vmcs_config.size);
+ memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
/* KVM supports Enlightened VMCS v1 only */
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
else
- vmcs->hdr.revision_id = vmcs_config.revision_id;
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
if (shadow)
vmcs->hdr.shadow_vmcs = 1;
@@ -2617,7 +3022,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
void free_vmcs(struct vmcs *vmcs)
{
- free_pages((unsigned long)vmcs, vmcs_config.order);
+ free_page((unsigned long)vmcs);
}
/*
@@ -2654,15 +3059,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
-
- if (IS_ENABLED(CONFIG_HYPERV) &&
- static_branch_unlikely(&enable_evmcs) &&
- (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
- struct hv_enlightened_vmcs *evmcs =
- (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
-
- evmcs->hv_enlightenments_control.msr_bitmap = 1;
- }
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
@@ -2709,8 +3105,8 @@ static __init int alloc_kvm_area(void)
* still be marked with revision_id reported by
* physical CPU.
*/
- if (static_branch_unlikely(&enable_evmcs))
- vmcs->hdr.revision_id = vmcs_config.revision_id;
+ if (kvm_is_using_evmcs())
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
per_cpu(vmxarea, cpu) = vmcs;
}
@@ -2733,7 +3129,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
save->dpl = save->selector & SEGMENT_RPL_MASK;
save->s = 1;
}
- vmx_set_segment(vcpu, save, seg);
+ __vmx_set_segment(vcpu, save, seg);
}
static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2754,7 +3150,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmx->rmode.vm86_active = 0;
- vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+ __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@ -2796,9 +3192,8 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save)
var.type = 0x3;
var.avl = 0;
if (save->base & 0xf)
- printk_once(KERN_WARNING "kvm: segment base is not "
- "paragraph aligned when entering "
- "protected mode (seg=%d)", seg);
+ pr_warn_once("segment base is not paragraph aligned "
+ "when entering protected mode (seg=%d)", seg);
}
vmcs_write16(sf->selector, var.selector);
@@ -2813,6 +3208,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+ /*
+ * KVM should never use VM86 to virtualize Real Mode when L2 is active,
+ * as using VM86 is unnecessary if unrestricted guest is enabled, and
+ * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
+ * should VM-Fail and KVM should reject userspace attempts to stuff
+ * CR0.PG=0 when L2 is active.
+ */
+ WARN_ON_ONCE(is_guest_mode(vcpu));
+
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
@@ -2823,14 +3227,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmx->rmode.vm86_active = 1;
- /*
- * Very old userspace does not call KVM_SET_TSS_ADDR before entering
- * vcpu. Warn the user that an update is overdue.
- */
- if (!kvm_vmx->tss_addr)
- printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
- "called before entering vcpu\n");
-
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
@@ -2852,29 +3248,28 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
-
- kvm_mmu_reset_context(vcpu);
}
int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER);
/* Nothing to do if hardware doesn't support EFER. */
- if (!msr)
+ if (!vmx_find_uret_msr(vmx, MSR_EFER))
return 0;
vcpu->arch.efer = efer;
- if (efer & EFER_LMA) {
- vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
- msr->data = efer;
- } else {
- vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
+#ifdef CONFIG_X86_64
+ if (efer & EFER_LMA)
+ vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
+ else
+ vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
+#else
+ if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
+ return 1;
+#endif
- msr->data = efer & ~EFER_LME;
- }
- setup_msrs(vmx);
+ vmx_setup_uret_msrs(vmx);
return 0;
}
@@ -2899,13 +3294,12 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
static void exit_lmode(struct kvm_vcpu *vcpu)
{
- vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
}
#endif
-static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2928,43 +3322,81 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
}
}
-static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
+static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
+ return nested_get_vpid02(vcpu);
+ return to_vmx(vcpu)->vpid;
+}
+
+static u64 construct_eptp(hpa_t root_hpa)
+{
+ u64 eptp = root_hpa | VMX_EPTP_MT_WB;
+ struct kvm_mmu_page *root;
+
+ if (kvm_mmu_is_dummy_root(root_hpa))
+ return eptp | VMX_EPTP_PWL_4;
+
+ /*
+ * EPT roots should always have an associated MMU page. Return a "bad"
+ * EPTP to induce VM-Fail instead of continuing on in a unknown state.
+ */
+ root = root_to_sp(root_hpa);
+ if (WARN_ON_ONCE(!root))
+ return INVALID_PAGE;
+
+ eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+
+ if (enable_ept_ad_bits && !root->role.ad_disabled)
+ eptp |= VMX_EPTP_AD_ENABLE_BIT;
+
+ return eptp;
+}
+
+static void vmx_flush_tlb_ept_root(hpa_t root_hpa)
+{
+ u64 eptp = construct_eptp(root_hpa);
+
+ if (VALID_PAGE(eptp))
+ ept_sync_context(eptp);
+ else
+ ept_sync_global();
+}
+
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- u64 root_hpa = mmu->root_hpa;
+ u64 root_hpa = mmu->root.hpa;
/* No flush required if the current context is invalid. */
if (!VALID_PAGE(root_hpa))
return;
if (enable_ept)
- ept_sync_context(construct_eptp(vcpu, root_hpa,
- mmu->shadow_root_level));
- else if (!is_guest_mode(vcpu))
- vpid_sync_context(to_vmx(vcpu)->vpid);
+ vmx_flush_tlb_ept_root(root_hpa);
else
- vpid_sync_context(nested_get_vpid02(vcpu));
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
-static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
- * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
+ * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
* vmx_flush_tlb_guest() for an explanation of why this is ok.
*/
- vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
+ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
-static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
- * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
- * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit
- * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
+ * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
+ * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
* disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
* i.e. no explicit INVVPID is necessary.
*/
- vpid_sync_context(to_vmx(vcpu)->vpid);
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -2994,45 +3426,38 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
}
-static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
- unsigned long cr0,
- struct kvm_vcpu *vcpu)
+#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING)
+
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (is_guest_mode(vcpu))
+ return nested_guest_cr0_valid(vcpu, cr0);
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
- vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
- if (!(cr0 & X86_CR0_PG)) {
- /* From paging/starting to nonpaging */
- exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING);
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
- } else if (!is_paging(vcpu)) {
- /* From nonpaging to paging */
- exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING);
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
- }
+ if (to_vmx(vcpu)->nested.vmxon)
+ return nested_host_cr0_valid(vcpu, cr0);
- if (!(cr0 & X86_CR0_WP))
- *hw_cr0 &= ~X86_CR0_WP;
+ return true;
}
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long hw_cr0;
+ unsigned long hw_cr0, old_cr0_pg;
+ u32 tmp;
+
+ old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
- if (is_unrestricted_guest(vcpu))
+ if (enable_unrestricted_guest)
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
+ if (!enable_ept)
+ hw_cr0 |= X86_CR0_WP;
if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
@@ -3041,83 +3466,114 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
enter_rmode(vcpu);
}
+ vmcs_writel(CR0_READ_SHADOW, cr0);
+ vmcs_writel(GUEST_CR0, hw_cr0);
+ vcpu->arch.cr0 = cr0;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
+ if (!old_cr0_pg && (cr0 & X86_CR0_PG))
enter_lmode(vcpu);
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
+ else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
exit_lmode(vcpu);
}
#endif
- if (enable_ept && !is_unrestricted_guest(vcpu))
- ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+ if (enable_ept && !enable_unrestricted_guest) {
+ /*
+ * Ensure KVM has an up-to-date snapshot of the guest's CR3. If
+ * the below code _enables_ CR3 exiting, vmx_cache_reg() will
+ * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
+ * KVM's CR3 is installed.
+ */
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
- vmcs_writel(CR0_READ_SHADOW, cr0);
- vmcs_writel(GUEST_CR0, hw_cr0);
- vcpu->arch.cr0 = cr0;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+ /*
+ * When running with EPT but not unrestricted guest, KVM must
+ * intercept CR3 accesses when paging is _disabled_. This is
+ * necessary because restricted guests can't actually run with
+ * paging disabled, and so KVM stuffs its own CR3 in order to
+ * run the guest when identity mapped page tables.
+ *
+ * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
+ * update, it may be stale with respect to CR3 interception,
+ * e.g. after nested VM-Enter.
+ *
+ * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
+ * stores to forward them to L1, even if KVM does not need to
+ * intercept them to preserve its identity mapped page tables.
+ */
+ if (!(cr0 & X86_CR0_PG)) {
+ exec_controls_setbit(vmx, CR3_EXITING_BITS);
+ } else if (!is_guest_mode(vcpu)) {
+ exec_controls_clearbit(vmx, CR3_EXITING_BITS);
+ } else {
+ tmp = exec_controls_get(vmx);
+ tmp &= ~CR3_EXITING_BITS;
+ tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
+ exec_controls_set(vmx, tmp);
+ }
+
+ /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
+ if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+
+ /*
+ * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
+ * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
+ */
+ if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ }
/* depends on vcpu->arch.cr0 to be set to a new value */
- vmx->emulation_required = emulation_required(vcpu);
+ vmx->vt.emulation_required = vmx_emulation_required(vcpu);
}
-static int vmx_get_max_tdp_level(void)
+static int vmx_get_max_ept_level(void)
{
if (cpu_has_vmx_ept_5levels())
return 5;
return 4;
}
-u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
-{
- u64 eptp = VMX_EPTP_MT_WB;
-
- eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
-
- if (enable_ept_ad_bits &&
- (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
- eptp |= VMX_EPTP_AD_ENABLE_BIT;
- eptp |= root_hpa;
-
- return eptp;
-}
-
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
- int root_level)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
unsigned long guest_cr3;
- u64 eptp;
if (enable_ept) {
- eptp = construct_eptp(vcpu, root_hpa, root_level);
- vmcs_write64(EPT_POINTER, eptp);
+ KVM_MMU_WARN_ON(root_to_sp(root_hpa) &&
+ root_level != root_to_sp(root_hpa)->role.level);
+ vmcs_write64(EPT_POINTER, construct_eptp(root_hpa));
hv_track_root_tdp(vcpu, root_hpa);
if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
- else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
guest_cr3 = vcpu->arch.cr3;
- else /* vmcs01.GUEST_CR3 is already up-to-date. */
+ else /* vmcs.GUEST_CR3 is already up-to-date. */
update_guest_cr3 = false;
vmx_ept_load_pdptrs(vcpu);
} else {
- guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
+ guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) |
+ kvm_get_active_cr3_lam_bits(vcpu);
}
if (update_guest_cr3)
vmcs_writel(GUEST_CR3, guest_cr3);
}
-static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
/*
* We operate under the default treatment of SMM, so VMX cannot be
- * enabled under SMM. Note, whether or not VMXE is allowed at all is
- * handled by kvm_is_valid_cr4().
+ * enabled under SMM. Note, whether or not VMXE is allowed at all,
+ * i.e. is a reserved bit, is handled by common x86 code.
*/
if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
return false;
@@ -3130,24 +3586,24 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long old_cr4 = vcpu->arch.cr4;
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr4;
+
/*
* Pass through host's Machine Check Enable value to hw_cr4, which
* is in force while we are in guest mode. Do not let guests control
* this bit, even if host CR4.MCE == 0.
*/
- unsigned long hw_cr4;
-
hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
- if (is_unrestricted_guest(vcpu))
+ if (enable_unrestricted_guest)
hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
else if (vmx->rmode.vm86_active)
hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
else
hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
- if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
+ if (vmx_umip_emulated()) {
if (cr4 & X86_CR4_UMIP) {
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
hw_cr4 &= ~X86_CR4_UMIP;
@@ -3160,7 +3616,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vcpu->arch.cr4 = cr4;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
- if (!is_unrestricted_guest(vcpu)) {
+ if (!enable_unrestricted_guest) {
if (enable_ept) {
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
@@ -3189,7 +3645,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcs_writel(GUEST_CR4, hw_cr4);
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
}
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
@@ -3228,7 +3684,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
var->g = (ar >> 15) & 1;
}
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_segment s;
@@ -3239,39 +3695,49 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
}
-int vmx_get_cpl(struct kvm_vcpu *vcpu)
+static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ar;
if (unlikely(vmx->rmode.vm86_active))
return 0;
- else {
- int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
- return VMX_AR_DPL(ar);
- }
+
+ if (no_cache)
+ ar = vmcs_read32(GUEST_SS_AR_BYTES);
+ else
+ ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
+ return VMX_AR_DPL(ar);
+}
+
+int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ return __vmx_get_cpl(vcpu, false);
+}
+
+int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu)
+{
+ return __vmx_get_cpl(vcpu, true);
}
static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
- if (var->unusable || !var->present)
- ar = 1 << 16;
- else {
- ar = var->type & 15;
- ar |= (var->s & 1) << 4;
- ar |= (var->dpl & 3) << 5;
- ar |= (var->present & 1) << 7;
- ar |= (var->avl & 1) << 12;
- ar |= (var->l & 1) << 13;
- ar |= (var->db & 1) << 14;
- ar |= (var->g & 1) << 15;
- }
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ ar |= (var->unusable || !var->present) << 16;
return ar;
}
-void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -3284,7 +3750,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
vmcs_write16(sf->selector, var->selector);
else if (var->s)
fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
- goto out;
+ return;
}
vmcs_writel(sf->base, var->base);
@@ -3306,12 +3772,16 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
var->type |= 0x1; /* Accessed */
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
+}
-out:
- vmx->emulation_required = emulation_required(vcpu);
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+ __vmx_set_segment(vcpu, var, seg);
+
+ to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu);
}
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
@@ -3319,25 +3789,25 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
*l = (ar >> 13) & 1;
}
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
dt->address = vmcs_readl(GUEST_IDTR_BASE);
}
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
vmcs_writel(GUEST_IDTR_BASE, dt->address);
}
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
dt->address = vmcs_readl(GUEST_GDTR_BASE);
}
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
vmcs_writel(GUEST_GDTR_BASE, dt->address);
@@ -3584,7 +4054,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
}
/* Set up identity-mapping pagetable for EPT in real mode */
- for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+ for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
@@ -3614,39 +4084,6 @@ static void seg_setup(int seg)
vmcs_write32(sf->ar_bytes, ar);
}
-static int alloc_apic_access_page(struct kvm *kvm)
-{
- struct page *page;
- void __user *hva;
- int ret = 0;
-
- mutex_lock(&kvm->slots_lock);
- if (kvm->arch.apic_access_memslot_enabled)
- goto out;
- hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
- if (IS_ERR(hva)) {
- ret = PTR_ERR(hva);
- goto out;
- }
-
- page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (is_error_page(page)) {
- ret = -EFAULT;
- goto out;
- }
-
- /*
- * Do not pin the page in memory, so that memory hot-unplug
- * is able to migrate it.
- */
- put_page(page);
- kvm->arch.apic_access_memslot_enabled = true;
-out:
- mutex_unlock(&kvm->slots_lock);
- return ret;
-}
-
int allocate_vpid(void)
{
int vpid;
@@ -3672,47 +4109,25 @@ void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}
-static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __clear_bit(msr, msr_bitmap + 0x000 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
-}
-
-static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __clear_bit(msr, msr_bitmap + 0x800 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
-}
-
-static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
-
- if (msr <= 0x1fff)
- __set_bit(msr, msr_bitmap + 0x000 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
-}
+ /*
+ * When KVM is a nested hypervisor on top of Hyper-V and uses
+ * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
+ * bitmap has changed.
+ */
+ if (kvm_is_using_evmcs()) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
-static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
-{
- int f = sizeof(unsigned long);
+ if (evmcs->hv_enlightenments_control.msr_bitmap)
+ evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ }
- if (msr <= 0x1fff)
- __set_bit(msr, msr_bitmap + 0x800 / f);
- else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
- __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+ vmx->nested.force_msr_bitmap_recalc = true;
}
-void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
@@ -3720,114 +4135,66 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
if (!cpu_has_vmx_msr_bitmap())
return;
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
-
- /*
- * Mark the desired intercept state in shadow bitmap, this is needed
- * for resync when the MSR filters change.
- */
- if (is_valid_passthrough_msr(msr)) {
- int idx = possible_passthrough_msr_slot(msr);
-
- if (idx != -ENOENT) {
- if (type & MSR_TYPE_R)
- clear_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- clear_bit(idx, vmx->shadow_msr_intercept.write);
- }
- }
+ vmx_msr_bitmap_l01_changed(vmx);
- if ((type & MSR_TYPE_R) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
- type &= ~MSR_TYPE_R;
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
}
- if ((type & MSR_TYPE_W) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
- type &= ~MSR_TYPE_W;
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-
- if (type & MSR_TYPE_R)
- vmx_clear_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_clear_msr_bitmap_write(msr_bitmap, msr);
}
-void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
{
+ /*
+ * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
+ * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0,
+ * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
+ */
+ const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
+ const int write_idx = read_idx + (0x800 / sizeof(u64));
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
+ u8 mode;
- if (!cpu_has_vmx_msr_bitmap())
+ if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
return;
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
-
- /*
- * Mark the desired intercept state in shadow bitmap, this is needed
- * for resync when the MSR filter changes.
- */
- if (is_valid_passthrough_msr(msr)) {
- int idx = possible_passthrough_msr_slot(msr);
-
- if (idx != -ENOENT) {
- if (type & MSR_TYPE_R)
- set_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- set_bit(idx, vmx->shadow_msr_intercept.write);
- }
- }
-
- if (type & MSR_TYPE_R)
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
-}
-
-static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
-{
- u8 mode = 0;
-
if (cpu_has_secondary_exec_ctrls() &&
- (secondary_exec_controls_get(to_vmx(vcpu)) &
+ (secondary_exec_controls_get(vmx) &
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
- mode |= MSR_BITMAP_MODE_X2APIC;
+ mode = MSR_BITMAP_MODE_X2APIC;
if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+ } else {
+ mode = 0;
}
- return mode;
-}
-
-static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
-{
- unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
- unsigned long read_intercept;
- int msr;
-
- read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
-
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned int read_idx = msr / BITS_PER_LONG;
- unsigned int write_idx = read_idx + (0x800 / sizeof(long));
-
- msr_bitmap[read_idx] = read_intercept;
- msr_bitmap[write_idx] = ~0ul;
- }
-}
-
-static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
-{
- if (!cpu_has_vmx_msr_bitmap())
+ if (mode == vmx->x2apic_msr_bitmap_mode)
return;
- vmx_reset_x2apic_msrs(vcpu, mode);
+ vmx->x2apic_msr_bitmap_mode = mode;
+
+ /*
+ * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended
+ * registers (0x840 and above) intercepted, KVM doesn't support them.
+ * Intercept all writes by default and poke holes as needed. Pass
+ * through reads for all valid registers by default in x2APIC+APICv
+ * mode, only the current timer count needs on-demand emulation by KVM.
+ */
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
+ msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
+ else
+ msr_bitmap[read_idx] = ~0ull;
+ msr_bitmap[write_idx] = ~0ull;
/*
* TPR reads and writes can be virtualized even if virtual interrupt
@@ -3840,24 +4207,11 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+ if (enable_ipiv)
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
}
}
-void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u8 mode = vmx_msr_bitmap_mode(vcpu);
- u8 changed = mode ^ vmx->msr_bitmap_mode;
-
- if (!changed)
- return;
-
- if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
- vmx_update_msr_bitmap_x2apic(vcpu, mode);
-
- vmx->msr_bitmap_mode = mode;
-}
-
void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3868,92 +4222,87 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
- for (i = 0; i < vmx->pt_desc.addr_range; i++) {
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
}
}
-static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- void *vapic_page;
- u32 vppr;
- int rvi;
+ bool intercept;
- if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
- !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
- WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
- return false;
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
+ vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ if (kvm_cstate_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ }
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
+ }
- rvi = vmx_get_rvi();
+ /* PT MSRs can be passed through iff PT is exposed to the guest. */
+ if (vmx_pt_mode_is_host_guest())
+ pt_update_intercept_for_msr(vcpu);
- vapic_page = vmx->nested.virtual_apic_map.hva;
- vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
+ if (vcpu->arch.xfd_no_write_intercept)
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
- return ((rvi & 0xf0) > (vppr & 0xf0));
-}
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !to_vmx(vcpu)->spec_ctrl);
-static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 i;
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
- /*
- * Set intercept permissions for all potentially passed through MSRs
- * again. They will automatically get filtered through the MSR filter,
- * so we are back in sync after this.
- */
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
- u32 msr = vmx_possible_passthrough_msrs[i];
- bool read = test_bit(i, vmx->shadow_msr_intercept.read);
- bool write = test_bit(i, vmx->shadow_msr_intercept.write);
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
- vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read);
- vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write);
- }
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
- pt_update_intercept_for_msr(vcpu);
- vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu));
-}
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
- bool nested)
-{
-#ifdef CONFIG_SMP
- int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept);
+ }
- if (vcpu->mode == IN_GUEST_MODE) {
- /*
- * The vector of interrupt to be delivered to vcpu had
- * been set in PIR before this function.
- *
- * Following cases will be reached in this block, and
- * we always send a notification event in all cases as
- * explained below.
- *
- * Case 1: vcpu keeps in non-root mode. Sending a
- * notification event posts the interrupt to vcpu.
- *
- * Case 2: vcpu exits to root mode and is still
- * runnable. PIR will be synced to vIRR before the
- * next vcpu entry. Sending a notification event in
- * this case has no effect, as vcpu is not in root
- * mode.
- *
- * Case 3: vcpu exits to root mode and is blocked.
- * vcpu_block() has already synced PIR to vIRR and
- * never blocks vcpu if vIRR is not cleared. Therefore,
- * a blocked vcpu here does not wait for any requested
- * interrupts in PIR, and sending a notification event
- * which has no effect is safe here.
- */
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) {
+ intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
- apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
- return true;
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept);
}
-#endif
- return false;
+
+ /*
+ * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
+ * filtered by userspace.
+ */
+}
+
+void vmx_recalc_intercepts(struct kvm_vcpu *vcpu)
+{
+ vmx_recalc_msr_intercepts(vcpu);
}
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -3961,6 +4310,13 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
+ * and freed, and must not be accessed outside of vcpu->mutex. The
+ * vCPU's cached PI NV is valid if and only if posted interrupts
+ * enabled in its vmcs12, i.e. checking the vector also checks that
+ * L1 has enabled posted interrupts for L2.
+ */
if (is_guest_mode(vcpu) &&
vector == vmx->nested.posted_intr_nv) {
/*
@@ -3969,9 +4325,21 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
*/
vmx->nested.pi_pending = true;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * This pairs with the smp_mb_*() after setting vcpu->mode in
+ * vcpu_enter_guest() to guarantee the vCPU sees the event
+ * request if triggering a posted interrupt "fails" because
+ * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
+ * the smb_wmb() in kvm_make_request() only ensures everything
+ * done before making the request is visible when the request
+ * is visible, it doesn't ensure ordering between the store to
+ * vcpu->requests and the load from vcpu->mode.
+ */
+ smp_mb__after_atomic();
+
/* the PIR and ON have been set by L1. */
- if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
- kvm_vcpu_kick(vcpu);
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
return 0;
}
return -1;
@@ -3985,28 +4353,34 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
*/
static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
int r;
r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
if (!r)
return 0;
- if (!vcpu->arch.apicv_active)
+ /* Note, this is called iff the local APIC is in-kernel. */
+ if (!vcpu->arch.apic->apicv_active)
return -1;
- if (pi_test_and_set_pir(vector, &vmx->pi_desc))
- return 0;
+ __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector);
+ return 0;
+}
- /* If a previous notification has sent the IPI, nothing to do. */
- if (pi_test_and_set_on(&vmx->pi_desc))
- return 0;
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
- if (vcpu != kvm_get_running_vcpu() &&
- !kvm_vcpu_trigger_posted_interrupt(vcpu, false))
+ if (vmx_deliver_posted_interrupt(vcpu, vector)) {
+ kvm_lapic_set_irr(vector, apic);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
-
- return 0;
+ } else {
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
+ }
}
/*
@@ -4060,7 +4434,17 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
- rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
+
+ /*
+ * SYSENTER is used for 32-bit system calls on either 32-bit or
+ * 64-bit kernels. It is always zero If neither is allowed, otherwise
+ * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
+ * have already done so!).
+ */
+ if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+
+ rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl);
vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
@@ -4069,7 +4453,22 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
}
if (cpu_has_load_ia32_efer())
- vmcs_write64(HOST_IA32_EFER, host_efer);
+ vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
+
+ /*
+ * Supervisor shadow stack is not enabled on host side, i.e.,
+ * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM
+ * description(RDSSP instruction), SSP is not readable in CPL0,
+ * so resetting the two registers to 0s at VM-Exit does no harm
+ * to kernel execution. When execution flow exits to userspace,
+ * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter
+ * 3 and 4 for details.
+ */
+ if (cpu_has_load_cet_ctrl()) {
+ vmcs_writel(HOST_S_CET, kvm_host.s_cet);
+ vmcs_writel(HOST_SSP, 0);
+ vmcs_writel(HOST_INTR_SSP_TABLE, 0);
+ }
}
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -4078,15 +4477,17 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
~vcpu->arch.cr4_guest_rsvd_bits;
- if (!enable_ept)
- vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;
+ if (!enable_ept) {
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
+ }
if (is_guest_mode(&vmx->vcpu))
vcpu->arch.cr4_guest_owned_bits &=
~get_vmcs12(vcpu)->cr4_guest_host_mask;
vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
}
-u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
{
u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
@@ -4102,44 +4503,100 @@ u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
return pin_based_exec_ctrl;
}
-static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+static u32 vmx_get_initial_vmentry_ctrl(void)
+{
+ u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
+
+ if (vmx_pt_mode_is_system())
+ vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
+ VM_ENTRY_LOAD_IA32_RTIT_CTL);
+ /*
+ * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
+ */
+ vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
+ VM_ENTRY_LOAD_IA32_EFER |
+ VM_ENTRY_IA32E_MODE);
+
+ return vmentry_ctrl;
+}
+
+static u32 vmx_get_initial_vmexit_ctrl(void)
+{
+ u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
+
+ /*
+ * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
+ * nested virtualization and thus allowed to be set in vmcs12.
+ */
+ vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
+
+ if (vmx_pt_mode_is_system())
+ vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
+ VM_EXIT_CLEAR_IA32_RTIT_CTL);
+ /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
+ return vmexit_ctrl &
+ ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
+}
+
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
- if (cpu_has_secondary_exec_ctrls()) {
- if (kvm_vcpu_apicv_active(vcpu))
- secondary_exec_controls_setbit(vmx,
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
- else
- secondary_exec_controls_clearbit(vmx,
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.update_vmcs01_apicv_status = true;
+ return;
}
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(vcpu);
+ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
+
+ secondary_exec_controls_changebit(vmx,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY,
+ kvm_vcpu_apicv_active(vcpu));
+ if (enable_ipiv)
+ tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT,
+ kvm_vcpu_apicv_active(vcpu));
+
+ vmx_update_msr_bitmap_x2apic(vcpu);
}
-u32 vmx_exec_control(struct vcpu_vmx *vmx)
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+ /*
+ * Not used by KVM, but fully supported for nesting, i.e. are allowed in
+ * vmcs12 and propagated to vmcs02 when set in vmcs12.
+ */
+ exec_control &= ~(CPU_BASED_RDTSC_EXITING |
+ CPU_BASED_USE_IO_BITMAPS |
+ CPU_BASED_MONITOR_TRAP_FLAG |
+ CPU_BASED_PAUSE_EXITING);
+
+ /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
+ exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING);
+
if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
exec_control &= ~CPU_BASED_MOV_DR_EXITING;
- if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
+ if (!cpu_need_tpr_shadow(&vmx->vcpu))
exec_control &= ~CPU_BASED_TPR_SHADOW;
+
#ifdef CONFIG_X86_64
+ if (exec_control & CPU_BASED_TPR_SHADOW)
+ exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING);
+ else
exec_control |= CPU_BASED_CR8_STORE_EXITING |
CPU_BASED_CR8_LOAD_EXITING;
#endif
- }
- if (!enable_ept)
- exec_control |= CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_INVLPG_EXITING;
+ /* No need to intercept CR3 access or INVPLG when using EPT. */
+ if (enable_ept)
+ exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_INVLPG_EXITING);
if (kvm_mwait_in_guest(vmx->vcpu.kvm))
exec_control &= ~(CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING);
@@ -4148,6 +4605,20 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx)
return exec_control;
}
+static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
+{
+ u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
+
+ /*
+ * IPI virtualization relies on APICv. Disable IPI virtualization if
+ * APICv is inhibited.
+ */
+ if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
+ exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
+
+ return exec_control;
+}
+
/*
* Adjust a single secondary execution control bit to intercept/allow an
* instruction in the guest. This is usually done based on whether or not a
@@ -4172,7 +4643,15 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
* Update the nested MSR settings so that a nested VMM can/can't set
* controls for features that are/aren't exposed to the guest.
*/
- if (nested) {
+ if (nested &&
+ kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
+ /*
+ * All features that can be added or removed to VMX MSRs must
+ * be supported in the first place for nested virtualization.
+ */
+ if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
+ enabled = false;
+
if (enabled)
vmx->nested.msrs.secondary_ctls_high |= control;
else
@@ -4185,16 +4664,16 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
* based on a single guest CPUID bit, with a dedicated feature bit. This also
* verifies that the control is actually supported by KVM and hardware.
*/
-#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
-({ \
- bool __enabled; \
- \
- if (cpu_has_vmx_##name()) { \
- __enabled = guest_cpuid_has(&(vmx)->vcpu, \
- X86_FEATURE_##feat_name); \
- vmx_adjust_secondary_exec_control(vmx, exec_control, \
- SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
- } \
+#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
+({ \
+ struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \
+ bool __enabled; \
+ \
+ if (cpu_has_vmx_##name()) { \
+ __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \
+ vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
+ __enabled, exiting); \
+ } \
})
/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
@@ -4204,7 +4683,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
-static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
+static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
{
struct kvm_vcpu *vcpu = &vmx->vcpu;
@@ -4218,6 +4697,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
enable_unrestricted_guest = 0;
}
if (!enable_unrestricted_guest)
@@ -4229,6 +4709,12 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ /*
+ * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
+ * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
+ */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+
/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
* in vmx_set_cr4. */
exec_control &= ~SECONDARY_EXEC_DESC;
@@ -4245,22 +4731,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
* it needs to be set here when dirty logging is already active, e.g.
* if this vCPU was created after dirty logging was enabled.
*/
- if (!vcpu->kvm->arch.cpu_dirty_logging_count)
+ if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- if (cpu_has_vmx_xsaves()) {
- /* Exposing XSAVES only when XSAVE is exposed */
- bool xsaves_enabled =
- boot_cpu_has(X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
-
- vcpu->arch.xsaves_enabled = xsaves_enabled;
-
- vmx_adjust_secondary_exec_control(vmx, &exec_control,
- SECONDARY_EXEC_XSAVES,
- xsaves_enabled, false);
- }
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
/*
* RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
@@ -4272,13 +4746,14 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
*/
if (cpu_has_vmx_rdtscp()) {
bool rdpid_or_rdtscp_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
vmx_adjust_secondary_exec_control(vmx, &exec_control,
SECONDARY_EXEC_ENABLE_RDTSCP,
rdpid_or_rdtscp_enabled, false);
}
+
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@@ -4290,24 +4765,56 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (!vcpu->kvm->arch.bus_lock_detection_enabled)
exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
- vmx->secondary_exec_control = exec_control;
+ if (!kvm_notify_vmexit_enabled(vcpu->kvm))
+ exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
+
+ return exec_control;
+}
+
+static inline int vmx_get_pid_table_order(struct kvm *kvm)
+{
+ return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
+}
+
+static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
+{
+ struct page *pages;
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+ if (!irqchip_in_kernel(kvm) || !enable_ipiv)
+ return 0;
+
+ if (kvm_vmx->pid_table)
+ return 0;
+
+ pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ vmx_get_pid_table_order(kvm));
+ if (!pages)
+ return -ENOMEM;
+
+ kvm_vmx->pid_table = (void *)page_address(pages);
+ return 0;
+}
+
+int vmx_vcpu_precreate(struct kvm *kvm)
+{
+ return vmx_alloc_ipiv_pid_table(kvm);
}
#define VMX_XSS_EXIT_BITMAP 0
-/*
- * Noting that the initialization of Guest-state Area of VMCS is in
- * vmx_vcpu_reset().
- */
static void init_vmcs(struct vcpu_vmx *vmx)
{
+ struct kvm *kvm = vmx->vcpu.kvm;
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
if (nested)
nested_vmx_set_vmcs_shadowing_bitmap();
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
- vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
/* Control */
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
@@ -4315,11 +4822,16 @@ static void init_vmcs(struct vcpu_vmx *vmx)
exec_controls_set(vmx, vmx_exec_control(vmx));
if (cpu_has_secondary_exec_ctrls()) {
- vmx_compute_secondary_exec_control(vmx);
- secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
+ secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS,
+ __pa(vmx->ve_info));
}
- if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
+ if (cpu_has_tertiary_exec_ctrls())
+ tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
+
+ if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4328,15 +4840,23 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write16(GUEST_INTR_STATUS, 0);
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
- vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
+ vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc)));
}
- if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
+ if (vmx_can_use_ipiv(&vmx->vcpu)) {
+ vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
+ vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
+ }
+
+ if (!kvm_pause_in_guest(kvm)) {
vmcs_write32(PLE_GAP, ple_gap);
vmx->ple_window = ple_window;
vmx->ple_window_dirty = true;
}
+ if (kvm_notify_vmexit_enabled(kvm))
+ vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -4359,12 +4879,12 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
- vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
+ vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl());
/* 22.2.1, 20.8.1 */
- vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
+ vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl());
- vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+ vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
set_cr4_guest_host_mask(vmx);
@@ -4377,7 +4897,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (enable_pml) {
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
vmx_write_encls_bitmap(&vmx->vcpu, NULL);
@@ -4388,34 +4908,71 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmx->pt_desc.guest.output_mask = 0x7F;
vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
}
+
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+
+ vmx_guest_debugctl_write(&vmx->vcpu, 0);
+
+ if (cpu_has_vmx_tpr_shadow()) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+ if (cpu_need_tpr_shadow(&vmx->vcpu))
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ __pa(vmx->vcpu.arch.apic->regs));
+ vmcs_write32(TPR_THRESHOLD, 0);
+ }
+
+ vmx_setup_uret_msrs(vmx);
+}
+
+static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ init_vmcs(vmx);
+
+ if (nested &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
+ memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
+
+ vcpu_setup_sgx_lepubkeyhash(vcpu);
+
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.vmxon_ptr = INVALID_GPA;
+ vmx->nested.current_vmptr = INVALID_GPA;
+
+#ifdef CONFIG_KVM_HYPERV
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+#endif
+
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
+ vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
+
+ /*
+ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+ * or POSTED_INTR_WAKEUP_VECTOR.
+ */
+ vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
+ __pi_set_sn(&vmx->vt.pi_desc);
}
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct msr_data apic_base_msr;
- u64 cr0;
+
+ if (!init_event)
+ __vmx_vcpu_reset(vcpu);
vmx->rmode.vm86_active = 0;
vmx->spec_ctrl = 0;
vmx->msr_ia32_umwait_control = 0;
- vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
vmx->hv_deadline_tsc = -1;
kvm_set_cr8(vcpu, 0);
- if (!init_event) {
- apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(vcpu))
- apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
- apic_base_msr.host_initiated = true;
- kvm_set_apic_base(vcpu, &apic_base_msr);
- }
-
- vmx_segment_cache_clear(vmx);
-
seg_setup(VCPU_SREG_CS);
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
@@ -4436,61 +4993,44 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
- if (!init_event) {
- vmcs_write32(GUEST_SYSENTER_CS, 0);
- vmcs_writel(GUEST_SYSENTER_ESP, 0);
- vmcs_writel(GUEST_SYSENTER_EIP, 0);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
- }
-
- kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
- kvm_rip_write(vcpu, 0xfff0);
-
vmcs_writel(GUEST_GDTR_BASE, 0);
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
vmcs_writel(GUEST_IDTR_BASE, 0);
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+ vmx_segment_cache_clear(vmx);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
+
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
if (kvm_mpx_supported())
vmcs_write64(GUEST_BNDCFGS, 0);
- setup_msrs(vmx);
-
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
- if (cpu_has_vmx_tpr_shadow() && !init_event) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (cpu_need_tpr_shadow(vcpu))
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- __pa(vcpu->arch.apic->regs));
- vmcs_write32(TPR_THRESHOLD, 0);
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ vmcs_writel(GUEST_SSP, 0);
+ vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
}
+ if (kvm_cpu_cap_has(X86_FEATURE_IBT) ||
+ kvm_cpu_cap_has(X86_FEATURE_SHSTK))
+ vmcs_writel(GUEST_S_CET, 0);
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx->vcpu.arch.cr0 = cr0;
- vmx_set_cr0(vcpu, cr0); /* enter rmode */
- vmx_set_cr4(vcpu, 0);
- vmx_set_efer(vcpu, 0);
-
- vmx_update_exception_bitmap(vcpu);
-
vpid_sync_context(vmx->vpid);
- if (init_event)
- vmx_clear_hlt(vcpu);
+
+ vmx_update_fb_clear_dis(vcpu, vmx);
}
-static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
{
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
-static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
{
if (!enable_vnmi ||
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
@@ -4501,13 +5041,13 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
-static void vmx_inject_irq(struct kvm_vcpu *vcpu)
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
uint32_t intr;
int irq = vcpu->arch.interrupt.nr;
- trace_kvm_inj_virq(irq);
+ trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
++vcpu->stat.irq_injections;
if (vmx->rmode.vm86_active) {
@@ -4529,7 +5069,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
vmx_clear_hlt(vcpu);
}
-static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
+void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4607,7 +5147,7 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
GUEST_INTR_STATE_NMI));
}
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
@@ -4619,32 +5159,37 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !vmx_nmi_blocked(vcpu);
}
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
return false;
- return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+ return __vmx_interrupt_blocked(vcpu);
}
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
- /*
- * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
- * e.g. if the IRQ arrived asynchronously after checking nested events.
- */
+ /*
+ * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+ * e.g. if the IRQ arrived asynchronously after checking nested events.
+ */
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
return -EBUSY;
return !vmx_interrupt_blocked(vcpu);
}
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
void __user *ret;
@@ -4664,7 +5209,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
return init_rmode_tss(kvm, ret);
}
-static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
{
to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
return 0;
@@ -4710,7 +5255,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
if (kvm_emulate_instruction(vcpu, 0)) {
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
return 1;
}
@@ -4748,10 +5293,16 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
return true;
- return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
+ return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
(kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
}
+static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.guest_fpu.fpstate->xfd &&
+ !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS);
+}
+
static int handle_exception_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4763,12 +5314,39 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
vect_info = vmx->idt_vectoring_info;
intr_info = vmx_get_intr_info(vcpu);
+ /*
+ * Machine checks are handled by handle_exception_irqoff(), or by
+ * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
+ * vmx_vcpu_enter_exit().
+ */
if (is_machine_check(intr_info) || is_nmi(intr_info))
- return 1; /* handled by handle_exception_nmi_irqoff() */
+ return 1;
+
+ /*
+ * Queue the exception here instead of in handle_nm_fault_irqoff().
+ * This ensures the nested_vmx check is not skipped so vmexit can
+ * be reflected to L1 (when it intercepts #NM) before reaching this
+ * point.
+ */
+ if (is_nm_fault(intr_info)) {
+ kvm_queue_exception_p(vcpu, NM_VECTOR,
+ is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0);
+ return 1;
+ }
if (is_invalid_opcode(intr_info))
return handle_ud(vcpu);
+ if (WARN_ON_ONCE(is_ve_fault(intr_info))) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+
+ WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION,
+ "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason);
+ dump_vmcs(vcpu);
+ kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE");
+ return 1;
+ }
+
error_code = 0;
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
@@ -4829,8 +5407,35 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
dr6 = vmx_get_exit_qual(vcpu);
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ /*
+ * If the #DB was due to ICEBP, a.k.a. INT1, skip the
+ * instruction. ICEBP generates a trap-like #DB, but
+ * despite its interception control being tied to #DB,
+ * is an instruction intercept, i.e. the VM-Exit occurs
+ * on the ICEBP itself. Use the inner "skip" helper to
+ * avoid single-step #DB and MTF updates, as ICEBP is
+ * higher priority. Note, skipping ICEBP still clears
+ * STI and MOVSS blocking.
+ *
+ * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
+ * if single-step is enabled in RFLAGS and STI or MOVSS
+ * blocking is active, as the CPU doesn't set the bit
+ * on VM-Exit due to #DB interception. VM-Entry has a
+ * consistency check that a single-step #DB is pending
+ * in this scenario as the previous instruction cannot
+ * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
+ * don't modify RFLAGS), therefore the one instruction
+ * delay when activating single-step breakpoints must
+ * have already expired. Note, the CPU sets/clears BS
+ * as appropriate for all other VM-Exits types.
+ */
if (is_icebp(intr_info))
WARN_ON(!skip_emulated_instruction(vcpu));
+ else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
return 1;
@@ -4907,8 +5512,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
return kvm_fast_pio(vcpu, size, port, in);
}
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
/*
* Patch in the VMCALL instruction:
@@ -4936,18 +5540,11 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
val = (val & ~vmcs12->cr0_guest_host_mask) |
(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
- if (!nested_guest_cr0_valid(vcpu, val))
- return 1;
-
if (kvm_set_cr0(vcpu, val))
return 1;
vmcs_writel(CR0_READ_SHADOW, orig_val);
return 0;
} else {
- if (to_vmx(vcpu)->nested.vmxon &&
- !nested_host_cr0_valid(vcpu, val))
- return 1;
-
return kvm_set_cr0(vcpu, val);
}
}
@@ -4971,7 +5568,13 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
static int handle_desc(struct kvm_vcpu *vcpu)
{
- WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
+ /*
+ * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
+ * and other code needs to be updated if UMIP can be guest owned.
+ */
+ BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
+
+ WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
return kvm_emulate_instruction(vcpu, 0);
}
@@ -4996,6 +5599,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return kvm_complete_insn_gp(vcpu, err);
case 3:
WARN_ON_ONCE(enable_unrestricted_guest);
+
err = kvm_set_cr3(vcpu, val);
return kvm_complete_insn_gp(vcpu, err);
case 4:
@@ -5021,14 +5625,13 @@ static int handle_cr(struct kvm_vcpu *vcpu)
}
break;
case 2: /* clts */
- WARN_ONCE(1, "Guest should always own CR0.TS");
- vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
- trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
- return kvm_skip_emulated_instruction(vcpu);
+ KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
+ return -EIO;
case 1: /*mov from cr*/
switch (cr) {
case 3:
WARN_ON_ONCE(enable_unrestricted_guest);
+
val = kvm_read_cr3(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
@@ -5042,7 +5645,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
break;
case 3: /* lmsw */
val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
- trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+ trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
kvm_lmsw(vcpu, val);
return kvm_skip_emulated_instruction(vcpu);
@@ -5068,7 +5671,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (!kvm_require_dr(vcpu, dr))
return 1;
- if (kvm_x86_ops.get_cpl(vcpu) > 0)
+ if (vmx_get_cpl(vcpu) > 0)
goto out;
dr7 = vmcs_readl(GUEST_DR7);
@@ -5105,10 +5708,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
if (exit_qualification & TYPE_MOV_FROM_DR) {
- unsigned long val;
-
- kvm_get_dr(vcpu, dr, &val);
- kvm_register_write(vcpu, reg, val);
+ kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
err = 0;
} else {
err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
@@ -5118,7 +5718,7 @@ out:
return kvm_complete_insn_gp(vcpu, err);
}
-static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
@@ -5129,9 +5729,15 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
+
+ /*
+ * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
+ * a stale dr6 from the guest.
+ */
+ set_debugreg(DR6_RESERVED, 6);
}
-static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
}
@@ -5195,9 +5801,16 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
static int handle_apic_write(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
- u32 offset = exit_qualification & 0xfff;
- /* APIC-write VM exit is trap-like and thus no need to adjust IP */
+ /*
+ * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
+ * hardware has done any necessary aliasing, offset adjustments, etc...
+ * for the access. I.e. the correct value has already been written to
+ * the vAPIC page for the correct 16-byte chunk. KVM needs only to
+ * retrieve the register value and emulate the access.
+ */
+ u32 offset = exit_qualification & 0xff0;
+
kvm_apic_write_nodecode(vcpu, offset);
return 1;
}
@@ -5261,11 +5874,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification;
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
gpa_t gpa;
- u64 error_code;
-
- exit_qualification = vmx_get_exit_qual(vcpu);
/*
* EPT violation happened while executing iret from NMI,
@@ -5279,27 +5889,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- trace_kvm_page_fault(gpa, exit_qualification);
-
- /* Is it a read fault? */
- error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
- ? PFERR_USER_MASK : 0;
- /* Is it a write fault? */
- error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
- ? PFERR_WRITE_MASK : 0;
- /* Is it a fetch fault? */
- error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
- ? PFERR_FETCH_MASK : 0;
- /* ept page table entry is present? */
- error_code |= (exit_qualification &
- (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
- EPT_VIOLATION_EXECUTABLE))
- ? PFERR_PRESENT_MASK : 0;
-
- error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
- PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
-
- vcpu->arch.exit_qualification = exit_qualification;
+ trace_kvm_page_fault(vcpu, gpa, exit_qualification);
/*
* Check that the GPA doesn't exceed physical memory limits, as that is
@@ -5309,17 +5899,17 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
* would also use advanced VM-exit information for EPT violations to
* reconstruct the page fault error code.
*/
- if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
+ if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
return kvm_emulate_instruction(vcpu, 0);
- return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+ return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
}
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+ if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
return 1;
/*
@@ -5338,7 +5928,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
- WARN_ON_ONCE(!enable_vnmi);
+ if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
+ return -EIO;
+
exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
++vcpu->stat.nmi_window_exits;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5346,6 +5938,38 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
return 1;
}
+/*
+ * Returns true if emulation is required (due to the vCPU having invalid state
+ * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the
+ * current vCPU state.
+ */
+static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!vmx->vt.emulation_required)
+ return false;
+
+ /*
+ * It is architecturally impossible for emulation to be required when a
+ * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if
+ * guest state is invalid and unrestricted guest is disabled, i.e. KVM
+ * should synthesize VM-Fail instead emulation L2 code. This path is
+ * only reachable if userspace modifies L2 guest state after KVM has
+ * performed the nested VM-Enter consistency checks.
+ */
+ if (vmx->nested.nested_run_pending)
+ return true;
+
+ /*
+ * KVM only supports emulating exceptions if the vCPU is in Real Mode.
+ * If emulation is required, KVM can't perform a successful VM-Enter to
+ * inject the exception.
+ */
+ return !vmx->rmode.vm86_active &&
+ (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
+}
+
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5355,28 +5979,31 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
intr_window_requested = exec_controls_get(vmx) &
CPU_BASED_INTR_WINDOW_EXITING;
- while (vmx->emulation_required && count-- != 0) {
+ while (vmx->vt.emulation_required && count-- != 0) {
if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
return handle_interrupt_window(&vmx->vcpu);
if (kvm_test_request(KVM_REQ_EVENT, vcpu))
return 1;
+ /*
+ * Ensure that any updates to kvm->buses[] observed by the
+ * previous instruction (emulated or otherwise) are also
+ * visible to the instruction KVM is about to emulate.
+ */
+ smp_rmb();
+
if (!kvm_emulate_instruction(vcpu, 0))
return 0;
- if (vmx->emulation_required && !vmx->rmode.vm86_active &&
- vcpu->arch.exception.pending) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ if (vmx_unhandleable_emulation_required(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
/*
@@ -5391,36 +6018,14 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return 1;
}
-static void grow_ple_window(struct kvm_vcpu *vcpu)
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned int old = vmx->ple_window;
-
- vmx->ple_window = __grow_ple_window(old, ple_window,
- ple_window_grow,
- ple_window_max);
-
- if (vmx->ple_window != old) {
- vmx->ple_window_dirty = true;
- trace_kvm_ple_window_update(vcpu->vcpu_id,
- vmx->ple_window, old);
+ if (vmx_unhandleable_emulation_required(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
}
-}
-static void shrink_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned int old = vmx->ple_window;
-
- vmx->ple_window = __shrink_ple_window(old, ple_window,
- ple_window_shrink,
- ple_window);
-
- if (vmx->ple_window != old) {
- vmx->ple_window_dirty = true;
- trace_kvm_ple_window_update(vcpu->vcpu_id,
- vmx->ple_window, old);
- }
+ return 1;
}
/*
@@ -5456,19 +6061,16 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
u64 pcid;
u64 gla;
} operand;
+ int gpr_index;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
-
- if (type > 3) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
/* According to the Intel instruction reference, the memory operand
* is read even if it isn't needed (e.g., for type==all)
@@ -5506,28 +6108,52 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
-static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!vmx->req_immediate_exit &&
- !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
- kvm_lapic_expired_hv_timer(vcpu);
+ /*
+ * In the *extremely* unlikely scenario that this is a spurious VM-Exit
+ * due to the timer expiring while it was "soft" disabled, just eat the
+ * exit and re-enter the guest.
+ */
+ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
return EXIT_FASTPATH_REENTER_GUEST;
- }
- return EXIT_FASTPATH_NONE;
+ /*
+ * If the timer expired because KVM used it to force an immediate exit,
+ * then mission accomplished.
+ */
+ if (force_immediate_exit)
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ /*
+ * If L2 is active, go down the slow path as emulating the guest timer
+ * expiration likely requires synthesizing a nested VM-Exit.
+ */
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
}
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
- handle_fastpath_preemption_timer(vcpu);
+ /*
+ * This non-fastpath handler is reached if and only if the preemption
+ * timer was being used to emulate a guest timer while L2 is active.
+ * All other scenarios are supposed to be handled in the fastpath.
+ */
+ WARN_ON_ONCE(!is_guest_mode(vcpu));
+ kvm_lapic_expired_hv_timer(vcpu);
return 1;
}
/*
* When nested=0, all VMX instruction VM Exits filter here. The handlers
- * are overwritten by nested_vmx_setup() when nested=1.
+ * are overwritten by nested_vmx_hardware_setup() when nested=1.
*/
static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
{
@@ -5535,6 +6161,12 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
#ifndef CONFIG_X86_SGX_KVM
static int handle_encls(struct kvm_vcpu *vcpu)
{
@@ -5550,9 +6182,56 @@ static int handle_encls(struct kvm_vcpu *vcpu)
static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
{
- vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
- vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
- return 0;
+ /*
+ * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
+ * VM-Exits. Unconditionally set the flag here and leave the handling to
+ * vmx_handle_exit().
+ */
+ to_vt(vcpu)->exit_reason.bus_lock_detected = true;
+ return 1;
+}
+
+static int handle_notify(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual = vmx_get_exit_qual(vcpu);
+ bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
+
+ ++vcpu->stat.notify_window_exits;
+
+ /*
+ * Notify VM exit happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ */
+ if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
+ context_invalid) {
+ vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
+ vcpu->run->notify.flags = context_invalid ?
+ KVM_NOTIFY_CONTEXT_INVALID : 0;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO));
+}
+
+static int handle_rdmsr_imm(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
+}
+
+static int handle_wrmsr_imm(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
}
/*
@@ -5612,18 +6291,24 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
[EXIT_REASON_ENCLS] = handle_encls,
[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
+ [EXIT_REASON_NOTIFY] = handle_notify,
+ [EXIT_REASON_SEAMCALL] = handle_tdx_instruction,
+ [EXIT_REASON_TDCALL] = handle_tdx_instruction,
+ [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
+ [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
};
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
- u32 *intr_info, u32 *error_code)
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ *reason = vmx->vt.exit_reason.full;
*info1 = vmx_get_exit_qual(vcpu);
- if (!(vmx->exit_reason.failed_vmentry)) {
+ if (!(vmx->vt.exit_reason.failed_vmentry)) {
*info2 = vmx->idt_vectoring_info;
*intr_info = vmx_get_intr_info(vcpu);
if (is_exception_with_error_code(*intr_info))
@@ -5637,6 +6322,15 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
}
}
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
+{
+ *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+ else
+ *error_code = 0;
+}
+
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
{
if (vmx->pml_pg) {
@@ -5648,32 +6342,40 @@ static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u16 pml_idx, pml_tail_index;
u64 *pml_buf;
- u16 pml_idx;
+ int i;
pml_idx = vmcs_read16(GUEST_PML_INDEX);
/* Do nothing if PML buffer is empty */
- if (pml_idx == (PML_ENTITY_NUM - 1))
+ if (pml_idx == PML_HEAD_INDEX)
return;
+ /*
+ * PML index always points to the next available PML buffer entity
+ * unless PML log has just overflowed.
+ */
+ pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
- /* PML index always points to next available PML buffer entity */
- if (pml_idx >= PML_ENTITY_NUM)
- pml_idx = 0;
- else
- pml_idx++;
-
+ /*
+ * PML log is written backwards: the CPU first writes the entry 511
+ * then the entry 510, and so on.
+ *
+ * Read the entries in the same order they were written, to ensure that
+ * the dirty ring is filled in the same order the CPU wrote them.
+ */
pml_buf = page_address(vmx->pml_pg);
- for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+
+ for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
u64 gpa;
- gpa = pml_buf[pml_idx];
+ gpa = pml_buf[i];
WARN_ON(gpa & (PAGE_SIZE - 1));
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
}
/* reset PML index */
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
static void vmx_dump_sel(char *name, uint32_t sel)
@@ -5707,6 +6409,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vmentry_ctl, vmexit_ctl;
u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
+ u64 tertiary_exec_control;
unsigned long cr4;
int efer_slot;
@@ -5720,9 +6423,16 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
cr4 = vmcs_readl(GUEST_CR4);
- secondary_exec_control = 0;
+
if (cpu_has_secondary_exec_ctrls())
secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ else
+ secondary_exec_control = 0;
+
+ if (cpu_has_tertiary_exec_ctrls())
+ tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
+ else
+ tertiary_exec_control = 0;
pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
@@ -5790,6 +6500,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
+ if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE)
+ pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
+ vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP),
+ vmcs_readl(GUEST_INTR_SSP_TABLE));
pr_err("*** Host State ***\n");
pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
@@ -5820,11 +6534,16 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
+ if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE)
+ pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
+ vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP),
+ vmcs_readl(HOST_INTR_SSP_TABLE));
pr_err("*** Control State ***\n");
- pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
- pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
- pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+ pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
+ cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
+ pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
+ pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
vmcs_read32(EXCEPTION_BITMAP),
vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
@@ -5866,6 +6585,24 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
pr_err("Virtual processor ID = 0x%04x\n",
vmcs_read16(VIRTUAL_PROCESSOR_ID));
+ if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+ u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
+
+ /*
+ * If KVM is dumping the VMCS, then something has gone wrong
+ * already. Derefencing an address from the VMCS, which could
+ * very well be corrupted, is a terrible idea. The virtual
+ * address is known so use it.
+ */
+ pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
+ ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
+ pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
+ ve_info->exit_reason, ve_info->delivery,
+ ve_info->exit_qualification,
+ ve_info->guest_linear_address,
+ ve_info->guest_physical_address, ve_info->eptp_index);
+ }
}
/*
@@ -5875,7 +6612,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- union vmx_exit_reason exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
u32 vectoring_info = vmx->idt_vectoring_info;
u16 exit_handler_index;
@@ -5891,16 +6628,13 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vmx_flush_pml_buffer(vcpu);
/*
- * We should never reach this point with a pending nested VM-Enter, and
- * more specifically emulation of L2 due to invalid guest state (see
- * below) should never happen as that means we incorrectly allowed a
- * nested VM-Enter with an invalid vmcs12.
+ * KVM should never reach this point with a pending nested VM-Enter.
+ * More specifically, short-circuiting VM-Entry to emulate L2 due to
+ * invalid guest state should never happen as that means KVM knowingly
+ * allowed a nested VM-Enter with an invalid vmcs12. More below.
*/
- WARN_ON_ONCE(vmx->nested.nested_run_pending);
-
- /* If guest state is invalid, start emulating */
- if (vmx->emulation_required)
- return handle_invalid_guest_state(vcpu);
+ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
+ return -EIO;
if (is_guest_mode(vcpu)) {
/*
@@ -5923,10 +6657,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
*/
nested_mark_vmcs12_pages_dirty(vcpu);
+ /*
+ * Synthesize a triple fault if L2 state is invalid. In normal
+ * operation, nested VM-Enter rejects any attempt to enter L2
+ * with invalid state. However, those checks are skipped if
+ * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
+ * L2 state is invalid, it means either L1 modified SMRAM state
+ * or userspace provided bad state. Synthesize TRIPLE_FAULT as
+ * doing so is architecturally allowed in the RSM case, and is
+ * the least awful solution for the userspace case without
+ * risking false positives.
+ */
+ if (vmx->vt.emulation_required) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+ return 1;
+ }
+
if (nested_vmx_reflect_vmexit(vcpu))
return 1;
}
+ /* If guest state is invalid, start emulating. L2 is handled above. */
+ if (vmx->vt.emulation_required)
+ return handle_invalid_guest_state(vcpu);
+
if (exit_reason.failed_vmentry) {
dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -5945,32 +6699,15 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return 0;
}
- /*
- * Note:
- * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
- * delivery event since it indicates guest is accessing MMIO.
- * The vm-exit can be triggered again after return to guest that
- * will cause infinite loop.
- */
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
(exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
exit_reason.basic != EXIT_REASON_PML_FULL &&
exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
- exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
- int ndata = 3;
-
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
- vcpu->run->internal.data[0] = vectoring_info;
- vcpu->run->internal.data[1] = exit_reason.full;
- vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
- if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
- vcpu->run->internal.data[ndata++] =
- vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- }
- vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
- vcpu->run->internal.ndata = ndata;
+ exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
+ exit_reason.basic != EXIT_REASON_NOTIFY &&
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) {
+ kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA);
return 0;
}
@@ -5998,9 +6735,11 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
goto unexpected_vmexit;
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
return kvm_emulate_wrmsr(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
+ return handle_wrmsr_imm(vcpu);
else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
return handle_preemption_timer(vcpu);
else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
@@ -6021,28 +6760,20 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
unexpected_vmexit:
- vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
- exit_reason.full);
dump_vmcs(vcpu);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_reason.full;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full);
return 0;
}
-static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
int ret = __vmx_handle_exit(vcpu, exit_fastpath);
/*
- * Even when current exit reason is handled by KVM internally, we
- * still need to exit to user space when bus lock detected to inform
- * that there is a bus lock in guest.
+ * Exit to user space when bus lock detected to inform that there is
+ * a bus lock in guest.
*/
- if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
+ if (vmx_get_exit_reason(vcpu).bus_lock_detected) {
if (ret > 0)
vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
@@ -6052,77 +6783,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return ret;
}
-/*
- * Software based L1D cache flush which is used when microcode providing
- * the cache control MSR is not loaded.
- *
- * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
- * flush it is required to read in 64 KiB because the replacement algorithm
- * is not exactly LRU. This could be sized at runtime via topology
- * information but as all relevant affected CPUs have 32KiB L1D cache size
- * there is no point in doing so.
- */
-static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
-{
- int size = PAGE_SIZE << L1D_CACHE_ORDER;
-
- /*
- * This code is only executed when the the flush mode is 'cond' or
- * 'always'
- */
- if (static_branch_likely(&vmx_l1d_flush_cond)) {
- bool flush_l1d;
-
- /*
- * Clear the per-vcpu flush bit, it gets set again
- * either from vcpu_run() or from one of the unsafe
- * VMEXIT handlers.
- */
- flush_l1d = vcpu->arch.l1tf_flush_l1d;
- vcpu->arch.l1tf_flush_l1d = false;
-
- /*
- * Clear the per-cpu flush bit, it gets set again from
- * the interrupt handlers.
- */
- flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
- kvm_clear_cpu_l1tf_flush_l1d();
-
- if (!flush_l1d)
- return;
- }
-
- vcpu->stat.l1d_flush++;
-
- if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
- return;
- }
-
- asm volatile(
- /* First ensure the pages are in the TLB */
- "xorl %%eax, %%eax\n"
- ".Lpopulate_tlb:\n\t"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $4096, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lpopulate_tlb\n\t"
- "xorl %%eax, %%eax\n\t"
- "cpuid\n\t"
- /* Now fill the cache */
- "xorl %%eax, %%eax\n"
- ".Lfill_cache:\n"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $64, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lfill_cache\n\t"
- "lfence\n"
- :: [flush_pages] "r" (vmx_l1d_flush_pages),
- [size] "r" (size)
- : "eax", "ebx", "ecx", "edx");
-}
-
-static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
int tpr_threshold;
@@ -6189,12 +6850,19 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
}
secondary_exec_controls_set(vmx, sec_exec_control);
- vmx_update_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap_x2apic(vcpu);
}
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
{
- struct page *page;
+ const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *slot;
+ struct page *refcounted_page;
+ unsigned long mmu_seq;
+ kvm_pfn_t pfn;
+ bool writable;
/* Defer reload until vmcs01 is the current VMCS. */
if (is_guest_mode(vcpu)) {
@@ -6206,25 +6874,82 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
return;
- page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (is_error_page(page))
+ /*
+ * Explicitly grab the memslot using KVM's internal slot ID to ensure
+ * KVM doesn't unintentionally grab a userspace memslot. It _should_
+ * be impossible for userspace to create a memslot for the APIC when
+ * APICv is enabled, but paranoia won't hurt in this case.
+ */
+ slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return;
- vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
- vmx_flush_tlb_current(vcpu);
+ /*
+ * Ensure that the mmu_notifier sequence count is read before KVM
+ * retrieves the pfn from the primary MMU. Note, the memslot is
+ * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb()
+ * in kvm_mmu_invalidate_end().
+ */
+ mmu_seq = kvm->mmu_invalidate_seq;
+ smp_rmb();
/*
- * Do not pin apic access page in memory, the MMU notifier
- * will call us again if it is migrated or swapped out.
+ * No need to retry if the memslot does not exist or is invalid. KVM
+ * controls the APIC-access page memslot, and only deletes the memslot
+ * if APICv is permanently inhibited, i.e. the memslot won't reappear.
*/
- put_page(page);
+ pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page);
+ if (is_error_noslot_pfn(pfn))
+ return;
+
+ read_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn))
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+ else
+ vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
+
+ /*
+ * Do not pin the APIC access page in memory so that it can be freely
+ * migrated, the MMU notifier will call us again if it is migrated or
+ * swapped out. KVM backs the memslot with anonymous memory, the pfn
+ * should always point at a refcounted page (if the pfn is valid).
+ */
+ if (!WARN_ON_ONCE(!refcounted_page))
+ kvm_release_page_clean(refcounted_page);
+
+ /*
+ * No need for a manual TLB flush at this point, KVM has already done a
+ * flush if there were SPTEs pointing at the previous page.
+ */
+ read_unlock(&vcpu->kvm->mmu_lock);
}
-static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
u16 status;
u8 old;
+ /*
+ * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
+ * is only relevant for if and only if Virtual Interrupt Delivery is
+ * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
+ * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested
+ * VM-Exit, otherwise L1 with run with a stale SVI.
+ */
+ if (is_guest_mode(vcpu)) {
+ /*
+ * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
+ * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
+ * Note, userspace can stuff state while L2 is active; assert
+ * that VID is disabled if and only if the vCPU is in KVM_RUN
+ * to avoid false positives if userspace is setting APIC state.
+ */
+ WARN_ON_ONCE(vcpu->wants_to_run &&
+ nested_cpu_has_vid(get_vmcs12(vcpu)));
+ to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
+ return;
+ }
+
if (max_isr == -1)
max_isr = 0;
@@ -6254,59 +6979,53 @@ static void vmx_set_rvi(int vector)
}
}
-static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
-{
- /*
- * When running L2, updating RVI is only relevant when
- * vmcs12 virtual-interrupt-delivery enabled.
- * However, it can be enabled only when L1 also
- * intercepts external-interrupts and in that case
- * we should not update vmcs02 RVI but instead intercept
- * interrupt. Therefore, do nothing when running L2.
- */
- if (!is_guest_mode(vcpu))
- vmx_set_rvi(max_irr);
-}
-
-static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
int max_irr;
- bool max_irr_updated;
+ bool got_posted_interrupt;
+
+ if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
+ return -EIO;
- WARN_ON(!vcpu->arch.apicv_active);
- if (pi_test_on(&vmx->pi_desc)) {
- pi_clear_on(&vmx->pi_desc);
+ if (pi_test_on(&vt->pi_desc)) {
+ pi_clear_on(&vt->pi_desc);
/*
* IOMMU can write to PID.ON, so the barrier matters even on UP.
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
- max_irr_updated =
- kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
-
- /*
- * If we are running L2 and L1 has a new pending interrupt
- * which can be injected, we should re-evaluate
- * what should be done with this new L1 interrupt.
- * If L1 intercepts external-interrupts, we should
- * exit from L2 to L1. Otherwise, interrupt should be
- * delivered directly to L2.
- */
- if (is_guest_mode(vcpu) && max_irr_updated) {
- if (nested_exit_on_intr(vcpu))
- kvm_vcpu_exiting_guest_mode(vcpu);
- else
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
+ got_posted_interrupt =
+ kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
+ got_posted_interrupt = false;
}
- vmx_hwapic_irr_update(vcpu, max_irr);
+
+ /*
+ * Newly recognized interrupts are injected via either virtual interrupt
+ * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
+ * disabled in two cases:
+ *
+ * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
+ * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
+ * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
+ * into L2, but KVM doesn't use virtual interrupt delivery to inject
+ * interrupts into L2, and so KVM_REQ_EVENT is again needed.
+ *
+ * 2) If APICv is disabled for this vCPU, assigned devices may still
+ * attempt to post interrupts. The posted interrupt vector will cause
+ * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
+ */
+ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
+ vmx_set_rvi(max_irr);
+ else if (got_posted_interrupt)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return max_irr;
}
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
return;
@@ -6317,79 +7036,105 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}
-static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- pi_clear_on(&vmx->pi_desc);
- memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
-}
-
-void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
+void vmx_do_interrupt_irqoff(unsigned long entry);
+void vmx_do_nmi_irqoff(void);
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
- unsigned long entry)
+static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
{
- kvm_before_interrupt(vcpu);
- vmx_do_interrupt_nmi_irqoff(entry);
- kvm_after_interrupt(vcpu);
+ /*
+ * Save xfd_err to guest_fpu before interrupt is enabled, so the
+ * MSR value is not clobbered by the host activity before the guest
+ * has chance to consume it.
+ *
+ * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM
+ * interception may have been caused by L1 interception. Per the SDM,
+ * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1.
+ *
+ * Note, XFD_ERR is updated _before_ the #NM interception check, i.e.
+ * unlike CR2 and DR6, the value is not a payload that is attached to
+ * the #NM exception.
+ */
+ if (is_xfd_nm_fault(vcpu))
+ rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
-static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
+static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
{
- const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
- u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
-
/* if exit due to PF check for async PF */
if (is_page_fault(intr_info))
- vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ /* if exit due to NM, handle before interrupts are enabled */
+ else if (is_nm_fault(intr_info))
+ handle_nm_fault_irqoff(vcpu);
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
- /* We need to handle NMIs before interrupts are enabled */
- else if (is_nmi(intr_info))
- handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
}
-static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
+static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
+ u32 intr_info)
{
- u32 intr_info = vmx_get_intr_info(vcpu);
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
- gate_desc *desc = (gate_desc *)host_idt_base + vector;
- if (WARN_ONCE(!is_external_intr(intr_info),
- "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
+ if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
+ "unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+ /*
+ * Invoke the kernel's IRQ handler for the vector. Use the FRED path
+ * when it's available even if FRED isn't fully enabled, e.g. even if
+ * FRED isn't supported in hardware, in order to avoid the indirect
+ * CALL in the non-FRED path.
+ */
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
+ if (IS_ENABLED(CONFIG_X86_FRED))
+ fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
+ else
+ vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
+ kvm_after_interrupt(vcpu);
+
+ vcpu->arch.at_instruction_boundary = true;
}
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (to_vt(vcpu)->emulation_required)
+ return;
- if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
- handle_external_interrupt_irqoff(vcpu);
- else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
- handle_exception_nmi_irqoff(vmx);
+ switch (vmx_get_exit_reason(vcpu).basic) {
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
+ break;
+ case EXIT_REASON_EXCEPTION_NMI:
+ handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
+ break;
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ kvm_machine_check();
+ break;
+ default:
+ break;
+ }
}
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
*/
-static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
/*
* We cannot do SMM unless we can run the guest in big
* real mode.
*/
return enable_unrestricted_guest || emulate_invalid_guest_state;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
return nested;
case MSR_AMD64_VIRT_SPEC_CTRL:
+ case MSR_AMD64_TSC_RATIO:
/* This is AMD only. */
return false;
default:
@@ -6473,13 +7218,17 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
case INTR_TYPE_SOFT_EXCEPTION:
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
fallthrough;
- case INTR_TYPE_HARD_EXCEPTION:
- if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
- u32 err = vmcs_read32(error_code_field);
- kvm_requeue_exception_e(vcpu, vector, err);
- } else
- kvm_requeue_exception(vcpu, vector);
+ case INTR_TYPE_HARD_EXCEPTION: {
+ u32 error_code = 0;
+
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(error_code_field);
+
+ kvm_requeue_exception(vcpu, vector,
+ idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
+ error_code);
break;
+ }
case INTR_TYPE_SOFT_INTR:
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
fallthrough;
@@ -6498,7 +7247,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
IDT_VECTORING_ERROR_CODE);
}
-static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+void vmx_cancel_injection(struct kvm_vcpu *vcpu)
{
__vmx_complete_interrupts(vcpu,
vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
@@ -6512,9 +7261,14 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
{
int i, nr_msrs;
struct perf_guest_switch_msr *msrs;
+ struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
+
+ pmu->host_cross_mapped_mask = 0;
+ if (pmu->pebs_enable & pmu->global_ctrl)
+ intel_pmu_cross_mapped_check(pmu);
/* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
- msrs = perf_guest_get_msrs(&nr_msrs);
+ msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
if (!msrs)
return;
@@ -6526,13 +7280,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host, false);
}
-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl;
u32 delta_tsc;
- if (vmx->req_immediate_exit) {
+ if (force_immediate_exit) {
vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
vmx->loaded_vmcs->hv_timer_soft_disabled = false;
} else if (vmx->hv_deadline_tsc != -1) {
@@ -6560,42 +7314,115 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
}
}
-static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+ unsigned int flags)
{
- switch (to_vmx(vcpu)->exit_reason.basic) {
+ u64 hostval = this_cpu_read(x86_spec_ctrl_current);
+
+ if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
+ return;
+
+ if (flags & VMX_RUN_SAVE_SPEC_CTRL)
+ vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL);
+
+ /*
+ * If the guest/host SPEC_CTRL values differ, restore the host value.
+ *
+ * For legacy IBRS, the IBRS bit always needs to be written after
+ * transitioning from a less privileged predictor mode, regardless of
+ * whether the guest/host values differ.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
+ vmx->spec_ctrl != hostval)
+ native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval);
+
+ barrier_nospec();
+}
+
+static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
+{
+ /*
+ * If L2 is active, some VMX preemption timer exits can be handled in
+ * the fastpath even, all other exits must use the slow path.
+ */
+ if (is_guest_mode(vcpu) &&
+ vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER)
+ return EXIT_FASTPATH_NONE;
+
+ switch (vmx_get_exit_reason(vcpu).basic) {
case EXIT_REASON_MSR_WRITE:
- return handle_fastpath_set_msr_irqoff(vcpu);
+ return handle_fastpath_wrmsr(vcpu);
+ case EXIT_REASON_MSR_WRITE_IMM:
+ return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
case EXIT_REASON_PREEMPTION_TIMER:
- return handle_fastpath_preemption_timer(vcpu);
+ return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+ case EXIT_REASON_HLT:
+ return handle_fastpath_hlt(vcpu);
+ case EXIT_REASON_INVD:
+ return handle_fastpath_invd(vcpu);
default:
return EXIT_FASTPATH_NONE;
}
}
+noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu)
+{
+ if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI ||
+ !is_nmi(vmx_get_intr_info(vcpu)))
+ return;
+
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
+ else
+ vmx_do_nmi_irqoff();
+ kvm_after_interrupt(vcpu);
+}
+
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
- struct vcpu_vmx *vmx)
+ unsigned int flags)
{
- kvm_guest_enter_irqoff();
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
- /* L1D Flush includes CPU buffer clear to mitigate MDS */
- if (static_branch_unlikely(&vmx_l1d_should_flush))
- vmx_l1d_flush(vcpu);
- else if (static_branch_unlikely(&mds_user_clear))
- mds_clear_cpu_buffers();
+ guest_state_enter_irqoff();
+
+ vmx_l1d_flush(vcpu);
+
+ vmx_disable_fb_clear(vmx);
if (vcpu->arch.cr2 != native_read_cr2())
native_write_cr2(vcpu->arch.cr2);
vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
- vmx->loaded_vmcs->launched);
+ flags);
vcpu->arch.cr2 = native_read_cr2();
+ vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
+
+ vmx->idt_vectoring_info = 0;
+
+ vmx_enable_fb_clear(vmx);
- kvm_guest_exit_irqoff();
+ if (unlikely(vmx->fail)) {
+ vmx->vt.exit_reason.full = 0xdead;
+ goto out;
+ }
+
+ vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry))
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+ vmx_handle_nmi(vcpu);
+
+out:
+ guest_state_exit_irqoff();
}
-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -6604,12 +7431,24 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->soft_vnmi_blocked))
vmx->loaded_vmcs->entry_time = ktime_get();
- /* Don't enter VMX if guest state is invalid, let the exit handler
- start emulation until we arrive back to a valid state */
- if (vmx->emulation_required)
+ /*
+ * Don't enter VMX if guest state is invalid, let the exit handler
+ * start emulation until we arrive back to a valid state. Synthesize a
+ * consistency check VM-Exit due to invalid guest state and bail.
+ */
+ if (unlikely(vmx->vt.emulation_required)) {
+ vmx->fail = 0;
+
+ vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE;
+ vmx->vt.exit_reason.failed_vmentry = 1;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+ vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+ vmx->vt.exit_intr_info = 0;
return EXIT_FASTPATH_NONE;
+ }
- trace_kvm_entry(vcpu);
+ trace_kvm_entry(vcpu, force_immediate_exit);
if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
@@ -6626,7 +7465,21 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ vcpu->arch.regs_dirty = 0;
+
+ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+ set_debugreg(vcpu->arch.dr6, 6);
+ if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
+ vmx_reload_guest_debugctl(vcpu);
+
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+ * it switches back to the current->mm, which can occur in KVM context
+ * when switching to a temporary mm to patch kernel code, e.g. if KVM
+ * toggles a static key while handling a VM-Exit.
+ */
cr3 = __get_current_cr3_fast();
if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
vmcs_writel(HOST_CR3, cr3);
@@ -6647,8 +7500,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- kvm_load_guest_xsave_state(vcpu);
-
pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx);
@@ -6656,43 +7507,17 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_passthrough_lbr_msrs(vcpu);
if (enable_preemption_timer)
- vmx_update_hv_timer(vcpu);
+ vmx_update_hv_timer(vcpu, force_immediate_exit);
+ else if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
kvm_wait_lapic_expire(vcpu);
- /*
- * If this vCPU has touched SPEC_CTRL, restore the guest's value if
- * it's non-zero. Since vmentry is serialising on affected CPUs, there
- * is no need to worry about the conditional branch over the wrmsr
- * being speculatively taken.
- */
- x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
-
/* The actual VMENTER/EXIT is in the .noinstr.text section. */
- vmx_vcpu_enter_exit(vcpu, vmx);
-
- /*
- * We do not use IBRS in the kernel. If this vCPU has used the
- * SPEC_CTRL MSR it may have left it on; save the value and
- * turn it off. This is much more efficient than blindly adding
- * it to the atomic save/restore list. Especially as the former
- * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
- *
- * For non-nested case:
- * If the L01 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- *
- * For nested case:
- * If the L02 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- */
- if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
- vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
-
- x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
+ vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
/* All fields are clean at this point */
- if (static_branch_unlikely(&enable_evmcs)) {
+ if (kvm_is_using_evmcs()) {
current_evmcs->hv_clean_fields |=
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
@@ -6700,8 +7525,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
}
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
- if (vmx->host_debugctlmsr)
- update_debugctlmsr(vmx->host_debugctlmsr);
+ if (vcpu->arch.host_debugctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
#ifndef CONFIG_X86_64
/*
@@ -6716,41 +7541,26 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
loadsegment(es, __USER_DS);
#endif
- vmx_register_cache_reset(vcpu);
-
pt_guest_exit(vmx);
- kvm_load_host_xsave_state(vcpu);
-
if (is_guest_mode(vcpu)) {
/*
* Track VMLAUNCH/VMRESUME that have made past guest state
* checking.
*/
if (vmx->nested.nested_run_pending &&
- !vmx->exit_reason.failed_vmentry)
+ !vmx_get_exit_reason(vcpu).failed_vmentry)
++vcpu->stat.nested_run;
vmx->nested.nested_run_pending = 0;
}
- vmx->idt_vectoring_info = 0;
-
- if (unlikely(vmx->fail)) {
- vmx->exit_reason.full = 0xdead;
+ if (unlikely(vmx->fail))
return EXIT_FASTPATH_NONE;
- }
- vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
- if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
- kvm_machine_check();
-
- if (likely(!vmx->exit_reason.failed_vmentry))
- vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
- trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
+ trace_kvm_exit(vcpu, KVM_ISA_VMX);
- if (unlikely(vmx->exit_reason.failed_vmentry))
+ if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry))
return EXIT_FASTPATH_NONE;
vmx->loaded_vmcs->launched = 1;
@@ -6758,13 +7568,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
- if (is_guest_mode(vcpu))
- return EXIT_FASTPATH_NONE;
-
- return vmx_exit_handlers_fastpath(vcpu);
+ return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
}
-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
+void vmx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6773,17 +7580,20 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
free_vpid(vmx->vpid);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
+ free_page((unsigned long)vmx->ve_info);
}
-static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
+int vmx_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
- int i, cpu, err;
+ int i, err;
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu);
+ INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list);
+
err = -ENOMEM;
vmx->vpid = allocate_vpid();
@@ -6800,10 +7610,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vpid;
}
- for (i = 0; i < kvm_nr_uret_msrs; ++i) {
- vmx->guest_uret_msrs[i].data = 0;
+ for (i = 0; i < kvm_nr_uret_msrs; ++i)
vmx->guest_uret_msrs[i].mask = -1ull;
- }
if (boot_cpu_has(X86_FEATURE_RTM)) {
/*
* TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
@@ -6812,43 +7620,30 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
*/
tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (tsx_ctrl)
- vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+ tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
}
err = alloc_loaded_vmcs(&vmx->vmcs01);
if (err < 0)
goto free_pml;
- /* The MSR bitmap starts with all ones */
- bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ /*
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
+ * performance benefits from enabling it for vmcs02.
+ */
+ if (kvm_is_using_evmcs() &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
-#ifdef CONFIG_X86_64
- vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-#endif
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
- if (kvm_cstate_in_guest(vcpu->kvm)) {
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
}
- vmx->msr_bitmap_mode = 0;
vmx->loaded_vmcs = &vmx->vmcs01;
- cpu = get_cpu();
- vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
- init_vmcs(vmx);
- vmx_vcpu_put(vcpu);
- put_cpu();
+
if (cpu_need_virtualize_apic_accesses(vcpu)) {
- err = alloc_apic_access_page(vcpu->kvm);
+ err = kvm_alloc_apic_access_page(vcpu->kvm);
if (err)
goto free_vmcs;
}
@@ -6859,26 +7654,23 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vmcs;
}
- if (nested)
- memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
- else
- memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
+ err = -ENOMEM;
+ if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct page *page;
- vcpu_setup_sgx_lepubkeyhash(vcpu);
+ BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
- vmx->nested.posted_intr_nv = -1;
- vmx->nested.current_vmptr = -1ull;
- vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+ /* ve_info must be page aligned. */
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!page)
+ goto free_vmcs;
- vcpu->arch.microcode_version = 0x100000000ULL;
- vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
+ vmx->ve_info = page_to_virt(page);
+ }
- /*
- * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
- * or POSTED_INTR_WAKEUP_VECTOR.
- */
- vmx->pi_desc.nv = POSTED_INTR_VECTOR;
- vmx->pi_desc.sn = 1;
+ if (vmx_can_use_ipiv(vcpu))
+ WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
+ __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID);
return 0;
@@ -6894,10 +7686,10 @@ free_vpid:
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
-static int vmx_vm_init(struct kvm *kvm)
+int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
- kvm->arch.pause_in_guest = true;
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
switch (l1tf_mitigation) {
@@ -6905,6 +7697,7 @@ static int vmx_vm_init(struct kvm *kvm)
case L1TF_MITIGATION_FLUSH_NOWARN:
/* 'I explicitly don't care' is set */
break;
+ case L1TF_MITIGATION_AUTO:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
case L1TF_MITIGATION_FULL:
@@ -6922,82 +7715,40 @@ static int vmx_vm_init(struct kvm *kvm)
break;
}
}
+
+ if (enable_pml)
+ kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
return 0;
}
-static int __init vmx_check_processor_compat(void)
+static inline bool vmx_ignore_guest_pat(struct kvm *kvm)
{
- struct vmcs_config vmcs_conf;
- struct vmx_capability vmx_cap;
-
- if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !this_cpu_has(X86_FEATURE_VMX)) {
- pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
- return -EIO;
- }
-
- if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
- return -EIO;
- if (nested)
- nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
- if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
- printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
- smp_processor_id());
- return -EIO;
- }
- return 0;
+ /*
+ * Non-coherent DMA devices need the guest to flush CPU properly.
+ * In that case it is not possible to map all guest RAM as WB, so
+ * always trust guest PAT.
+ */
+ return !kvm_arch_has_noncoherent_dma(kvm) &&
+ kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT);
}
-static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- u8 cache;
- u64 ipat = 0;
-
- /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
- * memory aliases with conflicting memory types and sometimes MCEs.
- * We have to be careful as to what are honored and when.
- *
- * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
- * UC. The effective memory type is UC or WC depending on guest PAT.
- * This was historically the source of MCEs and we want to be
- * conservative.
- *
- * When there is no need to deal with noncoherent DMA (e.g., no VT-d
- * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
- * EPT memory type is set to WB. The effective memory type is forced
- * WB.
- *
- * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
- * EPT memory type is used to emulate guest CD/MTRR.
+ /*
+ * Force UC for host MMIO regions, as allowing the guest to access MMIO
+ * with cacheable accesses will result in Machine Checks.
*/
+ if (is_mmio)
+ return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- if (is_mmio) {
- cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
-
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
- ipat = VMX_EPT_IPAT_BIT;
- cache = MTRR_TYPE_WRBACK;
- goto exit;
- }
-
- if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
- ipat = VMX_EPT_IPAT_BIT;
- if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- cache = MTRR_TYPE_WRBACK;
- else
- cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
+ /* Force WB if ignoring guest PAT */
+ if (vmx_ignore_guest_pat(vcpu->kvm))
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
- cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
-
-exit:
- return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
}
-static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
+static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
{
/*
* These bits in the secondary execution controls field
@@ -7011,7 +7762,6 @@ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_DESC;
- u32 new_ctl = vmx->secondary_exec_control;
u32 cur_ctl = secondary_exec_controls_get(vmx);
secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
@@ -7034,7 +7784,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
} while (0)
- entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ entry = kvm_find_cpuid_entry(vcpu, 0x1);
cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
@@ -7050,32 +7800,20 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
- entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+ entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
+ cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK));
+ cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT));
-#undef cr4_fixed1_update
-}
-
-static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (kvm_mpx_supported()) {
- bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
+ entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1);
+ cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM));
- if (mpx_enabled) {
- vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
- vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
- } else {
- vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
- vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
- }
- }
+#undef cr4_fixed1_update
}
static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
@@ -7085,7 +7823,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
int i;
for (i = 0; i < PT_CPUID_LEAVES; i++) {
- best = kvm_find_cpuid_entry(vcpu, 0x14, i);
+ best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
if (!best)
return;
vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
@@ -7095,12 +7833,13 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
}
/* Get the number of configurable Address Ranges for filtering */
- vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
+ vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges);
/* Initialize and clear the no dependency bits */
vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
- RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
+ RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
+ RTIT_CTL_BRANCH_EN);
/*
* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
@@ -7118,12 +7857,11 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
/*
- * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
- * MTCFreq can be set
+ * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
*/
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
- RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
+ RTIT_CTL_MTC_RANGE);
/* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
@@ -7143,45 +7881,49 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
/* unmask address range configure area */
- for (i = 0; i < vmx->pt_desc.addr_range; i++)
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
-static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
- vcpu->arch.xsaves_enabled = false;
+ /*
+ * XSAVES is effectively enabled if and only if XSAVE is also exposed
+ * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
+ * set if and only if XSAVE is supported.
+ */
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
+ guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES);
- if (cpu_has_secondary_exec_ctrls()) {
- vmx_compute_secondary_exec_control(vmx);
- vmcs_set_secondary_exec_control(vmx);
- }
+ vmx_setup_uret_msrs(vmx);
- if (nested_vmx_allowed(vcpu))
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ if (cpu_has_secondary_exec_ctrls())
+ vmcs_set_secondary_exec_control(vmx,
+ vmx_secondary_exec_control(vmx));
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
+ vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
else
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ vmx->msr_ia32_feature_control_valid_bits &=
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
- if (nested_vmx_allowed(vcpu)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
nested_vmx_cr_fixed1_bits_update(vcpu);
- nested_vmx_entry_exit_ctls_update(vcpu);
- }
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
- guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+ guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT))
update_intel_pt_cfg(vcpu);
if (boot_cpu_has(X86_FEATURE_RTM)) {
struct vmx_uret_msr *msr;
msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (msr) {
- bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
+ bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM);
vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
}
}
@@ -7189,12 +7931,12 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX))
vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
else
vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_SGX_LC_ENABLED;
else
@@ -7205,6 +7947,60 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_update_exception_bitmap(vcpu);
}
+static __init u64 vmx_get_perf_capabilities(void)
+{
+ u64 perf_cap = PERF_CAP_FW_WRITES;
+ u64 host_perf_cap = 0;
+
+ if (!enable_pmu)
+ return 0;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+
+ if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
+ x86_perf_get_lbr(&vmx_lbr_caps);
+
+ /*
+ * KVM requires LBR callstack support, as the overhead due to
+ * context switching LBRs without said support is too high.
+ * See intel_pmu_create_guest_lbr_event() for more info.
+ */
+ if (!vmx_lbr_caps.has_callstack)
+ memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps));
+ else if (vmx_lbr_caps.nr)
+ perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT;
+ }
+
+ if (vmx_pebs_supported()) {
+ perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
+
+ /*
+ * Disallow adaptive PEBS as it is functionally broken, can be
+ * used by the guest to read *host* LBRs, and can be used to
+ * bypass userspace event filters. To correctly and safely
+ * support adaptive PEBS, KVM needs to:
+ *
+ * 1. Account for the ADAPTIVE flag when (re)programming fixed
+ * counters.
+ *
+ * 2. Gain support from perf (or take direct control of counter
+ * programming) to support events without adaptive PEBS
+ * enabled for the hardware counter.
+ *
+ * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
+ * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
+ *
+ * 4. Document which PMU events are effectively exposed to the
+ * guest via adaptive PEBS, and make adaptive PEBS mutually
+ * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
+ */
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+ }
+
+ return perf_cap;
+}
+
static __init void vmx_set_cpu_caps(void)
{
kvm_set_cpu_caps();
@@ -7220,19 +8016,27 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
if (vmx_pt_mode_is_host_guest())
kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+ if (vmx_pebs_supported()) {
+ kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
+ }
+
+ if (!enable_pmu)
+ kvm_cpu_cap_clear(X86_FEATURE_PDCM);
+ kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
if (!enable_sgx) {
kvm_cpu_cap_clear(X86_FEATURE_SGX);
kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
kvm_cpu_cap_clear(X86_FEATURE_SGX1);
kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
}
if (vmx_umip_emulated())
kvm_cpu_cap_set(X86_FEATURE_UMIP);
/* CPUID 0xD.1 */
- supported_xss = 0;
if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
@@ -7244,73 +8048,100 @@ static __init void vmx_set_cpu_caps(void)
if (cpu_has_vmx_waitpkg())
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
-}
-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- to_vmx(vcpu)->req_immediate_exit = true;
+ /*
+ * Disable CET if unrestricted_guest is unsupported as KVM doesn't
+ * enforce CET HW behaviors in emulator. On platforms with
+ * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
+ * fails, so disable CET in this case too.
+ */
+ if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
+ !cpu_has_vmx_basic_no_hw_errcode_cc()) {
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+ kvm_cpu_cap_clear(X86_FEATURE_IBT);
+ }
}
-static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info)
+static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ unsigned long *exit_qualification)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
unsigned short port;
- bool intercept;
int size;
+ bool imm;
+
+ /*
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+ */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
if (info->intercept == x86_intercept_in ||
info->intercept == x86_intercept_ins) {
port = info->src_val;
size = info->dst_bytes;
+ imm = info->src_type == OP_IMM;
} else {
port = info->dst_val;
size = info->src_bytes;
+ imm = info->dst_type == OP_IMM;
}
- /*
- * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
- * VM-exits depend on the 'unconditional IO exiting' VM-execution
- * control.
- *
- * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
- */
- if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
- intercept = nested_cpu_has(vmcs12,
- CPU_BASED_UNCOND_IO_EXITING);
- else
- intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
- /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
- return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+ *exit_qualification = ((unsigned long)port << 16) | (size - 1);
+
+ if (info->intercept == x86_intercept_ins ||
+ info->intercept == x86_intercept_outs)
+ *exit_qualification |= BIT(4);
+
+ if (info->rep_prefix)
+ *exit_qualification |= BIT(5);
+
+ if (imm)
+ *exit_qualification |= BIT(6);
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
}
-static int vmx_check_intercept(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info,
- enum x86_intercept_stage stage,
- struct x86_exception *exception)
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long exit_qualification = 0;
+ u32 vm_exit_reason;
+ u64 exit_insn_len;
switch (info->intercept) {
- /*
- * RDPID causes #UD if disabled through secondary execution controls.
- * Because it is marked as EmulateOnUD, we need to intercept it here.
- * Note, RDPID is hidden behind ENABLE_RDTSCP.
- */
case x86_intercept_rdpid:
+ /*
+ * RDPID causes #UD if not enabled through secondary execution
+ * controls (ENABLE_RDTSCP). Note, the implicit MSR access to
+ * TSC_AUX is NOT subject to interception, i.e. checking only
+ * the dedicated execution control is architecturally correct.
+ */
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
exception->vector = UD_VECTOR;
exception->error_code_valid = false;
return X86EMUL_PROPAGATE_FAULT;
}
- break;
+ return X86EMUL_CONTINUE;
case x86_intercept_in:
case x86_intercept_ins:
case x86_intercept_out:
case x86_intercept_outs:
- return vmx_check_intercept_io(vcpu, info);
+ if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_IO_INSTRUCTION;
+ break;
case x86_intercept_lgdt:
case x86_intercept_lidt:
@@ -7323,15 +8154,54 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
return X86EMUL_CONTINUE;
- /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ if (info->intercept == x86_intercept_lldt ||
+ info->intercept == x86_intercept_ltr ||
+ info->intercept == x86_intercept_sldt ||
+ info->intercept == x86_intercept_str)
+ vm_exit_reason = EXIT_REASON_LDTR_TR;
+ else
+ vm_exit_reason = EXIT_REASON_GDTR_IDTR;
+ /*
+ * FIXME: Decode the ModR/M to generate the correct exit
+ * qualification for memory operands.
+ */
+ break;
+
+ case x86_intercept_hlt:
+ if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_HLT;
+ break;
+
+ case x86_intercept_pause:
+ /*
+ * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
+ * with vanilla NOPs in the emulator. Apply the interception
+ * check only to actual PAUSE instructions. Don't check
+ * PAUSE-loop-exiting, software can't expect a given PAUSE to
+ * exit, i.e. KVM is within its rights to allow L2 to execute
+ * the PAUSE.
+ */
+ if ((info->rep_prefix != REPE_PREFIX) ||
+ !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION;
break;
/* TODO: check more intercepts... */
default:
- break;
+ return X86EMUL_UNHANDLEABLE;
}
- return X86EMUL_UNHANDLEABLE;
+ exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip);
+ if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH)
+ return X86EMUL_UNHANDLEABLE;
+
+ __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification,
+ exit_insn_len);
+ return X86EMUL_INTERCEPTED;
}
#ifdef CONFIG_X86_64
@@ -7353,8 +8223,8 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
return 0;
}
-static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
- bool *expired)
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
{
struct vcpu_vmx *vmx;
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
@@ -7373,9 +8243,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
delta_tsc = 0;
/* Convert to host delta tsc if tsc scaling is enabled */
- if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
delta_tsc && u64_shl_div_u64(delta_tsc,
- kvm_tsc_scaling_ratio_frac_bits,
+ kvm_caps.tsc_scaling_ratio_frac_bits,
vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
return -ERANGE;
@@ -7393,58 +8263,36 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
return 0;
}
-static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
to_vmx(vcpu)->hv_deadline_tsc = -1;
}
#endif
-static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
- if (!kvm_pause_in_guest(vcpu->kvm))
- shrink_ple_window(vcpu);
-}
-
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (WARN_ON_ONCE(!enable_pml))
+ return;
+
if (is_guest_mode(vcpu)) {
vmx->nested.update_vmcs01_cpu_dirty_logging = true;
return;
}
/*
- * Note, cpu_dirty_logging_count can be changed concurrent with this
+ * Note, nr_memslots_dirty_logging can be changed concurrent with this
* code, but in that case another update request will be made and so
* the guest will never run with a stale PML value.
*/
- if (vcpu->kvm->arch.cpu_dirty_logging_count)
+ if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
else
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
-{
- if (pi_pre_block(vcpu))
- return 1;
-
- if (kvm_lapic_hv_timer_in_use(vcpu))
- kvm_lapic_switch_to_sw_timer(vcpu);
-
- return 0;
-}
-
-static void vmx_post_block(struct kvm_vcpu *vcpu)
-{
- if (kvm_x86_ops.set_hv_timer)
- kvm_lapic_switch_to_hv_timer(vcpu);
-
- pi_post_block(vcpu);
-}
-
-static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
@@ -7454,7 +8302,8 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
~FEAT_CTL_LMCE_ENABLED;
}
-static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+#ifdef CONFIG_KVM_SMM
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
/* we need a nested vmexit to enter SMM, postpone if run is pending */
if (to_vmx(vcpu)->nested.nested_run_pending)
@@ -7462,10 +8311,17 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !is_smm(vcpu);
}
-static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
+ * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong
+ * SMI and RSM only modify state that is saved and restored via SMRAM.
+ * E.g. most MSRs are left untouched, but many are modified by VM-Exit
+ * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
+ */
vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
if (vmx->nested.smm.guest_mode)
nested_vmx_vmexit(vcpu, -1, 0, 0);
@@ -7476,7 +8332,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -7491,22 +8347,24 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
if (ret)
return ret;
+ vmx->nested.nested_run_pending = 1;
vmx->nested.smm.guest_mode = false;
}
return 0;
}
-static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
+#endif
-static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
}
-static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
+void vmx_migrate_timers(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
@@ -7516,157 +8374,80 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
}
}
-static void hardware_unsetup(void)
+void vmx_hardware_unsetup(void)
{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
if (nested)
nested_vmx_hardware_unsetup();
free_kvm_area();
}
-static bool vmx_check_apicv_inhibit_reasons(ulong bit)
-{
- ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_HYPERV);
-
- return supported & BIT(bit);
-}
-
-static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .hardware_unsetup = hardware_unsetup,
-
- .hardware_enable = hardware_enable,
- .hardware_disable = hardware_disable,
- .cpu_has_accelerated_tpr = report_flexpriority,
- .has_emulated_msr = vmx_has_emulated_msr,
-
- .vm_size = sizeof(struct kvm_vmx),
- .vm_init = vmx_vm_init,
-
- .vcpu_create = vmx_create_vcpu,
- .vcpu_free = vmx_free_vcpu,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_guest_switch = vmx_prepare_switch_to_guest,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .update_exception_bitmap = vmx_update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .set_cr0 = vmx_set_cr0,
- .is_valid_cr4 = vmx_is_valid_cr4,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .set_dr7 = vmx_set_dr7,
- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
-
- .tlb_flush_all = vmx_flush_tlb_all,
- .tlb_flush_current = vmx_flush_tlb_current,
- .tlb_flush_gva = vmx_flush_tlb_gva,
- .tlb_flush_guest = vmx_flush_tlb_guest,
-
- .run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = vmx_skip_emulated_instruction,
- .update_emulated_instruction = vmx_update_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .set_irq = vmx_inject_irq,
- .set_nmi = vmx_inject_nmi,
- .queue_exception = vmx_queue_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = vmx_enable_nmi_window,
- .enable_irq_window = vmx_enable_irq_window,
- .update_cr8_intercept = vmx_update_cr8_intercept,
- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vmx_load_eoi_exitmap,
- .apicv_post_state_restore = vmx_apicv_post_state_restore,
- .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
- .hwapic_irr_update = vmx_hwapic_irr_update,
- .hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
- .sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
- .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
-
- .set_tss_addr = vmx_set_tss_addr,
- .set_identity_map_addr = vmx_set_identity_map_addr,
- .get_mt_mask = vmx_get_mt_mask,
-
- .get_exit_info = vmx_get_exit_info,
-
- .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
- .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
- .write_tsc_offset = vmx_write_tsc_offset,
- .write_tsc_multiplier = vmx_write_tsc_multiplier,
-
- .load_mmu_pgd = vmx_load_mmu_pgd,
-
- .check_intercept = vmx_check_intercept,
- .handle_exit_irqoff = vmx_handle_exit_irqoff,
-
- .request_immediate_exit = vmx_request_immediate_exit,
-
- .sched_in = vmx_sched_in,
-
- .cpu_dirty_log_size = PML_ENTITY_NUM,
- .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
-
- .pre_block = vmx_pre_block,
- .post_block = vmx_post_block,
-
- .pmu_ops = &intel_pmu_ops,
- .nested_ops = &vmx_nested_ops,
-
- .update_pi_irte = pi_update_irte,
- .start_assignment = vmx_pi_start_assignment,
+void vmx_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
-#ifdef CONFIG_X86_64
- .set_hv_timer = vmx_set_hv_timer,
- .cancel_hv_timer = vmx_cancel_hv_timer,
-#endif
+ free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
+}
- .setup_mce = vmx_setup_mce,
+/*
+ * Note, the SDM states that the linear address is masked *after* the modified
+ * canonicality check, whereas KVM masks (untags) the address and then performs
+ * a "normal" canonicality check. Functionally, the two methods are identical,
+ * and when the masking occurs relative to the canonicality check isn't visible
+ * to software, i.e. KVM's behavior doesn't violate the SDM.
+ */
+gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags)
+{
+ int lam_bit;
+ unsigned long cr3_bits;
- .smi_allowed = vmx_smi_allowed,
- .enter_smm = vmx_enter_smm,
- .leave_smm = vmx_leave_smm,
- .enable_smi_window = vmx_enable_smi_window,
+ if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG))
+ return gva;
- .can_emulate_instruction = vmx_can_emulate_instruction,
- .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
- .migrate_timers = vmx_migrate_timers,
+ if (!is_64_bit_mode(vcpu))
+ return gva;
- .msr_filter_changed = vmx_msr_filter_changed,
- .complete_emulated_msr = kvm_complete_insn_gp,
+ /*
+ * Bit 63 determines if the address should be treated as user address
+ * or a supervisor address.
+ */
+ if (!(gva & BIT_ULL(63))) {
+ cr3_bits = kvm_get_active_cr3_lam_bits(vcpu);
+ if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48)))
+ return gva;
- .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
-};
+ /* LAM_U48 is ignored if LAM_U57 is set. */
+ lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
+ } else {
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP))
+ return gva;
+
+ lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
+ }
+
+ /*
+ * Untag the address by sign-extending the lam_bit, but NOT to bit 63.
+ * Bit 63 is retained from the raw virtual address so that untagging
+ * doesn't change a user access to a supervisor access, and vice versa.
+ */
+ return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
+}
+
+static unsigned int vmx_handle_intel_pt_intr(void)
+{
+ struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+ /* '0' on failure so that the !PT case can use a RET0 static call. */
+ if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
+ return 0;
+
+ kvm_make_request(KVM_REQ_PMI, vcpu);
+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+ (unsigned long *)&vcpu->arch.pmu.global_status);
+ return 1;
+}
static __init void vmx_setup_user_return_msrs(void)
{
@@ -7694,31 +8475,52 @@ static __init void vmx_setup_user_return_msrs(void)
kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
}
-static __init int hardware_setup(void)
+static void __init vmx_setup_me_spte_mask(void)
+{
+ u64 me_mask = 0;
+
+ /*
+ * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
+ * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems,
+ * boot_cpu_data.x86_phys_bits holds the actual physical address
+ * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
+ * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
+ */
+ if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
+ me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
+ kvm_host.maxphyaddr - 1);
+
+ /*
+ * Unlike SME, host kernel doesn't support setting up any
+ * MKTME KeyID on Intel platforms. No memory encryption
+ * bits should be included into the SPTE.
+ */
+ kvm_mmu_set_me_spte_mask(0, me_mask);
+}
+
+__init int vmx_hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
- int r, ept_lpage_level;
+ int r;
store_idt(&dt);
host_idt_base = dt.address;
vmx_setup_user_return_msrs();
- if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
- return -EIO;
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
if (boot_cpu_has(X86_FEATURE_MPX)) {
- rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
- WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
+ rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs);
+ WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
}
if (!cpu_has_vmx_mpx())
- supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
- XFEATURE_MASK_BNDCSR);
+ kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ XFEATURE_MASK_BNDCSR);
if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
@@ -7732,10 +8534,18 @@ static __init int hardware_setup(void)
/* NX support is required for shadow paging. */
if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
- pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n");
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
return -EOPNOTSUPP;
}
+ /*
+ * Shadow paging doesn't have a (further) performance penalty
+ * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
+ * by default
+ */
+ if (!enable_ept)
+ allow_smaller_maxphyaddr = true;
+
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
@@ -7748,23 +8558,27 @@ static __init int hardware_setup(void)
if (!cpu_has_virtual_nmis())
enable_vnmi = 0;
+#ifdef CONFIG_X86_SGX_KVM
+ if (!cpu_has_vmx_encls_vmexit())
+ enable_sgx = false;
+#endif
+
/*
* set_apic_access_page_addr() is used to reload apic access
* page upon invalidation. No need to do anything if not
* using the APIC_ACCESS_ADDR VMCS field.
*/
if (!flexpriority_enabled)
- vmx_x86_ops.set_apic_access_page_addr = NULL;
+ vt_x86_ops.set_apic_access_page_addr = NULL;
if (!cpu_has_vmx_tpr_shadow())
- vmx_x86_ops.update_cr8_intercept = NULL;
+ vt_x86_ops.update_cr8_intercept = NULL;
#if IS_ENABLED(CONFIG_HYPERV)
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
&& enable_ept) {
- vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
- vmx_x86_ops.tlb_remote_flush_with_range =
- hv_remote_flush_tlb_with_range;
+ vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
#endif
@@ -7776,34 +8590,38 @@ static __init int hardware_setup(void)
ple_window_shrink = 0;
}
- if (!cpu_has_vmx_apicv()) {
+ if (!cpu_has_vmx_apicv())
enable_apicv = 0;
- vmx_x86_ops.sync_pir_to_irr = NULL;
- }
+ if (!enable_apicv)
+ vt_x86_ops.sync_pir_to_irr = NULL;
- if (cpu_has_vmx_tsc_scaling()) {
- kvm_has_tsc_control = true;
- kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 48;
- }
+ if (!enable_apicv || !cpu_has_vmx_ipiv())
+ enable_ipiv = false;
- kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+ if (cpu_has_vmx_tsc_scaling())
+ kvm_caps.has_tsc_control = true;
+
+ kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_caps.tsc_scaling_ratio_frac_bits = 48;
+ kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+ kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
if (enable_ept)
kvm_mmu_set_ept_masks(enable_ept_ad_bits,
cpu_has_vmx_ept_execute_only());
-
- if (!enable_ept)
- ept_lpage_level = 0;
- else if (cpu_has_vmx_ept_1g_page())
- ept_lpage_level = PG_LEVEL_1G;
- else if (cpu_has_vmx_ept_2m_page())
- ept_lpage_level = PG_LEVEL_2M;
else
- ept_lpage_level = PG_LEVEL_4K;
- kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
+ vt_x86_ops.get_mt_mask = NULL;
+
+ /*
+ * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
+ * bits to shadow_zero_check.
+ */
+ vmx_setup_me_spte_mask();
+
+ kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
+ ept_caps_to_lpage_level(vmx_capability.ept));
/*
* Only enable PML when hardware supports PML feature, and both EPT
@@ -7812,19 +8630,14 @@ static __init int hardware_setup(void)
if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
enable_pml = 0;
- if (!enable_pml)
- vmx_x86_ops.cpu_dirty_log_size = 0;
-
if (!cpu_has_vmx_preemption_timer())
enable_preemption_timer = false;
if (enable_preemption_timer) {
u64 use_timer_freq = 5000ULL * 1000 * 1000;
- u64 vmx_msr;
- rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
cpu_preemption_timer_multi =
- vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ vmx_misc_preemption_timer_rate(vmcs_config.misc);
if (tsc_khz)
use_timer_freq = (u64)tsc_khz * 1000;
@@ -7840,151 +8653,110 @@ static __init int hardware_setup(void)
}
if (!enable_preemption_timer) {
- vmx_x86_ops.set_hv_timer = NULL;
- vmx_x86_ops.cancel_hv_timer = NULL;
- vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
+ vt_x86_ops.set_hv_timer = NULL;
+ vt_x86_ops.cancel_hv_timer = NULL;
}
- kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
-
- kvm_mce_cap_supported |= MCG_LMCE_P;
+ kvm_caps.supported_mce_cap |= MCG_LMCE_P;
+ kvm_caps.supported_mce_cap |= MCG_CMCI_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
return -EINVAL;
- if (!enable_ept || !cpu_has_vmx_intel_pt())
+ if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
+ if (pt_mode == PT_MODE_HOST_GUEST)
+ vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ else
+ vt_init_ops.handle_intel_pt_intr = NULL;
setup_default_sgx_lepubkeyhash();
+ vmx_set_cpu_caps();
+
+ /*
+ * Configure nested capabilities after core CPU capabilities so that
+ * nested support can be conditional on base support, e.g. so that KVM
+ * can hide/show features based on kvm_cpu_cap_has().
+ */
if (nested) {
- nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
- vmx_capability.ept);
+ nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
if (r)
return r;
}
- vmx_set_cpu_caps();
-
r = alloc_kvm_area();
- if (r)
+ if (r && nested)
nested_vmx_hardware_unsetup();
- return r;
-}
-static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- .cpu_has_kvm_support = cpu_has_kvm_support,
- .disabled_by_bios = vmx_disabled_by_bios,
- .check_processor_compatibility = vmx_check_processor_compat,
- .hardware_setup = hardware_setup,
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
- .runtime_ops = &vmx_x86_ops,
-};
+ /*
+ * On Intel CPUs that lack self-snoop feature, letting the guest control
+ * memory types may result in unexpected behavior. So always ignore guest
+ * PAT on those CPUs and map VM as writeback, not allowing userspace to
+ * disable the quirk.
+ *
+ * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is
+ * supported, UC is slow enough to cause issues with some older guests (e.g.
+ * an old version of bochs driver uses ioremap() instead of ioremap_wc() to
+ * map the video RAM, causing wayland desktop to fail to get started
+ * correctly). To avoid breaking those older guests that rely on KVM to force
+ * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the
+ * safer (for performance) default behavior.
+ *
+ * On top of this, non-coherent DMA devices need the guest to flush CPU
+ * caches properly. This also requires honoring guest PAT, and is forced
+ * independent of the quirk in vmx_ignore_guest_pat().
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
-static void vmx_cleanup_l1d_flush(void)
-{
- if (vmx_l1d_flush_pages) {
- free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
- vmx_l1d_flush_pages = NULL;
- }
- /* Restore state so sysfs ignores VMX */
- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+ kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+
+ return r;
}
-static void vmx_exit(void)
+void vmx_exit(void)
{
-#ifdef CONFIG_KEXEC_CORE
- RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
- synchronize_rcu();
-#endif
-
- kvm_exit();
-
-#if IS_ENABLED(CONFIG_HYPERV)
- if (static_branch_unlikely(&enable_evmcs)) {
- int cpu;
- struct hv_vp_assist_page *vp_ap;
- /*
- * Reset everything to support using non-enlightened VMCS
- * access later (e.g. when we reload the module with
- * enlightened_vmcs=0)
- */
- for_each_online_cpu(cpu) {
- vp_ap = hv_get_vp_assist_page(cpu);
-
- if (!vp_ap)
- continue;
-
- vp_ap->nested_control.features.directhypercall = 0;
- vp_ap->current_nested_vmcs = 0;
- vp_ap->enlighten_vmentry = 0;
- }
+ allow_smaller_maxphyaddr = false;
- static_branch_disable(&enable_evmcs);
- }
-#endif
vmx_cleanup_l1d_flush();
- allow_smaller_maxphyaddr = false;
+ kvm_x86_vendor_exit();
}
-module_exit(vmx_exit);
-static int __init vmx_init(void)
+int __init vmx_init(void)
{
int r, cpu;
-#if IS_ENABLED(CONFIG_HYPERV)
- /*
- * Enlightened VMCS usage should be recommended and the host needs
- * to support eVMCS v1 or above. We can also disable eVMCS support
- * with module parameter.
- */
- if (enlightened_vmcs &&
- ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
- (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
- KVM_EVMCS_VERSION) {
- int cpu;
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
- /* Check that we have assist pages on all online CPUs */
- for_each_online_cpu(cpu) {
- if (!hv_get_vp_assist_page(cpu)) {
- enlightened_vmcs = false;
- break;
- }
- }
-
- if (enlightened_vmcs) {
- pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
- static_branch_enable(&enable_evmcs);
- }
+ if (!kvm_is_vmx_supported())
+ return -EOPNOTSUPP;
- if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_direct_tlbflush
- = hv_enable_direct_tlbflush;
+ /*
+ * Note, VMCS and eVMCS configuration only touch VMX knobs/variables,
+ * i.e. there's nothing to unwind if a later step fails.
+ */
+ hv_init_evmcs();
- } else {
- enlightened_vmcs = false;
- }
-#endif
+ /*
+ * Parse the VMCS config and VMX capabilities before anything else, so
+ * that the information is available to all setup flows.
+ */
+ if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
+ return -EIO;
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ r = kvm_x86_vendor_init(&vt_init_ops);
if (r)
return r;
- /*
- * Must be called after kvm_init() so enable_ept is properly set
- * up. Hand the parameter mitigation value in which was stored in
- * the pre module init parser. If no parameter was given, it will
- * contain 'auto' which will be turned into the default 'cond'
- * mitigation mode.
- */
- r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- if (r) {
- vmx_exit();
- return r;
- }
+ /* Must be called after common x86 init so enable_ept is setup. */
+ r = vmx_setup_l1d_flush();
+ if (r)
+ goto err_l1d_flush;
for_each_possible_cpu(cpu) {
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
@@ -7992,20 +8764,11 @@ static int __init vmx_init(void)
pi_init_cpu(cpu);
}
-#ifdef CONFIG_KEXEC_CORE
- rcu_assign_pointer(crash_vmclear_loaded_vmcss,
- crash_vmclear_local_loaded_vmcss);
-#endif
vmx_check_vmcs12_offsets();
- /*
- * Shadow paging doesn't have a (further) performance penalty
- * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
- * by default
- */
- if (!enable_ept)
- allow_smaller_maxphyaddr = true;
-
return 0;
+
+err_l1d_flush:
+ kvm_x86_vendor_exit();
+ return r;
}
-module_init(vmx_init);